diff options
author | Andrew Lawson <andrewl@olney.nag.co.uk> | 2017-10-20 11:26:27 +0100 |
---|---|---|
committer | Andrew Lawson <andrewl@olney.nag.co.uk> | 2017-10-20 11:26:27 +0100 |
commit | 62d03f5c115517b5a2e208f747c4799217a431bf (patch) | |
tree | fc0a334e068089d7e325307932893f8b87cdb6ee | |
parent | 6e01373d0b3b75ecd4534e7f928acf009f7b8c8f (diff) | |
parent | 928671055fde9469e68a5ef35686a244391b404d (diff) | |
download | numpy-62d03f5c115517b5a2e208f747c4799217a431bf.tar.gz |
Merge branch 'master' into nagfor-compatability
updating branch
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 93 | ||||
-rw-r--r-- | numpy/core/tests/test_umath.py | 11 | ||||
-rw-r--r-- | numpy/linalg/linalg.py | 15 |
3 files changed, 59 insertions, 60 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 670c39ea2..789717555 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1623,13 +1623,13 @@ NPY_NO_EXPORT void * when updating also update similar complex floats summation */ static @type@ -pairwise_sum_@TYPE@(@dtype@ *a, npy_uintp n, npy_intp stride) +pairwise_sum_@TYPE@(char *a, npy_uintp n, npy_intp stride) { if (n < 8) { npy_intp i; @type@ res = 0.; for (i = 0; i < n; i++) { - res += @trf@(a[i * stride]); + res += @trf@(*((@dtype@*)(a + i * stride))); } return res; } @@ -1642,26 +1642,26 @@ pairwise_sum_@TYPE@(@dtype@ *a, npy_uintp n, npy_intp stride) * 8 times unroll reduces blocksize to 16 and allows vectorization with * avx without changing summation ordering */ - r[0] = @trf@(a[0 * stride]); - r[1] = @trf@(a[1 * stride]); - r[2] = @trf@(a[2 * stride]); - r[3] = @trf@(a[3 * stride]); - r[4] = @trf@(a[4 * stride]); - r[5] = @trf@(a[5 * stride]); - r[6] = @trf@(a[6 * stride]); - r[7] = @trf@(a[7 * stride]); + r[0] = @trf@(*((@dtype@ *)(a + 0 * stride))); + r[1] = @trf@(*((@dtype@ *)(a + 1 * stride))); + r[2] = @trf@(*((@dtype@ *)(a + 2 * stride))); + r[3] = @trf@(*((@dtype@ *)(a + 3 * stride))); + r[4] = @trf@(*((@dtype@ *)(a + 4 * stride))); + r[5] = @trf@(*((@dtype@ *)(a + 5 * stride))); + r[6] = @trf@(*((@dtype@ *)(a + 6 * stride))); + r[7] = @trf@(*((@dtype@ *)(a + 7 * stride))); for (i = 8; i < n - (n % 8); i += 8) { /* small blocksizes seems to mess with hardware prefetch */ - NPY_PREFETCH(&a[(i + 512 / sizeof(a[0])) * stride], 0, 3); - r[0] += @trf@(a[(i + 0) * stride]); - r[1] += @trf@(a[(i + 1) * stride]); - r[2] += @trf@(a[(i + 2) * stride]); - r[3] += @trf@(a[(i + 3) * stride]); - r[4] += @trf@(a[(i + 4) * stride]); - r[5] += @trf@(a[(i + 5) * stride]); - r[6] += @trf@(a[(i + 6) * stride]); - r[7] += @trf@(a[(i + 7) * stride]); + NPY_PREFETCH(a + (i + 512 / sizeof(@dtype@)) * stride, 0, 3); + r[0] += @trf@(*((@dtype@ *)(a + (i + 0) * stride))); + r[1] += @trf@(*((@dtype@ *)(a + (i + 1) * stride))); + r[2] += @trf@(*((@dtype@ *)(a + (i + 2) * stride))); + r[3] += @trf@(*((@dtype@ *)(a + (i + 3) * stride))); + r[4] += @trf@(*((@dtype@ *)(a + (i + 4) * stride))); + r[5] += @trf@(*((@dtype@ *)(a + (i + 5) * stride))); + r[6] += @trf@(*((@dtype@ *)(a + (i + 6) * stride))); + r[7] += @trf@(*((@dtype@ *)(a + (i + 7) * stride))); } /* accumulate now to avoid stack spills for single peel loop */ @@ -1670,7 +1670,7 @@ pairwise_sum_@TYPE@(@dtype@ *a, npy_uintp n, npy_intp stride) /* do non multiple of 8 rest */ for (; i < n; i++) { - res += @trf@(a[i * stride]); + res += @trf@(*((@dtype@ *)(a + i * stride))); } return res; } @@ -1707,8 +1707,7 @@ NPY_NO_EXPORT void @type@ * iop1 = (@type@ *)args[0]; npy_intp n = dimensions[0]; - *iop1 @OP@= pairwise_sum_@TYPE@((@type@ *)args[1], n, - steps[1] / (npy_intp)sizeof(@type@)); + *iop1 @OP@= pairwise_sum_@TYPE@(args[1], n, steps[1]); #else BINARY_REDUCE_LOOP(@type@) { io1 @OP@= *(@type@ *)ip2; @@ -2065,8 +2064,7 @@ HALF_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED #if @PW@ npy_intp n = dimensions[0]; - io1 @OP@= pairwise_sum_HALF((npy_half *)args[1], n, - steps[1] / (npy_intp)sizeof(npy_half)); + io1 @OP@= pairwise_sum_HALF(args[1], n, steps[1]); #else BINARY_REDUCE_LOOP_INNER { io1 @OP@= npy_half_to_float(*(npy_half *)ip2); @@ -2397,7 +2395,7 @@ HALF_ldexp_long(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UN /* similar to pairwise sum of real floats */ static void -pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n, +pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_uintp n, npy_intp stride) { assert(n % 2 == 0); @@ -2406,8 +2404,8 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n, *rr = 0.; *ri = 0.; for (i = 0; i < n; i += 2) { - *rr += a[i * stride + 0]; - *ri += a[i * stride + 1]; + *rr += *((@ftype@ *)(a + i * stride + 0)); + *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@))); } return; } @@ -2420,26 +2418,26 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n, * 8 times unroll reduces blocksize to 16 and allows vectorization with * avx without changing summation ordering */ - r[0] = a[0 * stride]; - r[1] = a[0 * stride + 1]; - r[2] = a[2 * stride]; - r[3] = a[2 * stride + 1]; - r[4] = a[4 * stride]; - r[5] = a[4 * stride + 1]; - r[6] = a[6 * stride]; - r[7] = a[6 * stride + 1]; + r[0] = *((@ftype@ *)(a + 0 * stride)); + r[1] = *((@ftype@ *)(a + 0 * stride + sizeof(@ftype@))); + r[2] = *((@ftype@ *)(a + 2 * stride)); + r[3] = *((@ftype@ *)(a + 2 * stride + sizeof(@ftype@))); + r[4] = *((@ftype@ *)(a + 4 * stride)); + r[5] = *((@ftype@ *)(a + 4 * stride + sizeof(@ftype@))); + r[6] = *((@ftype@ *)(a + 6 * stride)); + r[7] = *((@ftype@ *)(a + 6 * stride + sizeof(@ftype@))); for (i = 8; i < n - (n % 8); i += 8) { /* small blocksizes seems to mess with hardware prefetch */ - NPY_PREFETCH(&a[(i + 512 / sizeof(a[0])) * stride], 0, 3); - r[0] += a[(i + 0) * stride]; - r[1] += a[(i + 0) * stride + 1]; - r[2] += a[(i + 2) * stride]; - r[3] += a[(i + 2) * stride + 1]; - r[4] += a[(i + 4) * stride]; - r[5] += a[(i + 4) * stride + 1]; - r[6] += a[(i + 6) * stride]; - r[7] += a[(i + 6) * stride + 1]; + NPY_PREFETCH(a + (i + 512 / sizeof(@ftype@)) * stride, 0, 3); + r[0] += *((@ftype@ *)(a + (i + 0) * stride)); + r[1] += *((@ftype@ *)(a + (i + 0) * stride + sizeof(@ftype@))); + r[2] += *((@ftype@ *)(a + (i + 2) * stride)); + r[3] += *((@ftype@ *)(a + (i + 2) * stride + sizeof(@ftype@))); + r[4] += *((@ftype@ *)(a + (i + 4) * stride)); + r[5] += *((@ftype@ *)(a + (i + 4) * stride + sizeof(@ftype@))); + r[6] += *((@ftype@ *)(a + (i + 6) * stride)); + r[7] += *((@ftype@ *)(a + (i + 6) * stride + sizeof(@ftype@))); } /* accumulate now to avoid stack spills for single peel loop */ @@ -2448,8 +2446,8 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n, /* do non multiple of 8 rest */ for (; i < n; i+=2) { - *rr += a[i * stride + 0]; - *ri += a[i * stride + 1]; + *rr += *((@ftype@ *)(a + i * stride + 0)); + *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@))); } return; } @@ -2481,8 +2479,7 @@ NPY_NO_EXPORT void @ftype@ * oi = ((@ftype@ *)args[0]) + 1; @ftype@ rr, ri; - pairwise_sum_@TYPE@(&rr, &ri, (@ftype@ *)args[1], n * 2, - steps[1] / (npy_intp)sizeof(@ftype@) / 2); + pairwise_sum_@TYPE@(&rr, &ri, args[1], n * 2, steps[1] / 2); *or @OP@= rr; *oi @OP@= ri; return; diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index 5787a5183..bebeddc92 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -213,6 +213,17 @@ class TestComparisons(object): assert_equal(np.not_equal(a, a), [True]) +class TestAdd(object): + def test_reduce_alignment(self): + # gh-9876 + # make sure arrays with weird strides work with the optimizations in + # pairwise_sum_@TYPE@. On x86, the 'b' field will count as aligned at a + # 4 byte offset, even though its itemsize is 8. + a = np.zeros(2, dtype=[('a', np.int32), ('b', np.float64)]) + a['a'] = -1 + assert_equal(a['b'].sum(), 0) + + class TestDivision(object): def test_division_int(self): # int division should follow Python diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py index bfd6ac28e..2f738c8a6 100644 --- a/numpy/linalg/linalg.py +++ b/numpy/linalg/linalg.py @@ -1815,14 +1815,8 @@ def slogdet(a): real_t = _realType(result_t) signature = 'D->Dd' if isComplexType(t) else 'd->dd' sign, logdet = _umath_linalg.slogdet(a, signature=signature) - if isscalar(sign): - sign = sign.astype(result_t) - else: - sign = sign.astype(result_t, copy=False) - if isscalar(logdet): - logdet = logdet.astype(real_t) - else: - logdet = logdet.astype(real_t, copy=False) + sign = sign.astype(result_t, copy=False) + logdet = logdet.astype(real_t, copy=False) return sign, logdet def det(a): @@ -1878,10 +1872,7 @@ def det(a): t, result_t = _commonType(a) signature = 'D->D' if isComplexType(t) else 'd->d' r = _umath_linalg.det(a, signature=signature) - if isscalar(r): - r = r.astype(result_t) - else: - r = r.astype(result_t, copy=False) + r = r.astype(result_t, copy=False) return r # Linear Least Squares |