diff options
| author | Julian Taylor <juliantaylor108@gmail.com> | 2018-12-16 10:36:53 +0100 |
|---|---|---|
| committer | Julian Taylor <juliantaylor108@gmail.com> | 2018-12-16 10:40:38 +0100 |
| commit | 81fe95c53fcde2fded5ab663a1dfc08184c4373a (patch) | |
| tree | f1b651d4c3d225e569467c50ae96040ec2fc1ed5 /numpy/core/src | |
| parent | f07a38da97a6a36eb12b203f6c1ffa4bf2b2cb87 (diff) | |
| download | numpy-81fe95c53fcde2fded5ab663a1dfc08184c4373a.tar.gz | |
BUG: only override vector size for avx code
only a fraction of code has avx variants so the size must not be
overriden globally.
Closes gh-12507
Diffstat (limited to 'numpy/core/src')
| -rw-r--r-- | numpy/core/src/umath/simd.inc.src | 99 |
1 files changed, 53 insertions, 46 deletions
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index a3e00b5c1..4e0278bd4 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -32,13 +32,7 @@ #include <float.h> #include <string.h> /* for memcpy */ -#if defined __AVX512F__ -#define VECTOR_SIZE_BYTES 64 -#elif defined __AVX2__ -#define VECTOR_SIZE_BYTES 32 -#else #define VECTOR_SIZE_BYTES 16 -#endif static NPY_INLINE npy_uintp abs_ptrdiff(char *a, char *b) @@ -190,17 +184,24 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps @type@ * ip2 = (@type@ *)args[1]; @type@ * op = (@type@ *)args[2]; npy_intp n = dimensions[0]; +#if defined __AVX512F__ + const npy_intp vector_size_bytes = 64; +#elif defined __AVX2__ + const npy_intp vector_size_bytes = 32; +#else + const npy_intp vector_size_bytes = 32; +#endif /* argument one scalar */ - if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), VECTOR_SIZE_BYTES)) { + if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) { sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n); return 1; } /* argument two scalar */ - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), VECTOR_SIZE_BYTES)) { + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), vector_size_bytes)) { sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n); return 1; } - else if (IS_BLOCKABLE_BINARY(sizeof(@type@), VECTOR_SIZE_BYTES)) { + else if (IS_BLOCKABLE_BINARY(sizeof(@type@), vector_size_bytes)) { sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n); return 1; } @@ -427,19 +428,20 @@ static void sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) { #ifdef __AVX512F__ - LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) + const npy_intp vector_size_bytes = 64; + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[i] @OP@ ip2[i]; /* lots of specializations, to squeeze out max performance */ - if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES) && npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) { + if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) { if (ip1 == ip2) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a); @vpre512@_store_@vsuf@(&op[i], c); } } else { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); @@ -447,16 +449,16 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) } } } - else if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + else if (npy_is_aligned(&ip1[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); @vpre512@_store_@vsuf@(&op[i], c); } } - else if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + else if (npy_is_aligned(&ip2[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); @@ -465,14 +467,14 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) } else { if (ip1 == ip2) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a); @vpre512@_store_@vsuf@(&op[i], c); } } else { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); @@ -481,20 +483,21 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) } } #elif __AVX2__ - LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) + const npy_intp vector_size_bytes = 32; + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[i] @OP@ ip2[i]; /* lots of specializations, to squeeze out max performance */ - if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES) && - npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) { + if (npy_is_aligned(&ip1[i], vector_size_bytes) && + npy_is_aligned(&ip2[i], vector_size_bytes)) { if (ip1 == ip2) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a); @vpre256@_store_@vsuf@(&op[i], c); } } else { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); @@ -502,16 +505,16 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) } } } - else if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + else if (npy_is_aligned(&ip1[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); @vpre256@_store_@vsuf@(&op[i], c); } } - else if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + else if (npy_is_aligned(&ip2[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); @@ -520,14 +523,14 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) } else { if (ip1 == ip2) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a); @vpre256@_store_@vsuf@(&op[i], c); } } else { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); @@ -601,18 +604,19 @@ static void sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) { #ifdef __AVX512F__ + const npy_intp vector_size_bytes = 64; const @vtype512@ a = @vpre512@_set1_@vsuf@(ip1[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[0] @OP@ ip2[i]; - if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + if (npy_is_aligned(&ip2[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); @vpre512@_store_@vsuf@(&op[i], c); } } else { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); @vpre512@_store_@vsuf@(&op[i], c); @@ -621,18 +625,19 @@ sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i #elif __AVX2__ + const npy_intp vector_size_bytes = 32; const @vtype256@ a = @vpre256@_set1_@vsuf@(ip1[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[0] @OP@ ip2[i]; - if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + if (npy_is_aligned(&ip2[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); @vpre256@_store_@vsuf@(&op[i], c); } } else { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); @vpre256@_store_@vsuf@(&op[i], c); @@ -667,18 +672,19 @@ static void sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) { #ifdef __AVX512F__ + const npy_intp vector_size_bytes = 64; const @vtype512@ b = @vpre512@_set1_@vsuf@(ip2[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[i] @OP@ ip2[0]; - if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + if (npy_is_aligned(&ip1[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); @vpre512@_store_@vsuf@(&op[i], c); } } else { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); @vpre512@_store_@vsuf@(&op[i], c); @@ -686,18 +692,19 @@ sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i } #elif __AVX2__ + const npy_intp vector_size_bytes = 32; const @vtype256@ b = @vpre256@_set1_@vsuf@(ip2[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[i] @OP@ ip2[0]; - if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + if (npy_is_aligned(&ip1[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); @vpre256@_store_@vsuf@(&op[i], c); } } else { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); @vpre256@_store_@vsuf@(&op[i], c); |
