diff options
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 99 |
1 files changed, 53 insertions, 46 deletions
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index ee7030855..4bb8569be 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -32,13 +32,7 @@ #include <float.h> #include <string.h> /* for memcpy */ -#if defined __AVX512F__ -#define VECTOR_SIZE_BYTES 64 -#elif defined __AVX2__ -#define VECTOR_SIZE_BYTES 32 -#else #define VECTOR_SIZE_BYTES 16 -#endif static NPY_INLINE npy_uintp abs_ptrdiff(char *a, char *b) @@ -190,17 +184,24 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps @type@ * ip2 = (@type@ *)args[1]; @type@ * op = (@type@ *)args[2]; npy_intp n = dimensions[0]; +#if defined __AVX512F__ + const npy_intp vector_size_bytes = 64; +#elif defined __AVX2__ + const npy_intp vector_size_bytes = 32; +#else + const npy_intp vector_size_bytes = 32; +#endif /* argument one scalar */ - if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), VECTOR_SIZE_BYTES)) { + if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) { sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n); return 1; } /* argument two scalar */ - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), VECTOR_SIZE_BYTES)) { + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), vector_size_bytes)) { sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n); return 1; } - else if (IS_BLOCKABLE_BINARY(sizeof(@type@), VECTOR_SIZE_BYTES)) { + else if (IS_BLOCKABLE_BINARY(sizeof(@type@), vector_size_bytes)) { sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n); return 1; } @@ -427,19 +428,20 @@ static void sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) { #ifdef __AVX512F__ - LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) + const npy_intp vector_size_bytes = 64; + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[i] @OP@ ip2[i]; /* lots of specializations, to squeeze out max performance */ - if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES) && npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) { + if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) { if (ip1 == ip2) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a); @vpre512@_store_@vsuf@(&op[i], c); } } else { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); @@ -447,16 +449,16 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) } } } - else if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + else if (npy_is_aligned(&ip1[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); @vpre512@_store_@vsuf@(&op[i], c); } } - else if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + else if (npy_is_aligned(&ip2[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); @@ -465,14 +467,14 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) } else { if (ip1 == ip2) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a); @vpre512@_store_@vsuf@(&op[i], c); } } else { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); @@ -481,20 +483,21 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) } } #elif __AVX2__ - LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) + const npy_intp vector_size_bytes = 32; + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[i] @OP@ ip2[i]; /* lots of specializations, to squeeze out max performance */ - if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES) && - npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) { + if (npy_is_aligned(&ip1[i], vector_size_bytes) && + npy_is_aligned(&ip2[i], vector_size_bytes)) { if (ip1 == ip2) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a); @vpre256@_store_@vsuf@(&op[i], c); } } else { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); @@ -502,16 +505,16 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) } } } - else if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + else if (npy_is_aligned(&ip1[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); @vpre256@_store_@vsuf@(&op[i], c); } } - else if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + else if (npy_is_aligned(&ip2[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); @@ -520,14 +523,14 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) } else { if (ip1 == ip2) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a); @vpre256@_store_@vsuf@(&op[i], c); } } else { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); @@ -601,18 +604,19 @@ static void sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) { #ifdef __AVX512F__ + const npy_intp vector_size_bytes = 64; const @vtype512@ a = @vpre512@_set1_@vsuf@(ip1[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[0] @OP@ ip2[i]; - if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + if (npy_is_aligned(&ip2[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); @vpre512@_store_@vsuf@(&op[i], c); } } else { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); @vpre512@_store_@vsuf@(&op[i], c); @@ -621,18 +625,19 @@ sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i #elif __AVX2__ + const npy_intp vector_size_bytes = 32; const @vtype256@ a = @vpre256@_set1_@vsuf@(ip1[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[0] @OP@ ip2[i]; - if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + if (npy_is_aligned(&ip2[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); @vpre256@_store_@vsuf@(&op[i], c); } } else { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); @vpre256@_store_@vsuf@(&op[i], c); @@ -667,18 +672,19 @@ static void sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) { #ifdef __AVX512F__ + const npy_intp vector_size_bytes = 64; const @vtype512@ b = @vpre512@_set1_@vsuf@(ip2[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[i] @OP@ ip2[0]; - if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + if (npy_is_aligned(&ip1[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); @vpre512@_store_@vsuf@(&op[i], c); } } else { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]); @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b); @vpre512@_store_@vsuf@(&op[i], c); @@ -686,18 +692,19 @@ sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i } #elif __AVX2__ + const npy_intp vector_size_bytes = 32; const @vtype256@ b = @vpre256@_set1_@vsuf@(ip2[0]); - LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) + LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes) op[i] = ip1[i] @OP@ ip2[0]; - if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + if (npy_is_aligned(&ip1[i], vector_size_bytes)) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); @vpre256@_store_@vsuf@(&op[i], c); } } else { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { + LOOP_BLOCKED(@type@, vector_size_bytes) { @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]); @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b); @vpre256@_store_@vsuf@(&op[i], c); |