summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/src/umath/simd.inc.src251
1 files changed, 134 insertions, 117 deletions
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index da0713b2b..a3e00b5c1 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -32,6 +32,14 @@
#include <float.h>
#include <string.h> /* for memcpy */
+#if defined __AVX512F__
+#define VECTOR_SIZE_BYTES 64
+#elif defined __AVX2__
+#define VECTOR_SIZE_BYTES 32
+#else
+#define VECTOR_SIZE_BYTES 16
+#endif
+
static NPY_INLINE npy_uintp
abs_ptrdiff(char *a, char *b)
{
@@ -144,7 +152,7 @@ static NPY_INLINE int
run_@name@_simd_@func@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps)
{
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
- if (@check@(sizeof(@type@), 16)) {
+ if (@check@(sizeof(@type@), VECTOR_SIZE_BYTES)) {
sse2_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]);
return 1;
}
@@ -183,16 +191,16 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps
@type@ * op = (@type@ *)args[2];
npy_intp n = dimensions[0];
/* argument one scalar */
- if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), 16)) {
+ if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), VECTOR_SIZE_BYTES)) {
sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
return 1;
}
/* argument two scalar */
- else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), 16)) {
+ else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), VECTOR_SIZE_BYTES)) {
sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
return 1;
}
- else if (IS_BLOCKABLE_BINARY(sizeof(@type@), 16)) {
+ else if (IS_BLOCKABLE_BINARY(sizeof(@type@), VECTOR_SIZE_BYTES)) {
sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
return 1;
}
@@ -232,16 +240,16 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp *dimensions, npy_intp *steps
npy_bool * op = (npy_bool *)args[2];
npy_intp n = dimensions[0];
/* argument one scalar */
- if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), 16)) {
+ if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), VECTOR_SIZE_BYTES)) {
sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
return 1;
}
/* argument two scalar */
- else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), 16)) {
+ else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), VECTOR_SIZE_BYTES)) {
sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
return 1;
}
- else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), 16)) {
+ else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), VECTOR_SIZE_BYTES)) {
sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
return 1;
}
@@ -302,7 +310,8 @@ static NPY_INLINE int
run_binary_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)
{
#if defined NPY_HAVE_SSE2_INTRINSICS
- if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_BINARY(sizeof(npy_bool), 16)) {
+ if (sizeof(npy_bool) == 1 &&
+ IS_BLOCKABLE_BINARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
sse2_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0],
(npy_bool*)args[1], dimensions[0]);
return 1;
@@ -316,7 +325,8 @@ static NPY_INLINE int
run_reduce_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)
{
#if defined NPY_HAVE_SSE2_INTRINSICS
- if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_REDUCE(sizeof(npy_bool), 16)) {
+ if (sizeof(npy_bool) == 1 &&
+ IS_BLOCKABLE_REDUCE(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
sse2_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1],
dimensions[0]);
return 1;
@@ -340,7 +350,8 @@ static NPY_INLINE int
run_unary_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)
{
#if defined NPY_HAVE_SSE2_INTRINSICS
- if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_UNARY(sizeof(npy_bool), 16)) {
+ if (sizeof(npy_bool) == 1 &&
+ IS_BLOCKABLE_UNARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
sse2_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]);
return 1;
}
@@ -416,19 +427,19 @@ static void
sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
#ifdef __AVX512F__
- LOOP_BLOCK_ALIGN_VAR(op, @type@, 64)
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
op[i] = ip1[i] @OP@ ip2[i];
/* lots of specializations, to squeeze out max performance */
- if (npy_is_aligned(&ip1[i], 64) && npy_is_aligned(&ip2[i], 64)) {
+ if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES) && npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, 64) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
@vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
@vpre512@_store_@vsuf@(&op[i], c);
}
}
else {
- LOOP_BLOCKED(@type@, 64) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
@vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
@vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
@@ -436,16 +447,16 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
}
}
}
- else if (npy_is_aligned(&ip1[i], 64)) {
- LOOP_BLOCKED(@type@, 64) {
+ else if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
@vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
@vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
@vpre512@_store_@vsuf@(&op[i], c);
}
}
- else if (npy_is_aligned(&ip2[i], 64)) {
- LOOP_BLOCKED(@type@, 64) {
+ else if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
@vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
@vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
@@ -454,14 +465,14 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
}
else {
if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, 64) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
@vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
@vpre512@_store_@vsuf@(&op[i], c);
}
}
else {
- LOOP_BLOCKED(@type@, 64) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
@vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
@vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
@@ -470,19 +481,20 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
}
}
#elif __AVX2__
- LOOP_BLOCK_ALIGN_VAR(op, @type@, 32)
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
op[i] = ip1[i] @OP@ ip2[i];
/* lots of specializations, to squeeze out max performance */
- if (npy_is_aligned(&ip1[i], 32) && npy_is_aligned(&ip2[i], 32)) {
+ if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES) &&
+ npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, 32) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
@vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
@vpre256@_store_@vsuf@(&op[i], c);
}
}
else {
- LOOP_BLOCKED(@type@, 32) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
@vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
@vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
@@ -490,16 +502,16 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
}
}
}
- else if (npy_is_aligned(&ip1[i], 32)) {
- LOOP_BLOCKED(@type@, 32) {
+ else if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
@vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
@vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
@vpre256@_store_@vsuf@(&op[i], c);
}
}
- else if (npy_is_aligned(&ip2[i], 32)) {
- LOOP_BLOCKED(@type@, 32) {
+ else if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
@vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
@vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
@@ -508,14 +520,14 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
}
else {
if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, 32) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
@vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
@vpre256@_store_@vsuf@(&op[i], c);
}
}
else {
- LOOP_BLOCKED(@type@, 32) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
@vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
@vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
@@ -524,19 +536,20 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
}
}
#else
- LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
op[i] = ip1[i] @OP@ ip2[i];
/* lots of specializations, to squeeze out max performance */
- if (npy_is_aligned(&ip1[i], 16) && npy_is_aligned(&ip2[i], 16)) {
+ if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES) &&
+ npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, 16) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
@vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
@vpre@_store_@vsuf@(&op[i], c);
}
}
else {
- LOOP_BLOCKED(@type@, 16) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
@vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
@vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
@@ -544,16 +557,16 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
}
}
}
- else if (npy_is_aligned(&ip1[i], 16)) {
- LOOP_BLOCKED(@type@, 16) {
+ else if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
@vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
@vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
@vpre@_store_@vsuf@(&op[i], c);
}
}
- else if (npy_is_aligned(&ip2[i], 16)) {
- LOOP_BLOCKED(@type@, 16) {
+ else if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
@vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
@vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
@@ -562,14 +575,14 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
}
else {
if (ip1 == ip2) {
- LOOP_BLOCKED(@type@, 16) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
@vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
@vpre@_store_@vsuf@(&op[i], c);
}
}
else {
- LOOP_BLOCKED(@type@, 16) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
@vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
@vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
@@ -589,17 +602,17 @@ sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
{
#ifdef __AVX512F__
const @vtype512@ a = @vpre512@_set1_@vsuf@(ip1[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, 64)
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
op[i] = ip1[0] @OP@ ip2[i];
- if (npy_is_aligned(&ip2[i], 64)) {
- LOOP_BLOCKED(@type@, 64) {
+ if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
@vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
@vpre512@_store_@vsuf@(&op[i], c);
}
}
else {
- LOOP_BLOCKED(@type@, 64) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
@vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
@vpre512@_store_@vsuf@(&op[i], c);
@@ -609,17 +622,17 @@ sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
#elif __AVX2__
const @vtype256@ a = @vpre256@_set1_@vsuf@(ip1[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, 32)
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
op[i] = ip1[0] @OP@ ip2[i];
- if (npy_is_aligned(&ip2[i], 32)) {
- LOOP_BLOCKED(@type@, 32) {
+ if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
@vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
@vpre256@_store_@vsuf@(&op[i], c);
}
}
else {
- LOOP_BLOCKED(@type@, 32) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
@vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
@vpre256@_store_@vsuf@(&op[i], c);
@@ -627,17 +640,17 @@ sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
}
#else
const @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
op[i] = ip1[0] @OP@ ip2[i];
- if (npy_is_aligned(&ip2[i], 16)) {
- LOOP_BLOCKED(@type@, 16) {
+ if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
@vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
@vpre@_store_@vsuf@(&op[i], c);
}
}
else {
- LOOP_BLOCKED(@type@, 16) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
@vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
@vpre@_store_@vsuf@(&op[i], c);
@@ -655,17 +668,17 @@ sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
{
#ifdef __AVX512F__
const @vtype512@ b = @vpre512@_set1_@vsuf@(ip2[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, 64)
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
op[i] = ip1[i] @OP@ ip2[0];
- if (npy_is_aligned(&ip1[i], 64)) {
- LOOP_BLOCKED(@type@, 64) {
+ if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
@vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
@vpre512@_store_@vsuf@(&op[i], c);
}
}
else {
- LOOP_BLOCKED(@type@, 64) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
@vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
@vpre512@_store_@vsuf@(&op[i], c);
@@ -674,17 +687,17 @@ sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
#elif __AVX2__
const @vtype256@ b = @vpre256@_set1_@vsuf@(ip2[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, 32)
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
op[i] = ip1[i] @OP@ ip2[0];
- if (npy_is_aligned(&ip1[i], 32)) {
- LOOP_BLOCKED(@type@, 32) {
+ if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
@vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
@vpre256@_store_@vsuf@(&op[i], c);
}
}
else {
- LOOP_BLOCKED(@type@, 32) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
@vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
@vpre256@_store_@vsuf@(&op[i], c);
@@ -692,17 +705,17 @@ sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
}
#else
const @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]);
- LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
op[i] = ip1[i] @OP@ ip2[0];
- if (npy_is_aligned(&ip1[i], 16)) {
- LOOP_BLOCKED(@type@, 16) {
+ if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
@vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
@vpre@_store_@vsuf@(&op[i], c);
}
}
else {
- LOOP_BLOCKED(@type@, 16) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
@vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
@vpre@_store_@vsuf@(&op[i], c);
@@ -742,10 +755,10 @@ sse2_compress4_to_byte_@TYPE@(@vtype@ r1, @vtype@ r2, @vtype@ r3, @vtype@ * r4,
static void
sse2_signbit_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
{
- LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) {
+ LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
op[i] = npy_signbit(ip1[i]) != 0;
}
- LOOP_BLOCKED(@type@, 16) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
int r = @vpre@_movemask_@vsuf@(a);
if (sizeof(@type@) == 8) {
@@ -783,14 +796,14 @@ sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
const @vtype@ fltmax = @vpre@_set1_@vsuf@(FLT_MAX);
#endif
#endif
- LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) {
+ LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
op[i] = npy_@kind@(ip1[i]) != 0;
}
- LOOP_BLOCKED(@type@, 64) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * 16 / sizeof(@type@)]);
- @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * 16 / sizeof(@type@)]);
- @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * 16 / sizeof(@type@)]);
- @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * 16 / sizeof(@type@)]);
+ LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
+ @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+ @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+ @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+ @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
@vtype@ r1, r2, r3, r4;
#if @var@ != 0 /* isinf/isfinite */
/* fabs via masking of sign bit */
@@ -853,18 +866,18 @@ sse2_ordered_cmp_@kind@_@TYPE@(const @type@ a, const @type@ b)
static void
sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
- LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) {
+ LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]);
}
- LOOP_BLOCKED(@type@, 64) {
- @vtype@ a1 = @vpre@_load_@vsuf@(&ip1[i + 0 * 16 / sizeof(@type@)]);
- @vtype@ b1 = @vpre@_load_@vsuf@(&ip1[i + 1 * 16 / sizeof(@type@)]);
- @vtype@ c1 = @vpre@_load_@vsuf@(&ip1[i + 2 * 16 / sizeof(@type@)]);
- @vtype@ d1 = @vpre@_load_@vsuf@(&ip1[i + 3 * 16 / sizeof(@type@)]);
- @vtype@ a2 = @vpre@_loadu_@vsuf@(&ip2[i + 0 * 16 / sizeof(@type@)]);
- @vtype@ b2 = @vpre@_loadu_@vsuf@(&ip2[i + 1 * 16 / sizeof(@type@)]);
- @vtype@ c2 = @vpre@_loadu_@vsuf@(&ip2[i + 2 * 16 / sizeof(@type@)]);
- @vtype@ d2 = @vpre@_loadu_@vsuf@(&ip2[i + 3 * 16 / sizeof(@type@)]);
+ LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
+ @vtype@ a1 = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+ @vtype@ b1 = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+ @vtype@ c1 = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+ @vtype@ d1 = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+ @vtype@ a2 = @vpre@_loadu_@vsuf@(&ip2[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+ @vtype@ b2 = @vpre@_loadu_@vsuf@(&ip2[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+ @vtype@ c2 = @vpre@_loadu_@vsuf@(&ip2[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+ @vtype@ d2 = @vpre@_loadu_@vsuf@(&ip2[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
@vtype@ r1 = @vpre@_@VOP@_@vsuf@(a1, a2);
@vtype@ r2 = @vpre@_@VOP@_@vsuf@(b1, b2);
@vtype@ r3 = @vpre@_@VOP@_@vsuf@(c1, c2);
@@ -881,14 +894,14 @@ static void
sse2_binary_scalar1_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
@vtype@ s = @vpre@_set1_@vsuf@(ip1[0]);
- LOOP_BLOCK_ALIGN_VAR(ip2, @type@, 16) {
+ LOOP_BLOCK_ALIGN_VAR(ip2, @type@, VECTOR_SIZE_BYTES) {
op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[0], ip2[i]);
}
- LOOP_BLOCKED(@type@, 64) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip2[i + 0 * 16 / sizeof(@type@)]);
- @vtype@ b = @vpre@_load_@vsuf@(&ip2[i + 1 * 16 / sizeof(@type@)]);
- @vtype@ c = @vpre@_load_@vsuf@(&ip2[i + 2 * 16 / sizeof(@type@)]);
- @vtype@ d = @vpre@_load_@vsuf@(&ip2[i + 3 * 16 / sizeof(@type@)]);
+ LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
+ @vtype@ a = @vpre@_load_@vsuf@(&ip2[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+ @vtype@ b = @vpre@_load_@vsuf@(&ip2[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+ @vtype@ c = @vpre@_load_@vsuf@(&ip2[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+ @vtype@ d = @vpre@_load_@vsuf@(&ip2[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
@vtype@ r1 = @vpre@_@VOP@_@vsuf@(s, a);
@vtype@ r2 = @vpre@_@VOP@_@vsuf@(s, b);
@vtype@ r3 = @vpre@_@VOP@_@vsuf@(s, c);
@@ -905,14 +918,14 @@ static void
sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
@vtype@ s = @vpre@_set1_@vsuf@(ip2[0]);
- LOOP_BLOCK_ALIGN_VAR(ip1, @type@, 16) {
+ LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[0]);
}
- LOOP_BLOCKED(@type@, 64) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * 16 / sizeof(@type@)]);
- @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * 16 / sizeof(@type@)]);
- @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * 16 / sizeof(@type@)]);
- @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * 16 / sizeof(@type@)]);
+ LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
+ @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+ @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+ @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
+ @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
@vtype@ r1 = @vpre@_@VOP@_@vsuf@(a, s);
@vtype@ r2 = @vpre@_@VOP@_@vsuf@(b, s);
@vtype@ r3 = @vpre@_@VOP@_@vsuf@(c, s);
@@ -928,19 +941,20 @@ sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy
static void
sse2_sqrt_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
{
- /* align output to 16 bytes */
- LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
+ /* align output to VECTOR_SIZE_BYTES bytes */
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) {
op[i] = @scalarf@(ip[i]);
}
- assert(n < (16 / sizeof(@type@)) || npy_is_aligned(&op[i], 16));
- if (npy_is_aligned(&ip[i], 16)) {
- LOOP_BLOCKED(@type@, 16) {
+ assert(n < (VECTOR_SIZE_BYTES / sizeof(@type@)) ||
+ npy_is_aligned(&op[i], VECTOR_SIZE_BYTES));
+ if (npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES)) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype@ d = @vpre@_load_@vsuf@(&ip[i]);
@vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
}
}
else {
- LOOP_BLOCKED(@type@, 16) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype@ d = @vpre@_loadu_@vsuf@(&ip[i]);
@vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
}
@@ -979,19 +993,20 @@ sse2_@kind@_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
*/
const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
- /* align output to 16 bytes */
- LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) {
+ /* align output to VECTOR_SIZE_BYTES bytes */
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) {
op[i] = @scalar@_@type@(ip[i]);
}
- assert(n < (16 / sizeof(@type@)) || npy_is_aligned(&op[i], 16));
- if (npy_is_aligned(&ip[i], 16)) {
- LOOP_BLOCKED(@type@, 16) {
+ assert(n < (VECTOR_SIZE_BYTES / sizeof(@type@)) ||
+ npy_is_aligned(&op[i], VECTOR_SIZE_BYTES));
+ if (npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES)) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype@ a = @vpre@_load_@vsuf@(&ip[i]);
@vpre@_store_@vsuf@(&op[i], @vpre@_@VOP@_@vsuf@(mask, a));
}
}
else {
- LOOP_BLOCKED(@type@, 16) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype@ a = @vpre@_loadu_@vsuf@(&ip[i]);
@vpre@_store_@vsuf@(&op[i], @vpre@_@VOP@_@vsuf@(mask, a));
}
@@ -1012,11 +1027,11 @@ sse2_@kind@_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
static void
sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
{
- const npy_intp stride = 16 / (npy_intp)sizeof(@type@);
- LOOP_BLOCK_ALIGN_VAR(ip, @type@, 16) {
+ const npy_intp stride = VECTOR_SIZE_BYTES / (npy_intp)sizeof(@type@);
+ LOOP_BLOCK_ALIGN_VAR(ip, @type@, VECTOR_SIZE_BYTES) {
*op = (npy_isnan(*op) || *op @OP@ ip[i]) ? *op : ip[i];
}
- assert(n < (stride) || npy_is_aligned(&ip[i], 16));
+ assert(n < (stride) || npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES));
if (i + 3 * stride <= n) {
/* load the first elements */
@vtype@ c1 = @vpre@_load_@vsuf@((@type@*)&ip[i]);
@@ -1025,7 +1040,7 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
/* minps/minpd will set invalid flag if nan is encountered */
npy_clear_floatstatus_barrier((char*)&c1);
- LOOP_BLOCKED(@type@, 32) {
+ LOOP_BLOCKED(@type@, 2 * VECTOR_SIZE_BYTES) {
@vtype@ v1 = @vpre@_load_@vsuf@((@type@*)&ip[i]);
@vtype@ v2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]);
c1 = @vpre@_@VOP@_@vsuf@(c1, v1);
@@ -1090,9 +1105,9 @@ static NPY_INLINE @vtype@ byte_to_true(@vtype@ v)
static void
sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp n)
{
- LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
op[i] = ip1[i] @op@ ip2[i];
- LOOP_BLOCKED(@type@, 16) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype@ a = @vloadu@((@vtype@*)&ip1[i]);
@vtype@ b = @vloadu@((@vtype@*)&ip2[i]);
#if @and@
@@ -1117,16 +1132,16 @@ static void
sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, const npy_intp n)
{
const @vtype@ zero = @vpre@_setzero_@vsuf@();
- LOOP_BLOCK_ALIGN_VAR(ip, npy_bool, 16) {
+ LOOP_BLOCK_ALIGN_VAR(ip, npy_bool, VECTOR_SIZE_BYTES) {
*op = *op @op@ ip[i];
if (*op @sc@ 0) {
return;
}
}
/* unrolled once to replace a slow movmsk with a fast pmaxb */
- LOOP_BLOCKED(npy_bool, 32) {
+ LOOP_BLOCKED(npy_bool, 2 * VECTOR_SIZE_BYTES) {
@vtype@ v = @vload@((@vtype@*)&ip[i]);
- @vtype@ v2 = @vload@((@vtype@*)&ip[i + 16]);
+ @vtype@ v2 = @vload@((@vtype@*)&ip[i + VECTOR_SIZE_BYTES]);
v = @vpre@_cmpeq_epi8(v, zero);
v2 = @vpre@_cmpeq_epi8(v2, zero);
#if @and@
@@ -1164,9 +1179,9 @@ sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, const npy_intp n)
static void
sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
{
- LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
+ LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
op[i] = (ip[i] @op@ 0);
- LOOP_BLOCKED(@type@, 16) {
+ LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
@vtype@ a = @vloadu@((@vtype@*)&ip[i]);
#if @not@
const @vtype@ zero = @vpre@_setzero_@vsuf@();
@@ -1187,6 +1202,8 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
/**end repeat**/
+#undef VECTOR_SIZE_BYTES
+
#endif /* NPY_HAVE_SSE2_INTRINSICS */
#endif