summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/src/umath/simd.inc.src60
1 files changed, 42 insertions, 18 deletions
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 69f003473..0b14f5bfd 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -33,6 +33,7 @@
#include <string.h> /* for memcpy */
#define VECTOR_SIZE_BYTES 16
+#define MAX_STEP_SIZE 2097152
static NPY_INLINE npy_uintp
abs_ptrdiff(char *a, char *b)
@@ -52,13 +53,32 @@ abs_ptrdiff(char *a, char *b)
((abs_ptrdiff(args[1], args[0]) == 0))))
/*
+ * Avoid using SIMD for very large step sizes for several reasons:
+ * 1) Supporting large step sizes requires use of i64gather/scatter_ps instructions,
+ * in which case we need two i64gather instructions and an additional vinsertf32x8
+ * instruction to load a single zmm register (since one i64gather instruction
+ * loads into a ymm register). This hurts performance significantly.
+ * 2) Gather and scatter instructions can be slow when the loads/stores
+ * cross page boundaries.
+ *
+ * We instead rely on i32gather/scatter_ps instructions which use a 32-bit index
+ * element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE ensures this.
+ */
+#define IS_BINARY_SMALL_STEPS \
+ ((abs(steps[0]) < MAX_STEP_SIZE) && \
+ (abs(steps[1]) < MAX_STEP_SIZE) && \
+ (abs(steps[2]) < MAX_STEP_SIZE))
+
+/*
* output should be contiguous, can handle strided input data
+ * Input step should be smaller than MAX_STEP_SIZE for performance
*/
#define IS_OUTPUT_BLOCKABLE_UNARY(esize, vsize) \
(steps[1] == (esize) && \
- (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
- ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
- ((abs_ptrdiff(args[1], args[0]) == 0))))
+ (abs(steps[0]) < MAX_STEP_SIZE) && \
+ (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
+ ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
+ ((abs_ptrdiff(args[1], args[0]) == 0))))
#define IS_BLOCKABLE_REDUCE(esize, vsize) \
(steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \
@@ -148,8 +168,12 @@ static NPY_INLINE int
run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
- AVX512F_@func@_@TYPE@(args, dimensions, steps);
- return 1;
+ if (IS_BINARY_SMALL_STEPS) {
+ AVX512F_@func@_@TYPE@(args, dimensions, steps);
+ return 1;
+ }
+ else
+ return 0;
#endif
return 0;
}
@@ -1722,9 +1746,9 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
static NPY_INLINE NPY_GCC_TARGET_AVX512F void
AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
- const npy_intp stride_ip1 = steps[0]/sizeof(@type@);
- const npy_intp stride_ip2 = steps[1]/sizeof(@type@);
- const npy_intp stride_op = steps[2]/sizeof(@type@);
+ const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@);
+ const npy_intp stride_ip2 = steps[1]/(npy_intp)sizeof(@type@);
+ const npy_intp stride_op = steps[2]/(npy_intp)sizeof(@type@);
const npy_intp array_size = dimensions[0];
npy_intp num_remaining_elements = array_size;
@type@* ip1 = (@type@*) args[0];
@@ -1817,8 +1841,8 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
const npy_intp array_size,
const npy_intp steps)
{
- const npy_intp stride = steps/sizeof(npy_float);
- const npy_int num_lanes = @BYTES@/sizeof(npy_float);
+ const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
+ const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float);
npy_intp num_remaining_elements = array_size;
@vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
@mask@ load_mask = @isa@_get_full_load_mask_ps();
@@ -1896,8 +1920,8 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
const npy_intp array_size,
const npy_intp steps)
{
- const npy_intp stride = steps/sizeof(npy_double);
- const npy_int num_lanes = @BYTES@/sizeof(npy_double);
+ const npy_intp stride = steps/(npy_intp)sizeof(npy_double);
+ const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_double);
npy_intp num_remaining_elements = array_size;
@mask@ load_mask = @isa@_get_full_load_mask_pd();
#if @replace_0_with_1@
@@ -1992,8 +2016,8 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
const npy_intp steps,
NPY_TRIG_OP my_trig_op)
{
- const npy_intp stride = steps/sizeof(npy_float);
- const npy_int num_lanes = @BYTES@/sizeof(npy_float);
+ const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
+ const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float);
npy_float large_number = 71476.0625f;
if (my_trig_op == npy_compute_sin) {
large_number = 117435.992f;
@@ -2135,8 +2159,8 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
const npy_intp array_size,
const npy_intp steps)
{
- const npy_intp stride = steps/sizeof(npy_float);
- const npy_int num_lanes = @BYTES@/sizeof(npy_float);
+ const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
+ const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float);
npy_float xmax = 88.72283935546875f;
npy_float xmin = -103.97208404541015625f;
npy_int indexarr[16];
@@ -2261,8 +2285,8 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
const npy_intp array_size,
const npy_intp steps)
{
- const npy_intp stride = steps/sizeof(npy_float);
- const npy_int num_lanes = @BYTES@/sizeof(npy_float);
+ const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
+ const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float);
npy_int indexarr[16];
for (npy_int ii = 0; ii < 16; ii++) {
indexarr[ii] = ii*stride;