summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/src/umath/simd.inc.src86
1 files changed, 68 insertions, 18 deletions
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 0b14f5bfd..cd485034e 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -33,6 +33,20 @@
#include <string.h> /* for memcpy */
#define VECTOR_SIZE_BYTES 16
+
+/*
+ * MAX_STEP_SIZE is used to determine if we need to use SIMD version of the ufunc.
+ * Very large step size can be as slow as processing it using scalar. The
+ * value of 2097152 ( = 2MB) was chosen using 2 considerations:
+ * 1) Typical linux kernel page size is 4Kb, but sometimes it could also be 2MB
+ * which is == 2097152 Bytes. For a step size as large as this, surely all
+ * the loads/stores of gather/scatter instructions falls on 16 different pages
+ * which one would think would slow down gather/scatter instructions.
+ * 2) It additionally satisfies MAX_STEP_SIZE*16/esize < NPY_MAX_INT32 which
+ * allows us to use i32 version of gather/scatter (as opposed to the i64 version)
+ * without problems (step larger than NPY_MAX_INT32*esize/16 would require use of
+ * i64gather/scatter). esize = element size = 4/8 bytes for float/double.
+ */
#define MAX_STEP_SIZE 2097152
static NPY_INLINE npy_uintp
@@ -57,7 +71,7 @@ abs_ptrdiff(char *a, char *b)
* 1) Supporting large step sizes requires use of i64gather/scatter_ps instructions,
* in which case we need two i64gather instructions and an additional vinsertf32x8
* instruction to load a single zmm register (since one i64gather instruction
- * loads into a ymm register). This hurts performance significantly.
+ * loads into a ymm register). This is not ideal for performance.
* 2) Gather and scatter instructions can be slow when the loads/stores
* cross page boundaries.
*
@@ -74,11 +88,10 @@ abs_ptrdiff(char *a, char *b)
* Input step should be smaller than MAX_STEP_SIZE for performance
*/
#define IS_OUTPUT_BLOCKABLE_UNARY(esize, vsize) \
- (steps[1] == (esize) && \
- (abs(steps[0]) < MAX_STEP_SIZE) && \
- (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
- ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
- ((abs_ptrdiff(args[1], args[0]) == 0))))
+ (steps[1] == (esize) && abs(steps[0]) < MAX_STEP_SIZE && \
+ (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
+ ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
+ ((abs_ptrdiff(args[1], args[0]) == 0))))
#define IS_BLOCKABLE_REDUCE(esize, vsize) \
(steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \
@@ -1757,8 +1770,14 @@ AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *s
@mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
- npy_int index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@];
- for (npy_int ii = 0; ii < @num_lanes@; ii++) {
+ /*
+ * Note: while generally indices are npy_intp, we ensure that our maximum index
+ * will fit in an int32 as a precondition for this function via
+ * IS_BINARY_SMALL_STEPS
+ */
+
+ npy_int32 index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@];
+ for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
index_ip1[ii] = ii*stride_ip1;
index_ip2[ii] = ii*stride_ip2;
index_op[ii] = ii*stride_op;
@@ -1849,8 +1868,15 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
#if @replace_0_with_1@
@mask@ inv_load_mask = @isa@_invert_mask_ps(load_mask);
#endif
- npy_int indexarr[16];
- for (npy_int ii = 0; ii < 16; ii++) {
+
+ /*
+ * Note: while generally indices are npy_intp, we ensure that our maximum index
+ * will fit in an int32 as a precondition for this function via
+ * IS_OUTPUT_BLOCKABLE_UNARY
+ */
+
+ npy_int32 indexarr[16];
+ for (npy_int32 ii = 0; ii < 16; ii++) {
indexarr[ii] = ii*stride;
}
@vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
@@ -1928,8 +1954,14 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
@mask@ inv_load_mask = @isa@_invert_mask_pd(load_mask);
#endif
@vtype@ ones_d = _mm@vsize@_set1_pd(1.0f);
- npy_int indexarr[8];
- for (npy_int ii = 0; ii < 8; ii++) {
+
+ /*
+ * Note: while generally indices are npy_intp, we ensure that our maximum index
+ * will fit in an int32 as a precondition for this function via
+ * IS_OUTPUT_BLOCKABLE_UNARY
+ */
+ npy_int32 indexarr[8];
+ for (npy_int32 ii = 0; ii < 8; ii++) {
indexarr[ii] = ii*stride;
}
@vindextype@ vindex = @vindexload@((@vindextype@*)&indexarr[0]);
@@ -2047,8 +2079,14 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
@mask@ nan_mask, glibc_mask, sine_mask, negate_mask;
@mask@ load_mask = @isa@_get_full_load_mask_ps();
npy_intp num_remaining_elements = array_size;
- npy_int indexarr[16];
- for (npy_int ii = 0; ii < 16; ii++) {
+
+ /*
+ * Note: while generally indices are npy_intp, we ensure that our maximum index
+ * will fit in an int32 as a precondition for this function via
+ * IS_OUTPUT_BLOCKABLE_UNARY
+ */
+ npy_int32 indexarr[16];
+ for (npy_int32 ii = 0; ii < 16; ii++) {
indexarr[ii] = ii*stride;
}
@vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
@@ -2163,8 +2201,14 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float);
npy_float xmax = 88.72283935546875f;
npy_float xmin = -103.97208404541015625f;
- npy_int indexarr[16];
- for (npy_int ii = 0; ii < 16; ii++) {
+
+ /*
+ * Note: while generally indices are npy_intp, we ensure that our maximum index
+ * will fit in an int32 as a precondition for this function via
+ * IS_OUTPUT_BLOCKABLE_UNARY
+ */
+ npy_int32 indexarr[16];
+ for (npy_int32 ii = 0; ii < 16; ii++) {
indexarr[ii] = ii*stride;
}
@@ -2287,8 +2331,14 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
{
const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float);
- npy_int indexarr[16];
- for (npy_int ii = 0; ii < 16; ii++) {
+
+ /*
+ * Note: while generally indices are npy_intp, we ensure that our maximum index
+ * will fit in an int32 as a precondition for this function via
+ * IS_OUTPUT_BLOCKABLE_UNARY
+ */
+ npy_int32 indexarr[16];
+ for (npy_int32 ii = 0; ii < 16; ii++) {
indexarr[ii] = ii*stride;
}