diff options
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 60 |
1 files changed, 42 insertions, 18 deletions
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 69f003473..0b14f5bfd 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -33,6 +33,7 @@ #include <string.h> /* for memcpy */ #define VECTOR_SIZE_BYTES 16 +#define MAX_STEP_SIZE 2097152 static NPY_INLINE npy_uintp abs_ptrdiff(char *a, char *b) @@ -52,13 +53,32 @@ abs_ptrdiff(char *a, char *b) ((abs_ptrdiff(args[1], args[0]) == 0)))) /* + * Avoid using SIMD for very large step sizes for several reasons: + * 1) Supporting large step sizes requires use of i64gather/scatter_ps instructions, + * in which case we need two i64gather instructions and an additional vinsertf32x8 + * instruction to load a single zmm register (since one i64gather instruction + * loads into a ymm register). This hurts performance significantly. + * 2) Gather and scatter instructions can be slow when the loads/stores + * cross page boundaries. + * + * We instead rely on i32gather/scatter_ps instructions which use a 32-bit index + * element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE ensures this. + */ +#define IS_BINARY_SMALL_STEPS \ + ((abs(steps[0]) < MAX_STEP_SIZE) && \ + (abs(steps[1]) < MAX_STEP_SIZE) && \ + (abs(steps[2]) < MAX_STEP_SIZE)) + +/* * output should be contiguous, can handle strided input data + * Input step should be smaller than MAX_STEP_SIZE for performance */ #define IS_OUTPUT_BLOCKABLE_UNARY(esize, vsize) \ (steps[1] == (esize) && \ - (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \ - ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \ - ((abs_ptrdiff(args[1], args[0]) == 0)))) + (abs(steps[0]) < MAX_STEP_SIZE) && \ + (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \ + ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \ + ((abs_ptrdiff(args[1], args[0]) == 0)))) #define IS_BLOCKABLE_REDUCE(esize, vsize) \ (steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \ @@ -148,8 +168,12 @@ static NPY_INLINE int run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) { #if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@ - AVX512F_@func@_@TYPE@(args, dimensions, steps); - return 1; + if (IS_BINARY_SMALL_STEPS) { + AVX512F_@func@_@TYPE@(args, dimensions, steps); + return 1; + } + else + return 0; #endif return 0; } @@ -1722,9 +1746,9 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d static NPY_INLINE NPY_GCC_TARGET_AVX512F void AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) { - const npy_intp stride_ip1 = steps[0]/sizeof(@type@); - const npy_intp stride_ip2 = steps[1]/sizeof(@type@); - const npy_intp stride_op = steps[2]/sizeof(@type@); + const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@); + const npy_intp stride_ip2 = steps[1]/(npy_intp)sizeof(@type@); + const npy_intp stride_op = steps[2]/(npy_intp)sizeof(@type@); const npy_intp array_size = dimensions[0]; npy_intp num_remaining_elements = array_size; @type@* ip1 = (@type@*) args[0]; @@ -1817,8 +1841,8 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void const npy_intp array_size, const npy_intp steps) { - const npy_intp stride = steps/sizeof(npy_float); - const npy_int num_lanes = @BYTES@/sizeof(npy_float); + const npy_intp stride = steps/(npy_intp)sizeof(npy_float); + const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float); npy_intp num_remaining_elements = array_size; @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f); @mask@ load_mask = @isa@_get_full_load_mask_ps(); @@ -1896,8 +1920,8 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void const npy_intp array_size, const npy_intp steps) { - const npy_intp stride = steps/sizeof(npy_double); - const npy_int num_lanes = @BYTES@/sizeof(npy_double); + const npy_intp stride = steps/(npy_intp)sizeof(npy_double); + const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_double); npy_intp num_remaining_elements = array_size; @mask@ load_mask = @isa@_get_full_load_mask_pd(); #if @replace_0_with_1@ @@ -1992,8 +2016,8 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void const npy_intp steps, NPY_TRIG_OP my_trig_op) { - const npy_intp stride = steps/sizeof(npy_float); - const npy_int num_lanes = @BYTES@/sizeof(npy_float); + const npy_intp stride = steps/(npy_intp)sizeof(npy_float); + const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float); npy_float large_number = 71476.0625f; if (my_trig_op == npy_compute_sin) { large_number = 117435.992f; @@ -2135,8 +2159,8 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void const npy_intp array_size, const npy_intp steps) { - const npy_intp stride = steps/sizeof(npy_float); - const npy_int num_lanes = @BYTES@/sizeof(npy_float); + const npy_intp stride = steps/(npy_intp)sizeof(npy_float); + const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float); npy_float xmax = 88.72283935546875f; npy_float xmin = -103.97208404541015625f; npy_int indexarr[16]; @@ -2261,8 +2285,8 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void const npy_intp array_size, const npy_intp steps) { - const npy_intp stride = steps/sizeof(npy_float); - const npy_int num_lanes = @BYTES@/sizeof(npy_float); + const npy_intp stride = steps/(npy_intp)sizeof(npy_float); + const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float); npy_int indexarr[16]; for (npy_int ii = 0; ii < 16; ii++) { indexarr[ii] = ii*stride; |