diff options
author | Raghuveer Devulapalli <raghuveer.devulapalli@intel.com> | 2019-11-13 14:25:55 -0800 |
---|---|---|
committer | Raghuveer Devulapalli <raghuveer.devulapalli@intel.com> | 2020-01-28 08:52:42 -0800 |
commit | 89618ad43b1fa867888fe8e18b7d85e264eb4f27 (patch) | |
tree | 1972272eabe71c06b667839ea12145a710671cdd | |
parent | 40ce133a2994c04ab5a96d4e69c78095ee4f5388 (diff) | |
download | numpy-89618ad43b1fa867888fe8e18b7d85e264eb4f27.tar.gz |
MAINT: Use npy_int32 instead of npy_int, added some comments
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 86 |
1 files changed, 68 insertions, 18 deletions
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 0b14f5bfd..cd485034e 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -33,6 +33,20 @@ #include <string.h> /* for memcpy */ #define VECTOR_SIZE_BYTES 16 + +/* + * MAX_STEP_SIZE is used to determine if we need to use SIMD version of the ufunc. + * Very large step size can be as slow as processing it using scalar. The + * value of 2097152 ( = 2MB) was chosen using 2 considerations: + * 1) Typical linux kernel page size is 4Kb, but sometimes it could also be 2MB + * which is == 2097152 Bytes. For a step size as large as this, surely all + * the loads/stores of gather/scatter instructions falls on 16 different pages + * which one would think would slow down gather/scatter instructions. + * 2) It additionally satisfies MAX_STEP_SIZE*16/esize < NPY_MAX_INT32 which + * allows us to use i32 version of gather/scatter (as opposed to the i64 version) + * without problems (step larger than NPY_MAX_INT32*esize/16 would require use of + * i64gather/scatter). esize = element size = 4/8 bytes for float/double. + */ #define MAX_STEP_SIZE 2097152 static NPY_INLINE npy_uintp @@ -57,7 +71,7 @@ abs_ptrdiff(char *a, char *b) * 1) Supporting large step sizes requires use of i64gather/scatter_ps instructions, * in which case we need two i64gather instructions and an additional vinsertf32x8 * instruction to load a single zmm register (since one i64gather instruction - * loads into a ymm register). This hurts performance significantly. + * loads into a ymm register). This is not ideal for performance. * 2) Gather and scatter instructions can be slow when the loads/stores * cross page boundaries. * @@ -74,11 +88,10 @@ abs_ptrdiff(char *a, char *b) * Input step should be smaller than MAX_STEP_SIZE for performance */ #define IS_OUTPUT_BLOCKABLE_UNARY(esize, vsize) \ - (steps[1] == (esize) && \ - (abs(steps[0]) < MAX_STEP_SIZE) && \ - (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \ - ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \ - ((abs_ptrdiff(args[1], args[0]) == 0)))) + (steps[1] == (esize) && abs(steps[0]) < MAX_STEP_SIZE && \ + (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \ + ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \ + ((abs_ptrdiff(args[1], args[0]) == 0)))) #define IS_BLOCKABLE_REDUCE(esize, vsize) \ (steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \ @@ -1757,8 +1770,14 @@ AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *s @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@(); - npy_int index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@]; - for (npy_int ii = 0; ii < @num_lanes@; ii++) { + /* + * Note: while generally indices are npy_intp, we ensure that our maximum index + * will fit in an int32 as a precondition for this function via + * IS_BINARY_SMALL_STEPS + */ + + npy_int32 index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@]; + for (npy_int32 ii = 0; ii < @num_lanes@; ii++) { index_ip1[ii] = ii*stride_ip1; index_ip2[ii] = ii*stride_ip2; index_op[ii] = ii*stride_op; @@ -1849,8 +1868,15 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void #if @replace_0_with_1@ @mask@ inv_load_mask = @isa@_invert_mask_ps(load_mask); #endif - npy_int indexarr[16]; - for (npy_int ii = 0; ii < 16; ii++) { + + /* + * Note: while generally indices are npy_intp, we ensure that our maximum index + * will fit in an int32 as a precondition for this function via + * IS_OUTPUT_BLOCKABLE_UNARY + */ + + npy_int32 indexarr[16]; + for (npy_int32 ii = 0; ii < 16; ii++) { indexarr[ii] = ii*stride; } @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]); @@ -1928,8 +1954,14 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void @mask@ inv_load_mask = @isa@_invert_mask_pd(load_mask); #endif @vtype@ ones_d = _mm@vsize@_set1_pd(1.0f); - npy_int indexarr[8]; - for (npy_int ii = 0; ii < 8; ii++) { + + /* + * Note: while generally indices are npy_intp, we ensure that our maximum index + * will fit in an int32 as a precondition for this function via + * IS_OUTPUT_BLOCKABLE_UNARY + */ + npy_int32 indexarr[8]; + for (npy_int32 ii = 0; ii < 8; ii++) { indexarr[ii] = ii*stride; } @vindextype@ vindex = @vindexload@((@vindextype@*)&indexarr[0]); @@ -2047,8 +2079,14 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void @mask@ nan_mask, glibc_mask, sine_mask, negate_mask; @mask@ load_mask = @isa@_get_full_load_mask_ps(); npy_intp num_remaining_elements = array_size; - npy_int indexarr[16]; - for (npy_int ii = 0; ii < 16; ii++) { + + /* + * Note: while generally indices are npy_intp, we ensure that our maximum index + * will fit in an int32 as a precondition for this function via + * IS_OUTPUT_BLOCKABLE_UNARY + */ + npy_int32 indexarr[16]; + for (npy_int32 ii = 0; ii < 16; ii++) { indexarr[ii] = ii*stride; } @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]); @@ -2163,8 +2201,14 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float); npy_float xmax = 88.72283935546875f; npy_float xmin = -103.97208404541015625f; - npy_int indexarr[16]; - for (npy_int ii = 0; ii < 16; ii++) { + + /* + * Note: while generally indices are npy_intp, we ensure that our maximum index + * will fit in an int32 as a precondition for this function via + * IS_OUTPUT_BLOCKABLE_UNARY + */ + npy_int32 indexarr[16]; + for (npy_int32 ii = 0; ii < 16; ii++) { indexarr[ii] = ii*stride; } @@ -2287,8 +2331,14 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void { const npy_intp stride = steps/(npy_intp)sizeof(npy_float); const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float); - npy_int indexarr[16]; - for (npy_int ii = 0; ii < 16; ii++) { + + /* + * Note: while generally indices are npy_intp, we ensure that our maximum index + * will fit in an int32 as a precondition for this function via + * IS_OUTPUT_BLOCKABLE_UNARY + */ + npy_int32 indexarr[16]; + for (npy_int32 ii = 0; ii < 16; ii++) { indexarr[ii] = ii*stride; } |