diff options
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 15 |
1 files changed, 9 insertions, 6 deletions
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 137fdaa71..4265476b5 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -114,22 +114,25 @@ nomemoverlap(char *ip, * cross page boundaries. * * We instead rely on i32gather/scatter_ps instructions which use a 32-bit index - * element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE ensures this. + * element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE + * ensures this. The condition also requires that the input and output arrays + * should have no overlap in memory. */ #define IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP \ ((abs(steps[0]) < MAX_STEP_SIZE) && \ (abs(steps[1]) < MAX_STEP_SIZE) && \ (abs(steps[2]) < MAX_STEP_SIZE) && \ - (nomemoverlap(args[0], steps[0]*dimensions[0], args[2], steps[2]*dimensions[0])) && \ - (nomemoverlap(args[1], steps[1]*dimensions[0], args[2], steps[2]*dimensions[0]))) + (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \ + (nomemoverlap(args[1], steps[1] * dimensions[0], args[2], steps[2] * dimensions[0]))) /* - * output should be contiguous, can handle strided input data - * Input step should be smaller than MAX_STEP_SIZE for performance + * 1) Output should be contiguous, can handle strided input data + * 2) Input step should be smaller than MAX_STEP_SIZE for performance + * 3) Input and output arrays should have no overlap in memory */ #define IS_OUTPUT_BLOCKABLE_UNARY(esize, vsize) \ (steps[1] == (esize) && abs(steps[0]) < MAX_STEP_SIZE && \ - (nomemoverlap(args[1], steps[1]*dimensions[0], args[0], steps[0]*dimensions[0]))) + (nomemoverlap(args[1], steps[1] * dimensions[0], args[0], steps[0] * dimensions[0]))) #define IS_BLOCKABLE_REDUCE(esize, vsize) \ (steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \ |