summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/src/umath/simd.inc.src41
1 files changed, 37 insertions, 4 deletions
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 8db0f6ee6..61321445f 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -55,6 +55,37 @@ abs_ptrdiff(char *a, char *b)
return (a > b) ? (a - b) : (b - a);
}
+/*
+ * nomemoverlap - returns true if two strided arrays have an overlapping
+ * region in memory. ip_size/op_size = size of the arrays which can be negative
+ * indicating negative steps.
+ */
+static NPY_INLINE npy_bool
+nomemoverlap(char *ip,
+ npy_intp ip_size,
+ char *op,
+ npy_intp op_size)
+{
+ char *ip_start, *ip_end, *op_start, *op_end;
+ if (ip_size < 0) {
+ ip_start = ip + ip_size;
+ ip_end = ip;
+ }
+ else {
+ ip_start = ip;
+ ip_end = ip + ip_size;
+ }
+ if (op_size < 0) {
+ op_start = op + op_size;
+ op_end = op;
+ }
+ else {
+ op_start = op;
+ op_end = op + op_size;
+ }
+ return (ip_start > op_end) | (op_start > ip_end);
+}
+
#define IS_BINARY_STRIDE_ONE(esize, vsize) \
((steps[0] == esize) && \
(steps[1] == esize) && \
@@ -85,10 +116,12 @@ abs_ptrdiff(char *a, char *b)
* We instead rely on i32gather/scatter_ps instructions which use a 32-bit index
* element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE ensures this.
*/
-#define IS_BINARY_SMALL_STEPS \
+#define IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP \
((abs(steps[0]) < MAX_STEP_SIZE) && \
(abs(steps[1]) < MAX_STEP_SIZE) && \
- (abs(steps[2]) < MAX_STEP_SIZE))
+ (abs(steps[2]) < MAX_STEP_SIZE) && \
+ (nomemoverlap(args[0], steps[0]*dimensions[0], args[2], steps[2]*dimensions[0])) && \
+ (nomemoverlap(args[1], steps[1]*dimensions[0], args[2], steps[2]*dimensions[0])))
/*
* output should be contiguous, can handle strided input data
@@ -252,7 +285,7 @@ static NPY_INLINE int
run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
- if (IS_BINARY_SMALL_STEPS) {
+ if (IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
AVX512F_@func@_@TYPE@(args, dimensions, steps);
return 1;
}
@@ -1942,7 +1975,7 @@ AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *s
/*
* Note: while generally indices are npy_intp, we ensure that our maximum index
* will fit in an int32 as a precondition for this function via
- * IS_BINARY_SMALL_STEPS
+ * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
*/
npy_int32 index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@];