summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRaghuveer Devulapalli <raghuveer.devulapalli@intel.com>2022-09-26 09:54:30 -0700
committerRaghuveer Devulapalli <raghuveer.devulapalli@intel.com>2022-09-26 09:59:46 -0700
commit7dfcd39f53e0fad9aa083f64112caa26c4406d7f (patch)
treea08a393f8e871b065d027a4888ba580ae9c37920
parenta13006aca6079fdf128556aab854833c82bae796 (diff)
downloadnumpy-7dfcd39f53e0fad9aa083f64112caa26c4406d7f.tar.gz
BUG: Add memoverlap check
-rw-r--r--numpy/core/src/umath/loops_umath_fp.dispatch.c.src13
1 files changed, 6 insertions, 7 deletions
diff --git a/numpy/core/src/umath/loops_umath_fp.dispatch.c.src b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
index 1865c6b88..46ce51824 100644
--- a/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
@@ -82,7 +82,7 @@ simd_@func@_f64(const double *src, npy_intp ssrc,
/**begin repeat1
* #func = pow, atan2#
*/
-
+
static void
simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src1, npy_intp ssrc1,
const npyv_lanetype_@sfx@ *src2, npy_intp ssrc2,
@@ -96,14 +96,14 @@ simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src1, npy_intp ssrc1,
} else {
x1 = npyv_loadn_till_@sfx@(src1, ssrc1, len, 1);
}
-
+
npyv_@sfx@ x2;
if (ssrc2 == 1) {
x2 = npyv_load_till_@sfx@(src2, len, 1);
} else {
x2 = npyv_loadn_till_@sfx@(src2, ssrc2, len, 1);
}
-
+
npyv_@sfx@ out = __svml_@func@@func_suffix@(x1, x2);
if (sdst == 1) {
npyv_store_till_@sfx@(dst, len, out);
@@ -115,9 +115,6 @@ simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src1, npy_intp ssrc1,
/**end repeat1**/
/**end repeat**/
-#endif
-
-#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
typedef __m256i npyvh_f16;
#define npyv_cvt_f16_f32 _mm512_cvtph_ps
#define npyv_cvt_f32_f16 _mm512_cvtps_ph
@@ -182,7 +179,9 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_@func@)
const npy_intp ssrc = steps[0] / lsize;
const npy_intp sdst = steps[1] / lsize;
const npy_intp len = dimensions[0];
- if ((ssrc == 1) && (sdst == 1)) {
+ if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+ (ssrc == 1) &&
+ (sdst == 1)) {
avx512_@intrin@_f16(src, dst, len);
return;
}