summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorQiyu8 <fangchunlin@huawei.com>2020-11-19 11:04:30 +0800
committerQiyu8 <fangchunlin@huawei.com>2020-11-19 11:04:30 +0800
commit95d6052902fc4763cbceee51ec08a3fff3dc6b1f (patch)
tree80aee76f45cae3d88347df136274ae8dadfe6b83
parent594dd5d97ec9989f19de96f064930a955478b9a4 (diff)
downloadnumpy-95d6052902fc4763cbceee51ec08a3fff3dc6b1f.tar.gz
optimize the remaining elements using npyv_load_tillz
-rw-r--r--numpy/core/src/multiarray/einsum_sumprod.c.src27
1 files changed, 8 insertions, 19 deletions
diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index c9ab71e28..efe9a59db 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -49,21 +49,6 @@
#define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
#endif
-/**
- * This macro is used to enable a scalar loop which advances 4 elements at a
- * time, which appears after a main SIMD loop gated by `CHK` that unrolls by
- * `NPY_SIMD_WIDTH * unroll_by` elements, and before a non-unrolled scalar loop
- * that finishes up all the remaining scalars. The purpose of the unrolled loop
- * is to enable auto-vectorization in cases when all of the following are true:
- *
- * - optimization is allowed
- * - either:
- * - we did not run the SIMD loop at all, due to NPV being disabled.
- * - the SIMD loop was larger than 128bit, so there are likely to be many
- * elements left to process.
- */
-#define EINSUM_UNROLL_4_SCALARS(CHK) (!defined(NPY_DISABLE_OPTIMIZATION) && (!(CHK) || NPY_SIMD > 128))
-
/**********************************************/
/**begin repeat
@@ -318,10 +303,14 @@ static void
}
}
/**end repeat2**/
+ for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+ npyv_@sfx@ a = npyv_load_tillz_@sfx@(data0, count);
+ npyv_@sfx@ b = npyv_load_tillz_@sfx@(data1, count);
+ npyv_@sfx@ c = npyv_load_tillz_@sfx@(data_out, count);
+ npyv_store_till_@sfx@(data_out, count, npyv_muladd_@sfx@(a, b, c));
+ }
npyv_cleanup();
-#endif // NPYV check for @type@
-
-#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
+#else
for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
/**begin repeat2
* #i = 0, 1, 2, 3#
@@ -341,7 +330,7 @@ static void
data_out[@i@] = @to@(abc@i@);
/**end repeat2**/
}
-#endif
+#endif // NPYV check for @type@
for (; count > 0; --count, ++data0, ++data1, ++data_out) {
const @type@ a = @from@(*data0);
const @type@ b = @from@(*data1);