optimize the remaining elements using npyv_load_tillz

author: Qiyu8 <fangchunlin@huawei.com> 2020-11-19 11:04:30 +0800
committer: Qiyu8 <fangchunlin@huawei.com> 2020-11-19 11:04:30 +0800
commit: 95d6052902fc4763cbceee51ec08a3fff3dc6b1f (patch)
tree: 80aee76f45cae3d88347df136274ae8dadfe6b83
parent: 594dd5d97ec9989f19de96f064930a955478b9a4 (diff)
download: numpy-95d6052902fc4763cbceee51ec08a3fff3dc6b1f.tar.gz
1 files changed, 8 insertions, 19 deletions
diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index c9ab71e28..efe9a59db 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -49,21 +49,6 @@
     #define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
 #endif
 
-/**
- * This macro is used to enable a scalar loop which advances 4 elements at a
- * time, which appears after a main SIMD loop gated by `CHK` that unrolls by
- * `NPY_SIMD_WIDTH * unroll_by` elements, and before a non-unrolled scalar loop
- * that finishes up all the remaining scalars. The purpose of the unrolled loop
- * is to enable auto-vectorization in cases when all of the following are true:
- *
- *  - optimization is allowed
- *  - either:
- *    - we did not run the SIMD loop at all, due to NPV being disabled.
- *    - the SIMD loop was larger than 128bit, so there are likely to be many
- *      elements left to process.
- */
-#define EINSUM_UNROLL_4_SCALARS(CHK) (!defined(NPY_DISABLE_OPTIMIZATION) && (!(CHK) || NPY_SIMD > 128))
-
 /**********************************************/
 
 /**begin repeat
@@ -318,10 +303,14 @@ static void
         }
     }
     /**end repeat2**/
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_@sfx@ a = npyv_load_tillz_@sfx@(data0, count);
+        npyv_@sfx@ b = npyv_load_tillz_@sfx@(data1, count);
+        npyv_@sfx@ c = npyv_load_tillz_@sfx@(data_out, count);
+        npyv_store_till_@sfx@(data_out, count, npyv_muladd_@sfx@(a, b, c));
+    }
     npyv_cleanup();
-#endif // NPYV check for @type@
-
-#if EINSUM_UNROLL_4_SCALARS(@NPYV_CHK@)
+#else
     for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
         /**begin repeat2
          * #i = 0, 1, 2, 3#
@@ -341,7 +330,7 @@ static void
         data_out[@i@] = @to@(abc@i@);
         /**end repeat2**/
     }
-#endif
+#endif // NPYV check for @type@
     for (; count > 0; --count, ++data0, ++data1, ++data_out) {
         const @type@ a = @from@(*data0);
         const @type@ b = @from@(*data1);
author	Qiyu8 <fangchunlin@huawei.com>	2020-11-19 11:04:30 +0800
committer	Qiyu8 <fangchunlin@huawei.com>	2020-11-19 11:04:30 +0800
commit	95d6052902fc4763cbceee51ec08a3fff3dc6b1f (patch)
tree	80aee76f45cae3d88347df136274ae8dadfe6b83
parent	594dd5d97ec9989f19de96f064930a955478b9a4 (diff)
download	numpy-95d6052902fc4763cbceee51ec08a3fff3dc6b1f.tar.gz