summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorQiyu8 <fangchunlin@huawei.com>2020-12-14 10:16:58 +0800
committerQiyu8 <fangchunlin@huawei.com>2020-12-14 10:16:58 +0800
commitc32f60e38376e438e1d357d03c5699f4fe9c6649 (patch)
tree089d3b8e97c099674a2e8c629b9e63b84a6ba270
parent9e26d1d2be7a961a16f8fa9ff7820c33b25415e2 (diff)
downloadnumpy-c32f60e38376e438e1d357d03c5699f4fe9c6649.tar.gz
Optimize the performance of einsum's submodule dot
-rw-r--r--numpy/core/src/common/simd/simd.h6
-rw-r--r--numpy/core/src/multiarray/einsum_sumprod.c.src184
2 files changed, 49 insertions, 141 deletions
diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h
index 8804223c9..7705a48ce 100644
--- a/numpy/core/src/common/simd/simd.h
+++ b/numpy/core/src/common/simd/simd.h
@@ -10,7 +10,11 @@
#include "numpy/npy_common.h"
#include "npy_cpu_dispatch.h"
#include "simd_utils.h"
-
+#ifndef NPY_HAVE_AVX2
+ #include <immintrin.h>
+ #define NPY_HAVE_AVX
+ #define NPY_HAVE_AVX2
+#endif
#ifdef __cplusplus
extern "C" {
#endif
diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index caba0e00a..2ef0ab13b 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -597,156 +597,60 @@ static void
@type@ *data1 = (@type@ *)dataptr[1];
@temptype@ accum = 0;
-#if EINSUM_USE_SSE1 && @float32@
- __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
- __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_contig_outstride0_two (%d)\n",
(int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
- switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
- case @i@+1:
- accum += @from@(data0[@i@]) * @from@(data1[@i@]);
-/**end repeat2**/
- case 0:
- *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum);
- return;
- }
-
-#if EINSUM_USE_SSE1 && @float32@
+#if @NPYV_CHK@ // NPYV check for @type@
/* Use aligned instructions if possible */
- if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
- _mm_prefetch(data0 + 512, _MM_HINT_T0);
- _mm_prefetch(data1 + 512, _MM_HINT_T0);
+ const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+ const int vstep = npyv_nlanes_@sfx@;
+ npyv_@sfx@ vaccum = npyv_zero_@sfx@();
-/**begin repeat2
- * #i = 0, 4#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
+ /**begin repeat2
+ * #cond = if(is_aligned), else#
+ * #ld = loada, load#
+ * #st = storea, store#
+ */
+ @cond@ {
+ const npy_intp vstepx4 = vstep * 4;
+ for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+ /**begin repeat3
+ * #i = 0, 1, 2, 3#
*/
- a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
- accum_sse = _mm_add_ps(accum_sse, a);
-/**end repeat2**/
- data0 += 8;
- data1 += 8;
+ npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@);
+ npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@);
+ /**end repeat3**/
+ npyv_@sfx@ ab3 = npyv_muladd_@sfx@(a3, b3, vaccum);
+ npyv_@sfx@ ab2 = npyv_muladd_@sfx@(a2, b2, ab3);
+ npyv_@sfx@ ab1 = npyv_muladd_@sfx@(a1, b1, ab2);
+ vaccum = npyv_muladd_@sfx@(a0, b0, ab1);
}
-
- /* Add the four SSE values and put in accum */
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
- accum_sse = _mm_add_ps(a, accum_sse);
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
- accum_sse = _mm_add_ps(a, accum_sse);
- _mm_store_ss(&accum, accum_sse);
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
}
-#elif EINSUM_USE_SSE2 && @float64@
- /* Use aligned instructions if possible */
- if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
- _mm_prefetch(data0 + 512, _MM_HINT_T0);
- _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
- accum_sse = _mm_add_pd(accum_sse, a);
-/**end repeat2**/
- data0 += 8;
- data1 += 8;
- }
-
- /* Add the two SSE2 values and put in accum */
- a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
- accum_sse = _mm_add_pd(a, accum_sse);
- _mm_store_sd(&accum, accum_sse);
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
+ /**end repeat2**/
+ for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+ npyv_@sfx@ a = npyv_load_tillz_@sfx@(data0, count);
+ npyv_@sfx@ b = npyv_load_tillz_@sfx@(data1, count);
+ vaccum = npyv_muladd_@sfx@(a, b, vaccum);
}
-#endif
-
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
- _mm_prefetch(data0 + 512, _MM_HINT_T0);
- _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
- accum_sse = _mm_add_ps(accum_sse, a);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
- _mm_prefetch(data0 + 512, _MM_HINT_T0);
- _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
- accum_sse = _mm_add_pd(accum_sse, a);
-/**end repeat2**/
+ accum = npyv_sum_@sfx@(vaccum);
+ npyv_cleanup();
#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
- accum += @from@(data0[@i@]) * @from@(data1[@i@]);
-/**end repeat2**/
-#endif
- data0 += 8;
- data1 += 8;
+#ifndef NPY_DISABLE_OPTIMIZATION
+ for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+ /**begin repeat2
+ * #i = 0, 1, 2, 3#
+ */
+ const @type@ ab@i@ = @from@(data0[@i@]) * @from@(data1[@i@]);
+ /**end repeat2**/
+ accum += ab0 + ab1 + ab2 + ab3;
}
-
-#if EINSUM_USE_SSE1 && @float32@
- /* Add the four SSE values and put in accum */
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
- accum_sse = _mm_add_ps(a, accum_sse);
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
- accum_sse = _mm_add_ps(a, accum_sse);
- _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
- /* Add the two SSE2 values and put in accum */
- a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
- accum_sse = _mm_add_pd(a, accum_sse);
- _mm_store_sd(&accum, accum_sse);
-#endif
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
+#endif // !NPY_DISABLE_OPTIMIZATION
+ for (; count > 0; --count, ++data0, ++data1) {
+ const @type@ a = @from@(*data0);
+ const @type@ b = @from@(*data1);
+ accum += a * b;
+ }
+#endif // NPYV check for @type@
+ *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum);
}
static void