summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorQiyu8 <fangchunlin@huawei.com>2020-12-17 14:53:43 +0800
committerQiyu8 <fangchunlin@huawei.com>2020-12-17 14:53:43 +0800
commitb80411d2eaf6eda8075764359ea32b59fcc6c468 (patch)
tree93c554c5c3f92bccfbb9a2de5f89da796f1aead7
parentd7a75e8e8fefc433cf6e5305807d5f3180954273 (diff)
downloadnumpy-b80411d2eaf6eda8075764359ea32b59fcc6c468.tar.gz
Optimize the performance of einsum's submodule sum.
-rw-r--r--numpy/core/src/multiarray/einsum_sumprod.c.src504
1 files changed, 78 insertions, 426 deletions
diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index 86d5b82fc..03d2d614c 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -94,6 +94,58 @@
* 0*3#
*/
+#if !@complex@
+static @temptype@ @name@_sum_of_arr(@type@ **data, npy_intp count)
+{
+ @temptype@ accum = 0;
+#if @NPYV_CHK@ // NPYV check for @type@
+ /* Use aligned instructions if possible */
+ const int is_aligned = EINSUM_IS_ALIGNED(*data);
+ const int vstep = npyv_nlanes_@sfx@;
+ npyv_@sfx@ vaccum = npyv_zero_@sfx@();
+ const npy_intp vstepx4 = vstep * 4;
+
+ /**begin repeat1
+ * #cond = if(is_aligned), else#
+ * #ld = loada, load#
+ * #st = storea, store#
+ */
+ @cond@ {
+ for (; count >= vstepx4; count -= vstepx4, *data += vstepx4) {
+ /**begin repeat2
+ * #i = 0, 1, 2, 3#
+ */
+ npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(*data + vstep * @i@);
+ /**end repeat2**/
+ npyv_@sfx@ a01 = npyv_add_@sfx@(a0, a1);
+ npyv_@sfx@ a23 = npyv_add_@sfx@(a2, a3);
+ npyv_@sfx@ a0123 = npyv_add_@sfx@(a01, a23);
+ vaccum = npyv_add_@sfx@(a0123, vaccum);
+ }
+ }
+ /**end repeat1**/
+ for (; count > 0; count -= vstep, *data += vstep) {
+ npyv_@sfx@ a = npyv_load_tillz_@sfx@(*data, count);
+ vaccum = npyv_add_@sfx@(a, vaccum);
+ }
+ accum = npyv_sum_@sfx@(vaccum);
+ npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+ for (; count > 4; count -= 4, *data += 4) {
+ const @temptype@ a01 = @from@(**data) + @from@(*(*data + 1));
+ const @temptype@ a23 = @from@(*(*data + 2)) + @from@(*(*data + 3));
+ accum += a01 + a23;
+ }
+#endif // !NPY_DISABLE_OPTIMIZATION
+ for (; count > 0; --count, *data += 1) {
+ accum += @from@(**data);
+ }
+#endif // NPYV check for @type@
+ return accum;
+}
+#endif
+
/**begin repeat1
* #nop = 1, 2, 3, 1000#
* #noplabel = one, two, three, any#
@@ -657,139 +709,10 @@ static void
@name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
- @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
@type@ *data1 = (@type@ *)dataptr[1];
- @temptype@ accum = 0;
-
-#if EINSUM_USE_SSE1 && @float32@
- __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
- __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
- NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n",
- (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
- switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
- case @i@+1:
- accum += @from@(data1[@i@]);
-/**end repeat2**/
- case 0:
- *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum);
- return;
- }
-
-#if EINSUM_USE_SSE1 && @float32@
- /* Use aligned instructions if possible */
- if (EINSUM_IS_SSE_ALIGNED(data1)) {
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+@i@));
-/**end repeat2**/
- data1 += 8;
- }
- /* Add the four SSE values and put in accum */
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
- accum_sse = _mm_add_ps(a, accum_sse);
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
- accum_sse = _mm_add_ps(a, accum_sse);
- _mm_store_ss(&accum, accum_sse);
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
- }
-#elif EINSUM_USE_SSE2 && @float64@
- /* Use aligned instructions if possible */
- if (EINSUM_IS_SSE_ALIGNED(data1)) {
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data1+@i@));
-/**end repeat2**/
- data1 += 8;
- }
- /* Add the two SSE2 values and put in accum */
- a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
- accum_sse = _mm_add_pd(a, accum_sse);
- _mm_store_sd(&accum, accum_sse);
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
- }
-#endif
-
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@));
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data1+@i@));
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
- accum += @from@(data1[@i@]);
-/**end repeat2**/
-#endif
- data1 += 8;
- }
-
-#if EINSUM_USE_SSE1 && @float32@
- /* Add the four SSE values and put in accum */
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
- accum_sse = _mm_add_ps(a, accum_sse);
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
- accum_sse = _mm_add_ps(a, accum_sse);
- _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
- /* Add the two SSE2 values and put in accum */
- a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
- accum_sse = _mm_add_pd(a, accum_sse);
- _mm_store_sd(&accum, accum_sse);
-#endif
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
+ @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
+ @temptype@ accum = @name@_sum_of_arr(&data1, count);
+ *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum);
}
static void
@@ -798,135 +721,8 @@ static void
{
@type@ *data0 = (@type@ *)dataptr[0];
@temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
- @temptype@ accum = 0;
-
-#if EINSUM_USE_SSE1 && @float32@
- __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
- __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
- NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n",
- (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
- switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
- case @i@+1:
- accum += @from@(data0[@i@]);
-/**end repeat2**/
- case 0:
- *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum * value1);
- return;
- }
-
-#if EINSUM_USE_SSE1 && @float32@
- /* Use aligned instructions if possible */
- if (EINSUM_IS_SSE_ALIGNED(data0)) {
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
-/**end repeat2**/
- data0 += 8;
- }
- /* Add the four SSE values and put in accum */
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
- accum_sse = _mm_add_ps(a, accum_sse);
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
- accum_sse = _mm_add_ps(a, accum_sse);
- _mm_store_ss(&accum, accum_sse);
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
- }
-#elif EINSUM_USE_SSE2 && @float64@
- /* Use aligned instructions if possible */
- if (EINSUM_IS_SSE_ALIGNED(data0)) {
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
-/**end repeat2**/
- data0 += 8;
- }
- /* Add the two SSE2 values and put in accum */
- a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
- accum_sse = _mm_add_pd(a, accum_sse);
- _mm_store_sd(&accum, accum_sse);
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
- }
-#endif
-
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
- accum += @from@(data0[@i@]);
-/**end repeat2**/
-#endif
- data0 += 8;
- }
-
-#if EINSUM_USE_SSE1 && @float32@
- /* Add the four SSE values and put in accum */
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
- accum_sse = _mm_add_ps(a, accum_sse);
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
- accum_sse = _mm_add_ps(a, accum_sse);
- _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
- /* Add the two SSE2 values and put in accum */
- a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
- accum_sse = _mm_add_pd(a, accum_sse);
- _mm_store_sd(&accum, accum_sse);
-#endif
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
+ @temptype@ accum = @name@_sum_of_arr(&data0, count);
+ *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value1 * accum);
}
#elif @nop@ == 3 && !@complex@
@@ -1032,175 +828,31 @@ static void
@name@_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
npy_intp const *strides, npy_intp count)
{
-#if @complex@
- @temptype@ accum_re = 0, accum_im = 0;
- @temptype@ *data0 = (@temptype@ *)dataptr[0];
-#else
- @temptype@ accum = 0;
- @type@ *data0 = (@type@ *)dataptr[0];
-#endif
-
-#if EINSUM_USE_SSE1 && @float32@
- __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
- __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
-
- NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n",
- (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
- switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
- case @i@+1:
+ NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !@complex@
- accum += @from@(data0[@i@]);
-#else /* complex */
- accum_re += data0[2*@i@+0];
- accum_im += data0[2*@i@+1];
-#endif
-/**end repeat2**/
- case 0:
-#if @complex@
- ((@temptype@ *)dataptr[1])[0] += accum_re;
- ((@temptype@ *)dataptr[1])[1] += accum_im;
+ @type@ *data = (@type@ *)dataptr[0];
+ @temptype@ accum = @name@_sum_of_arr(&data, count);
+ *((@type@ *)dataptr[1]) = @to@(accum + @from@(*((@type@ *)dataptr[1])));
#else
- *((@type@ *)dataptr[1]) = @to@(accum +
- @from@(*((@type@ *)dataptr[1])));
-#endif
- return;
- }
-
-#if EINSUM_USE_SSE1 && @float32@
- /* Use aligned instructions if possible */
- if (EINSUM_IS_SSE_ALIGNED(data0)) {
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
- _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
-/**end repeat2**/
- data0 += 8;
- }
-
- /* Add the four SSE values and put in accum */
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
- accum_sse = _mm_add_ps(a, accum_sse);
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
- accum_sse = _mm_add_ps(a, accum_sse);
- _mm_store_ss(&accum, accum_sse);
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
- }
-#elif EINSUM_USE_SSE2 && @float64@
- /* Use aligned instructions if possible */
- if (EINSUM_IS_SSE_ALIGNED(data0)) {
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
- _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
-/**end repeat2**/
- data0 += 8;
- }
-
- /* Add the two SSE2 values and put in accum */
- a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
- accum_sse = _mm_add_pd(a, accum_sse);
- _mm_store_sd(&accum, accum_sse);
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
+ @temptype@ accum_re = 0, accum_im = 0;
+ @temptype@ *data0 = (@temptype@ *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+ for (; count > 4; count -= 4, data0 += 4*2) {
+ const @temptype@ re01 = data0[0] + data0[2];
+ const @temptype@ re23 = data0[4] + data0[6];
+ const @temptype@ im13 = data0[1] + data0[3];
+ const @temptype@ im57 = data0[5] + data0[7];
+ accum_re += re01 + re23;
+ accum_im += im13 + im57;
}
-#endif
-
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
- _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
- _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-# if !@complex@
- accum += @from@(data0[@i@]);
-# else /* complex */
- accum_re += data0[2*@i@+0];
- accum_im += data0[2*@i@+1];
-# endif
-/**end repeat2**/
-#endif
-
-#if !@complex@
- data0 += 8;
-#else
- data0 += 8*2;
-#endif
+#endif // !NPY_DISABLE_OPTIMIZATION
+ for (; count > 0; --count, data0 += 2) {
+ accum_re += data0[0];
+ accum_im += data0[1];
}
-
-#if EINSUM_USE_SSE1 && @float32@
- /* Add the four SSE values and put in accum */
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
- accum_sse = _mm_add_ps(a, accum_sse);
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
- accum_sse = _mm_add_ps(a, accum_sse);
- _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
- /* Add the two SSE2 values and put in accum */
- a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
- accum_sse = _mm_add_pd(a, accum_sse);
- _mm_store_sd(&accum, accum_sse);
-#endif
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
+ ((@temptype@ *)dataptr[1])[0] += accum_re;
+ ((@temptype@ *)dataptr[1])[1] += accum_im;
+#endif // !@complex@
}
#endif /* @nop@ == 1 */