diff options
Diffstat (limited to 'numpy/core')
| -rw-r--r-- | numpy/core/src/multiarray/einsum.c.src | 159 |
1 files changed, 150 insertions, 9 deletions
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src index b914e5bb3..2538e05c6 100644 --- a/numpy/core/src/multiarray/einsum.c.src +++ b/numpy/core/src/multiarray/einsum.c.src @@ -31,9 +31,6 @@ #define EINSUM_USE_SSE1 0 #endif -/* - * TODO: Only some SSE2 for float64 is implemented. - */ #ifdef NPY_HAVE_SSE2_INTRINSICS #define EINSUM_USE_SSE2 1 #else @@ -276,6 +273,8 @@ static void #if EINSUM_USE_SSE1 && @float32@ __m128 a, b; +#elif EINSUM_USE_SSE2 && @float64@ + __m128d a, b; #endif NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n", @@ -319,6 +318,29 @@ finish_after_unrolled_loop: /* Finish off the loop */ goto finish_after_unrolled_loop; } +#elif EINSUM_USE_SSE2 && @float64@ + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) && + EINSUM_IS_SSE_ALIGNED(data_out)) { + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@)); + b = _mm_add_pd(a, _mm_load_pd(data_out+@i@)); + _mm_store_pd(data_out+@i@, b); +/**end repeat2**/ + data0 += 8; + data1 += 8; + data_out += 8; + } + + /* Finish off the loop */ + goto finish_after_unrolled_loop; + } #endif /* Unroll the loop by 8 */ @@ -333,6 +355,14 @@ finish_after_unrolled_loop: b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@)); _mm_storeu_ps(data_out+@i@, b); /**end repeat2**/ +#elif EINSUM_USE_SSE2 && @float64@ +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@)); + b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@)); + _mm_storeu_pd(data_out+@i@, b); +/**end repeat2**/ #else /**begin repeat2 * #i = 0, 1, 2, 3, 4, 5, 6, 7# @@ -491,6 +521,8 @@ static void #if EINSUM_USE_SSE1 && @float32@ __m128 a, b, value1_sse; +#elif EINSUM_USE_SSE2 && @float64@ + __m128d a, b, value1_sse; #endif NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outcontig_two (%d)\n", @@ -534,6 +566,29 @@ finish_after_unrolled_loop: /* Finish off the loop */ goto finish_after_unrolled_loop; } +#elif EINSUM_USE_SSE2 && @float64@ + value1_sse = _mm_set1_pd(value1); + + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) { + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + a = _mm_mul_pd(_mm_load_pd(data0+@i@), value1_sse); + b = _mm_add_pd(a, _mm_load_pd(data_out+@i@)); + _mm_store_pd(data_out+@i@, b); +/**end repeat2**/ + data0 += 8; + data_out += 8; + } + + /* Finish off the loop */ + goto finish_after_unrolled_loop; + } #endif /* Unroll the loop by 8 */ @@ -548,6 +603,14 @@ finish_after_unrolled_loop: b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@)); _mm_storeu_ps(data_out+@i@, b); /**end repeat2**/ +#elif EINSUM_USE_SSE2 && @float64@ +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), value1_sse); + b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@)); + _mm_storeu_pd(data_out+@i@, b); +/**end repeat2**/ #else /**begin repeat2 * #i = 0, 1, 2, 3, 4, 5, 6, 7# @@ -735,6 +798,8 @@ static void #if EINSUM_USE_SSE1 && @float32@ __m128 a, accum_sse = _mm_setzero_ps(); +#elif EINSUM_USE_SSE2 && @float64@ + __m128d a, accum_sse = _mm_setzero_pd(); #endif NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n", @@ -772,15 +837,38 @@ finish_after_unrolled_loop: /**end repeat2**/ data1 += 8; } - -#if EINSUM_USE_SSE1 && @float32@ /* Add the four SSE values and put in accum */ a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); accum_sse = _mm_add_ps(a, accum_sse); a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); accum_sse = _mm_add_ps(a, accum_sse); _mm_store_ss(&accum, accum_sse); -#endif + + /* Finish off the loop */ + goto finish_after_unrolled_loop; + } +#elif EINSUM_USE_SSE2 && @float64@ + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data1)) { + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data1+@i@)); +/**end repeat2**/ + data1 += 8; + } + /* Add the two SSE2 values and put in accum */ + a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); + accum_sse = _mm_add_pd(a, accum_sse); + _mm_store_sd(&accum, accum_sse); /* Finish off the loop */ goto finish_after_unrolled_loop; @@ -801,6 +889,16 @@ finish_after_unrolled_loop: */ accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@)); /**end repeat2**/ +#elif EINSUM_USE_SSE2 && @float64@ +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data1+@i@)); +/**end repeat2**/ #else /**begin repeat2 * #i = 0, 1, 2, 3, 4, 5, 6, 7# @@ -818,6 +916,11 @@ finish_after_unrolled_loop: a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); accum_sse = _mm_add_ps(a, accum_sse); _mm_store_ss(&accum, accum_sse); +#elif EINSUM_USE_SSE2 && @float64@ + /* Add the two SSE2 values and put in accum */ + a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); + accum_sse = _mm_add_pd(a, accum_sse); + _mm_store_sd(&accum, accum_sse); #endif /* Finish off the loop */ @@ -834,6 +937,8 @@ static void #if EINSUM_USE_SSE1 && @float32@ __m128 a, accum_sse = _mm_setzero_ps(); +#elif EINSUM_USE_SSE2 && @float64@ + __m128d a, accum_sse = _mm_setzero_pd(); #endif NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n", @@ -871,16 +976,37 @@ finish_after_unrolled_loop: /**end repeat2**/ data0 += 8; } - -#if EINSUM_USE_SSE1 && @float32@ /* Add the four SSE values and put in accum */ a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); accum_sse = _mm_add_ps(a, accum_sse); a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); accum_sse = _mm_add_ps(a, accum_sse); _mm_store_ss(&accum, accum_sse); -#endif + /* Finish off the loop */ + goto finish_after_unrolled_loop; + } +#elif EINSUM_USE_SSE2 && @float64@ + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data0)) { + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@)); +/**end repeat2**/ + data0 += 8; + } + /* Add the two SSE2 values and put in accum */ + a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); + accum_sse = _mm_add_pd(a, accum_sse); + _mm_store_sd(&accum, accum_sse); /* Finish off the loop */ goto finish_after_unrolled_loop; } @@ -900,6 +1026,16 @@ finish_after_unrolled_loop: */ accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@)); /**end repeat2**/ +#elif EINSUM_USE_SSE2 && @float64@ +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@)); +/**end repeat2**/ #else /**begin repeat2 * #i = 0, 1, 2, 3, 4, 5, 6, 7# @@ -917,6 +1053,11 @@ finish_after_unrolled_loop: a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); accum_sse = _mm_add_ps(a, accum_sse); _mm_store_ss(&accum, accum_sse); +#elif EINSUM_USE_SSE2 && @float64@ + /* Add the two SSE2 values and put in accum */ + a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); + accum_sse = _mm_add_pd(a, accum_sse); + _mm_store_sd(&accum, accum_sse); #endif /* Finish off the loop */ |
