diff options
Diffstat (limited to 'numpy')
| -rw-r--r-- | numpy/core/src/multiarray/einsum.c.src | 95 |
1 files changed, 86 insertions, 9 deletions
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src index 1eca7d751..5a5dd7236 100644 --- a/numpy/core/src/multiarray/einsum.c.src +++ b/numpy/core/src/multiarray/einsum.c.src @@ -546,7 +546,7 @@ finish_after_unrolled_loop: return; } -#if EINSUM_USE_SSE3 && @float32@ +#if EINSUM_USE_SSE1 && @float32@ value1_sse = _mm_set_ps1(value1); /* Use aligned instructions if possible */ @@ -570,7 +570,6 @@ finish_after_unrolled_loop: goto finish_after_unrolled_loop; } #elif EINSUM_USE_SSE2 && @float64@ - printf("using sse2\n"); value1_sse = _mm_set1_pd(value1); /* Use aligned instructions if possible */ @@ -599,7 +598,7 @@ finish_after_unrolled_loop: while (count >= 8) { count -= 8; -#if EINSUM_USE_SSE3 && @float32@ +#if EINSUM_USE_SSE1 && @float32@ /**begin repeat2 * #i = 0, 4# */ @@ -802,6 +801,8 @@ static void #if EINSUM_USE_SSE1 && @float32@ __m128 a, accum_sse = _mm_setzero_ps(); +#elif EINSUM_USE_SSE2 && @float64@ + __m128d a, accum_sse = _mm_setzero_pd(); #endif NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n", @@ -839,15 +840,38 @@ finish_after_unrolled_loop: /**end repeat2**/ data1 += 8; } - -#if EINSUM_USE_SSE1 && @float32@ /* Add the four SSE values and put in accum */ a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); accum_sse = _mm_add_ps(a, accum_sse); a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); accum_sse = _mm_add_ps(a, accum_sse); _mm_store_ss(&accum, accum_sse); -#endif + + /* Finish off the loop */ + goto finish_after_unrolled_loop; + } +#elif EINSUM_USE_SSE2 && @float64@ + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data1)) { + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data1+@i@)); +/**end repeat2**/ + data1 += 8; + } + /* Add the two SSE2 values and put in accum */ + a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); + accum_sse = _mm_add_pd(a, accum_sse); + _mm_store_sd(&accum, accum_sse); /* Finish off the loop */ goto finish_after_unrolled_loop; @@ -868,6 +892,16 @@ finish_after_unrolled_loop: */ accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@)); /**end repeat2**/ +#elif EINSUM_USE_SSE2 && @float64@ +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data1+@i@)); +/**end repeat2**/ #else /**begin repeat2 * #i = 0, 1, 2, 3, 4, 5, 6, 7# @@ -885,6 +919,11 @@ finish_after_unrolled_loop: a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); accum_sse = _mm_add_ps(a, accum_sse); _mm_store_ss(&accum, accum_sse); +#elif EINSUM_USE_SSE2 && @float64@ + /* Add the two SSE2 values and put in accum */ + a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); + accum_sse = _mm_add_pd(a, accum_sse); + _mm_store_sd(&accum, accum_sse); #endif /* Finish off the loop */ @@ -901,6 +940,8 @@ static void #if EINSUM_USE_SSE1 && @float32@ __m128 a, accum_sse = _mm_setzero_ps(); +#elif EINSUM_USE_SSE2 && @float64@ + __m128d a, accum_sse = _mm_setzero_pd(); #endif NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n", @@ -938,16 +979,37 @@ finish_after_unrolled_loop: /**end repeat2**/ data0 += 8; } - -#if EINSUM_USE_SSE1 && @float32@ /* Add the four SSE values and put in accum */ a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); accum_sse = _mm_add_ps(a, accum_sse); a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); accum_sse = _mm_add_ps(a, accum_sse); _mm_store_ss(&accum, accum_sse); -#endif + /* Finish off the loop */ + goto finish_after_unrolled_loop; + } +#elif EINSUM_USE_SSE2 && @float64@ + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data0)) { + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@)); +/**end repeat2**/ + data0 += 8; + } + /* Add the two SSE2 values and put in accum */ + a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); + accum_sse = _mm_add_pd(a, accum_sse); + _mm_store_sd(&accum, accum_sse); /* Finish off the loop */ goto finish_after_unrolled_loop; } @@ -967,6 +1029,16 @@ finish_after_unrolled_loop: */ accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@)); /**end repeat2**/ +#elif EINSUM_USE_SSE2 && @float64@ +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@)); +/**end repeat2**/ #else /**begin repeat2 * #i = 0, 1, 2, 3, 4, 5, 6, 7# @@ -984,6 +1056,11 @@ finish_after_unrolled_loop: a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); accum_sse = _mm_add_ps(a, accum_sse); _mm_store_ss(&accum, accum_sse); +#elif EINSUM_USE_SSE2 && @float64@ + /* Add the two SSE2 values and put in accum */ + a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); + accum_sse = _mm_add_pd(a, accum_sse); + _mm_store_sd(&accum, accum_sse); #endif /* Finish off the loop */ |
