diff options
Diffstat (limited to 'numpy')
| -rw-r--r-- | numpy/core/src/multiarray/einsum.c.src | 46 |
1 files changed, 40 insertions, 6 deletions
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src index 198191db5..160ecf673 100644 --- a/numpy/core/src/multiarray/einsum.c.src +++ b/numpy/core/src/multiarray/einsum.c.src @@ -277,7 +277,7 @@ static void #if EINSUM_USE_SSE1 && @float32@ __m128 a, b; #elif EINSUM_USE_SSE2 && @float64@ -__m128d a, b; + __m128d a, b; #endif NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n", @@ -321,11 +321,6 @@ finish_after_unrolled_loop: /* Finish off the loop */ goto finish_after_unrolled_loop; } -#endif - - /* Unroll the loop by 8 */ - while (count >= 8) { - count -= 8; #elif EINSUM_USE_SSE2 && @float64@ /* Use aligned instructions if possible */ if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) && @@ -349,6 +344,12 @@ finish_after_unrolled_loop: /* Finish off the loop */ goto finish_after_unrolled_loop; } +#endif + + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + #if EINSUM_USE_SSE1 && @float32@ /**begin repeat2 * #i = 0, 4# @@ -523,6 +524,8 @@ static void #if EINSUM_USE_SSE1 && @float32@ __m128 a, b, value1_sse; +#elif EINSUM_USE_SSE2 && @float64@ + __m128d a, b, value1_sse; #endif NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outcontig_two (%d)\n", @@ -566,6 +569,29 @@ finish_after_unrolled_loop: /* Finish off the loop */ goto finish_after_unrolled_loop; } +#elif EINSUM_USE_SSE2 && @float64@ + value1_sse = _mm_set1_pd(value1); + + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) { + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + a = _mm_mul_pd(_mm_load_pd(data0+@i@), value1_sse); + b = _mm_add_pd(a, _mm_load_pd(data_out+@i@)); + _mm_store_pd(data_out+@i@, b); +/**end repeat2**/ + data0 += 8; + data_out += 8; + } + + /* Finish off the loop */ + goto finish_after_unrolled_loop; + } #endif /* Unroll the loop by 8 */ @@ -580,6 +606,14 @@ finish_after_unrolled_loop: b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@)); _mm_storeu_ps(data_out+@i@, b); /**end repeat2**/ +#elif EINSUM_USE_SSE2 && @float64@ +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), value1_sse); + b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@)); + _mm_storeu_pd(data_out+@i@, b); +/**end repeat2**/ #else /**begin repeat2 * #i = 0, 1, 2, 3, 4, 5, 6, 7# |
