diff options
author | Matti Picus <matti.picus@gmail.com> | 2020-06-15 14:21:14 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-06-15 14:21:14 +0300 |
commit | 56315089b048160d7c013f257ebf7ea9fea70d91 (patch) | |
tree | 2436c8d6a84ba5e30e480f4ae8e07a4009049868 | |
parent | f3ff68e5fe874f9626fcbe85000549083f4f222a (diff) | |
parent | 2f93818563c810d366ecd115af42efa3f5705bf2 (diff) | |
download | numpy-56315089b048160d7c013f257ebf7ea9fea70d91.tar.gz |
Merge pull request #16540 from Qiyu8/enisum_sse2
SIMD: SSE2 intrinsic implementation for float64 input of np.enisum
-rw-r--r-- | benchmarks/benchmarks/bench_linalg.py | 26 | ||||
-rw-r--r-- | numpy/core/src/multiarray/einsum.c.src | 159 |
2 files changed, 176 insertions, 9 deletions
diff --git a/benchmarks/benchmarks/bench_linalg.py b/benchmarks/benchmarks/bench_linalg.py index 3abbe3670..dc2849d58 100644 --- a/benchmarks/benchmarks/bench_linalg.py +++ b/benchmarks/benchmarks/bench_linalg.py @@ -105,3 +105,29 @@ class Lstsq(Benchmark): def time_numpy_linalg_lstsq_a__b_float64(self): np.linalg.lstsq(self.a, self.b, rcond=-1) + +class Einsum(Benchmark): + param_names = ['dtype'] + params = [[np.float64]] + def setup(self, dtype): + self.a = np.arange(2900, dtype=dtype) + self.b = np.arange(3000, dtype=dtype) + self.c = np.arange(24000, dtype=dtype).reshape(20, 30, 40) + self.c1 = np.arange(1200, dtype=dtype).reshape(30, 40) + self.d = np.arange(10000, dtype=dtype).reshape(10,100,10) + + #outer(a,b): trigger sum_of_products_contig_stride0_outcontig_two + def time_einsum_outer(self, dtype): + np.einsum("i,j", self.a, self.b, optimize=True) + + # multiply(a, b):trigger sum_of_products_contig_two + def time_einsum_multiply(self, dtype): + np.einsum("..., ...", self.c1, self.c , optimize=True) + + # sum and multiply:trigger sum_of_products_contig_stride0_outstride0_two + def time_einsum_sum_mul(self, dtype): + np.einsum(",i...->", 300, self.d, optimize=True) + + # sum and multiply:trigger sum_of_products_stride0_contig_outstride0_two + def time_einsum_sum_mul2(self, dtype): + np.einsum("i...,->", self.d, 300, optimize=True)
\ No newline at end of file diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src index b914e5bb3..2538e05c6 100644 --- a/numpy/core/src/multiarray/einsum.c.src +++ b/numpy/core/src/multiarray/einsum.c.src @@ -31,9 +31,6 @@ #define EINSUM_USE_SSE1 0 #endif -/* - * TODO: Only some SSE2 for float64 is implemented. - */ #ifdef NPY_HAVE_SSE2_INTRINSICS #define EINSUM_USE_SSE2 1 #else @@ -276,6 +273,8 @@ static void #if EINSUM_USE_SSE1 && @float32@ __m128 a, b; +#elif EINSUM_USE_SSE2 && @float64@ + __m128d a, b; #endif NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n", @@ -319,6 +318,29 @@ finish_after_unrolled_loop: /* Finish off the loop */ goto finish_after_unrolled_loop; } +#elif EINSUM_USE_SSE2 && @float64@ + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) && + EINSUM_IS_SSE_ALIGNED(data_out)) { + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@)); + b = _mm_add_pd(a, _mm_load_pd(data_out+@i@)); + _mm_store_pd(data_out+@i@, b); +/**end repeat2**/ + data0 += 8; + data1 += 8; + data_out += 8; + } + + /* Finish off the loop */ + goto finish_after_unrolled_loop; + } #endif /* Unroll the loop by 8 */ @@ -333,6 +355,14 @@ finish_after_unrolled_loop: b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@)); _mm_storeu_ps(data_out+@i@, b); /**end repeat2**/ +#elif EINSUM_USE_SSE2 && @float64@ +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@)); + b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@)); + _mm_storeu_pd(data_out+@i@, b); +/**end repeat2**/ #else /**begin repeat2 * #i = 0, 1, 2, 3, 4, 5, 6, 7# @@ -491,6 +521,8 @@ static void #if EINSUM_USE_SSE1 && @float32@ __m128 a, b, value1_sse; +#elif EINSUM_USE_SSE2 && @float64@ + __m128d a, b, value1_sse; #endif NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outcontig_two (%d)\n", @@ -534,6 +566,29 @@ finish_after_unrolled_loop: /* Finish off the loop */ goto finish_after_unrolled_loop; } +#elif EINSUM_USE_SSE2 && @float64@ + value1_sse = _mm_set1_pd(value1); + + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) { + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + a = _mm_mul_pd(_mm_load_pd(data0+@i@), value1_sse); + b = _mm_add_pd(a, _mm_load_pd(data_out+@i@)); + _mm_store_pd(data_out+@i@, b); +/**end repeat2**/ + data0 += 8; + data_out += 8; + } + + /* Finish off the loop */ + goto finish_after_unrolled_loop; + } #endif /* Unroll the loop by 8 */ @@ -548,6 +603,14 @@ finish_after_unrolled_loop: b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@)); _mm_storeu_ps(data_out+@i@, b); /**end repeat2**/ +#elif EINSUM_USE_SSE2 && @float64@ +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), value1_sse); + b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@)); + _mm_storeu_pd(data_out+@i@, b); +/**end repeat2**/ #else /**begin repeat2 * #i = 0, 1, 2, 3, 4, 5, 6, 7# @@ -735,6 +798,8 @@ static void #if EINSUM_USE_SSE1 && @float32@ __m128 a, accum_sse = _mm_setzero_ps(); +#elif EINSUM_USE_SSE2 && @float64@ + __m128d a, accum_sse = _mm_setzero_pd(); #endif NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n", @@ -772,15 +837,38 @@ finish_after_unrolled_loop: /**end repeat2**/ data1 += 8; } - -#if EINSUM_USE_SSE1 && @float32@ /* Add the four SSE values and put in accum */ a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); accum_sse = _mm_add_ps(a, accum_sse); a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); accum_sse = _mm_add_ps(a, accum_sse); _mm_store_ss(&accum, accum_sse); -#endif + + /* Finish off the loop */ + goto finish_after_unrolled_loop; + } +#elif EINSUM_USE_SSE2 && @float64@ + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data1)) { + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; + +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data1+@i@)); +/**end repeat2**/ + data1 += 8; + } + /* Add the two SSE2 values and put in accum */ + a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); + accum_sse = _mm_add_pd(a, accum_sse); + _mm_store_sd(&accum, accum_sse); /* Finish off the loop */ goto finish_after_unrolled_loop; @@ -801,6 +889,16 @@ finish_after_unrolled_loop: */ accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@)); /**end repeat2**/ +#elif EINSUM_USE_SSE2 && @float64@ +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data1+@i@)); +/**end repeat2**/ #else /**begin repeat2 * #i = 0, 1, 2, 3, 4, 5, 6, 7# @@ -818,6 +916,11 @@ finish_after_unrolled_loop: a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); accum_sse = _mm_add_ps(a, accum_sse); _mm_store_ss(&accum, accum_sse); +#elif EINSUM_USE_SSE2 && @float64@ + /* Add the two SSE2 values and put in accum */ + a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); + accum_sse = _mm_add_pd(a, accum_sse); + _mm_store_sd(&accum, accum_sse); #endif /* Finish off the loop */ @@ -834,6 +937,8 @@ static void #if EINSUM_USE_SSE1 && @float32@ __m128 a, accum_sse = _mm_setzero_ps(); +#elif EINSUM_USE_SSE2 && @float64@ + __m128d a, accum_sse = _mm_setzero_pd(); #endif NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n", @@ -871,16 +976,37 @@ finish_after_unrolled_loop: /**end repeat2**/ data0 += 8; } - -#if EINSUM_USE_SSE1 && @float32@ /* Add the four SSE values and put in accum */ a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1)); accum_sse = _mm_add_ps(a, accum_sse); a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); accum_sse = _mm_add_ps(a, accum_sse); _mm_store_ss(&accum, accum_sse); -#endif + /* Finish off the loop */ + goto finish_after_unrolled_loop; + } +#elif EINSUM_USE_SSE2 && @float64@ + /* Use aligned instructions if possible */ + if (EINSUM_IS_SSE_ALIGNED(data0)) { + /* Unroll the loop by 8 */ + while (count >= 8) { + count -= 8; +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@)); +/**end repeat2**/ + data0 += 8; + } + /* Add the two SSE2 values and put in accum */ + a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); + accum_sse = _mm_add_pd(a, accum_sse); + _mm_store_sd(&accum, accum_sse); /* Finish off the loop */ goto finish_after_unrolled_loop; } @@ -900,6 +1026,16 @@ finish_after_unrolled_loop: */ accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@)); /**end repeat2**/ +#elif EINSUM_USE_SSE2 && @float64@ +/**begin repeat2 + * #i = 0, 2, 4, 6# + */ + /* + * NOTE: This accumulation changes the order, so will likely + * produce slightly different results. + */ + accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@)); +/**end repeat2**/ #else /**begin repeat2 * #i = 0, 1, 2, 3, 4, 5, 6, 7# @@ -917,6 +1053,11 @@ finish_after_unrolled_loop: a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2)); accum_sse = _mm_add_ps(a, accum_sse); _mm_store_ss(&accum, accum_sse); +#elif EINSUM_USE_SSE2 && @float64@ + /* Add the two SSE2 values and put in accum */ + a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1)); + accum_sse = _mm_add_pd(a, accum_sse); + _mm_store_sd(&accum, accum_sse); #endif /* Finish off the loop */ |