summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorqiyu8 <qiyu8@foxmail.com>2020-06-05 15:23:52 +0800
committerqiyu8 <qiyu8@foxmail.com>2020-06-05 15:23:52 +0800
commit98170965e63ccfcc83be2f91e4b3d26397949f70 (patch)
tree00d63bc014f5f63e99809645640ddfd2a2f29f7a
parenta5d021a1b6f439a19812926bc4d796ef5f346c44 (diff)
downloadnumpy-98170965e63ccfcc83be2f91e4b3d26397949f70.tar.gz
optimize using sse2
-rw-r--r--numpy/core/src/multiarray/einsum.c.src32
1 files changed, 32 insertions, 0 deletions
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index b914e5bb3..198191db5 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -276,6 +276,8 @@ static void
#if EINSUM_USE_SSE1 && @float32@
__m128 a, b;
+#elif EINSUM_USE_SSE2 && @float64@
+__m128d a, b;
#endif
NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n",
@@ -324,7 +326,29 @@ finish_after_unrolled_loop:
/* Unroll the loop by 8 */
while (count >= 8) {
count -= 8;
+#elif EINSUM_USE_SSE2 && @float64@
+ /* Use aligned instructions if possible */
+ if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
+ EINSUM_IS_SSE_ALIGNED(data_out)) {
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+ a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
+ b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
+ _mm_store_pd(data_out+@i@, b);
+/**end repeat2**/
+ data0 += 8;
+ data1 += 8;
+ data_out += 8;
+ }
+
+ /* Finish off the loop */
+ goto finish_after_unrolled_loop;
+ }
#if EINSUM_USE_SSE1 && @float32@
/**begin repeat2
* #i = 0, 4#
@@ -333,6 +357,14 @@ finish_after_unrolled_loop:
b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
_mm_storeu_ps(data_out+@i@, b);
/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+ a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
+ b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
+ _mm_storeu_pd(data_out+@i@, b);
+/**end repeat2**/
#else
/**begin repeat2
* #i = 0, 1, 2, 3, 4, 5, 6, 7#