summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
authorqiyu8 <qiyu8@foxmail.com>2020-06-05 15:50:23 +0800
committerqiyu8 <qiyu8@foxmail.com>2020-06-05 15:50:23 +0800
commit02066f6215da9b0408cd6650506ec5bf6a6d2f8c (patch)
tree78061f65df23d18fca3740439bc8aaa16deffd32 /numpy
parent98170965e63ccfcc83be2f91e4b3d26397949f70 (diff)
downloadnumpy-02066f6215da9b0408cd6650506ec5bf6a6d2f8c.tar.gz
simd optimize contig_stride0_outcontig_two
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/src/multiarray/einsum.c.src46
1 files changed, 40 insertions, 6 deletions
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index 198191db5..160ecf673 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -277,7 +277,7 @@ static void
#if EINSUM_USE_SSE1 && @float32@
__m128 a, b;
#elif EINSUM_USE_SSE2 && @float64@
-__m128d a, b;
+ __m128d a, b;
#endif
NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n",
@@ -321,11 +321,6 @@ finish_after_unrolled_loop:
/* Finish off the loop */
goto finish_after_unrolled_loop;
}
-#endif
-
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
#elif EINSUM_USE_SSE2 && @float64@
/* Use aligned instructions if possible */
if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
@@ -349,6 +344,12 @@ finish_after_unrolled_loop:
/* Finish off the loop */
goto finish_after_unrolled_loop;
}
+#endif
+
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
#if EINSUM_USE_SSE1 && @float32@
/**begin repeat2
* #i = 0, 4#
@@ -523,6 +524,8 @@ static void
#if EINSUM_USE_SSE1 && @float32@
__m128 a, b, value1_sse;
+#elif EINSUM_USE_SSE2 && @float64@
+ __m128d a, b, value1_sse;
#endif
NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outcontig_two (%d)\n",
@@ -566,6 +569,29 @@ finish_after_unrolled_loop:
/* Finish off the loop */
goto finish_after_unrolled_loop;
}
+#elif EINSUM_USE_SSE2 && @float64@
+ value1_sse = _mm_set1_pd(value1);
+
+ /* Use aligned instructions if possible */
+ if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+ a = _mm_mul_pd(_mm_load_pd(data0+@i@), value1_sse);
+ b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
+ _mm_store_pd(data_out+@i@, b);
+/**end repeat2**/
+ data0 += 8;
+ data_out += 8;
+ }
+
+ /* Finish off the loop */
+ goto finish_after_unrolled_loop;
+ }
#endif
/* Unroll the loop by 8 */
@@ -580,6 +606,14 @@ finish_after_unrolled_loop:
b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
_mm_storeu_ps(data_out+@i@, b);
/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+ a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), value1_sse);
+ b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
+ _mm_storeu_pd(data_out+@i@, b);
+/**end repeat2**/
#else
/**begin repeat2
* #i = 0, 1, 2, 3, 4, 5, 6, 7#