summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/src/multiarray/einsum.c.src95
1 files changed, 86 insertions, 9 deletions
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index 1eca7d751..5a5dd7236 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -546,7 +546,7 @@ finish_after_unrolled_loop:
return;
}
-#if EINSUM_USE_SSE3 && @float32@
+#if EINSUM_USE_SSE1 && @float32@
value1_sse = _mm_set_ps1(value1);
/* Use aligned instructions if possible */
@@ -570,7 +570,6 @@ finish_after_unrolled_loop:
goto finish_after_unrolled_loop;
}
#elif EINSUM_USE_SSE2 && @float64@
- printf("using sse2\n");
value1_sse = _mm_set1_pd(value1);
/* Use aligned instructions if possible */
@@ -599,7 +598,7 @@ finish_after_unrolled_loop:
while (count >= 8) {
count -= 8;
-#if EINSUM_USE_SSE3 && @float32@
+#if EINSUM_USE_SSE1 && @float32@
/**begin repeat2
* #i = 0, 4#
*/
@@ -802,6 +801,8 @@ static void
#if EINSUM_USE_SSE1 && @float32@
__m128 a, accum_sse = _mm_setzero_ps();
+#elif EINSUM_USE_SSE2 && @float64@
+ __m128d a, accum_sse = _mm_setzero_pd();
#endif
NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n",
@@ -839,15 +840,38 @@ finish_after_unrolled_loop:
/**end repeat2**/
data1 += 8;
}
-
-#if EINSUM_USE_SSE1 && @float32@
/* Add the four SSE values and put in accum */
a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
accum_sse = _mm_add_ps(a, accum_sse);
a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
accum_sse = _mm_add_ps(a, accum_sse);
_mm_store_ss(&accum, accum_sse);
-#endif
+
+ /* Finish off the loop */
+ goto finish_after_unrolled_loop;
+ }
+#elif EINSUM_USE_SSE2 && @float64@
+ /* Use aligned instructions if possible */
+ if (EINSUM_IS_SSE_ALIGNED(data1)) {
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+ /*
+ * NOTE: This accumulation changes the order, so will likely
+ * produce slightly different results.
+ */
+ accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data1+@i@));
+/**end repeat2**/
+ data1 += 8;
+ }
+ /* Add the two SSE2 values and put in accum */
+ a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+ accum_sse = _mm_add_pd(a, accum_sse);
+ _mm_store_sd(&accum, accum_sse);
/* Finish off the loop */
goto finish_after_unrolled_loop;
@@ -868,6 +892,16 @@ finish_after_unrolled_loop:
*/
accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@));
/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+ /*
+ * NOTE: This accumulation changes the order, so will likely
+ * produce slightly different results.
+ */
+ accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data1+@i@));
+/**end repeat2**/
#else
/**begin repeat2
* #i = 0, 1, 2, 3, 4, 5, 6, 7#
@@ -885,6 +919,11 @@ finish_after_unrolled_loop:
a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
accum_sse = _mm_add_ps(a, accum_sse);
_mm_store_ss(&accum, accum_sse);
+#elif EINSUM_USE_SSE2 && @float64@
+ /* Add the two SSE2 values and put in accum */
+ a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+ accum_sse = _mm_add_pd(a, accum_sse);
+ _mm_store_sd(&accum, accum_sse);
#endif
/* Finish off the loop */
@@ -901,6 +940,8 @@ static void
#if EINSUM_USE_SSE1 && @float32@
__m128 a, accum_sse = _mm_setzero_ps();
+#elif EINSUM_USE_SSE2 && @float64@
+ __m128d a, accum_sse = _mm_setzero_pd();
#endif
NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n",
@@ -938,16 +979,37 @@ finish_after_unrolled_loop:
/**end repeat2**/
data0 += 8;
}
-
-#if EINSUM_USE_SSE1 && @float32@
/* Add the four SSE values and put in accum */
a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
accum_sse = _mm_add_ps(a, accum_sse);
a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
accum_sse = _mm_add_ps(a, accum_sse);
_mm_store_ss(&accum, accum_sse);
-#endif
+ /* Finish off the loop */
+ goto finish_after_unrolled_loop;
+ }
+#elif EINSUM_USE_SSE2 && @float64@
+ /* Use aligned instructions if possible */
+ if (EINSUM_IS_SSE_ALIGNED(data0)) {
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+ /*
+ * NOTE: This accumulation changes the order, so will likely
+ * produce slightly different results.
+ */
+ accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
+/**end repeat2**/
+ data0 += 8;
+ }
+ /* Add the two SSE2 values and put in accum */
+ a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+ accum_sse = _mm_add_pd(a, accum_sse);
+ _mm_store_sd(&accum, accum_sse);
/* Finish off the loop */
goto finish_after_unrolled_loop;
}
@@ -967,6 +1029,16 @@ finish_after_unrolled_loop:
*/
accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+ /*
+ * NOTE: This accumulation changes the order, so will likely
+ * produce slightly different results.
+ */
+ accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
+/**end repeat2**/
#else
/**begin repeat2
* #i = 0, 1, 2, 3, 4, 5, 6, 7#
@@ -984,6 +1056,11 @@ finish_after_unrolled_loop:
a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
accum_sse = _mm_add_ps(a, accum_sse);
_mm_store_ss(&accum, accum_sse);
+#elif EINSUM_USE_SSE2 && @float64@
+ /* Add the two SSE2 values and put in accum */
+ a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+ accum_sse = _mm_add_pd(a, accum_sse);
+ _mm_store_sd(&accum, accum_sse);
#endif
/* Finish off the loop */