summaryrefslogtreecommitdiff
path: root/numpy/core/src
diff options
context:
space:
mode:
Diffstat (limited to 'numpy/core/src')
-rw-r--r--numpy/core/src/_simd/_simd.dispatch.c.src10
-rw-r--r--numpy/core/src/common/simd/avx2/arithmetic.h23
-rw-r--r--numpy/core/src/common/simd/avx512/arithmetic.h43
-rw-r--r--numpy/core/src/common/simd/neon/arithmetic.h13
-rw-r--r--numpy/core/src/common/simd/sse/arithmetic.h27
-rw-r--r--numpy/core/src/common/simd/vsx/arithmetic.h12
6 files changed, 128 insertions, 0 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index 2d89b9df0..3d7af2333 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -20,6 +20,7 @@
* #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1#
* #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
+ * #sum_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
* #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
@@ -351,6 +352,10 @@ SIMD_IMPL_INTRIN_3(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@, v@sfx@)
/**end repeat1**/
#endif // fused_sup
+#if @sum_sup@
+SIMD_IMPL_INTRIN_1(sum_@sfx@, @sfx@, v@sfx@)
+#endif // sum_sup
+
#endif // simd_sup
/**end repeat**/
/***************************
@@ -370,6 +375,7 @@ static PyMethodDef simd__intrinsics_methods[] = {
* #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1#
* #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
+ * #sum_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
* #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
@@ -484,6 +490,10 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
/**end repeat1**/
#endif // fused_sup
+#if @sum_sup@
+SIMD_INTRIN_DEF(sum_@sfx@)
+#endif // sum_sup
+
#endif // simd_sup
/**end repeat**/
diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h
index 4af9e4d17..3a6dc9535 100644
--- a/numpy/core/src/common/simd/avx2/arithmetic.h
+++ b/numpy/core/src/common/simd/avx2/arithmetic.h
@@ -116,4 +116,27 @@
return npyv_sub_f64(npyv_mul_f64(neg_a, b), c);
}
#endif // !NPY_HAVE_FMA3
+
+// Horizontal add: Calculates the sum of all vector elements.
+NPY_FINLINE float npyv_sum_f32(__m256 a)
+{
+ __m256 sum_halves = _mm256_hadd_ps(a, a);
+ sum_halves = _mm256_hadd_ps(sum_halves, sum_halves);
+ __m128 lo = _mm256_castps256_ps128(sum_halves);
+ __m128 hi = _mm256_extractf128_ps(sum_halves, 1);
+ __m128 sum = _mm_add_ps(lo, hi);
+ return _mm_cvtss_f32(sum);
+}
+
+NPY_FINLINE double npyv_sum_f64(__m256d a)
+{
+ __m256d sum_halves = _mm256_hadd_pd(a, a);
+ __m128d lo = _mm256_castpd256_pd128(sum_halves);
+ __m128d hi = _mm256_extractf128_pd(sum_halves, 1);
+ __m128d sum = _mm_add_pd(lo, hi);
+ return _mm_cvtsd_f64(sum);
+}
+
#endif // _NPY_SIMD_AVX2_ARITHMETIC_H
+
+
diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
index 824ae818e..7372ca29e 100644
--- a/numpy/core/src/common/simd/avx512/arithmetic.h
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -129,4 +129,47 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
#define npyv_nmulsub_f32 _mm512_fnmsub_ps
#define npyv_nmulsub_f64 _mm512_fnmsub_pd
+/***************************
+ * Reduce Sum
+ * there are three ways to implement reduce sum for AVX512:
+ * 1- split(256) /add /split(128) /add /hadd /hadd /extract
+ * 2- shuff(cross) /add /shuff(cross) /add /shuff /add /shuff /add /extract
+ * 3- _mm512_reduce_add_ps/pd
+ * The first one is been widely used by many projects
+ *
+ * the second one is used by Intel Compiler, maybe because the
+ * latency of hadd increased by (2-3) starting from Skylake-X which makes two
+ * extra shuffles(non-cross) cheaper. check https://godbolt.org/z/s3G9Er for more info.
+ *
+ * The third one is almost the same as the second one but only works for
+ * intel compiler/GCC 7.1/Clang 4, we still need to support older GCC.
+ ***************************/
+#ifdef NPY_HAVE_AVX512F_REDUCE
+ #define npyv_sum_f32 _mm512_reduce_add_ps
+ #define npyv_sum_f64 _mm512_reduce_add_pd
+#else
+ NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
+ {
+ __m512 h64 = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2));
+ __m512 sum32 = _mm512_add_ps(a, h64);
+ __m512 h32 = _mm512_shuffle_f32x4(sum32, sum32, _MM_SHUFFLE(1, 0, 3, 2));
+ __m512 sum16 = _mm512_add_ps(sum32, h32);
+ __m512 h16 = _mm512_permute_ps(sum16, _MM_SHUFFLE(1, 0, 3, 2));
+ __m512 sum8 = _mm512_add_ps(sum16, h16);
+ __m512 h4 = _mm512_permute_ps(sum8, _MM_SHUFFLE(2, 3, 0, 1));
+ __m512 sum4 = _mm512_add_ps(sum8, h4);
+ return _mm_cvtss_f32(_mm512_castps512_ps128(sum4));
+ }
+ NPY_FINLINE double npyv_sum_f64(npyv_f64 a)
+ {
+ __m512d h64 = _mm512_shuffle_f64x2(a, a, _MM_SHUFFLE(3, 2, 3, 2));
+ __m512d sum32 = _mm512_add_pd(a, h64);
+ __m512d h32 = _mm512_permutex_pd(sum32, _MM_SHUFFLE(1, 0, 3, 2));
+ __m512d sum16 = _mm512_add_pd(sum32, h32);
+ __m512d h16 = _mm512_permute_pd(sum16, _MM_SHUFFLE(2, 3, 0, 1));
+ __m512d sum8 = _mm512_add_pd(sum16, h16);
+ return _mm_cvtsd_f64(_mm512_castpd512_pd128(sum8));
+ }
+#endif
+
#endif // _NPY_SIMD_AVX512_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h
index 5eeee1bb6..bc14ffb75 100644
--- a/numpy/core/src/common/simd/neon/arithmetic.h
+++ b/numpy/core/src/common/simd/neon/arithmetic.h
@@ -118,4 +118,17 @@
NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
{ return vfmsq_f64(vnegq_f64(c), a, b); }
#endif // NPY_SIMD_F64
+
+// Horizontal add: Calculates the sum of all vector elements.
+#if NPY_SIMD_F64
+ #define npyv_sum_f32 vaddvq_f32
+ #define npyv_sum_f64 vaddvq_f64
+#else
+ NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
+ {
+ float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a));
+ return vget_lane_f32(vpadd_f32(r, r), 0);
+ }
+#endif
+
#endif // _NPY_SIMD_NEON_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h
index 717dacd39..8440cc52e 100644
--- a/numpy/core/src/common/simd/sse/arithmetic.h
+++ b/numpy/core/src/common/simd/sse/arithmetic.h
@@ -147,4 +147,31 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b)
return npyv_sub_f64(npyv_mul_f64(neg_a, b), c);
}
#endif // !NPY_HAVE_FMA3
+
+// Horizontal add: Calculates the sum of all vector elements.
+NPY_FINLINE float npyv_sum_f32(__m128 a)
+{
+#ifdef NPY_HAVE_SSE3
+ __m128 sum_halves = _mm_hadd_ps(a, a);
+ return _mm_cvtss_f32(_mm_hadd_ps(sum_halves, sum_halves));
+#else
+ __m128 t1 = _mm_movehl_ps(a, a);
+ __m128 t2 = _mm_add_ps(a, t1);
+ __m128 t3 = _mm_shuffle_ps(t2, t2, 1);
+ __m128 t4 = _mm_add_ss(t2, t3);
+ return _mm_cvtss_f32(t4);
+#endif
+}
+
+NPY_FINLINE double npyv_sum_f64(__m128d a)
+{
+#ifdef NPY_HAVE_SSE3
+ return _mm_cvtsd_f64(_mm_hadd_pd(a, a));
+#else
+ return _mm_cvtsd_f64(_mm_add_pd(a, _mm_unpackhi_pd(a, a)));
+#endif
+}
+
#endif // _NPY_SIMD_SSE_ARITHMETIC_H
+
+
diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h
index 6ef007676..2f6762e63 100644
--- a/numpy/core/src/common/simd/vsx/arithmetic.h
+++ b/numpy/core/src/common/simd/vsx/arithmetic.h
@@ -116,4 +116,16 @@
#define npyv_nmulsub_f32 vec_nmadd // equivalent to -(a*b + c)
#define npyv_nmulsub_f64 vec_nmadd
+// Horizontal add: Calculates the sum of all vector elements.
+NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
+{
+ npyv_f32 sum = vec_add(a, npyv_combineh_f32(a, a));
+ return vec_extract(sum, 0) + vec_extract(sum, 1);
+}
+
+NPY_FINLINE double npyv_sum_f64(npyv_f64 a)
+{
+ return vec_extract(a, 0) + vec_extract(a, 1);
+}
+
#endif // _NPY_SIMD_VSX_ARITHMETIC_H