diff options
author | Matti Picus <matti.picus@gmail.com> | 2020-11-03 13:29:03 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-11-03 13:29:03 +0200 |
commit | 671e8a02aeed0e823e9380557cad3c20ba30a3cb (patch) | |
tree | 2d277bf940f9afc414e914c4f726f3d4d8a63108 /numpy/core/src | |
parent | 1a12887866d06bd8b24d07f7109cb08e7e82e12d (diff) | |
parent | 1f0298d62853e5233b0b829b08a11c160f0b6597 (diff) | |
download | numpy-671e8a02aeed0e823e9380557cad3c20ba30a3cb.tar.gz |
Merge pull request #17681 from Qiyu8/sum_intrinsic
SIMD: Add sum intrinsics for float/double.
Diffstat (limited to 'numpy/core/src')
-rw-r--r-- | numpy/core/src/_simd/_simd.dispatch.c.src | 10 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx2/arithmetic.h | 23 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx512/arithmetic.h | 43 | ||||
-rw-r--r-- | numpy/core/src/common/simd/neon/arithmetic.h | 13 | ||||
-rw-r--r-- | numpy/core/src/common/simd/sse/arithmetic.h | 27 | ||||
-rw-r--r-- | numpy/core/src/common/simd/vsx/arithmetic.h | 12 |
6 files changed, 128 insertions, 0 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src index 2d89b9df0..3d7af2333 100644 --- a/numpy/core/src/_simd/_simd.dispatch.c.src +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -20,6 +20,7 @@ * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1# * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# + * #sum_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1# * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# * #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0# @@ -351,6 +352,10 @@ SIMD_IMPL_INTRIN_3(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@, v@sfx@) /**end repeat1**/ #endif // fused_sup +#if @sum_sup@ +SIMD_IMPL_INTRIN_1(sum_@sfx@, @sfx@, v@sfx@) +#endif // sum_sup + #endif // simd_sup /**end repeat**/ /*************************** @@ -370,6 +375,7 @@ static PyMethodDef simd__intrinsics_methods[] = { * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1# * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# + * #sum_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1# * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# * #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0# @@ -484,6 +490,10 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ #endif // fused_sup +#if @sum_sup@ +SIMD_INTRIN_DEF(sum_@sfx@) +#endif // sum_sup + #endif // simd_sup /**end repeat**/ diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h index 4af9e4d17..3a6dc9535 100644 --- a/numpy/core/src/common/simd/avx2/arithmetic.h +++ b/numpy/core/src/common/simd/avx2/arithmetic.h @@ -116,4 +116,27 @@ return npyv_sub_f64(npyv_mul_f64(neg_a, b), c); } #endif // !NPY_HAVE_FMA3 + +// Horizontal add: Calculates the sum of all vector elements. +NPY_FINLINE float npyv_sum_f32(__m256 a) +{ + __m256 sum_halves = _mm256_hadd_ps(a, a); + sum_halves = _mm256_hadd_ps(sum_halves, sum_halves); + __m128 lo = _mm256_castps256_ps128(sum_halves); + __m128 hi = _mm256_extractf128_ps(sum_halves, 1); + __m128 sum = _mm_add_ps(lo, hi); + return _mm_cvtss_f32(sum); +} + +NPY_FINLINE double npyv_sum_f64(__m256d a) +{ + __m256d sum_halves = _mm256_hadd_pd(a, a); + __m128d lo = _mm256_castpd256_pd128(sum_halves); + __m128d hi = _mm256_extractf128_pd(sum_halves, 1); + __m128d sum = _mm_add_pd(lo, hi); + return _mm_cvtsd_f64(sum); +} + #endif // _NPY_SIMD_AVX2_ARITHMETIC_H + + diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h index 824ae818e..7372ca29e 100644 --- a/numpy/core/src/common/simd/avx512/arithmetic.h +++ b/numpy/core/src/common/simd/avx512/arithmetic.h @@ -129,4 +129,47 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b) #define npyv_nmulsub_f32 _mm512_fnmsub_ps #define npyv_nmulsub_f64 _mm512_fnmsub_pd +/*************************** + * Reduce Sum + * there are three ways to implement reduce sum for AVX512: + * 1- split(256) /add /split(128) /add /hadd /hadd /extract + * 2- shuff(cross) /add /shuff(cross) /add /shuff /add /shuff /add /extract + * 3- _mm512_reduce_add_ps/pd + * The first one is been widely used by many projects + * + * the second one is used by Intel Compiler, maybe because the + * latency of hadd increased by (2-3) starting from Skylake-X which makes two + * extra shuffles(non-cross) cheaper. check https://godbolt.org/z/s3G9Er for more info. + * + * The third one is almost the same as the second one but only works for + * intel compiler/GCC 7.1/Clang 4, we still need to support older GCC. + ***************************/ +#ifdef NPY_HAVE_AVX512F_REDUCE + #define npyv_sum_f32 _mm512_reduce_add_ps + #define npyv_sum_f64 _mm512_reduce_add_pd +#else + NPY_FINLINE float npyv_sum_f32(npyv_f32 a) + { + __m512 h64 = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 sum32 = _mm512_add_ps(a, h64); + __m512 h32 = _mm512_shuffle_f32x4(sum32, sum32, _MM_SHUFFLE(1, 0, 3, 2)); + __m512 sum16 = _mm512_add_ps(sum32, h32); + __m512 h16 = _mm512_permute_ps(sum16, _MM_SHUFFLE(1, 0, 3, 2)); + __m512 sum8 = _mm512_add_ps(sum16, h16); + __m512 h4 = _mm512_permute_ps(sum8, _MM_SHUFFLE(2, 3, 0, 1)); + __m512 sum4 = _mm512_add_ps(sum8, h4); + return _mm_cvtss_f32(_mm512_castps512_ps128(sum4)); + } + NPY_FINLINE double npyv_sum_f64(npyv_f64 a) + { + __m512d h64 = _mm512_shuffle_f64x2(a, a, _MM_SHUFFLE(3, 2, 3, 2)); + __m512d sum32 = _mm512_add_pd(a, h64); + __m512d h32 = _mm512_permutex_pd(sum32, _MM_SHUFFLE(1, 0, 3, 2)); + __m512d sum16 = _mm512_add_pd(sum32, h32); + __m512d h16 = _mm512_permute_pd(sum16, _MM_SHUFFLE(2, 3, 0, 1)); + __m512d sum8 = _mm512_add_pd(sum16, h16); + return _mm_cvtsd_f64(_mm512_castpd512_pd128(sum8)); + } +#endif + #endif // _NPY_SIMD_AVX512_ARITHMETIC_H diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h index 5eeee1bb6..bc14ffb75 100644 --- a/numpy/core/src/common/simd/neon/arithmetic.h +++ b/numpy/core/src/common/simd/neon/arithmetic.h @@ -118,4 +118,17 @@ NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) { return vfmsq_f64(vnegq_f64(c), a, b); } #endif // NPY_SIMD_F64 + +// Horizontal add: Calculates the sum of all vector elements. +#if NPY_SIMD_F64 + #define npyv_sum_f32 vaddvq_f32 + #define npyv_sum_f64 vaddvq_f64 +#else + NPY_FINLINE float npyv_sum_f32(npyv_f32 a) + { + float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a)); + return vget_lane_f32(vpadd_f32(r, r), 0); + } +#endif + #endif // _NPY_SIMD_NEON_ARITHMETIC_H diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h index 717dacd39..8440cc52e 100644 --- a/numpy/core/src/common/simd/sse/arithmetic.h +++ b/numpy/core/src/common/simd/sse/arithmetic.h @@ -147,4 +147,31 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b) return npyv_sub_f64(npyv_mul_f64(neg_a, b), c); } #endif // !NPY_HAVE_FMA3 + +// Horizontal add: Calculates the sum of all vector elements. +NPY_FINLINE float npyv_sum_f32(__m128 a) +{ +#ifdef NPY_HAVE_SSE3 + __m128 sum_halves = _mm_hadd_ps(a, a); + return _mm_cvtss_f32(_mm_hadd_ps(sum_halves, sum_halves)); +#else + __m128 t1 = _mm_movehl_ps(a, a); + __m128 t2 = _mm_add_ps(a, t1); + __m128 t3 = _mm_shuffle_ps(t2, t2, 1); + __m128 t4 = _mm_add_ss(t2, t3); + return _mm_cvtss_f32(t4); +#endif +} + +NPY_FINLINE double npyv_sum_f64(__m128d a) +{ +#ifdef NPY_HAVE_SSE3 + return _mm_cvtsd_f64(_mm_hadd_pd(a, a)); +#else + return _mm_cvtsd_f64(_mm_add_pd(a, _mm_unpackhi_pd(a, a))); +#endif +} + #endif // _NPY_SIMD_SSE_ARITHMETIC_H + + diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h index 6ef007676..2f6762e63 100644 --- a/numpy/core/src/common/simd/vsx/arithmetic.h +++ b/numpy/core/src/common/simd/vsx/arithmetic.h @@ -116,4 +116,16 @@ #define npyv_nmulsub_f32 vec_nmadd // equivalent to -(a*b + c) #define npyv_nmulsub_f64 vec_nmadd +// Horizontal add: Calculates the sum of all vector elements. +NPY_FINLINE float npyv_sum_f32(npyv_f32 a) +{ + npyv_f32 sum = vec_add(a, npyv_combineh_f32(a, a)); + return vec_extract(sum, 0) + vec_extract(sum, 1); +} + +NPY_FINLINE double npyv_sum_f64(npyv_f64 a) +{ + return vec_extract(a, 0) + vec_extract(a, 1); +} + #endif // _NPY_SIMD_VSX_ARITHMETIC_H |