diff options
author | Qiyu8 <fangchunlin@huawei.com> | 2020-11-03 17:04:35 +0800 |
---|---|---|
committer | Qiyu8 <fangchunlin@huawei.com> | 2020-11-03 17:04:35 +0800 |
commit | 1f0298d62853e5233b0b829b08a11c160f0b6597 (patch) | |
tree | 7de626d4dda5c183d8d879bbf2b44296c622095f /numpy/core/src | |
parent | e17cdf56c4d40bcebb0a272af573be94e7286fa0 (diff) | |
download | numpy-1f0298d62853e5233b0b829b08a11c160f0b6597.tar.gz |
improve intrinsics and add sum intrinsic test
Diffstat (limited to 'numpy/core/src')
-rw-r--r-- | numpy/core/src/_simd/_simd.dispatch.c.src | 10 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx2/arithmetic.h | 5 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx512/arithmetic.h | 4 | ||||
-rw-r--r-- | numpy/core/src/common/simd/neon/arithmetic.h | 18 | ||||
-rw-r--r-- | numpy/core/src/common/simd/sse/arithmetic.h | 5 | ||||
-rw-r--r-- | numpy/core/src/common/simd/vsx/arithmetic.h | 8 |
6 files changed, 33 insertions, 17 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src index 2d89b9df0..3d7af2333 100644 --- a/numpy/core/src/_simd/_simd.dispatch.c.src +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -20,6 +20,7 @@ * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1# * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# + * #sum_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1# * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# * #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0# @@ -351,6 +352,10 @@ SIMD_IMPL_INTRIN_3(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@, v@sfx@) /**end repeat1**/ #endif // fused_sup +#if @sum_sup@ +SIMD_IMPL_INTRIN_1(sum_@sfx@, @sfx@, v@sfx@) +#endif // sum_sup + #endif // simd_sup /**end repeat**/ /*************************** @@ -370,6 +375,7 @@ static PyMethodDef simd__intrinsics_methods[] = { * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1# * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# + * #sum_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1# * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# * #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0# @@ -484,6 +490,10 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ #endif // fused_sup +#if @sum_sup@ +SIMD_INTRIN_DEF(sum_@sfx@) +#endif // sum_sup + #endif // simd_sup /**end repeat**/ diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h index 8b98394e0..3a6dc9535 100644 --- a/numpy/core/src/common/simd/avx2/arithmetic.h +++ b/numpy/core/src/common/simd/avx2/arithmetic.h @@ -116,7 +116,6 @@ return npyv_sub_f64(npyv_mul_f64(neg_a, b), c); } #endif // !NPY_HAVE_FMA3 -#endif // _NPY_SIMD_AVX2_ARITHMETIC_H // Horizontal add: Calculates the sum of all vector elements. NPY_FINLINE float npyv_sum_f32(__m256 a) @@ -137,3 +136,7 @@ NPY_FINLINE double npyv_sum_f64(__m256d a) __m128d sum = _mm_add_pd(lo, hi); return _mm_cvtsd_f64(sum); } + +#endif // _NPY_SIMD_AVX2_ARITHMETIC_H + + diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h index 989e10f39..7372ca29e 100644 --- a/numpy/core/src/common/simd/avx512/arithmetic.h +++ b/numpy/core/src/common/simd/avx512/arithmetic.h @@ -129,8 +129,6 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b) #define npyv_nmulsub_f32 _mm512_fnmsub_ps #define npyv_nmulsub_f64 _mm512_fnmsub_pd -#endif // _NPY_SIMD_AVX512_ARITHMETIC_H - /*************************** * Reduce Sum * there are three ways to implement reduce sum for AVX512: @@ -173,3 +171,5 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b) return _mm_cvtsd_f64(_mm512_castpd512_pd128(sum8)); } #endif + +#endif // _NPY_SIMD_AVX512_ARITHMETIC_H diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h index 2d56adfbd..bc14ffb75 100644 --- a/numpy/core/src/common/simd/neon/arithmetic.h +++ b/numpy/core/src/common/simd/neon/arithmetic.h @@ -118,17 +118,17 @@ NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) { return vfmsq_f64(vnegq_f64(c), a, b); } #endif // NPY_SIMD_F64 -#endif // _NPY_SIMD_NEON_ARITHMETIC_H // Horizontal add: Calculates the sum of all vector elements. -NPY_FINLINE float npyv_sum_f32(float32x4_t a) -{ - float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a)); - return vget_lane_f32(vpadd_f32(r, r), 0); -} -#ifdef __aarch64__ - NPY_FINLINE double npyv_sum_f64(float64x2_t a) +#if NPY_SIMD_F64 + #define npyv_sum_f32 vaddvq_f32 + #define npyv_sum_f64 vaddvq_f64 +#else + NPY_FINLINE float npyv_sum_f32(npyv_f32 a) { - return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); + float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a)); + return vget_lane_f32(vpadd_f32(r, r), 0); } #endif + +#endif // _NPY_SIMD_NEON_ARITHMETIC_H diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h index cf33349fc..8440cc52e 100644 --- a/numpy/core/src/common/simd/sse/arithmetic.h +++ b/numpy/core/src/common/simd/sse/arithmetic.h @@ -147,7 +147,6 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b) return npyv_sub_f64(npyv_mul_f64(neg_a, b), c); } #endif // !NPY_HAVE_FMA3 -#endif // _NPY_SIMD_SSE_ARITHMETIC_H // Horizontal add: Calculates the sum of all vector elements. NPY_FINLINE float npyv_sum_f32(__m128 a) @@ -172,3 +171,7 @@ NPY_FINLINE double npyv_sum_f64(__m128d a) return _mm_cvtsd_f64(_mm_add_pd(a, _mm_unpackhi_pd(a, a))); #endif } + +#endif // _NPY_SIMD_SSE_ARITHMETIC_H + + diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h index 755aa1ca3..2f6762e63 100644 --- a/numpy/core/src/common/simd/vsx/arithmetic.h +++ b/numpy/core/src/common/simd/vsx/arithmetic.h @@ -116,16 +116,16 @@ #define npyv_nmulsub_f32 vec_nmadd // equivalent to -(a*b + c) #define npyv_nmulsub_f64 vec_nmadd -#endif // _NPY_SIMD_VSX_ARITHMETIC_H - // Horizontal add: Calculates the sum of all vector elements. NPY_FINLINE float npyv_sum_f32(npyv_f32 a) { - return vec_extract(a, 0) + vec_extract(a, 1) + - vec_extract(a, 2) + vec_extract(a, 3); + npyv_f32 sum = vec_add(a, npyv_combineh_f32(a, a)); + return vec_extract(sum, 0) + vec_extract(sum, 1); } NPY_FINLINE double npyv_sum_f64(npyv_f64 a) { return vec_extract(a, 0) + vec_extract(a, 1); } + +#endif // _NPY_SIMD_VSX_ARITHMETIC_H |