diff options
author | Matti Picus <matti.picus@gmail.com> | 2021-02-03 00:10:16 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-02-03 00:10:16 +0200 |
commit | 78cf92c19a7cfe9a07b970a2156d177a9ba627b0 (patch) | |
tree | 54b6c69c0bc8d6acd4f76caead14c4ae2e7b088e | |
parent | 80ffa42124541d5a82f5af67375f14efcf548110 (diff) | |
parent | 9b2a8dee2185bcecb27ae06172082a8046b66527 (diff) | |
download | numpy-78cf92c19a7cfe9a07b970a2156d177a9ba627b0.tar.gz |
Merge pull request #18200 from Qiyu8/intrin-sum
ENH: Add new intrinsics sum_u8/u16/u64.
-rw-r--r-- | numpy/core/src/_simd/_simd.dispatch.c.src | 13 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx2/arithmetic.h | 37 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx512/arithmetic.h | 58 | ||||
-rw-r--r-- | numpy/core/src/common/simd/neon/arithmetic.h | 31 | ||||
-rw-r--r-- | numpy/core/src/common/simd/sse/arithmetic.h | 35 | ||||
-rw-r--r-- | numpy/core/src/common/simd/sse/sse.h | 1 | ||||
-rw-r--r-- | numpy/core/src/common/simd/sse/utils.h | 19 | ||||
-rw-r--r-- | numpy/core/src/common/simd/vsx/arithmetic.h | 27 | ||||
-rw-r--r-- | numpy/core/tests/test_simd.py | 21 |
9 files changed, 214 insertions, 28 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src index af42192a9..e5b58a8d2 100644 --- a/numpy/core/src/_simd/_simd.dispatch.c.src +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -23,7 +23,8 @@ * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1# * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# - * #sum_sup = 0, 0, 0, 0, 1, 0, 0, 0, 1, 1# + * #sumup_sup = 1, 0, 1, 0, 0, 0, 0, 0, 0, 0# + * #sum_sup = 0, 0, 0, 0, 1, 0, 1, 0, 1, 1# * #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0# * #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1# * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# @@ -365,6 +366,10 @@ SIMD_IMPL_INTRIN_3(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@, v@sfx@) SIMD_IMPL_INTRIN_1(sum_@sfx@, @sfx@, v@sfx@) #endif // sum_sup +#if @sumup_sup@ +SIMD_IMPL_INTRIN_1(sumup_@sfx@, @esfx@, v@sfx@) +#endif // sumup_sup + /*************************** * Math ***************************/ @@ -452,7 +457,8 @@ static PyMethodDef simd__intrinsics_methods[] = { * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1# * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# - * #sum_sup = 0, 0, 0, 0, 1, 0, 0, 0, 1, 1# + * #sumup_sup = 1, 0, 1, 0, 0, 0, 0, 0, 0, 0# + * #sum_sup = 0, 0, 0, 0, 1, 0, 1, 0, 1, 1# * #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0# * #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1# * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# @@ -574,6 +580,9 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@) SIMD_INTRIN_DEF(sum_@sfx@) #endif // sum_sup +#if @sumup_sup@ +SIMD_INTRIN_DEF(sumup_@sfx@) +#endif // sumup_sup /*************************** * Math ***************************/ diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h index 3a3a82798..4b8258759 100644 --- a/numpy/core/src/common/simd/avx2/arithmetic.h +++ b/numpy/core/src/common/simd/avx2/arithmetic.h @@ -5,6 +5,7 @@ #ifndef _NPY_SIMD_AVX2_ARITHMETIC_H #define _NPY_SIMD_AVX2_ARITHMETIC_H +#include "../sse/utils.h" /*************************** * Addition ***************************/ @@ -117,8 +118,11 @@ } #endif // !NPY_HAVE_FMA3 -// Horizontal add: Calculates the sum of all vector elements. -NPY_FINLINE npy_uint32 npyv_sum_u32(__m256i a) +/*************************** + * Summation + ***************************/ +// reduce sum across vector +NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a) { __m256i s0 = _mm256_hadd_epi32(a, a); s0 = _mm256_hadd_epi32(s0, s0); @@ -127,7 +131,14 @@ NPY_FINLINE npy_uint32 npyv_sum_u32(__m256i a) return _mm_cvtsi128_si32(s1); } -NPY_FINLINE float npyv_sum_f32(__m256 a) +NPY_FINLINE npy_uint64 npyv_sum_u64(npyv_u64 a) +{ + __m256i two = _mm256_add_epi64(a, _mm256_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); + __m128i one = _mm_add_epi64(_mm256_castsi256_si128(two), _mm256_extracti128_si256(two, 1)); + return (npy_uint64)npyv128_cvtsi128_si64(one); +} + +NPY_FINLINE float npyv_sum_f32(npyv_f32 a) { __m256 sum_halves = _mm256_hadd_ps(a, a); sum_halves = _mm256_hadd_ps(sum_halves, sum_halves); @@ -137,7 +148,7 @@ NPY_FINLINE float npyv_sum_f32(__m256 a) return _mm_cvtss_f32(sum); } -NPY_FINLINE double npyv_sum_f64(__m256d a) +NPY_FINLINE double npyv_sum_f64(npyv_f64 a) { __m256d sum_halves = _mm256_hadd_pd(a, a); __m128d lo = _mm256_castpd256_pd128(sum_halves); @@ -146,6 +157,24 @@ NPY_FINLINE double npyv_sum_f64(__m256d a) return _mm_cvtsd_f64(sum); } +// expand the source vector and performs sum reduce +NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a) +{ + __m256i four = _mm256_sad_epu8(a, _mm256_setzero_si256()); + __m128i two = _mm_add_epi16(_mm256_castsi256_si128(four), _mm256_extracti128_si256(four, 1)); + __m128i one = _mm_add_epi16(two, _mm_unpackhi_epi64(two, two)); + return (npy_uint16)_mm_cvtsi128_si32(one); +} + +NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a) +{ + const npyv_u16 even_mask = _mm256_set1_epi32(0x0000FFFF); + __m256i even = _mm256_and_si256(a, even_mask); + __m256i odd = _mm256_srli_epi32(a, 16); + __m256i eight = _mm256_add_epi32(even, odd); + return npyv_sum_u32(eight); +} + #endif // _NPY_SIMD_AVX2_ARITHMETIC_H diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h index 6f668f439..450da7ea5 100644 --- a/numpy/core/src/common/simd/avx512/arithmetic.h +++ b/numpy/core/src/common/simd/avx512/arithmetic.h @@ -6,7 +6,7 @@ #define _NPY_SIMD_AVX512_ARITHMETIC_H #include "../avx2/utils.h" - +#include "../sse/utils.h" /*************************** * Addition ***************************/ @@ -130,7 +130,7 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b) #define npyv_nmulsub_f64 _mm512_fnmsub_pd /*************************** - * Reduce Sum: Calculates the sum of all vector elements. + * Summation: Calculates the sum of all vector elements. * there are three ways to implement reduce sum for AVX512: * 1- split(256) /add /split(128) /add /hadd /hadd /extract * 2- shuff(cross) /add /shuff(cross) /add /shuff /add /shuff /add /extract @@ -144,19 +144,29 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b) * The third one is almost the same as the second one but only works for * intel compiler/GCC 7.1/Clang 4, we still need to support older GCC. ***************************/ - -NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a) -{ - __m256i half = _mm256_add_epi32(npyv512_lower_si256(a), npyv512_higher_si256(a)); - __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); - quarter = _mm_hadd_epi32(quarter, quarter); - return _mm_cvtsi128_si32(_mm_hadd_epi32(quarter, quarter)); -} - +// reduce sum across vector #ifdef NPY_HAVE_AVX512F_REDUCE + #define npyv_sum_u32 _mm512_reduce_add_epi32 + #define npyv_sum_u64 _mm512_reduce_add_epi64 #define npyv_sum_f32 _mm512_reduce_add_ps #define npyv_sum_f64 _mm512_reduce_add_pd #else + NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a) + { + __m256i half = _mm256_add_epi32(npyv512_lower_si256(a), npyv512_higher_si256(a)); + __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); + quarter = _mm_hadd_epi32(quarter, quarter); + return _mm_cvtsi128_si32(_mm_hadd_epi32(quarter, quarter)); + } + + NPY_FINLINE npy_uint64 npyv_sum_u64(npyv_u64 a) + { + __m256i four = _mm256_add_epi64(npyv512_lower_si256(a), npyv512_higher_si256(a)); + __m256i two = _mm256_add_epi64(four, _mm256_shuffle_epi32(four, _MM_SHUFFLE(1, 0, 3, 2))); + __m128i one = _mm_add_epi64(_mm256_castsi256_si128(two), _mm256_extracti128_si256(two, 1)); + return (npy_uint64)npyv128_cvtsi128_si64(one); + } + NPY_FINLINE float npyv_sum_f32(npyv_f32 a) { __m512 h64 = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2)); @@ -169,6 +179,7 @@ NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a) __m512 sum4 = _mm512_add_ps(sum8, h4); return _mm_cvtss_f32(_mm512_castps512_ps128(sum4)); } + NPY_FINLINE double npyv_sum_f64(npyv_f64 a) { __m512d h64 = _mm512_shuffle_f64x2(a, a, _MM_SHUFFLE(3, 2, 3, 2)); @@ -181,4 +192,29 @@ NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a) } #endif +// expand the source vector and performs sum reduce +NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a) +{ +#ifdef NPY_HAVE_AVX512BW + __m512i eight = _mm512_sad_epu8(a, _mm512_setzero_si512()); + __m256i four = _mm256_add_epi16(npyv512_lower_si256(eight), npyv512_higher_si256(eight)); +#else + __m256i lo_four = _mm256_sad_epu8(npyv512_lower_si256(a), _mm256_setzero_si256()); + __m256i hi_four = _mm256_sad_epu8(npyv512_higher_si256(a), _mm256_setzero_si256()); + __m256i four = _mm256_add_epi16(lo_four, hi_four); +#endif + __m128i two = _mm_add_epi16(_mm256_castsi256_si128(four), _mm256_extracti128_si256(four, 1)); + __m128i one = _mm_add_epi16(two, _mm_unpackhi_epi64(two, two)); + return (npy_uint16)_mm_cvtsi128_si32(one); +} + +NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a) +{ + const npyv_u16 even_mask = _mm512_set1_epi32(0x0000FFFF); + __m512i even = _mm512_and_si512(a, even_mask); + __m512i odd = _mm512_srli_epi32(a, 16); + __m512i ff = _mm512_add_epi32(even, odd); + return npyv_sum_u32(ff); +} + #endif // _NPY_SIMD_AVX512_ARITHMETIC_H diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h index 1c8bde15a..69a49f571 100644 --- a/numpy/core/src/common/simd/neon/arithmetic.h +++ b/numpy/core/src/common/simd/neon/arithmetic.h @@ -131,12 +131,21 @@ { return vfmsq_f64(vnegq_f64(c), a, b); } #endif // NPY_SIMD_F64 -// Horizontal add: Calculates the sum of all vector elements. +/*************************** + * Summation + ***************************/ +// reduce sum across vector #if NPY_SIMD_F64 #define npyv_sum_u32 vaddvq_u32 + #define npyv_sum_u64 vaddvq_u64 #define npyv_sum_f32 vaddvq_f32 #define npyv_sum_f64 vaddvq_f64 #else + NPY_FINLINE npy_uint64 npyv_sum_u64(npyv_u64 a) + { + return vget_lane_u64(vadd_u64(vget_low_u64(a), vget_high_u64(a)),0); + } + NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a) { uint32x2_t a0 = vpadd_u32(vget_low_u32(a), vget_high_u32(a)); @@ -150,4 +159,24 @@ } #endif +// expand the source vector and performs sum reduce +#if NPY_SIMD_F64 + #define npyv_sumup_u8 vaddlvq_u8 + #define npyv_sumup_u16 vaddlvq_u16 +#else + NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a) + { + uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(a)); + uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0)); + return vget_lane_u32(vpadd_u32(t1, t1), 0); + } + + NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a) + { + uint32x4_t t0 = vpaddlq_u16(a); + uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0)); + return vget_lane_u32(vpadd_u32(t1, t1), 0); + } +#endif + #endif // _NPY_SIMD_NEON_ARITHMETIC_H diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h index faf5685d9..c21b7da2d 100644 --- a/numpy/core/src/common/simd/sse/arithmetic.h +++ b/numpy/core/src/common/simd/sse/arithmetic.h @@ -148,16 +148,24 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b) } #endif // !NPY_HAVE_FMA3 -// Horizontal add: Calculates the sum of all vector elements. - -NPY_FINLINE npy_uint32 npyv_sum_u32(__m128i a) +/*************************** + * Summation + ***************************/ +// reduce sum across vector +NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a) { __m128i t = _mm_add_epi32(a, _mm_srli_si128(a, 8)); t = _mm_add_epi32(t, _mm_srli_si128(t, 4)); return (unsigned)_mm_cvtsi128_si32(t); } -NPY_FINLINE float npyv_sum_f32(__m128 a) +NPY_FINLINE npy_uint64 npyv_sum_u64(npyv_u64 a) +{ + __m128i one = _mm_add_epi64(a, _mm_unpackhi_epi64(a, a)); + return (npy_uint64)npyv128_cvtsi128_si64(one); +} + +NPY_FINLINE float npyv_sum_f32(npyv_f32 a) { #ifdef NPY_HAVE_SSE3 __m128 sum_halves = _mm_hadd_ps(a, a); @@ -171,7 +179,7 @@ NPY_FINLINE float npyv_sum_f32(__m128 a) #endif } -NPY_FINLINE double npyv_sum_f64(__m128d a) +NPY_FINLINE double npyv_sum_f64(npyv_f64 a) { #ifdef NPY_HAVE_SSE3 return _mm_cvtsd_f64(_mm_hadd_pd(a, a)); @@ -180,6 +188,23 @@ NPY_FINLINE double npyv_sum_f64(__m128d a) #endif } +// expand the source vector and performs sum reduce +NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a) +{ + __m128i two = _mm_sad_epu8(a, _mm_setzero_si128()); + __m128i one = _mm_add_epi16(two, _mm_unpackhi_epi64(two, two)); + return (npy_uint16)_mm_cvtsi128_si32(one); +} + +NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a) +{ + const __m128i even_mask = _mm_set1_epi32(0x0000FFFF); + __m128i even = _mm_and_si128(a, even_mask); + __m128i odd = _mm_srli_epi32(a, 16); + __m128i four = _mm_add_epi32(even, odd); + return npyv_sum_u32(four); +} + #endif // _NPY_SIMD_SSE_ARITHMETIC_H diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h index dc0b62f73..0bb404312 100644 --- a/numpy/core/src/common/simd/sse/sse.h +++ b/numpy/core/src/common/simd/sse/sse.h @@ -62,6 +62,7 @@ typedef struct { __m128d val[3]; } npyv_f64x3; #define npyv_nlanes_f32 4 #define npyv_nlanes_f64 2 +#include "utils.h" #include "memory.h" #include "misc.h" #include "reorder.h" diff --git a/numpy/core/src/common/simd/sse/utils.h b/numpy/core/src/common/simd/sse/utils.h new file mode 100644 index 000000000..c23def11d --- /dev/null +++ b/numpy/core/src/common/simd/sse/utils.h @@ -0,0 +1,19 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_SSE_UTILS_H +#define _NPY_SIMD_SSE_UTILS_H + +#if !defined(__x86_64__) && !defined(_M_X64) +NPY_FINLINE npy_int64 npyv128_cvtsi128_si64(__m128i a) +{ + npy_int64 NPY_DECL_ALIGNED(16) idx[2]; + _mm_store_si128((__m128i *)idx, a); + return idx[0]; +} +#else + #define npyv128_cvtsi128_si64 _mm_cvtsi128_si64 +#endif + +#endif // _NPY_SIMD_SSE_UTILS_H diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h index 1288a52a7..7c4e32f27 100644 --- a/numpy/core/src/common/simd/vsx/arithmetic.h +++ b/numpy/core/src/common/simd/vsx/arithmetic.h @@ -116,7 +116,14 @@ #define npyv_nmulsub_f32 vec_nmadd // equivalent to -(a*b + c) #define npyv_nmulsub_f64 vec_nmadd -// Horizontal add: Calculates the sum of all vector elements. +/*************************** + * Summation + ***************************/ +// reduce sum across vector +NPY_FINLINE npy_uint64 npyv_sum_u64(npyv_u64 a) +{ + return vec_extract(vec_add(a, vec_mergel(a, a)), 0); +} NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a) { @@ -135,4 +142,22 @@ NPY_FINLINE double npyv_sum_f64(npyv_f64 a) return vec_extract(a, 0) + vec_extract(a, 1); } +// expand the source vector and performs sum reduce +NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a) +{ + const npyv_u32 zero = npyv_zero_u32(); + npyv_u32 four = vec_sum4s(a, zero); + npyv_s32 one = vec_sums((npyv_s32)four, (npyv_s32)zero); + return (npy_uint16)vec_extract(one, 3); +} + +NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a) +{ + const npyv_s32 zero = npyv_zero_s32(); + npyv_u32x2 eight = npyv_expand_u32_u16(a); + npyv_u32 four = vec_add(eight.val[0], eight.val[1]); + npyv_s32 one = vec_sums((npyv_s32)four, zero); + return (npy_uint32)vec_extract(one, 3); +} + #endif // _NPY_SIMD_VSX_ARITHMETIC_H diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py index 23a5bb6c3..1d1a111be 100644 --- a/numpy/core/tests/test_simd.py +++ b/numpy/core/tests/test_simd.py @@ -736,11 +736,9 @@ class _SIMD_ALL(_Test_Utility): def test_arithmetic_reduce_sum(self): """ Test reduce sum intrinics: - npyv_sum_u32 - npyv_sum_f32 - npyv_sum_f64 + npyv_sum_##sfx """ - if self.sfx not in ("u32", "f32", "f64"): + if self.sfx not in ("u32", "u64", "f32", "f64"): return # reduce sum data = self._data() @@ -750,6 +748,21 @@ class _SIMD_ALL(_Test_Utility): vsum = self.sum(vdata) assert vsum == data_sum + def test_arithmetic_reduce_sumup(self): + """ + Test extend reduce sum intrinics: + npyv_sumup_##sfx + """ + if self.sfx not in ("u8", "u16"): + return + rdata = (0, self.nlanes, self._int_min(), self._int_max()-self.nlanes) + for r in rdata: + data = self._data(r) + vdata = self.load(data) + data_sum = sum(data) + vsum = self.sumup(vdata) + assert vsum == data_sum + def test_mask_conditional(self): """ Conditional addition and subtraction for all supported data types. |