diff options
author | Qiyu8 <fangchunlin@huawei.com> | 2021-01-22 16:30:21 +0800 |
---|---|---|
committer | Qiyu8 <fangchunlin@huawei.com> | 2021-01-22 16:30:21 +0800 |
commit | 617193440550cd50b7426a2c2008fdb60944c226 (patch) | |
tree | 7d0421ea40c4a8ffdeef2310fab7abbdb9f1e615 /numpy | |
parent | 5a528df2e2c3e8195e75253f75f9d98a64e6bc58 (diff) | |
download | numpy-617193440550cd50b7426a2c2008fdb60944c226.tar.gz |
treat u8/u16 as overflow protection.
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/core/src/_simd/_simd.dispatch.c.src | 15 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx2/arithmetic.h | 29 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx512/arithmetic.h | 41 | ||||
-rw-r--r-- | numpy/core/src/common/simd/neon/arithmetic.h | 8 | ||||
-rw-r--r-- | numpy/core/src/common/simd/sse/arithmetic.h | 8 | ||||
-rw-r--r-- | numpy/core/src/common/simd/sse/sse.h | 1 | ||||
-rw-r--r-- | numpy/core/src/common/simd/sse/utils.h | 19 | ||||
-rw-r--r-- | numpy/core/src/common/simd/vsx/arithmetic.h | 21 | ||||
-rw-r--r-- | numpy/core/tests/test_simd.py | 20 |
9 files changed, 110 insertions, 52 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src index a861cd944..49378b518 100644 --- a/numpy/core/src/_simd/_simd.dispatch.c.src +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -16,7 +16,7 @@ * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64# * #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64# * #esfx = u16, s8, u32, s16, u32, s32, u64, s64, f32, f64# - * #sum_ret = u32, 0, u32, 0, u32, 0, u64, 0, f32, f64# + * #sum_ret = u16, 0, u32, 0, u32, 0, u64, 0, f32, f64# * #expand_sup =1, 0, 1, 0, 0, 0, 0, 0, 0, 0# * #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64# * #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# @@ -24,7 +24,8 @@ * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1# * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# - * #sum_sup = 1, 0, 1, 0, 1, 0, 1, 0, 1, 1# + * #sumup_sup = 1, 0, 1, 0, 0, 0, 0, 0, 0, 0# + * #sum_sup = 0, 0, 0, 0, 1, 0, 1, 0, 1, 1# * #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0# * #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1# * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# @@ -366,6 +367,10 @@ SIMD_IMPL_INTRIN_3(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@, v@sfx@) SIMD_IMPL_INTRIN_1(sum_@sfx@, @sum_ret@, v@sfx@) #endif // sum_sup +#if @sumup_sup@ +SIMD_IMPL_INTRIN_1(sumup_@sfx@, @sum_ret@, v@sfx@) +#endif // sumup_sup + /*************************** * Math ***************************/ @@ -453,7 +458,8 @@ static PyMethodDef simd__intrinsics_methods[] = { * #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1# * #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# * #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1# - * #sum_sup = 1, 0, 1, 0, 1, 0, 1, 0, 1, 1# + * #sumup_sup = 1, 0, 1, 0, 0, 0, 0, 0, 0, 0# + * #sum_sup = 0, 0, 0, 0, 1, 0, 1, 0, 1, 1# * #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0# * #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1# * #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0# @@ -575,6 +581,9 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@) SIMD_INTRIN_DEF(sum_@sfx@) #endif // sum_sup +#if @sumup_sup@ +SIMD_INTRIN_DEF(sumup_@sfx@) +#endif // sumup_sup /*************************** * Math ***************************/ diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h index c2153718d..770d4230c 100644 --- a/numpy/core/src/common/simd/avx2/arithmetic.h +++ b/numpy/core/src/common/simd/avx2/arithmetic.h @@ -5,6 +5,7 @@ #ifndef _NPY_SIMD_AVX2_ARITHMETIC_H #define _NPY_SIMD_AVX2_ARITHMETIC_H +#include "../sse/utils.h" /*************************** * Addition ***************************/ @@ -119,11 +120,12 @@ // Horizontal add: Calculates the sum of all vector elements. -NPY_FINLINE npy_uint32 npyv_sum_u8(__m256i a) +NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a) { - __m256i half = _mm256_sad_epu8(a, _mm256_setzero_si256()); - __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); - return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter))); + __m256i four = _mm256_sad_epu8(a, _mm256_setzero_si256()); + __m128i two = _mm_add_epi16(_mm256_castsi256_si128(four), _mm256_extracti128_si256(four, 1)); + __m128i one = _mm_add_epi16(two, _mm_unpackhi_epi64(two, two)); + return (npy_uint16)_mm_cvtsi128_si32(one); } NPY_FINLINE npy_uint32 npyv_sum_u32(__m256i a) @@ -135,20 +137,23 @@ NPY_FINLINE npy_uint32 npyv_sum_u32(__m256i a) return _mm_cvtsi128_si32(s1); } -NPY_FINLINE npy_uint32 npyv_sum_u16(__m256i a) +NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a) { - npyv_u32x2 res = npyv_expand_u32_u16(a); - return (unsigned)npyv_sum_u32(_mm256_add_epi32(res.val[0], res.val[1])); + const npyv_u16 even_mask = _mm256_set1_epi32(0x0000FFFF); + __m256i even = _mm256_and_si256(a, even_mask); + __m256i odd = _mm256_srli_epi32(a, 16); + __m256i eight = _mm256_add_epi32(even, odd); + return npyv_sum_u32(eight); } -NPY_FINLINE npy_uint64 npyv_sum_u64(__m256i a) +NPY_FINLINE npy_uint64 npyv_sum_u64(npyv_u64 a) { - npy_uint64 NPY_DECL_ALIGNED(32) idx[2]; - _mm_store_si128((__m128i*)idx, _mm_add_epi64(_mm256_castsi256_si128(a), _mm256_extracti128_si256(a, 1))); - return idx[0] + idx[1]; + __m256i two = _mm256_add_epi64(a, _mm256_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); + __m128i one = _mm_add_epi64(_mm256_castsi256_si128(two), _mm256_extracti128_si256(two, 1)); + return (npy_uint64)npyv128_cvtsi128_si64(one); } -NPY_FINLINE float npyv_sum_f32(__m256 a) +NPY_FINLINE float npyv_sum_f32(npyv_u32 a) { __m256 sum_halves = _mm256_hadd_ps(a, a); sum_halves = _mm256_hadd_ps(sum_halves, sum_halves); diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h index 69c3caf93..ea7dc0c3c 100644 --- a/numpy/core/src/common/simd/avx512/arithmetic.h +++ b/numpy/core/src/common/simd/avx512/arithmetic.h @@ -6,7 +6,7 @@ #define _NPY_SIMD_AVX512_ARITHMETIC_H #include "../avx2/utils.h" - +#include "../sse/utils.h" /*************************** * Addition ***************************/ @@ -145,15 +145,19 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b) * intel compiler/GCC 7.1/Clang 4, we still need to support older GCC. ***************************/ -NPY_FINLINE npy_uint32 npyv_sum_u8(__m512i a) +NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a) { - npyv_u16x2 res = npyv_expand_u16_u8(a); - __m512i a16 = npyv_add_u16(res.val[0], res.val[1]); - a16 = _mm512_cvtepi16_epi32(_mm256_add_epi16(npyv512_lower_si256(a16), npyv512_higher_si256(a16))); - __m256i a8 = _mm256_add_epi32(npyv512_lower_si256(a16), npyv512_higher_si256(a16)); - __m128i a4 = _mm_add_epi32(_mm256_castsi256_si128(a8), _mm256_extracti128_si256(a8, 1)); - a4 = _mm_hadd_epi32(a4, a4); - return (npy_uint32)_mm_cvtsi128_si32(_mm_hadd_epi32(a4, a4)); +#ifdef NPY_HAVE_AVX512BW + __m512i eight = _mm512_sad_epu8(a, _mm512_setzero_si512()); + __m256i four = _mm256_add_epi16(npyv512_lower_si256(eight), npyv512_higher_si256(eight)); +#else + __m256i lo_four = _mm256_sad_epu8(npyv512_lower_si256(a), _mm256_setzero_si256()); + __m256i hi_four = _mm256_sad_epu8(npyv512_higher_si256(a), _mm256_setzero_si256()); + __m256i four = _mm256_add_epi16(lo_four, hi_four); +#endif + __m128i two = _mm_add_epi16(_mm256_castsi256_si128(four), _mm256_extracti128_si256(four, 1)); + __m128i one = _mm_add_epi16(two, _mm_unpackhi_epi64(two, two)); + return (npy_uint16)_mm_cvtsi128_si32(one); } #ifdef NPY_HAVE_AVX512F_REDUCE @@ -171,12 +175,12 @@ NPY_FINLINE npy_uint32 npyv_sum_u8(__m512i a) return _mm_cvtsi128_si32(_mm_hadd_epi32(quarter, quarter)); } - NPY_FINLINE npy_uint64 npyv_sum_u64(__m512i a) + NPY_FINLINE npy_uint64 npyv_sum_u64(npyv_u64 a) { - npy_uint64 NPY_DECL_ALIGNED(64) idx[2]; - __m256i half = _mm256_add_epi64(npyv512_lower_si256(a), npyv512_higher_si256(a)); - _mm_store_si128((__m128i*)idx, _mm_add_epi64(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1))); - return idx[0] + idx[1]; + __m256i four = _mm256_add_epi64(npyv512_lower_si256(a), npyv512_higher_si256(a)); + __m256i two = _mm256_add_epi64(four, _mm256_shuffle_epi32(four, _MM_SHUFFLE(1, 0, 3, 2))); + __m128i one = _mm_add_epi64(_mm256_castsi256_si128(two), _mm256_extracti128_si256(two, 1)); + return (npy_uint64)npyv128_cvtsi128_si64(one); } NPY_FINLINE float npyv_sum_f32(npyv_f32 a) @@ -204,10 +208,13 @@ NPY_FINLINE npy_uint32 npyv_sum_u8(__m512i a) } #endif -NPY_FINLINE npy_uint32 npyv_sum_u16(__m512i a) +NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a) { - npyv_u32x2 res = npyv_expand_u32_u16(a); - return (unsigned)npyv_sum_u32(_mm512_add_epi32(res.val[0], res.val[1])); + const npyv_u16 even_mask = _mm512_set1_epi32(0x0000FFFF); + __m512i even = _mm512_and_si512(a, even_mask); + __m512i odd = _mm512_srli_epi32(a, 16); + __m512i ff = _mm512_add_epi32(even, odd); + return npyv_sum_u32(ff); } #endif // _NPY_SIMD_AVX512_ARITHMETIC_H diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h index 90f62d063..81207ea5e 100644 --- a/numpy/core/src/common/simd/neon/arithmetic.h +++ b/numpy/core/src/common/simd/neon/arithmetic.h @@ -133,22 +133,22 @@ // Horizontal add: Calculates the sum of all vector elements. #if NPY_SIMD_F64 - #define npyv_sum_u8 vaddvq_u8 - #define npyv_sum_u16 vaddvq_u16 + #define npyv_sumup_u8 vaddlvq_u8 + #define npyv_sumup_u16 vaddlvq_u16 #define npyv_sum_u32 vaddvq_u32 #define npyv_sum_u64 vaddvq_u64 #define npyv_sum_f32 vaddvq_f32 #define npyv_sum_f64 vaddvq_f64 #else - NPY_FINLINE npy_uint32 npyv_sum_u8(npyv_u8 a) + NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a) { uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(a)); uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0)); return vget_lane_u32(vpadd_u32(t1, t1), 0); } - NPY_FINLINE npy_uint32 npyv_sum_u16(npyv_u16 a) + NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a) { uint32x4_t t0 = vpaddlq_u16(a); uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0)); diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h index 7dba6ea8c..968de7545 100644 --- a/numpy/core/src/common/simd/sse/arithmetic.h +++ b/numpy/core/src/common/simd/sse/arithmetic.h @@ -150,26 +150,26 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b) // Horizontal add: Calculates the sum of all vector elements. -NPY_FINLINE npy_uint32 npyv_sum_u8(__m128i a) +NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a) { __m128i half = _mm_sad_epu8(a, _mm_setzero_si128()); return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half))); } -NPY_FINLINE npy_uint32 npyv_sum_u32(__m128i a) +NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a) { __m128i t = _mm_add_epi32(a, _mm_srli_si128(a, 8)); t = _mm_add_epi32(t, _mm_srli_si128(t, 4)); return (unsigned)_mm_cvtsi128_si32(t); } -NPY_FINLINE npy_uint32 npyv_sum_u16(__m128i a) +NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a) { npyv_u32x2 res = npyv_expand_u32_u16(a); return (unsigned)npyv_sum_u32(_mm_add_epi32(res.val[0], res.val[1])); } -NPY_FINLINE npy_uint64 npyv_sum_u64(__m128i a) +NPY_FINLINE npy_uint64 npyv_sum_u64(npyv_u64 a) { npy_uint64 NPY_DECL_ALIGNED(32) idx[2]; npyv_storea_u64(idx, a); diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h index dc0b62f73..0bb404312 100644 --- a/numpy/core/src/common/simd/sse/sse.h +++ b/numpy/core/src/common/simd/sse/sse.h @@ -62,6 +62,7 @@ typedef struct { __m128d val[3]; } npyv_f64x3; #define npyv_nlanes_f32 4 #define npyv_nlanes_f64 2 +#include "utils.h" #include "memory.h" #include "misc.h" #include "reorder.h" diff --git a/numpy/core/src/common/simd/sse/utils.h b/numpy/core/src/common/simd/sse/utils.h new file mode 100644 index 000000000..fbb969377 --- /dev/null +++ b/numpy/core/src/common/simd/sse/utils.h @@ -0,0 +1,19 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_SSE_UTILS_H +#define _NPY_SIMD_SSE_UTILS_H + +#if !defined(__x86_64__) && !defined(_M_X64) +NPY_FINLINE npy_uint64 npyv128_cvtsi128_si64(npyv_u64 a) +{ + npy_uint64 NPY_DECL_ALIGNED(32) idx[2]; + npyv_storea_u64(idx, a); + return idx[0]; +} +#else + #define npyv128_cvtsi128_si64 _mm_cvtsi128_si64 +#endif + +#endif // _NPY_SIMD_SSE_UTILS_H diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h index fa114389a..30d77c6fe 100644 --- a/numpy/core/src/common/simd/vsx/arithmetic.h +++ b/numpy/core/src/common/simd/vsx/arithmetic.h @@ -118,23 +118,26 @@ // Horizontal add: Calculates the sum of all vector elements. -NPY_FINLINE npy_uint32 npyv_sum_u8(npyv_u8 a) +NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a) { - const npyv_u32 zero4 = npyv_zero_u32(); - npyv_u32 sum4 = vec_sum4s(a, zero4); - return (npy_uint32)vec_extract(vec_sums(sum4, zero4), 3); + const npyv_u32 zero = npyv_zero_u32(); + npyv_u32 four = vec_sum4s(a, zero); + npyv_u32 one = vec_sums((npyv_s32)sum4, (npyv_s32)zero4); + return (npy_uint16)vec_extract(one, 3); } -NPY_FINLINE npy_uint32 npyv_sum_u16(npyv_u16 a) +NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a) { - const npyv_u32 zero4 = npyv_zero_u32(); - const npyv_u32 v4 = vec_mergeh(vec_adds(a, vec_sld(a, a, 8)), zero4); - return vec_extract(vec_sums(v4, zero4), 3); + const npyv_s32 zero = npyv_zero_s32(); + npyv_u32x2 eight = npyv_expand_u32_u16(a); + npyv_u32 four = vec_add(eight.val[0], eight.val[1]); + npyv_s32 one = vec_sums((npyv_s32)four, zero); + return (npy_uint32)vec_extract(one, 3); } NPY_FINLINE npy_uint64 npyv_sum_u64(npyv_u64 a) { - return vec_extract(vec_add(a, vec_permi(a, a, 3)), 0); + return vec_extract(vec_add(a, vec_mergel(a, a)), 0); } NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a) diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py index ae98c47f7..c67e44fa7 100644 --- a/numpy/core/tests/test_simd.py +++ b/numpy/core/tests/test_simd.py @@ -736,14 +736,12 @@ class _SIMD_ALL(_Test_Utility): def test_arithmetic_reduce_sum(self): """ Test reduce sum intrinics: - npyv_sum_u8 - npyv_sum_u16 npyv_sum_u32 npyv_sum_u64 npyv_sum_f32 npyv_sum_f64 """ - if self.sfx not in ("u8", "u16", "u32", "u64", "f32", "f64"): + if self.sfx not in ("u32", "u64", "f32", "f64"): return # reduce sum data = self._data() @@ -753,6 +751,22 @@ class _SIMD_ALL(_Test_Utility): vsum = self.sum(vdata) assert vsum == data_sum + def test_arithmetic_reduce_sumup(self): + """ + Test overflow protect reduce sumup intrinics: + npyv_sumup_u8 + npyv_sumup_u16 + """ + if self.sfx not in ("u8", "u16"): + return + rdata = (0, self.nlanes, self._int_min(), self._int_max()-self.nlanes) + for r in rdata: + data = self._data(r) + vdata = self.load(data) + data_sum = sum(data) + vsum = self.sumup(vdata) + assert vsum == data_sum + def test_mask_conditional(self): """ Conditional addition and subtraction for all supported data types. |