summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
authorQiyu8 <fangchunlin@huawei.com>2021-01-22 16:30:21 +0800
committerQiyu8 <fangchunlin@huawei.com>2021-01-22 16:30:21 +0800
commit617193440550cd50b7426a2c2008fdb60944c226 (patch)
tree7d0421ea40c4a8ffdeef2310fab7abbdb9f1e615 /numpy
parent5a528df2e2c3e8195e75253f75f9d98a64e6bc58 (diff)
downloadnumpy-617193440550cd50b7426a2c2008fdb60944c226.tar.gz
treat u8/u16 as overflow protection.
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/src/_simd/_simd.dispatch.c.src15
-rw-r--r--numpy/core/src/common/simd/avx2/arithmetic.h29
-rw-r--r--numpy/core/src/common/simd/avx512/arithmetic.h41
-rw-r--r--numpy/core/src/common/simd/neon/arithmetic.h8
-rw-r--r--numpy/core/src/common/simd/sse/arithmetic.h8
-rw-r--r--numpy/core/src/common/simd/sse/sse.h1
-rw-r--r--numpy/core/src/common/simd/sse/utils.h19
-rw-r--r--numpy/core/src/common/simd/vsx/arithmetic.h21
-rw-r--r--numpy/core/tests/test_simd.py20
9 files changed, 110 insertions, 52 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index a861cd944..49378b518 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -16,7 +16,7 @@
* #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
* #bsfx = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
* #esfx = u16, s8, u32, s16, u32, s32, u64, s64, f32, f64#
- * #sum_ret = u32, 0, u32, 0, u32, 0, u64, 0, f32, f64#
+ * #sum_ret = u16, 0, u32, 0, u32, 0, u64, 0, f32, f64#
* #expand_sup =1, 0, 1, 0, 0, 0, 0, 0, 0, 0#
* #simd_sup = 1, 1, 1, 1, 1, 1, 1, 1, 1, NPY_SIMD_F64#
* #fp_only = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
@@ -24,7 +24,8 @@
* #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1#
* #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
- * #sum_sup = 1, 0, 1, 0, 1, 0, 1, 0, 1, 1#
+ * #sumup_sup = 1, 0, 1, 0, 0, 0, 0, 0, 0, 0#
+ * #sum_sup = 0, 0, 0, 0, 1, 0, 1, 0, 1, 1#
* #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0#
* #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
@@ -366,6 +367,10 @@ SIMD_IMPL_INTRIN_3(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@, v@sfx@)
SIMD_IMPL_INTRIN_1(sum_@sfx@, @sum_ret@, v@sfx@)
#endif // sum_sup
+#if @sumup_sup@
+SIMD_IMPL_INTRIN_1(sumup_@sfx@, @sum_ret@, v@sfx@)
+#endif // sumup_sup
+
/***************************
* Math
***************************/
@@ -453,7 +458,8 @@ static PyMethodDef simd__intrinsics_methods[] = {
* #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1#
* #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
- * #sum_sup = 1, 0, 1, 0, 1, 0, 1, 0, 1, 1#
+ * #sumup_sup = 1, 0, 1, 0, 0, 0, 0, 0, 0, 0#
+ * #sum_sup = 0, 0, 0, 0, 1, 0, 1, 0, 1, 1#
* #rev64_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 0#
* #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
@@ -575,6 +581,9 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
SIMD_INTRIN_DEF(sum_@sfx@)
#endif // sum_sup
+#if @sumup_sup@
+SIMD_INTRIN_DEF(sumup_@sfx@)
+#endif // sumup_sup
/***************************
* Math
***************************/
diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h
index c2153718d..770d4230c 100644
--- a/numpy/core/src/common/simd/avx2/arithmetic.h
+++ b/numpy/core/src/common/simd/avx2/arithmetic.h
@@ -5,6 +5,7 @@
#ifndef _NPY_SIMD_AVX2_ARITHMETIC_H
#define _NPY_SIMD_AVX2_ARITHMETIC_H
+#include "../sse/utils.h"
/***************************
* Addition
***************************/
@@ -119,11 +120,12 @@
// Horizontal add: Calculates the sum of all vector elements.
-NPY_FINLINE npy_uint32 npyv_sum_u8(__m256i a)
+NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a)
{
- __m256i half = _mm256_sad_epu8(a, _mm256_setzero_si256());
- __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
- return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
+ __m256i four = _mm256_sad_epu8(a, _mm256_setzero_si256());
+ __m128i two = _mm_add_epi16(_mm256_castsi256_si128(four), _mm256_extracti128_si256(four, 1));
+ __m128i one = _mm_add_epi16(two, _mm_unpackhi_epi64(two, two));
+ return (npy_uint16)_mm_cvtsi128_si32(one);
}
NPY_FINLINE npy_uint32 npyv_sum_u32(__m256i a)
@@ -135,20 +137,23 @@ NPY_FINLINE npy_uint32 npyv_sum_u32(__m256i a)
return _mm_cvtsi128_si32(s1);
}
-NPY_FINLINE npy_uint32 npyv_sum_u16(__m256i a)
+NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a)
{
- npyv_u32x2 res = npyv_expand_u32_u16(a);
- return (unsigned)npyv_sum_u32(_mm256_add_epi32(res.val[0], res.val[1]));
+ const npyv_u16 even_mask = _mm256_set1_epi32(0x0000FFFF);
+ __m256i even = _mm256_and_si256(a, even_mask);
+ __m256i odd = _mm256_srli_epi32(a, 16);
+ __m256i eight = _mm256_add_epi32(even, odd);
+ return npyv_sum_u32(eight);
}
-NPY_FINLINE npy_uint64 npyv_sum_u64(__m256i a)
+NPY_FINLINE npy_uint64 npyv_sum_u64(npyv_u64 a)
{
- npy_uint64 NPY_DECL_ALIGNED(32) idx[2];
- _mm_store_si128((__m128i*)idx, _mm_add_epi64(_mm256_castsi256_si128(a), _mm256_extracti128_si256(a, 1)));
- return idx[0] + idx[1];
+ __m256i two = _mm256_add_epi64(a, _mm256_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
+ __m128i one = _mm_add_epi64(_mm256_castsi256_si128(two), _mm256_extracti128_si256(two, 1));
+ return (npy_uint64)npyv128_cvtsi128_si64(one);
}
-NPY_FINLINE float npyv_sum_f32(__m256 a)
+NPY_FINLINE float npyv_sum_f32(npyv_u32 a)
{
__m256 sum_halves = _mm256_hadd_ps(a, a);
sum_halves = _mm256_hadd_ps(sum_halves, sum_halves);
diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
index 69c3caf93..ea7dc0c3c 100644
--- a/numpy/core/src/common/simd/avx512/arithmetic.h
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -6,7 +6,7 @@
#define _NPY_SIMD_AVX512_ARITHMETIC_H
#include "../avx2/utils.h"
-
+#include "../sse/utils.h"
/***************************
* Addition
***************************/
@@ -145,15 +145,19 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
* intel compiler/GCC 7.1/Clang 4, we still need to support older GCC.
***************************/
-NPY_FINLINE npy_uint32 npyv_sum_u8(__m512i a)
+NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a)
{
- npyv_u16x2 res = npyv_expand_u16_u8(a);
- __m512i a16 = npyv_add_u16(res.val[0], res.val[1]);
- a16 = _mm512_cvtepi16_epi32(_mm256_add_epi16(npyv512_lower_si256(a16), npyv512_higher_si256(a16)));
- __m256i a8 = _mm256_add_epi32(npyv512_lower_si256(a16), npyv512_higher_si256(a16));
- __m128i a4 = _mm_add_epi32(_mm256_castsi256_si128(a8), _mm256_extracti128_si256(a8, 1));
- a4 = _mm_hadd_epi32(a4, a4);
- return (npy_uint32)_mm_cvtsi128_si32(_mm_hadd_epi32(a4, a4));
+#ifdef NPY_HAVE_AVX512BW
+ __m512i eight = _mm512_sad_epu8(a, _mm512_setzero_si512());
+ __m256i four = _mm256_add_epi16(npyv512_lower_si256(eight), npyv512_higher_si256(eight));
+#else
+ __m256i lo_four = _mm256_sad_epu8(npyv512_lower_si256(a), _mm256_setzero_si256());
+ __m256i hi_four = _mm256_sad_epu8(npyv512_higher_si256(a), _mm256_setzero_si256());
+ __m256i four = _mm256_add_epi16(lo_four, hi_four);
+#endif
+ __m128i two = _mm_add_epi16(_mm256_castsi256_si128(four), _mm256_extracti128_si256(four, 1));
+ __m128i one = _mm_add_epi16(two, _mm_unpackhi_epi64(two, two));
+ return (npy_uint16)_mm_cvtsi128_si32(one);
}
#ifdef NPY_HAVE_AVX512F_REDUCE
@@ -171,12 +175,12 @@ NPY_FINLINE npy_uint32 npyv_sum_u8(__m512i a)
return _mm_cvtsi128_si32(_mm_hadd_epi32(quarter, quarter));
}
- NPY_FINLINE npy_uint64 npyv_sum_u64(__m512i a)
+ NPY_FINLINE npy_uint64 npyv_sum_u64(npyv_u64 a)
{
- npy_uint64 NPY_DECL_ALIGNED(64) idx[2];
- __m256i half = _mm256_add_epi64(npyv512_lower_si256(a), npyv512_higher_si256(a));
- _mm_store_si128((__m128i*)idx, _mm_add_epi64(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)));
- return idx[0] + idx[1];
+ __m256i four = _mm256_add_epi64(npyv512_lower_si256(a), npyv512_higher_si256(a));
+ __m256i two = _mm256_add_epi64(four, _mm256_shuffle_epi32(four, _MM_SHUFFLE(1, 0, 3, 2)));
+ __m128i one = _mm_add_epi64(_mm256_castsi256_si128(two), _mm256_extracti128_si256(two, 1));
+ return (npy_uint64)npyv128_cvtsi128_si64(one);
}
NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
@@ -204,10 +208,13 @@ NPY_FINLINE npy_uint32 npyv_sum_u8(__m512i a)
}
#endif
-NPY_FINLINE npy_uint32 npyv_sum_u16(__m512i a)
+NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a)
{
- npyv_u32x2 res = npyv_expand_u32_u16(a);
- return (unsigned)npyv_sum_u32(_mm512_add_epi32(res.val[0], res.val[1]));
+ const npyv_u16 even_mask = _mm512_set1_epi32(0x0000FFFF);
+ __m512i even = _mm512_and_si512(a, even_mask);
+ __m512i odd = _mm512_srli_epi32(a, 16);
+ __m512i ff = _mm512_add_epi32(even, odd);
+ return npyv_sum_u32(ff);
}
#endif // _NPY_SIMD_AVX512_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h
index 90f62d063..81207ea5e 100644
--- a/numpy/core/src/common/simd/neon/arithmetic.h
+++ b/numpy/core/src/common/simd/neon/arithmetic.h
@@ -133,22 +133,22 @@
// Horizontal add: Calculates the sum of all vector elements.
#if NPY_SIMD_F64
- #define npyv_sum_u8 vaddvq_u8
- #define npyv_sum_u16 vaddvq_u16
+ #define npyv_sumup_u8 vaddlvq_u8
+ #define npyv_sumup_u16 vaddlvq_u16
#define npyv_sum_u32 vaddvq_u32
#define npyv_sum_u64 vaddvq_u64
#define npyv_sum_f32 vaddvq_f32
#define npyv_sum_f64 vaddvq_f64
#else
- NPY_FINLINE npy_uint32 npyv_sum_u8(npyv_u8 a)
+ NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a)
{
uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(a));
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
return vget_lane_u32(vpadd_u32(t1, t1), 0);
}
- NPY_FINLINE npy_uint32 npyv_sum_u16(npyv_u16 a)
+ NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a)
{
uint32x4_t t0 = vpaddlq_u16(a);
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h
index 7dba6ea8c..968de7545 100644
--- a/numpy/core/src/common/simd/sse/arithmetic.h
+++ b/numpy/core/src/common/simd/sse/arithmetic.h
@@ -150,26 +150,26 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b)
// Horizontal add: Calculates the sum of all vector elements.
-NPY_FINLINE npy_uint32 npyv_sum_u8(__m128i a)
+NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a)
{
__m128i half = _mm_sad_epu8(a, _mm_setzero_si128());
return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
}
-NPY_FINLINE npy_uint32 npyv_sum_u32(__m128i a)
+NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a)
{
__m128i t = _mm_add_epi32(a, _mm_srli_si128(a, 8));
t = _mm_add_epi32(t, _mm_srli_si128(t, 4));
return (unsigned)_mm_cvtsi128_si32(t);
}
-NPY_FINLINE npy_uint32 npyv_sum_u16(__m128i a)
+NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a)
{
npyv_u32x2 res = npyv_expand_u32_u16(a);
return (unsigned)npyv_sum_u32(_mm_add_epi32(res.val[0], res.val[1]));
}
-NPY_FINLINE npy_uint64 npyv_sum_u64(__m128i a)
+NPY_FINLINE npy_uint64 npyv_sum_u64(npyv_u64 a)
{
npy_uint64 NPY_DECL_ALIGNED(32) idx[2];
npyv_storea_u64(idx, a);
diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h
index dc0b62f73..0bb404312 100644
--- a/numpy/core/src/common/simd/sse/sse.h
+++ b/numpy/core/src/common/simd/sse/sse.h
@@ -62,6 +62,7 @@ typedef struct { __m128d val[3]; } npyv_f64x3;
#define npyv_nlanes_f32 4
#define npyv_nlanes_f64 2
+#include "utils.h"
#include "memory.h"
#include "misc.h"
#include "reorder.h"
diff --git a/numpy/core/src/common/simd/sse/utils.h b/numpy/core/src/common/simd/sse/utils.h
new file mode 100644
index 000000000..fbb969377
--- /dev/null
+++ b/numpy/core/src/common/simd/sse/utils.h
@@ -0,0 +1,19 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_SSE_UTILS_H
+#define _NPY_SIMD_SSE_UTILS_H
+
+#if !defined(__x86_64__) && !defined(_M_X64)
+NPY_FINLINE npy_uint64 npyv128_cvtsi128_si64(npyv_u64 a)
+{
+ npy_uint64 NPY_DECL_ALIGNED(32) idx[2];
+ npyv_storea_u64(idx, a);
+ return idx[0];
+}
+#else
+ #define npyv128_cvtsi128_si64 _mm_cvtsi128_si64
+#endif
+
+#endif // _NPY_SIMD_SSE_UTILS_H
diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h
index fa114389a..30d77c6fe 100644
--- a/numpy/core/src/common/simd/vsx/arithmetic.h
+++ b/numpy/core/src/common/simd/vsx/arithmetic.h
@@ -118,23 +118,26 @@
// Horizontal add: Calculates the sum of all vector elements.
-NPY_FINLINE npy_uint32 npyv_sum_u8(npyv_u8 a)
+NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a)
{
- const npyv_u32 zero4 = npyv_zero_u32();
- npyv_u32 sum4 = vec_sum4s(a, zero4);
- return (npy_uint32)vec_extract(vec_sums(sum4, zero4), 3);
+ const npyv_u32 zero = npyv_zero_u32();
+ npyv_u32 four = vec_sum4s(a, zero);
+ npyv_u32 one = vec_sums((npyv_s32)sum4, (npyv_s32)zero4);
+ return (npy_uint16)vec_extract(one, 3);
}
-NPY_FINLINE npy_uint32 npyv_sum_u16(npyv_u16 a)
+NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a)
{
- const npyv_u32 zero4 = npyv_zero_u32();
- const npyv_u32 v4 = vec_mergeh(vec_adds(a, vec_sld(a, a, 8)), zero4);
- return vec_extract(vec_sums(v4, zero4), 3);
+ const npyv_s32 zero = npyv_zero_s32();
+ npyv_u32x2 eight = npyv_expand_u32_u16(a);
+ npyv_u32 four = vec_add(eight.val[0], eight.val[1]);
+ npyv_s32 one = vec_sums((npyv_s32)four, zero);
+ return (npy_uint32)vec_extract(one, 3);
}
NPY_FINLINE npy_uint64 npyv_sum_u64(npyv_u64 a)
{
- return vec_extract(vec_add(a, vec_permi(a, a, 3)), 0);
+ return vec_extract(vec_add(a, vec_mergel(a, a)), 0);
}
NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a)
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index ae98c47f7..c67e44fa7 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -736,14 +736,12 @@ class _SIMD_ALL(_Test_Utility):
def test_arithmetic_reduce_sum(self):
"""
Test reduce sum intrinics:
- npyv_sum_u8
- npyv_sum_u16
npyv_sum_u32
npyv_sum_u64
npyv_sum_f32
npyv_sum_f64
"""
- if self.sfx not in ("u8", "u16", "u32", "u64", "f32", "f64"):
+ if self.sfx not in ("u32", "u64", "f32", "f64"):
return
# reduce sum
data = self._data()
@@ -753,6 +751,22 @@ class _SIMD_ALL(_Test_Utility):
vsum = self.sum(vdata)
assert vsum == data_sum
+ def test_arithmetic_reduce_sumup(self):
+ """
+ Test overflow protect reduce sumup intrinics:
+ npyv_sumup_u8
+ npyv_sumup_u16
+ """
+ if self.sfx not in ("u8", "u16"):
+ return
+ rdata = (0, self.nlanes, self._int_min(), self._int_max()-self.nlanes)
+ for r in rdata:
+ data = self._data(r)
+ vdata = self.load(data)
+ data_sum = sum(data)
+ vsum = self.sumup(vdata)
+ assert vsum == data_sum
+
def test_mask_conditional(self):
"""
Conditional addition and subtraction for all supported data types.