summaryrefslogtreecommitdiff
path: root/numpy/core/src
diff options
context:
space:
mode:
authorQiyu8 <fangchunlin@huawei.com>2020-11-03 17:04:35 +0800
committerQiyu8 <fangchunlin@huawei.com>2020-11-03 17:04:35 +0800
commit1f0298d62853e5233b0b829b08a11c160f0b6597 (patch)
tree7de626d4dda5c183d8d879bbf2b44296c622095f /numpy/core/src
parente17cdf56c4d40bcebb0a272af573be94e7286fa0 (diff)
downloadnumpy-1f0298d62853e5233b0b829b08a11c160f0b6597.tar.gz
improve intrinsics and add sum intrinsic test
Diffstat (limited to 'numpy/core/src')
-rw-r--r--numpy/core/src/_simd/_simd.dispatch.c.src10
-rw-r--r--numpy/core/src/common/simd/avx2/arithmetic.h5
-rw-r--r--numpy/core/src/common/simd/avx512/arithmetic.h4
-rw-r--r--numpy/core/src/common/simd/neon/arithmetic.h18
-rw-r--r--numpy/core/src/common/simd/sse/arithmetic.h5
-rw-r--r--numpy/core/src/common/simd/vsx/arithmetic.h8
6 files changed, 33 insertions, 17 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index 2d89b9df0..3d7af2333 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -20,6 +20,7 @@
* #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1#
* #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
+ * #sum_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
* #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
@@ -351,6 +352,10 @@ SIMD_IMPL_INTRIN_3(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@, v@sfx@)
/**end repeat1**/
#endif // fused_sup
+#if @sum_sup@
+SIMD_IMPL_INTRIN_1(sum_@sfx@, @sfx@, v@sfx@)
+#endif // sum_sup
+
#endif // simd_sup
/**end repeat**/
/***************************
@@ -370,6 +375,7 @@ static PyMethodDef simd__intrinsics_methods[] = {
* #mul_sup = 1, 1, 1, 1, 1, 1, 0, 0, 1, 1#
* #div_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #fused_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
+ * #sum_sup = 0, 0, 0, 0, 0, 0, 0, 0, 1, 1#
* #ncont_sup = 0, 0, 0, 0, 1, 1, 1, 1, 1, 1#
* #shl_imm = 0, 0, 15, 15, 31, 31, 63, 63, 0, 0#
* #shr_imm = 0, 0, 16, 16, 32, 32, 64, 64, 0, 0#
@@ -484,6 +490,10 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
/**end repeat1**/
#endif // fused_sup
+#if @sum_sup@
+SIMD_INTRIN_DEF(sum_@sfx@)
+#endif // sum_sup
+
#endif // simd_sup
/**end repeat**/
diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h
index 8b98394e0..3a6dc9535 100644
--- a/numpy/core/src/common/simd/avx2/arithmetic.h
+++ b/numpy/core/src/common/simd/avx2/arithmetic.h
@@ -116,7 +116,6 @@
return npyv_sub_f64(npyv_mul_f64(neg_a, b), c);
}
#endif // !NPY_HAVE_FMA3
-#endif // _NPY_SIMD_AVX2_ARITHMETIC_H
// Horizontal add: Calculates the sum of all vector elements.
NPY_FINLINE float npyv_sum_f32(__m256 a)
@@ -137,3 +136,7 @@ NPY_FINLINE double npyv_sum_f64(__m256d a)
__m128d sum = _mm_add_pd(lo, hi);
return _mm_cvtsd_f64(sum);
}
+
+#endif // _NPY_SIMD_AVX2_ARITHMETIC_H
+
+
diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
index 989e10f39..7372ca29e 100644
--- a/numpy/core/src/common/simd/avx512/arithmetic.h
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -129,8 +129,6 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
#define npyv_nmulsub_f32 _mm512_fnmsub_ps
#define npyv_nmulsub_f64 _mm512_fnmsub_pd
-#endif // _NPY_SIMD_AVX512_ARITHMETIC_H
-
/***************************
* Reduce Sum
* there are three ways to implement reduce sum for AVX512:
@@ -173,3 +171,5 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
return _mm_cvtsd_f64(_mm512_castpd512_pd128(sum8));
}
#endif
+
+#endif // _NPY_SIMD_AVX512_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h
index 2d56adfbd..bc14ffb75 100644
--- a/numpy/core/src/common/simd/neon/arithmetic.h
+++ b/numpy/core/src/common/simd/neon/arithmetic.h
@@ -118,17 +118,17 @@
NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
{ return vfmsq_f64(vnegq_f64(c), a, b); }
#endif // NPY_SIMD_F64
-#endif // _NPY_SIMD_NEON_ARITHMETIC_H
// Horizontal add: Calculates the sum of all vector elements.
-NPY_FINLINE float npyv_sum_f32(float32x4_t a)
-{
- float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a));
- return vget_lane_f32(vpadd_f32(r, r), 0);
-}
-#ifdef __aarch64__
- NPY_FINLINE double npyv_sum_f64(float64x2_t a)
+#if NPY_SIMD_F64
+ #define npyv_sum_f32 vaddvq_f32
+ #define npyv_sum_f64 vaddvq_f64
+#else
+ NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
{
- return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0);
+ float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a));
+ return vget_lane_f32(vpadd_f32(r, r), 0);
}
#endif
+
+#endif // _NPY_SIMD_NEON_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h
index cf33349fc..8440cc52e 100644
--- a/numpy/core/src/common/simd/sse/arithmetic.h
+++ b/numpy/core/src/common/simd/sse/arithmetic.h
@@ -147,7 +147,6 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b)
return npyv_sub_f64(npyv_mul_f64(neg_a, b), c);
}
#endif // !NPY_HAVE_FMA3
-#endif // _NPY_SIMD_SSE_ARITHMETIC_H
// Horizontal add: Calculates the sum of all vector elements.
NPY_FINLINE float npyv_sum_f32(__m128 a)
@@ -172,3 +171,7 @@ NPY_FINLINE double npyv_sum_f64(__m128d a)
return _mm_cvtsd_f64(_mm_add_pd(a, _mm_unpackhi_pd(a, a)));
#endif
}
+
+#endif // _NPY_SIMD_SSE_ARITHMETIC_H
+
+
diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h
index 755aa1ca3..2f6762e63 100644
--- a/numpy/core/src/common/simd/vsx/arithmetic.h
+++ b/numpy/core/src/common/simd/vsx/arithmetic.h
@@ -116,16 +116,16 @@
#define npyv_nmulsub_f32 vec_nmadd // equivalent to -(a*b + c)
#define npyv_nmulsub_f64 vec_nmadd
-#endif // _NPY_SIMD_VSX_ARITHMETIC_H
-
// Horizontal add: Calculates the sum of all vector elements.
NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
{
- return vec_extract(a, 0) + vec_extract(a, 1) +
- vec_extract(a, 2) + vec_extract(a, 3);
+ npyv_f32 sum = vec_add(a, npyv_combineh_f32(a, a));
+ return vec_extract(sum, 0) + vec_extract(sum, 1);
}
NPY_FINLINE double npyv_sum_f64(npyv_f64 a)
{
return vec_extract(a, 0) + vec_extract(a, 1);
}
+
+#endif // _NPY_SIMD_VSX_ARITHMETIC_H