diff options
-rw-r--r-- | numpy/core/src/_simd/_simd.dispatch.c.src | 4 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx2/math.h | 8 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx512/math.h | 8 | ||||
-rw-r--r-- | numpy/core/src/common/simd/neon/math.h | 35 | ||||
-rw-r--r-- | numpy/core/src/common/simd/vsx/math.h | 8 | ||||
-rw-r--r-- | numpy/core/tests/test_simd.py | 30 |
6 files changed, 69 insertions, 24 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src index 003ef7ffd..fabec069c 100644 --- a/numpy/core/src/_simd/_simd.dispatch.c.src +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -381,7 +381,7 @@ SIMD_IMPL_INTRIN_1(sumup_@sfx@, @esfx@, v@sfx@) ***************************/ #if @fp_only@ /**begin repeat1 - * #intrin = sqrt, recip, abs, square, ceil, trunc, floor# + * #intrin = sqrt, recip, abs, square, rint, ceil, trunc, floor# */ SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, v@sfx@) /**end repeat1**/ @@ -615,7 +615,7 @@ SIMD_INTRIN_DEF(sumup_@sfx@) ***************************/ #if @fp_only@ /**begin repeat1 - * #intrin = sqrt, recip, abs, square, ceil, trunc, floor# + * #intrin = sqrt, recip, abs, square, rint, ceil, trunc, floor# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ diff --git a/numpy/core/src/common/simd/avx2/math.h b/numpy/core/src/common/simd/avx2/math.h index 78608d51b..deaf4ad11 100644 --- a/numpy/core/src/common/simd/avx2/math.h +++ b/numpy/core/src/common/simd/avx2/math.h @@ -42,7 +42,7 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) #define npyv_max_f64 _mm256_max_pd // Maximum, supports IEEE floating-point arithmetic (IEC 60559), // - If one of the two vectors contains NaN, the equivalent element of the other vector is set -// - Only if both corresponded elements are NaN, NaN is set. +// - Only if both corresponded elements are NaN, NaN is set. NPY_FINLINE npyv_f32 npyv_maxp_f32(npyv_f32 a, npyv_f32 b) { __m256 nn = _mm256_cmp_ps(b, b, _CMP_ORD_Q); @@ -76,7 +76,7 @@ NPY_FINLINE npyv_s64 npyv_max_s64(npyv_s64 a, npyv_s64 b) #define npyv_min_f64 _mm256_min_pd // Minimum, supports IEEE floating-point arithmetic (IEC 60559), // - If one of the two vectors contains NaN, the equivalent element of the other vector is set -// - Only if both corresponded elements are NaN, NaN is set. +// - Only if both corresponded elements are NaN, NaN is set. NPY_FINLINE npyv_f32 npyv_minp_f32(npyv_f32 a, npyv_f32 b) { __m256 nn = _mm256_cmp_ps(b, b, _CMP_ORD_Q); @@ -105,6 +105,10 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) return _mm256_blendv_epi8(a, b, _mm256_cmpgt_epi64(a, b)); } +// round to nearest intger even +#define npyv_rint_f32(A) _mm256_round_ps(A, _MM_FROUND_TO_NEAREST_INT) +#define npyv_rint_f64(A) _mm256_round_pd(A, _MM_FROUND_TO_NEAREST_INT) + // ceil #define npyv_ceil_f32 _mm256_ceil_ps #define npyv_ceil_f64 _mm256_ceil_pd diff --git a/numpy/core/src/common/simd/avx512/math.h b/numpy/core/src/common/simd/avx512/math.h index 0d6e17993..5a6cb6dcd 100644 --- a/numpy/core/src/common/simd/avx512/math.h +++ b/numpy/core/src/common/simd/avx512/math.h @@ -51,7 +51,7 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) #define npyv_max_f64 _mm512_max_pd // Maximum, supports IEEE floating-point arithmetic (IEC 60559), // - If one of the two vectors contains NaN, the equivalent element of the other vector is set -// - Only if both corresponded elements are NaN, NaN is set. +// - Only if both corresponded elements are NaN, NaN is set. NPY_FINLINE npyv_f32 npyv_maxp_f32(npyv_f32 a, npyv_f32 b) { __mmask16 nn = _mm512_cmp_ps_mask(b, b, _CMP_ORD_Q); @@ -84,7 +84,7 @@ NPY_FINLINE npyv_f64 npyv_maxp_f64(npyv_f64 a, npyv_f64 b) #define npyv_min_f64 _mm512_min_pd // Minimum, supports IEEE floating-point arithmetic (IEC 60559), // - If one of the two vectors contains NaN, the equivalent element of the other vector is set -// - Only if both corresponded elements are NaN, NaN is set. +// - Only if both corresponded elements are NaN, NaN is set. NPY_FINLINE npyv_f32 npyv_minp_f32(npyv_f32 a, npyv_f32 b) { __mmask16 nn = _mm512_cmp_ps_mask(b, b, _CMP_ORD_Q); @@ -112,6 +112,10 @@ NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b) #define npyv_min_u64 _mm512_min_epu64 #define npyv_min_s64 _mm512_min_epi64 +// round to nearest integer even +#define npyv_rint_f32(A) _mm512_roundscale_ps(A, _MM_FROUND_TO_NEAREST_INT) +#define npyv_rint_f64(A) _mm512_roundscale_pd(A, _MM_FROUND_TO_NEAREST_INT) + // ceil #define npyv_ceil_f32(A) _mm512_roundscale_ps(A, _MM_FROUND_TO_POS_INF) #define npyv_ceil_f64(A) _mm512_roundscale_pd(A, _MM_FROUND_TO_POS_INF) diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h index 8c4788b3d..4607d6f27 100644 --- a/numpy/core/src/common/simd/neon/math.h +++ b/numpy/core/src/common/simd/neon/math.h @@ -153,6 +153,33 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) return vbslq_s64(npyv_cmplt_s64(a, b), a, b); } +// round to nearest integer even +NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) +{ +#ifdef NPY_HAVE_ASIMD + return vrndnq_f32(a); +#else + // ARMv7 NEON only supports fp to int truncate conversion. + // a magic trick of adding 1.5 * 2**23 is used for rounding + // to nearest even and then substract this magic number to get + // the integer. + const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); + const npyv_f32 magic = vdupq_n_f32(12582912.0f); // 1.5 * 2**23 + npyv_f32 round = vsubq_f32(vaddq_f32(a, magic), magic); + npyv_b32 overflow = vcleq_f32(vabsq_f32(a), vreinterpretq_f32_u32(vdupq_n_u32(0x4b000000))); + round = vbslq_f32(overflow, round, a); + // signed zero + round = vreinterpretq_f32_s32(vorrq_s32( + vreinterpretq_s32_f32(round), + vandq_s32(vreinterpretq_s32_f32(a), szero) + )); + return round; +#endif +} +#if NPY_SIMD_F64 + #define npyv_rint_f64 vrndnq_f64 +#endif // NPY_SIMD_F64 + // ceil #ifdef NPY_HAVE_ASIMD #define npyv_ceil_f32 vrndpq_f32 @@ -238,13 +265,17 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) npyv_f32 floor = vsubq_f32(round, vreinterpretq_f32_u32( vandq_u32(vcgtq_f32(round, a), one) )); - + // respect signed zero + npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32( + vreinterpretq_s32_f32(floor), + vandq_s32(vreinterpretq_s32_f32(a), szero) + )); npyv_u32 nnan = npyv_notnan_f32(a); npyv_u32 overflow = vorrq_u32( vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int) ); - return vbslq_f32(vbicq_u32(nnan, overflow), floor, a); + return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a); } #endif // NPY_HAVE_ASIMD #if NPY_SIMD_F64 diff --git a/numpy/core/src/common/simd/vsx/math.h b/numpy/core/src/common/simd/vsx/math.h index 94f1233f1..444bc9e54 100644 --- a/numpy/core/src/common/simd/vsx/math.h +++ b/numpy/core/src/common/simd/vsx/math.h @@ -38,7 +38,7 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) #define npyv_max_f64 vec_max // Maximum, supports IEEE floating-point arithmetic (IEC 60559), // - If one of the two vectors contains NaN, the equivalent element of the other vector is set -// - Only if both corresponded elements are NaN, NaN is set. +// - Only if both corresponded elements are NaN, NaN is set. #define npyv_maxp_f32 vec_max #define npyv_maxp_f64 vec_max // Maximum, integer operations @@ -56,7 +56,7 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) #define npyv_min_f64 vec_min // Minimum, supports IEEE floating-point arithmetic (IEC 60559), // - If one of the two vectors contains NaN, the equivalent element of the other vector is set -// - Only if both corresponded elements are NaN, NaN is set. +// - Only if both corresponded elements are NaN, NaN is set. #define npyv_minp_f32 vec_min #define npyv_minp_f64 vec_min // Minimum, integer operations @@ -69,6 +69,10 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) #define npyv_min_u64 vec_min #define npyv_min_s64 vec_min +// round to nearest int even +#define npyv_rint_f32 vec_rint +#define npyv_rint_f64 vec_rint + // ceil #define npyv_ceil_f32 vec_ceil #define npyv_ceil_f64 vec_ceil diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py index 548d89574..605baefe6 100644 --- a/numpy/core/tests/test_simd.py +++ b/numpy/core/tests/test_simd.py @@ -330,17 +330,18 @@ class _SIMD_FP(_Test_Utility): square = self.square(vdata) assert square == data_square - @pytest.mark.parametrize("intrin, func", [("self.ceil", math.ceil), - ("self.trunc", math.trunc), ("self.floor", math.floor)]) + @pytest.mark.parametrize("intrin, func", [("ceil", math.ceil), + ("trunc", math.trunc), ("floor", math.floor), ("rint", round)]) def test_rounding(self, intrin, func): """ Test intrinsics: + npyv_rint_##SFX npyv_ceil_##SFX npyv_trunc_##SFX npyv_floor##SFX """ intrin_name = intrin - intrin = eval(intrin) + intrin = getattr(self, intrin) pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan() # special cases round_cases = ((nan, nan), (pinf, pinf), (ninf, ninf)) @@ -348,24 +349,25 @@ class _SIMD_FP(_Test_Utility): data_round = [desired]*self.nlanes _round = intrin(self.setall(case)) assert _round == pytest.approx(data_round, nan_ok=True) + for x in range(0, 2**20, 256**2): for w in (-1.05, -1.10, -1.15, 1.05, 1.10, 1.15): - data = [x*w+a for a in range(self.nlanes)] - vdata = self.load(data) + data = self.load([(x+a)*w for a in range(self.nlanes)]) data_round = [func(x) for x in data] - _round = intrin(vdata) + _round = intrin(data) assert _round == data_round + # signed zero - if "ceil" in intrin_name or "trunc" in intrin_name: - for w in (-0.25, -0.30, -0.45): - _round = self._to_unsigned(intrin(self.setall(w))) - data_round = self._to_unsigned(self.setall(-0.0)) - assert _round == data_round - if "floor" in intrin_name: - _round = self._to_unsigned(intrin(self.setall(-0.0))) + if intrin_name == "floor": + data_szero = (-0.0,) + else: + data_szero = (-0.0, -0.25, -0.30, -0.45, -0.5) + + for w in data_szero: + _round = self._to_unsigned(intrin(self.setall(w))) data_round = self._to_unsigned(self.setall(-0.0)) assert _round == data_round - + def test_max(self): """ Test intrinsics: |