diff options
| -rw-r--r-- | numpy/core/src/_simd/_simd.dispatch.c.src | 4 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/avx2/math.h | 4 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/avx512/math.h | 4 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/avx512/utils.h | 22 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/neon/math.h | 49 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/sse/math.h | 31 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/vsx/math.h | 4 | ||||
| -rw-r--r-- | numpy/core/tests/test_simd.py | 27 |
8 files changed, 136 insertions, 9 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src index 54770959c..5c494ae7a 100644 --- a/numpy/core/src/_simd/_simd.dispatch.c.src +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -381,7 +381,7 @@ SIMD_IMPL_INTRIN_1(sumup_@sfx@, @esfx@, v@sfx@) ***************************/ #if @fp_only@ /**begin repeat1 - * #intrin = sqrt, recip, abs, square# + * #intrin = sqrt, recip, abs, square, ceil# */ SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, v@sfx@) /**end repeat1**/ @@ -615,7 +615,7 @@ SIMD_INTRIN_DEF(sumup_@sfx@) ***************************/ #if @fp_only@ /**begin repeat1 - * #intrin = sqrt, recip, abs, square# + * #intrin = sqrt, recip, abs, square, ceil# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ diff --git a/numpy/core/src/common/simd/avx2/math.h b/numpy/core/src/common/simd/avx2/math.h index 9460183df..b1f3915a6 100644 --- a/numpy/core/src/common/simd/avx2/math.h +++ b/numpy/core/src/common/simd/avx2/math.h @@ -105,4 +105,8 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) return _mm256_blendv_epi8(a, b, _mm256_cmpgt_epi64(a, b)); } +// ceil +#define npyv_ceil_f32 _mm256_ceil_ps +#define npyv_ceil_f64 _mm256_ceil_pd + #endif // _NPY_SIMD_AVX2_MATH_H diff --git a/numpy/core/src/common/simd/avx512/math.h b/numpy/core/src/common/simd/avx512/math.h index 0949b2b06..c4f8d3410 100644 --- a/numpy/core/src/common/simd/avx512/math.h +++ b/numpy/core/src/common/simd/avx512/math.h @@ -112,4 +112,8 @@ NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b) #define npyv_min_u64 _mm512_min_epu64 #define npyv_min_s64 _mm512_min_epi64 +// ceil +#define npyv_ceil_f32(A) _mm512_roundscale_ps(A, _MM_FROUND_TO_POS_INF) +#define npyv_ceil_f64(A) _mm512_roundscale_pd(A, _MM_FROUND_TO_POS_INF) + #endif // _NPY_SIMD_AVX512_MATH_H diff --git a/numpy/core/src/common/simd/avx512/utils.h b/numpy/core/src/common/simd/avx512/utils.h index 8066283c6..c3079283f 100644 --- a/numpy/core/src/common/simd/avx512/utils.h +++ b/numpy/core/src/common/simd/avx512/utils.h @@ -26,7 +26,7 @@ #define npyv512_combine_ps256(A, B) _mm512_insertf32x8(_mm512_castps256_ps512(A), B, 1) #else #define npyv512_combine_ps256(A, B) \ - _mm512_castsi512_ps(npyv512_combine_si256(_mm512_castps_si512(A), _mm512_castps_si512(B))) + _mm512_castsi512_ps(npyv512_combine_si256(_mm256_castps_si256(A), _mm256_castps_si256(B))) #endif #define NPYV_IMPL_AVX512_FROM_AVX2_1ARG(FN_NAME, INTRIN) \ @@ -39,6 +39,26 @@ return npyv512_combine_si256(l_a, h_a); \ } +#define NPYV_IMPL_AVX512_FROM_AVX2_PS_1ARG(FN_NAME, INTRIN) \ + NPY_FINLINE __m512 FN_NAME(__m512 a) \ + { \ + __m256 l_a = npyv512_lower_ps256(a); \ + __m256 h_a = npyv512_higher_ps256(a); \ + l_a = INTRIN(l_a); \ + h_a = INTRIN(h_a); \ + return npyv512_combine_ps256(l_a, h_a); \ + } + +#define NPYV_IMPL_AVX512_FROM_AVX2_PD_1ARG(FN_NAME, INTRIN) \ + NPY_FINLINE __m512d FN_NAME(__m512d a) \ + { \ + __m256d l_a = npyv512_lower_pd256(a); \ + __m256d h_a = npyv512_higher_pd256(a); \ + l_a = INTRIN(l_a); \ + h_a = INTRIN(h_a); \ + return npyv512_combine_pd256(l_a, h_a); \ + } + #define NPYV_IMPL_AVX512_FROM_AVX2_2ARG(FN_NAME, INTRIN) \ NPY_FINLINE __m512i FN_NAME(__m512i a, __m512i b) \ { \ diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h index 19ea6f22f..38c3899e4 100644 --- a/numpy/core/src/common/simd/neon/math.h +++ b/numpy/core/src/common/simd/neon/math.h @@ -88,16 +88,16 @@ NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a) #define npyv_max_f64 vmaxq_f64 // Maximum, supports IEEE floating-point arithmetic (IEC 60559), // - If one of the two vectors contains NaN, the equivalent element of the other vector is set -// - Only if both corresponded elements are NaN, NaN is set. +// - Only if both corresponded elements are NaN, NaN is set. #ifdef NPY_HAVE_ASIMD #define npyv_maxp_f32 vmaxnmq_f32 #else NPY_FINLINE npyv_f32 npyv_maxp_f32(npyv_f32 a, npyv_f32 b) - { + { npyv_u32 nn_a = vceqq_f32(a, a); npyv_u32 nn_b = vceqq_f32(b, b); return vmaxq_f32(vbslq_f32(nn_a, a, b), vbslq_f32(nn_b, b, a)); - } + } #endif #if NPY_SIMD_F64 #define npyv_maxp_f64 vmaxnmq_f64 @@ -123,16 +123,16 @@ NPY_FINLINE npyv_s64 npyv_max_s64(npyv_s64 a, npyv_s64 b) #define npyv_min_f64 vminq_f64 // Minimum, supports IEEE floating-point arithmetic (IEC 60559), // - If one of the two vectors contains NaN, the equivalent element of the other vector is set -// - Only if both corresponded elements are NaN, NaN is set. +// - Only if both corresponded elements are NaN, NaN is set. #ifdef NPY_HAVE_ASIMD #define npyv_minp_f32 vminnmq_f32 #else NPY_FINLINE npyv_f32 npyv_minp_f32(npyv_f32 a, npyv_f32 b) - { + { npyv_u32 nn_a = vceqq_f32(a, a); npyv_u32 nn_b = vceqq_f32(b, b); return vminq_f32(vbslq_f32(nn_a, a, b), vbslq_f32(nn_b, b, a)); - } + } #endif #if NPY_SIMD_F64 #define npyv_minp_f64 vminnmq_f64 @@ -153,4 +153,41 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) return vbslq_s64(npyv_cmplt_s64(a, b), a, b); } +// ceil +#ifdef NPY_HAVE_ASIMD + #define npyv_ceil_f32 vrndpq_f32 +#else + NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a) + { + const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); + const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f)); + const npyv_s32 max_int = vdupq_n_s32(0x7fffffff); + /** + * On armv7, vcvtq.f32 handles special cases as follows: + * NaN return 0 + * +inf or +outrange return 0x80000000(-0.0f) + * -inf or -outrange return 0x7fffffff(nan) + */ + npyv_s32 roundi = vcvtq_s32_f32(a); + npyv_f32 round = vcvtq_f32_s32(roundi); + npyv_f32 ceil = vaddq_f32(round, vreinterpretq_f32_u32( + vandq_u32(vcltq_f32(round, a), one)) + ); + // respect signed zero, e.g. -0.5 -> -0.0 + npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32( + vreinterpretq_s32_f32(ceil), + vandq_s32(vreinterpretq_s32_f32(a), szero) + )); + // if nan or overflow return a + npyv_u32 nnan = npyv_notnan_f32(a); + npyv_u32 overflow = vorrq_u32( + vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int) + ); + return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a); + } +#endif +#if NPY_SIMD_F64 + #define npyv_ceil_f64 vrndpq_f64 +#endif // NPY_SIMD_F64 + #endif // _NPY_SIMD_NEON_MATH_H diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h index 97d35afc5..02eb06a29 100644 --- a/numpy/core/src/common/simd/sse/math.h +++ b/numpy/core/src/common/simd/sse/math.h @@ -143,4 +143,35 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) return npyv_select_s64(npyv_cmplt_s64(a, b), a, b); } +// ceil +#ifdef NPY_HAVE_SSE41 + #define npyv_ceil_f32 _mm_ceil_ps + #define npyv_ceil_f64 _mm_ceil_pd +#else + NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a) + { + const npyv_f32 szero = _mm_set1_ps(-0.0f); + const npyv_f32 one = _mm_set1_ps(1.0f); + npyv_s32 roundi = _mm_cvttps_epi32(a); + npyv_f32 round = _mm_cvtepi32_ps(roundi); + npyv_f32 ceil = _mm_add_ps(round, _mm_and_ps(_mm_cmplt_ps(round, a), one)); + // respect signed zero, e.g. -0.5 -> -0.0 + npyv_f32 rzero = _mm_or_ps(ceil, _mm_and_ps(a, szero)); + // if overflow return a + return npyv_select_f32(_mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)), a, rzero); + } + NPY_FINLINE npyv_f64 npyv_ceil_f64(npyv_f64 a) + { + const npyv_f64 szero = _mm_set1_pd(-0.0); + const npyv_f64 one = _mm_set1_pd(1.0); + const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000); + npyv_f64 sign_two52 = _mm_or_pd(two_power_52, _mm_and_pd(a, szero)); + // round by add magic number 2^52 + npyv_f64 round = _mm_sub_pd(_mm_add_pd(a, sign_two52), sign_two52); + npyv_f64 ceil = _mm_add_pd(round, _mm_and_pd(_mm_cmplt_pd(round, a), one)); + // respect signed zero, e.g. -0.5 -> -0.0 + return _mm_or_pd(ceil, _mm_and_pd(a, szero)); + } +#endif + #endif // _NPY_SIMD_SSE_MATH_H diff --git a/numpy/core/src/common/simd/vsx/math.h b/numpy/core/src/common/simd/vsx/math.h index b2e393c7c..f387dac4d 100644 --- a/numpy/core/src/common/simd/vsx/math.h +++ b/numpy/core/src/common/simd/vsx/math.h @@ -69,4 +69,8 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) #define npyv_min_u64 vec_min #define npyv_min_s64 vec_min +// ceil +#define npyv_ceil_f32 vec_ceil +#define npyv_ceil_f64 vec_ceil + #endif // _NPY_SIMD_VSX_MATH_H diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py index 0270ad901..379fef8af 100644 --- a/numpy/core/tests/test_simd.py +++ b/numpy/core/tests/test_simd.py @@ -330,6 +330,33 @@ class _SIMD_FP(_Test_Utility): square = self.square(vdata) assert square == data_square + @pytest.mark.parametrize("intrin, func", [("self.ceil", math.ceil)]) + def test_rounding(self, intrin, func): + """ + Test intrinsics: + npyv_ceil_##SFX + """ + intrin = eval(intrin) + pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan() + # special cases + round_cases = ((nan, nan), (pinf, pinf), (ninf, ninf)) + for case, desired in round_cases: + data_round = [desired]*self.nlanes + _round = intrin(self.setall(case)) + assert _round == pytest.approx(data_round, nan_ok=True) + for x in range(0, 2**20, 256**2): + for w in (-1.05, -1.10, -1.15, 1.05, 1.10, 1.15): + data = [x*w+a for a in range(self.nlanes)] + vdata = self.load(data) + data_round = [func(x) for x in data] + _round = intrin(vdata) + assert _round == data_round + # signed zero + for w in (-0.25, -0.30, -0.45): + _round = self._to_unsigned(intrin(self.setall(w))) + data_round = self._to_unsigned(self.setall(-0.0)) + assert _round == data_round + def test_max(self): """ Test intrinsics: |
