diff options
-rw-r--r-- | numpy/core/src/_simd/_simd.dispatch.c.src | 4 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx2/math.h | 4 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx512/math.h | 4 | ||||
-rw-r--r-- | numpy/core/src/common/simd/neon/math.h | 28 | ||||
-rw-r--r-- | numpy/core/src/common/simd/sse/math.h | 26 | ||||
-rw-r--r-- | numpy/core/src/common/simd/vsx/math.h | 4 | ||||
-rw-r--r-- | numpy/core/tests/test_simd.py | 7 |
7 files changed, 74 insertions, 3 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src index 84de9a059..003ef7ffd 100644 --- a/numpy/core/src/_simd/_simd.dispatch.c.src +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -381,7 +381,7 @@ SIMD_IMPL_INTRIN_1(sumup_@sfx@, @esfx@, v@sfx@) ***************************/ #if @fp_only@ /**begin repeat1 - * #intrin = sqrt, recip, abs, square, ceil, trunc# + * #intrin = sqrt, recip, abs, square, ceil, trunc, floor# */ SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, v@sfx@) /**end repeat1**/ @@ -615,7 +615,7 @@ SIMD_INTRIN_DEF(sumup_@sfx@) ***************************/ #if @fp_only@ /**begin repeat1 - * #intrin = sqrt, recip, abs, square, ceil, trunc# + * #intrin = sqrt, recip, abs, square, ceil, trunc, floor# */ SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ diff --git a/numpy/core/src/common/simd/avx2/math.h b/numpy/core/src/common/simd/avx2/math.h index ec15e50e1..78608d51b 100644 --- a/numpy/core/src/common/simd/avx2/math.h +++ b/numpy/core/src/common/simd/avx2/math.h @@ -113,4 +113,8 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) #define npyv_trunc_f32(A) _mm256_round_ps(A, _MM_FROUND_TO_ZERO) #define npyv_trunc_f64(A) _mm256_round_pd(A, _MM_FROUND_TO_ZERO) +// floor +#define npyv_floor_f32 _mm256_floor_ps +#define npyv_floor_f64 _mm256_floor_pd + #endif // _NPY_SIMD_AVX2_MATH_H diff --git a/numpy/core/src/common/simd/avx512/math.h b/numpy/core/src/common/simd/avx512/math.h index f30e50ad0..0d6e17993 100644 --- a/numpy/core/src/common/simd/avx512/math.h +++ b/numpy/core/src/common/simd/avx512/math.h @@ -120,4 +120,8 @@ NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b) #define npyv_trunc_f32(A) _mm512_roundscale_ps(A, _MM_FROUND_TO_ZERO) #define npyv_trunc_f64(A) _mm512_roundscale_pd(A, _MM_FROUND_TO_ZERO) +// floor +#define npyv_floor_f32(A) _mm512_roundscale_ps(A, _MM_FROUND_TO_NEG_INF) +#define npyv_floor_f64(A) _mm512_roundscale_pd(A, _MM_FROUND_TO_NEG_INF) + #endif // _NPY_SIMD_AVX512_MATH_H diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h index 19e5cd846..8c4788b3d 100644 --- a/numpy/core/src/common/simd/neon/math.h +++ b/numpy/core/src/common/simd/neon/math.h @@ -223,4 +223,32 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) #define npyv_trunc_f64 vrndq_f64 #endif // NPY_SIMD_F64 +// floor +#ifdef NPY_HAVE_ASIMD + #define npyv_floor_f32 vrndmq_f32 +#else + NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a) + { + const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); + const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f)); + const npyv_s32 max_int = vdupq_n_s32(0x7fffffff); + + npyv_s32 roundi = vcvtq_s32_f32(a); + npyv_f32 round = vcvtq_f32_s32(roundi); + npyv_f32 floor = vsubq_f32(round, vreinterpretq_f32_u32( + vandq_u32(vcgtq_f32(round, a), one) + )); + + npyv_u32 nnan = npyv_notnan_f32(a); + npyv_u32 overflow = vorrq_u32( + vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int) + ); + + return vbslq_f32(vbicq_u32(nnan, overflow), floor, a); + } +#endif // NPY_HAVE_ASIMD +#if NPY_SIMD_F64 + #define npyv_floor_f64 vrndmq_f64 +#endif // NPY_SIMD_F64 + #endif // _NPY_SIMD_NEON_MATH_H diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h index 5daf7711e..117d39fb5 100644 --- a/numpy/core/src/common/simd/sse/math.h +++ b/numpy/core/src/common/simd/sse/math.h @@ -202,4 +202,30 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) } #endif +// floor +#ifdef NPY_HAVE_SSE41 + #define npyv_floor_f32 _mm_floor_ps + #define npyv_floor_f64 _mm_floor_pd +#else + NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a) + { + const npyv_f32 szero = _mm_set1_ps(-0.0f); + const npyv_f32 one = _mm_set1_ps(1.0f); + npyv_s32 roundi = _mm_cvttps_epi32(a); + npyv_f32 round = _mm_cvtepi32_ps(roundi); + npyv_f32 floor = _mm_sub_ps(round, _mm_and_ps(_mm_cmpgt_ps(round, a), one)); + return npyv_select_f32(_mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)), a, floor); + } + NPY_FINLINE npyv_f64 npyv_floor_f64(npyv_f64 a) + { + const npyv_f64 szero = _mm_set1_pd(-0.0); + const npyv_f64 one = _mm_set1_pd(1.0); + const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000); + npyv_f64 sign_two52 = _mm_or_pd(two_power_52, _mm_and_pd(a, szero)); + // round by add magic number 2^52 + npyv_f64 round = _mm_sub_pd(_mm_add_pd(a, sign_two52), sign_two52); + return _mm_sub_pd(round, _mm_and_pd(_mm_cmpgt_pd(round, a), one)); + } +#endif // NPY_HAVE_SSE41 + #endif // _NPY_SIMD_SSE_MATH_H diff --git a/numpy/core/src/common/simd/vsx/math.h b/numpy/core/src/common/simd/vsx/math.h index d138cae8a..94f1233f1 100644 --- a/numpy/core/src/common/simd/vsx/math.h +++ b/numpy/core/src/common/simd/vsx/math.h @@ -77,4 +77,8 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) #define npyv_trunc_f32 vec_trunc #define npyv_trunc_f64 vec_trunc +// floor +#define npyv_floor_f32 vec_floor +#define npyv_floor_f64 vec_floor + #endif // _NPY_SIMD_VSX_MATH_H diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py index 12a67c44d..548d89574 100644 --- a/numpy/core/tests/test_simd.py +++ b/numpy/core/tests/test_simd.py @@ -331,12 +331,13 @@ class _SIMD_FP(_Test_Utility): assert square == data_square @pytest.mark.parametrize("intrin, func", [("self.ceil", math.ceil), - ("self.trunc", math.trunc)]) + ("self.trunc", math.trunc), ("self.floor", math.floor)]) def test_rounding(self, intrin, func): """ Test intrinsics: npyv_ceil_##SFX npyv_trunc_##SFX + npyv_floor##SFX """ intrin_name = intrin intrin = eval(intrin) @@ -360,6 +361,10 @@ class _SIMD_FP(_Test_Utility): _round = self._to_unsigned(intrin(self.setall(w))) data_round = self._to_unsigned(self.setall(-0.0)) assert _round == data_round + if "floor" in intrin_name: + _round = self._to_unsigned(intrin(self.setall(-0.0))) + data_round = self._to_unsigned(self.setall(-0.0)) + assert _round == data_round def test_max(self): """ |