diff options
author | Sayed Adel <seiko@imavr.com> | 2021-12-13 04:07:21 +0200 |
---|---|---|
committer | Sayed Adel <seiko@imavr.com> | 2021-12-13 04:17:57 +0200 |
commit | 43d57ff4d61fc9117a8977e71779e6c32662c60f (patch) | |
tree | a61753fca3348733aaa29d90ddd67bd6c06e7b8f | |
parent | d2f9cc7aeef868899cc281fe69908cfced3a7619 (diff) | |
download | numpy-43d57ff4d61fc9117a8977e71779e6c32662c60f.tar.gz |
SIMD: fix signed zero of SSE2/Floor
-rw-r--r-- | numpy/core/src/common/simd/sse/math.h | 49 |
1 files changed, 37 insertions, 12 deletions
diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h index 117d39fb5..e4b77b671 100644 --- a/numpy/core/src/common/simd/sse/math.h +++ b/numpy/core/src/common/simd/sse/math.h @@ -42,7 +42,7 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) #define npyv_max_f64 _mm_max_pd // Maximum, supports IEEE floating-point arithmetic (IEC 60559), // - If one of the two vectors contains NaN, the equivalent element of the other vector is set -// - Only if both corresponded elements are NaN, NaN is set. +// - Only if both corresponded elements are NaN, NaN is set. NPY_FINLINE npyv_f32 npyv_maxp_f32(npyv_f32 a, npyv_f32 b) { __m128 nn = _mm_cmpord_ps(b, b); @@ -95,7 +95,7 @@ NPY_FINLINE npyv_s64 npyv_max_s64(npyv_s64 a, npyv_s64 b) #define npyv_min_f64 _mm_min_pd // Minimum, supports IEEE floating-point arithmetic (IEC 60559), // - If one of the two vectors contains NaN, the equivalent element of the other vector is set -// - Only if both corresponded elements are NaN, NaN is set. +// - Only if both corresponded elements are NaN, NaN is set. NPY_FINLINE npyv_f32 npyv_minp_f32(npyv_f32 a, npyv_f32 b) { __m128 nn = _mm_cmpord_ps(b, b); @@ -143,6 +143,38 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) return npyv_select_s64(npyv_cmplt_s64(a, b), a, b); } +// round to nearest integer even +NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) +{ +#ifdef NPY_HAVE_SSE41 + return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); +#else + const npyv_f32 szero = _mm_set1_ps(-0.0f); + __m128i roundi = _mm_cvtps_epi32(a); + __m128i overflow = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)); + __m128 r = _mm_cvtepi32_ps(roundi); + // respect sign of zero + r = _mm_or_ps(r, _mm_and_ps(a, szero)); + return npyv_select_f32(overflow, a, r); +#endif +} + +// round to nearest integer even +NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a) +{ +#ifdef NPY_HAVE_SSE41 + return _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT); +#else + const npyv_f64 szero = _mm_set1_pd(-0.0); + const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000); + npyv_f64 sign_two52 = _mm_or_pd(two_power_52, _mm_and_pd(a, szero)); + // round by add magic number 2^52 + npyv_f64 round = _mm_sub_pd(_mm_add_pd(a, sign_two52), sign_two52); + // respect signed zero, e.g. -0.5 -> -0.0 + return _mm_or_pd(round, _mm_and_pd(a, szero)); +#endif +} + // ceil #ifdef NPY_HAVE_SSE41 #define npyv_ceil_f32 _mm_ceil_ps @@ -209,21 +241,14 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) #else NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a) { - const npyv_f32 szero = _mm_set1_ps(-0.0f); const npyv_f32 one = _mm_set1_ps(1.0f); - npyv_s32 roundi = _mm_cvttps_epi32(a); - npyv_f32 round = _mm_cvtepi32_ps(roundi); - npyv_f32 floor = _mm_sub_ps(round, _mm_and_ps(_mm_cmpgt_ps(round, a), one)); - return npyv_select_f32(_mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)), a, floor); + npyv_f32 round = npyv_rint_f32(a); + return _mm_sub_ps(round, _mm_and_ps(_mm_cmpgt_ps(round, a), one)); } NPY_FINLINE npyv_f64 npyv_floor_f64(npyv_f64 a) { - const npyv_f64 szero = _mm_set1_pd(-0.0); const npyv_f64 one = _mm_set1_pd(1.0); - const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000); - npyv_f64 sign_two52 = _mm_or_pd(two_power_52, _mm_and_pd(a, szero)); - // round by add magic number 2^52 - npyv_f64 round = _mm_sub_pd(_mm_add_pd(a, sign_two52), sign_two52); + npyv_f64 round = npyv_rint_f64(a); return _mm_sub_pd(round, _mm_and_pd(_mm_cmpgt_pd(round, a), one)); } #endif // NPY_HAVE_SSE41 |