SIMD: fix signed zero of SSE2/Floor

author: Sayed Adel <seiko@imavr.com> 2021-12-13 04:07:21 +0200
committer: Sayed Adel <seiko@imavr.com> 2021-12-13 04:17:57 +0200
commit: 43d57ff4d61fc9117a8977e71779e6c32662c60f (patch)
tree: a61753fca3348733aaa29d90ddd67bd6c06e7b8f
parent: d2f9cc7aeef868899cc281fe69908cfced3a7619 (diff)
download: numpy-43d57ff4d61fc9117a8977e71779e6c32662c60f.tar.gz
1 files changed, 37 insertions, 12 deletions
diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h
index 117d39fb5..e4b77b671 100644
--- a/numpy/core/src/common/simd/sse/math.h
+++ b/numpy/core/src/common/simd/sse/math.h
@@ -42,7 +42,7 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
 #define npyv_max_f64 _mm_max_pd
 // Maximum, supports IEEE floating-point arithmetic (IEC 60559),
 // - If one of the two vectors contains NaN, the equivalent element of the other vector is set
-// - Only if both corresponded elements are NaN, NaN is set. 
+// - Only if both corresponded elements are NaN, NaN is set.
 NPY_FINLINE npyv_f32 npyv_maxp_f32(npyv_f32 a, npyv_f32 b)
 {
     __m128 nn  = _mm_cmpord_ps(b, b);
@@ -95,7 +95,7 @@ NPY_FINLINE npyv_s64 npyv_max_s64(npyv_s64 a, npyv_s64 b)
 #define npyv_min_f64 _mm_min_pd
 // Minimum, supports IEEE floating-point arithmetic (IEC 60559),
 // - If one of the two vectors contains NaN, the equivalent element of the other vector is set
-// - Only if both corresponded elements are NaN, NaN is set. 
+// - Only if both corresponded elements are NaN, NaN is set.
 NPY_FINLINE npyv_f32 npyv_minp_f32(npyv_f32 a, npyv_f32 b)
 {
     __m128 nn  = _mm_cmpord_ps(b, b);
@@ -143,6 +143,38 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
     return npyv_select_s64(npyv_cmplt_s64(a, b), a, b);
 }
 
+// round to nearest integer even
+NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
+{
+#ifdef NPY_HAVE_SSE41
+    return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT);
+#else
+    const npyv_f32 szero = _mm_set1_ps(-0.0f);
+    __m128i roundi = _mm_cvtps_epi32(a);
+    __m128i overflow = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero));
+    __m128 r = _mm_cvtepi32_ps(roundi);
+    // respect sign of zero
+    r = _mm_or_ps(r, _mm_and_ps(a, szero));
+    return npyv_select_f32(overflow, a, r);
+#endif
+}
+
+// round to nearest integer even
+NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a)
+{
+#ifdef NPY_HAVE_SSE41
+    return _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT);
+#else
+    const npyv_f64 szero = _mm_set1_pd(-0.0);
+    const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000);
+    npyv_f64 sign_two52 = _mm_or_pd(two_power_52, _mm_and_pd(a, szero));
+    // round by add magic number 2^52
+    npyv_f64 round = _mm_sub_pd(_mm_add_pd(a, sign_two52), sign_two52);
+    // respect signed zero, e.g. -0.5 -> -0.0
+    return _mm_or_pd(round, _mm_and_pd(a, szero));
+#endif
+}
+
 // ceil
 #ifdef NPY_HAVE_SSE41
     #define npyv_ceil_f32 _mm_ceil_ps
@@ -209,21 +241,14 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
 #else
     NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a)
     {
-        const npyv_f32 szero = _mm_set1_ps(-0.0f);
         const npyv_f32 one = _mm_set1_ps(1.0f);
-        npyv_s32 roundi = _mm_cvttps_epi32(a);
-        npyv_f32 round = _mm_cvtepi32_ps(roundi);
-        npyv_f32 floor = _mm_sub_ps(round, _mm_and_ps(_mm_cmpgt_ps(round, a), one));
-        return npyv_select_f32(_mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)), a, floor);
+        npyv_f32 round = npyv_rint_f32(a);
+        return _mm_sub_ps(round, _mm_and_ps(_mm_cmpgt_ps(round, a), one));
     }
     NPY_FINLINE npyv_f64 npyv_floor_f64(npyv_f64 a)
     {
-        const npyv_f64 szero = _mm_set1_pd(-0.0);
         const npyv_f64 one = _mm_set1_pd(1.0);
-        const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000);
-        npyv_f64 sign_two52 = _mm_or_pd(two_power_52, _mm_and_pd(a, szero));
-        // round by add magic number 2^52
-        npyv_f64 round = _mm_sub_pd(_mm_add_pd(a, sign_two52), sign_two52);
+        npyv_f64 round = npyv_rint_f64(a);
         return _mm_sub_pd(round, _mm_and_pd(_mm_cmpgt_pd(round, a), one));
     }
 #endif // NPY_HAVE_SSE41
author	Sayed Adel <seiko@imavr.com>	2021-12-13 04:07:21 +0200
committer	Sayed Adel <seiko@imavr.com>	2021-12-13 04:17:57 +0200
commit	43d57ff4d61fc9117a8977e71779e6c32662c60f (patch)
tree	a61753fca3348733aaa29d90ddd67bd6c06e7b8f
parent	d2f9cc7aeef868899cc281fe69908cfced3a7619 (diff)
download	numpy-43d57ff4d61fc9117a8977e71779e6c32662c60f.tar.gz