summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSayed Adel <seiko@imavr.com>2021-12-13 04:07:21 +0200
committerSayed Adel <seiko@imavr.com>2021-12-13 04:17:57 +0200
commit43d57ff4d61fc9117a8977e71779e6c32662c60f (patch)
treea61753fca3348733aaa29d90ddd67bd6c06e7b8f
parentd2f9cc7aeef868899cc281fe69908cfced3a7619 (diff)
downloadnumpy-43d57ff4d61fc9117a8977e71779e6c32662c60f.tar.gz
SIMD: fix signed zero of SSE2/Floor
-rw-r--r--numpy/core/src/common/simd/sse/math.h49
1 files changed, 37 insertions, 12 deletions
diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h
index 117d39fb5..e4b77b671 100644
--- a/numpy/core/src/common/simd/sse/math.h
+++ b/numpy/core/src/common/simd/sse/math.h
@@ -42,7 +42,7 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
#define npyv_max_f64 _mm_max_pd
// Maximum, supports IEEE floating-point arithmetic (IEC 60559),
// - If one of the two vectors contains NaN, the equivalent element of the other vector is set
-// - Only if both corresponded elements are NaN, NaN is set.
+// - Only if both corresponded elements are NaN, NaN is set.
NPY_FINLINE npyv_f32 npyv_maxp_f32(npyv_f32 a, npyv_f32 b)
{
__m128 nn = _mm_cmpord_ps(b, b);
@@ -95,7 +95,7 @@ NPY_FINLINE npyv_s64 npyv_max_s64(npyv_s64 a, npyv_s64 b)
#define npyv_min_f64 _mm_min_pd
// Minimum, supports IEEE floating-point arithmetic (IEC 60559),
// - If one of the two vectors contains NaN, the equivalent element of the other vector is set
-// - Only if both corresponded elements are NaN, NaN is set.
+// - Only if both corresponded elements are NaN, NaN is set.
NPY_FINLINE npyv_f32 npyv_minp_f32(npyv_f32 a, npyv_f32 b)
{
__m128 nn = _mm_cmpord_ps(b, b);
@@ -143,6 +143,38 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
return npyv_select_s64(npyv_cmplt_s64(a, b), a, b);
}
+// round to nearest integer even
+NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
+{
+#ifdef NPY_HAVE_SSE41
+ return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT);
+#else
+ const npyv_f32 szero = _mm_set1_ps(-0.0f);
+ __m128i roundi = _mm_cvtps_epi32(a);
+ __m128i overflow = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero));
+ __m128 r = _mm_cvtepi32_ps(roundi);
+ // respect sign of zero
+ r = _mm_or_ps(r, _mm_and_ps(a, szero));
+ return npyv_select_f32(overflow, a, r);
+#endif
+}
+
+// round to nearest integer even
+NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a)
+{
+#ifdef NPY_HAVE_SSE41
+ return _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT);
+#else
+ const npyv_f64 szero = _mm_set1_pd(-0.0);
+ const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000);
+ npyv_f64 sign_two52 = _mm_or_pd(two_power_52, _mm_and_pd(a, szero));
+ // round by add magic number 2^52
+ npyv_f64 round = _mm_sub_pd(_mm_add_pd(a, sign_two52), sign_two52);
+ // respect signed zero, e.g. -0.5 -> -0.0
+ return _mm_or_pd(round, _mm_and_pd(a, szero));
+#endif
+}
+
// ceil
#ifdef NPY_HAVE_SSE41
#define npyv_ceil_f32 _mm_ceil_ps
@@ -209,21 +241,14 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
#else
NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a)
{
- const npyv_f32 szero = _mm_set1_ps(-0.0f);
const npyv_f32 one = _mm_set1_ps(1.0f);
- npyv_s32 roundi = _mm_cvttps_epi32(a);
- npyv_f32 round = _mm_cvtepi32_ps(roundi);
- npyv_f32 floor = _mm_sub_ps(round, _mm_and_ps(_mm_cmpgt_ps(round, a), one));
- return npyv_select_f32(_mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)), a, floor);
+ npyv_f32 round = npyv_rint_f32(a);
+ return _mm_sub_ps(round, _mm_and_ps(_mm_cmpgt_ps(round, a), one));
}
NPY_FINLINE npyv_f64 npyv_floor_f64(npyv_f64 a)
{
- const npyv_f64 szero = _mm_set1_pd(-0.0);
const npyv_f64 one = _mm_set1_pd(1.0);
- const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000);
- npyv_f64 sign_two52 = _mm_or_pd(two_power_52, _mm_and_pd(a, szero));
- // round by add magic number 2^52
- npyv_f64 round = _mm_sub_pd(_mm_add_pd(a, sign_two52), sign_two52);
+ npyv_f64 round = npyv_rint_f64(a);
return _mm_sub_pd(round, _mm_and_pd(_mm_cmpgt_pd(round, a), one));
}
#endif // NPY_HAVE_SSE41