summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSayed Adel <seiko@imavr.com>2021-12-13 04:07:30 +0200
committerSayed Adel <seiko@imavr.com>2021-12-19 16:35:56 +0200
commit72dfac649562b8dc1a15dac5babfb541c9e3dd89 (patch)
tree07debd42b64158cbb8012d67ac9755b87659fc0d
parent43d57ff4d61fc9117a8977e71779e6c32662c60f (diff)
downloadnumpy-72dfac649562b8dc1a15dac5babfb541c9e3dd89.tar.gz
SIMD: add universal intrinsic for round to nearest
-rw-r--r--numpy/core/src/_simd/_simd.dispatch.c.src4
-rw-r--r--numpy/core/src/common/simd/avx2/math.h8
-rw-r--r--numpy/core/src/common/simd/avx512/math.h8
-rw-r--r--numpy/core/src/common/simd/neon/math.h35
-rw-r--r--numpy/core/src/common/simd/vsx/math.h8
-rw-r--r--numpy/core/tests/test_simd.py30
6 files changed, 69 insertions, 24 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index 003ef7ffd..fabec069c 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -381,7 +381,7 @@ SIMD_IMPL_INTRIN_1(sumup_@sfx@, @esfx@, v@sfx@)
***************************/
#if @fp_only@
/**begin repeat1
- * #intrin = sqrt, recip, abs, square, ceil, trunc, floor#
+ * #intrin = sqrt, recip, abs, square, rint, ceil, trunc, floor#
*/
SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, v@sfx@)
/**end repeat1**/
@@ -615,7 +615,7 @@ SIMD_INTRIN_DEF(sumup_@sfx@)
***************************/
#if @fp_only@
/**begin repeat1
- * #intrin = sqrt, recip, abs, square, ceil, trunc, floor#
+ * #intrin = sqrt, recip, abs, square, rint, ceil, trunc, floor#
*/
SIMD_INTRIN_DEF(@intrin@_@sfx@)
/**end repeat1**/
diff --git a/numpy/core/src/common/simd/avx2/math.h b/numpy/core/src/common/simd/avx2/math.h
index 78608d51b..deaf4ad11 100644
--- a/numpy/core/src/common/simd/avx2/math.h
+++ b/numpy/core/src/common/simd/avx2/math.h
@@ -42,7 +42,7 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
#define npyv_max_f64 _mm256_max_pd
// Maximum, supports IEEE floating-point arithmetic (IEC 60559),
// - If one of the two vectors contains NaN, the equivalent element of the other vector is set
-// - Only if both corresponded elements are NaN, NaN is set.
+// - Only if both corresponded elements are NaN, NaN is set.
NPY_FINLINE npyv_f32 npyv_maxp_f32(npyv_f32 a, npyv_f32 b)
{
__m256 nn = _mm256_cmp_ps(b, b, _CMP_ORD_Q);
@@ -76,7 +76,7 @@ NPY_FINLINE npyv_s64 npyv_max_s64(npyv_s64 a, npyv_s64 b)
#define npyv_min_f64 _mm256_min_pd
// Minimum, supports IEEE floating-point arithmetic (IEC 60559),
// - If one of the two vectors contains NaN, the equivalent element of the other vector is set
-// - Only if both corresponded elements are NaN, NaN is set.
+// - Only if both corresponded elements are NaN, NaN is set.
NPY_FINLINE npyv_f32 npyv_minp_f32(npyv_f32 a, npyv_f32 b)
{
__m256 nn = _mm256_cmp_ps(b, b, _CMP_ORD_Q);
@@ -105,6 +105,10 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
return _mm256_blendv_epi8(a, b, _mm256_cmpgt_epi64(a, b));
}
+// round to nearest intger even
+#define npyv_rint_f32(A) _mm256_round_ps(A, _MM_FROUND_TO_NEAREST_INT)
+#define npyv_rint_f64(A) _mm256_round_pd(A, _MM_FROUND_TO_NEAREST_INT)
+
// ceil
#define npyv_ceil_f32 _mm256_ceil_ps
#define npyv_ceil_f64 _mm256_ceil_pd
diff --git a/numpy/core/src/common/simd/avx512/math.h b/numpy/core/src/common/simd/avx512/math.h
index 0d6e17993..5a6cb6dcd 100644
--- a/numpy/core/src/common/simd/avx512/math.h
+++ b/numpy/core/src/common/simd/avx512/math.h
@@ -51,7 +51,7 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
#define npyv_max_f64 _mm512_max_pd
// Maximum, supports IEEE floating-point arithmetic (IEC 60559),
// - If one of the two vectors contains NaN, the equivalent element of the other vector is set
-// - Only if both corresponded elements are NaN, NaN is set.
+// - Only if both corresponded elements are NaN, NaN is set.
NPY_FINLINE npyv_f32 npyv_maxp_f32(npyv_f32 a, npyv_f32 b)
{
__mmask16 nn = _mm512_cmp_ps_mask(b, b, _CMP_ORD_Q);
@@ -84,7 +84,7 @@ NPY_FINLINE npyv_f64 npyv_maxp_f64(npyv_f64 a, npyv_f64 b)
#define npyv_min_f64 _mm512_min_pd
// Minimum, supports IEEE floating-point arithmetic (IEC 60559),
// - If one of the two vectors contains NaN, the equivalent element of the other vector is set
-// - Only if both corresponded elements are NaN, NaN is set.
+// - Only if both corresponded elements are NaN, NaN is set.
NPY_FINLINE npyv_f32 npyv_minp_f32(npyv_f32 a, npyv_f32 b)
{
__mmask16 nn = _mm512_cmp_ps_mask(b, b, _CMP_ORD_Q);
@@ -112,6 +112,10 @@ NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b)
#define npyv_min_u64 _mm512_min_epu64
#define npyv_min_s64 _mm512_min_epi64
+// round to nearest integer even
+#define npyv_rint_f32(A) _mm512_roundscale_ps(A, _MM_FROUND_TO_NEAREST_INT)
+#define npyv_rint_f64(A) _mm512_roundscale_pd(A, _MM_FROUND_TO_NEAREST_INT)
+
// ceil
#define npyv_ceil_f32(A) _mm512_roundscale_ps(A, _MM_FROUND_TO_POS_INF)
#define npyv_ceil_f64(A) _mm512_roundscale_pd(A, _MM_FROUND_TO_POS_INF)
diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h
index 8c4788b3d..4607d6f27 100644
--- a/numpy/core/src/common/simd/neon/math.h
+++ b/numpy/core/src/common/simd/neon/math.h
@@ -153,6 +153,33 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
return vbslq_s64(npyv_cmplt_s64(a, b), a, b);
}
+// round to nearest integer even
+NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
+{
+#ifdef NPY_HAVE_ASIMD
+ return vrndnq_f32(a);
+#else
+ // ARMv7 NEON only supports fp to int truncate conversion.
+ // a magic trick of adding 1.5 * 2**23 is used for rounding
+ // to nearest even and then substract this magic number to get
+ // the integer.
+ const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
+ const npyv_f32 magic = vdupq_n_f32(12582912.0f); // 1.5 * 2**23
+ npyv_f32 round = vsubq_f32(vaddq_f32(a, magic), magic);
+ npyv_b32 overflow = vcleq_f32(vabsq_f32(a), vreinterpretq_f32_u32(vdupq_n_u32(0x4b000000)));
+ round = vbslq_f32(overflow, round, a);
+ // signed zero
+ round = vreinterpretq_f32_s32(vorrq_s32(
+ vreinterpretq_s32_f32(round),
+ vandq_s32(vreinterpretq_s32_f32(a), szero)
+ ));
+ return round;
+#endif
+}
+#if NPY_SIMD_F64
+ #define npyv_rint_f64 vrndnq_f64
+#endif // NPY_SIMD_F64
+
// ceil
#ifdef NPY_HAVE_ASIMD
#define npyv_ceil_f32 vrndpq_f32
@@ -238,13 +265,17 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
npyv_f32 floor = vsubq_f32(round, vreinterpretq_f32_u32(
vandq_u32(vcgtq_f32(round, a), one)
));
-
+ // respect signed zero
+ npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32(
+ vreinterpretq_s32_f32(floor),
+ vandq_s32(vreinterpretq_s32_f32(a), szero)
+ ));
npyv_u32 nnan = npyv_notnan_f32(a);
npyv_u32 overflow = vorrq_u32(
vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int)
);
- return vbslq_f32(vbicq_u32(nnan, overflow), floor, a);
+ return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a);
}
#endif // NPY_HAVE_ASIMD
#if NPY_SIMD_F64
diff --git a/numpy/core/src/common/simd/vsx/math.h b/numpy/core/src/common/simd/vsx/math.h
index 94f1233f1..444bc9e54 100644
--- a/numpy/core/src/common/simd/vsx/math.h
+++ b/numpy/core/src/common/simd/vsx/math.h
@@ -38,7 +38,7 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
#define npyv_max_f64 vec_max
// Maximum, supports IEEE floating-point arithmetic (IEC 60559),
// - If one of the two vectors contains NaN, the equivalent element of the other vector is set
-// - Only if both corresponded elements are NaN, NaN is set.
+// - Only if both corresponded elements are NaN, NaN is set.
#define npyv_maxp_f32 vec_max
#define npyv_maxp_f64 vec_max
// Maximum, integer operations
@@ -56,7 +56,7 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
#define npyv_min_f64 vec_min
// Minimum, supports IEEE floating-point arithmetic (IEC 60559),
// - If one of the two vectors contains NaN, the equivalent element of the other vector is set
-// - Only if both corresponded elements are NaN, NaN is set.
+// - Only if both corresponded elements are NaN, NaN is set.
#define npyv_minp_f32 vec_min
#define npyv_minp_f64 vec_min
// Minimum, integer operations
@@ -69,6 +69,10 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
#define npyv_min_u64 vec_min
#define npyv_min_s64 vec_min
+// round to nearest int even
+#define npyv_rint_f32 vec_rint
+#define npyv_rint_f64 vec_rint
+
// ceil
#define npyv_ceil_f32 vec_ceil
#define npyv_ceil_f64 vec_ceil
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index 548d89574..605baefe6 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -330,17 +330,18 @@ class _SIMD_FP(_Test_Utility):
square = self.square(vdata)
assert square == data_square
- @pytest.mark.parametrize("intrin, func", [("self.ceil", math.ceil),
- ("self.trunc", math.trunc), ("self.floor", math.floor)])
+ @pytest.mark.parametrize("intrin, func", [("ceil", math.ceil),
+ ("trunc", math.trunc), ("floor", math.floor), ("rint", round)])
def test_rounding(self, intrin, func):
"""
Test intrinsics:
+ npyv_rint_##SFX
npyv_ceil_##SFX
npyv_trunc_##SFX
npyv_floor##SFX
"""
intrin_name = intrin
- intrin = eval(intrin)
+ intrin = getattr(self, intrin)
pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
# special cases
round_cases = ((nan, nan), (pinf, pinf), (ninf, ninf))
@@ -348,24 +349,25 @@ class _SIMD_FP(_Test_Utility):
data_round = [desired]*self.nlanes
_round = intrin(self.setall(case))
assert _round == pytest.approx(data_round, nan_ok=True)
+
for x in range(0, 2**20, 256**2):
for w in (-1.05, -1.10, -1.15, 1.05, 1.10, 1.15):
- data = [x*w+a for a in range(self.nlanes)]
- vdata = self.load(data)
+ data = self.load([(x+a)*w for a in range(self.nlanes)])
data_round = [func(x) for x in data]
- _round = intrin(vdata)
+ _round = intrin(data)
assert _round == data_round
+
# signed zero
- if "ceil" in intrin_name or "trunc" in intrin_name:
- for w in (-0.25, -0.30, -0.45):
- _round = self._to_unsigned(intrin(self.setall(w)))
- data_round = self._to_unsigned(self.setall(-0.0))
- assert _round == data_round
- if "floor" in intrin_name:
- _round = self._to_unsigned(intrin(self.setall(-0.0)))
+ if intrin_name == "floor":
+ data_szero = (-0.0,)
+ else:
+ data_szero = (-0.0, -0.25, -0.30, -0.45, -0.5)
+
+ for w in data_szero:
+ _round = self._to_unsigned(intrin(self.setall(w)))
data_round = self._to_unsigned(self.setall(-0.0))
assert _round == data_round
-
+
def test_max(self):
"""
Test intrinsics: