diff options
author | Sayed Adel <seiko@imavr.com> | 2022-12-14 09:03:34 +0200 |
---|---|---|
committer | Sayed Adel <seiko@imavr.com> | 2022-12-14 20:23:26 +0200 |
commit | 5bf0e4413db20069953fe8c941ff118bb685cb46 (patch) | |
tree | 6c1939ee394a44db9ff347ea2ad91175d6be2913 | |
parent | 0ef7491803804452b26dd3459d4b58e998532e1c (diff) | |
download | numpy-5bf0e4413db20069953fe8c941ff118bb685cb46.tar.gz |
BUG, SIMD: Fix invalid value encountered in rint/trunc/ceil/floor on armhf/neon
-rw-r--r-- | numpy/core/src/common/simd/neon/math.h | 149 | ||||
-rw-r--r-- | numpy/core/tests/test_simd.py | 10 |
2 files changed, 91 insertions, 68 deletions
diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h index c0a771b5d..01344a41f 100644 --- a/numpy/core/src/common/simd/neon/math.h +++ b/numpy/core/src/common/simd/neon/math.h @@ -278,20 +278,25 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) return vrndnq_f32(a); #else // ARMv7 NEON only supports fp to int truncate conversion. - // a magic trick of adding 1.5 * 2**23 is used for rounding + // a magic trick of adding 1.5 * 2^23 is used for rounding // to nearest even and then subtract this magic number to get // the integer. - const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); - const npyv_f32 magic = vdupq_n_f32(12582912.0f); // 1.5 * 2**23 - npyv_f32 round = vsubq_f32(vaddq_f32(a, magic), magic); - npyv_b32 overflow = vcleq_f32(vabsq_f32(a), vreinterpretq_f32_u32(vdupq_n_u32(0x4b000000))); - round = vbslq_f32(overflow, round, a); - // signed zero - round = vreinterpretq_f32_s32(vorrq_s32( - vreinterpretq_s32_f32(round), - vandq_s32(vreinterpretq_s32_f32(a), szero) - )); - return round; + // + const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f)); + const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero); + const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23 + const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23 + npyv_u32 nnan_mask = vceqq_f32(a, a); + // eliminate nans to avoid invalid fp errors + npyv_f32 abs_x = vabsq_f32(vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a)))); + // round by add magic number 1.5 * 2^23 + npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h); + // copysign + round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask )); + // a if |a| >= 2^23 or a == NaN + npyv_u32 mask = vcleq_f32(abs_x, two_power_23); + mask = vandq_u32(mask, nnan_mask); + return vbslq_f32(mask, round, a); #endif } #if NPY_SIMD_F64 @@ -302,33 +307,30 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) #ifdef NPY_HAVE_ASIMD #define npyv_ceil_f32 vrndpq_f32 #else - NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a) - { - const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); + NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a) + { const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f)); - const npyv_s32 max_int = vdupq_n_s32(0x7fffffff); - /** - * On armv7, vcvtq.f32 handles special cases as follows: - * NaN return 0 - * +inf or +outrange return 0x80000000(-0.0f) - * -inf or -outrange return 0x7fffffff(nan) - */ - npyv_s32 roundi = vcvtq_s32_f32(a); - npyv_f32 round = vcvtq_f32_s32(roundi); + const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f)); + const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero); + const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23 + const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23 + npyv_u32 nnan_mask = vceqq_f32(a, a); + npyv_f32 x = vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a))); + // eliminate nans to avoid invalid fp errors + npyv_f32 abs_x = vabsq_f32(x); + // round by add magic number 1.5 * 2^23 + npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h); + // copysign + round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask)); npyv_f32 ceil = vaddq_f32(round, vreinterpretq_f32_u32( - vandq_u32(vcltq_f32(round, a), one)) - ); - // respect signed zero, e.g. -0.5 -> -0.0 - npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32( - vreinterpretq_s32_f32(ceil), - vandq_s32(vreinterpretq_s32_f32(a), szero) - )); - // if nan or overflow return a - npyv_u32 nnan = npyv_notnan_f32(a); - npyv_u32 overflow = vorrq_u32( - vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int) + vandq_u32(vcltq_f32(round, x), one)) ); - return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a); + // respects signed zero + ceil = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(ceil), sign_mask)); + // a if |a| >= 2^23 or a == NaN + npyv_u32 mask = vcleq_f32(abs_x, two_power_23); + mask = vandq_u32(mask, nnan_mask); + return vbslq_f32(mask, ceil, a); } #endif #if NPY_SIMD_F64 @@ -339,29 +341,37 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) #ifdef NPY_HAVE_ASIMD #define npyv_trunc_f32 vrndq_f32 #else - NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a) - { - const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); + NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a) + { const npyv_s32 max_int = vdupq_n_s32(0x7fffffff); + const npyv_u32 exp_mask = vdupq_n_u32(0xff000000); + const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); + const npyv_u32 sign_mask = vandq_u32( + vreinterpretq_u32_f32(a), vreinterpretq_u32_s32(szero)); + + npyv_u32 nfinite_mask = vshlq_n_u32(vreinterpretq_u32_f32(a), 1); + nfinite_mask = vandq_u32(nfinite_mask, exp_mask); + nfinite_mask = vceqq_u32(nfinite_mask, exp_mask); + // elminate nans/inf to avoid invalid fp errors + npyv_f32 x = vreinterpretq_f32_u32( + veorq_u32(nfinite_mask, vreinterpretq_u32_f32(a))); /** * On armv7, vcvtq.f32 handles special cases as follows: * NaN return 0 * +inf or +outrange return 0x80000000(-0.0f) * -inf or -outrange return 0x7fffffff(nan) */ - npyv_s32 roundi = vcvtq_s32_f32(a); - npyv_f32 round = vcvtq_f32_s32(roundi); + npyv_s32 trunci = vcvtq_s32_f32(x); + npyv_f32 trunc = vcvtq_f32_s32(trunci); // respect signed zero, e.g. -0.5 -> -0.0 - npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32( - vreinterpretq_s32_f32(round), - vandq_s32(vreinterpretq_s32_f32(a), szero) - )); - // if nan or overflow return a - npyv_u32 nnan = npyv_notnan_f32(a); - npyv_u32 overflow = vorrq_u32( - vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int) + trunc = vreinterpretq_f32_u32( + vorrq_u32(vreinterpretq_u32_f32(trunc), sign_mask)); + // if overflow return a + npyv_u32 overflow_mask = vorrq_u32( + vceqq_s32(trunci, szero), vceqq_s32(trunci, max_int) ); - return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a); + // a if a overflow or nonfinite + return vbslq_f32(vorrq_u32(nfinite_mask, overflow_mask), a, trunc); } #endif #if NPY_SIMD_F64 @@ -372,28 +382,31 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) #ifdef NPY_HAVE_ASIMD #define npyv_floor_f32 vrndmq_f32 #else - NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a) - { - const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); + NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a) + { const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f)); - const npyv_s32 max_int = vdupq_n_s32(0x7fffffff); + const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f)); + const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero); + const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23 + const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23 - npyv_s32 roundi = vcvtq_s32_f32(a); - npyv_f32 round = vcvtq_f32_s32(roundi); + npyv_u32 nnan_mask = vceqq_f32(a, a); + npyv_f32 x = vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a))); + // eliminate nans to avoid invalid fp errors + npyv_f32 abs_x = vabsq_f32(x); + // round by add magic number 1.5 * 2^23 + npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h); + // copysign + round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask)); npyv_f32 floor = vsubq_f32(round, vreinterpretq_f32_u32( - vandq_u32(vcgtq_f32(round, a), one) - )); - // respect signed zero - npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32( - vreinterpretq_s32_f32(floor), - vandq_s32(vreinterpretq_s32_f32(a), szero) + vandq_u32(vcgtq_f32(round, x), one) )); - npyv_u32 nnan = npyv_notnan_f32(a); - npyv_u32 overflow = vorrq_u32( - vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int) - ); - - return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a); + // respects signed zero + floor = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(floor), sign_mask)); + // a if |a| >= 2^23 or a == NaN + npyv_u32 mask = vcleq_f32(abs_x, two_power_23); + mask = vandq_u32(mask, nnan_mask); + return vbslq_f32(mask, floor, a); } #endif // NPY_HAVE_ASIMD #if NPY_SIMD_F64 diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py index 0f48aec55..17a5ae0dd 100644 --- a/numpy/core/tests/test_simd.py +++ b/numpy/core/tests/test_simd.py @@ -568,6 +568,16 @@ class _SIMD_FP(_Test_Utility): nnan = self.notnan(self.setall(self._nan())) assert nnan == [0]*self.nlanes + @pytest.mark.parametrize("intrin_name", [ + "rint", "trunc", "ceil", "floor" + ]) + def test_unary_invalid_fpexception(self, intrin_name): + intrin = getattr(self, intrin_name) + for d in [float("nan"), float("inf"), -float("inf")]: + v = self.setall(d) + clear_floatstatus() + intrin(v) + assert check_floatstatus(invalid=True) == False @pytest.mark.parametrize("intrin_name", [ "cmpltq", "cmpleq", "cmpgtq", "cmpgeq" |