diff options
| author | Sayed Adel <seiko@imavr.com> | 2021-11-01 18:49:15 +0200 |
|---|---|---|
| committer | Sayed Adel <seiko@imavr.com> | 2021-11-01 18:49:15 +0200 |
| commit | 521b3afc15b7a8c181ebb7c043f3744555c1e9c9 (patch) | |
| tree | 16282e66ec1d2f46284886ff54100a51c1effc5c /numpy | |
| parent | 6ccad06d45c682f5fd199cf61a175575f431cbac (diff) | |
| download | numpy-521b3afc15b7a8c181ebb7c043f3744555c1e9c9.tar.gz | |
SIMD: Fix impl of intrinsic `npyv_ceil_f32` on armv7/neon
Diffstat (limited to 'numpy')
| -rw-r--r-- | numpy/core/src/common/simd/neon/math.h | 41 |
1 files changed, 31 insertions, 10 deletions
diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h index 8d370e624..38c3899e4 100644 --- a/numpy/core/src/common/simd/neon/math.h +++ b/numpy/core/src/common/simd/neon/math.h @@ -88,16 +88,16 @@ NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a) #define npyv_max_f64 vmaxq_f64 // Maximum, supports IEEE floating-point arithmetic (IEC 60559), // - If one of the two vectors contains NaN, the equivalent element of the other vector is set -// - Only if both corresponded elements are NaN, NaN is set. +// - Only if both corresponded elements are NaN, NaN is set. #ifdef NPY_HAVE_ASIMD #define npyv_maxp_f32 vmaxnmq_f32 #else NPY_FINLINE npyv_f32 npyv_maxp_f32(npyv_f32 a, npyv_f32 b) - { + { npyv_u32 nn_a = vceqq_f32(a, a); npyv_u32 nn_b = vceqq_f32(b, b); return vmaxq_f32(vbslq_f32(nn_a, a, b), vbslq_f32(nn_b, b, a)); - } + } #endif #if NPY_SIMD_F64 #define npyv_maxp_f64 vmaxnmq_f64 @@ -123,16 +123,16 @@ NPY_FINLINE npyv_s64 npyv_max_s64(npyv_s64 a, npyv_s64 b) #define npyv_min_f64 vminq_f64 // Minimum, supports IEEE floating-point arithmetic (IEC 60559), // - If one of the two vectors contains NaN, the equivalent element of the other vector is set -// - Only if both corresponded elements are NaN, NaN is set. +// - Only if both corresponded elements are NaN, NaN is set. #ifdef NPY_HAVE_ASIMD #define npyv_minp_f32 vminnmq_f32 #else NPY_FINLINE npyv_f32 npyv_minp_f32(npyv_f32 a, npyv_f32 b) - { + { npyv_u32 nn_a = vceqq_f32(a, a); npyv_u32 nn_b = vceqq_f32(b, b); return vminq_f32(vbslq_f32(nn_a, a, b), vbslq_f32(nn_b, b, a)); - } + } #endif #if NPY_SIMD_F64 #define npyv_minp_f64 vminnmq_f64 @@ -159,10 +159,31 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) #else NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a) { - npyv_f32 conv_trunc = vcvtq_f32_s32(vcvtq_s32_f32(a)); - npyv_f32 conv_trunc_add_one = npyv_add_f32(conv_trunc, vdupq_n_f32(1.0f)); - npyv_u32 mask = vcltq_f32(conv_trunc, a); - return vbslq_f32(mask, conv_trunc, conv_trunc_add_one); + const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); + const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f)); + const npyv_s32 max_int = vdupq_n_s32(0x7fffffff); + /** + * On armv7, vcvtq.f32 handles special cases as follows: + * NaN return 0 + * +inf or +outrange return 0x80000000(-0.0f) + * -inf or -outrange return 0x7fffffff(nan) + */ + npyv_s32 roundi = vcvtq_s32_f32(a); + npyv_f32 round = vcvtq_f32_s32(roundi); + npyv_f32 ceil = vaddq_f32(round, vreinterpretq_f32_u32( + vandq_u32(vcltq_f32(round, a), one)) + ); + // respect signed zero, e.g. -0.5 -> -0.0 + npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32( + vreinterpretq_s32_f32(ceil), + vandq_s32(vreinterpretq_s32_f32(a), szero) + )); + // if nan or overflow return a + npyv_u32 nnan = npyv_notnan_f32(a); + npyv_u32 overflow = vorrq_u32( + vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int) + ); + return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a); } #endif #if NPY_SIMD_F64 |
