diff options
author | Sayed Adel <seiko@imavr.com> | 2022-12-14 21:04:27 +0200 |
---|---|---|
committer | Sayed Adel <seiko@imavr.com> | 2022-12-15 05:17:00 +0200 |
commit | d02fc70cb6ad43093e1c407172c7193957705b5d (patch) | |
tree | 4200eefc874bf59f65ef71dee45c37faf842c5b5 | |
parent | 5bf0e4413db20069953fe8c941ff118bb685cb46 (diff) | |
download | numpy-d02fc70cb6ad43093e1c407172c7193957705b5d.tar.gz |
ENH, SIMD: Discard non-signaling comparison intrinsics
Providing non-signaling comparison intrinsics that guarantee
no FP invalid exception in case of qNaN sounds great but it
cost unacceptable extra intrinsics on ppc64le(VSX) and x86(SSE).
Therefore, an integer definition #NPY_SIMD_CMPSIGNAL has been
provided instead to differenate between SIMD extensions
that support only supports signaling comparison.
-rw-r--r-- | numpy/core/src/_simd/_simd.dispatch.c.src | 15 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx2/avx2.h | 1 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx2/operators.h | 10 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx512/avx512.h | 1 | ||||
-rw-r--r-- | numpy/core/src/common/simd/avx512/operators.h | 10 | ||||
-rw-r--r-- | numpy/core/src/common/simd/neon/neon.h | 1 | ||||
-rw-r--r-- | numpy/core/src/common/simd/neon/operators.h | 29 | ||||
-rw-r--r-- | numpy/core/src/common/simd/simd.h | 3 | ||||
-rw-r--r-- | numpy/core/src/common/simd/sse/operators.h | 32 | ||||
-rw-r--r-- | numpy/core/src/common/simd/sse/sse.h | 1 | ||||
-rw-r--r-- | numpy/core/src/common/simd/vec/operators.h | 44 | ||||
-rw-r--r-- | numpy/core/src/common/simd/vec/vec.h | 2 | ||||
-rw-r--r-- | numpy/core/src/umath/loops_trigonometric.dispatch.c.src | 8 | ||||
-rw-r--r-- | numpy/core/tests/test_simd.py | 38 |
14 files changed, 22 insertions, 173 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src index 8d2ec6c30..48023af80 100644 --- a/numpy/core/src/_simd/_simd.dispatch.c.src +++ b/numpy/core/src/_simd/_simd.dispatch.c.src @@ -333,13 +333,6 @@ SIMD_IMPL_INTRIN_1(not_@sfx@, v@sfx@, v@sfx@) */ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@bsfx@, v@sfx@, v@sfx@) /**end repeat1**/ -#if @fp_only@ -/**begin repeat1 - * #intrin = cmpgtq, cmpgeq, cmpltq, cmpleq# - */ -SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@bsfx@, v@sfx@, v@sfx@) -/**end repeat1**/ -#endif #if @bitw8b_sup@ SIMD_IMPL_INTRIN_2(andc_@sfx@, v@sfx@, v@sfx@, v@sfx@) @@ -618,14 +611,6 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@) SIMD_INTRIN_DEF(@intrin@_@sfx@) /**end repeat1**/ -#if @fp_only@ -/**begin repeat1 - * #intrin = cmpgtq, cmpgeq, cmpltq, cmpleq# - */ -SIMD_INTRIN_DEF(@intrin@_@sfx@) -/**end repeat1**/ -#endif - #if @bitw8b_sup@ SIMD_INTRIN_DEF(andc_@sfx@) SIMD_INTRIN_DEF(andc_@bsfx@) diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h index 8cb74df2b..d64f3c6d6 100644 --- a/numpy/core/src/common/simd/avx2/avx2.h +++ b/numpy/core/src/common/simd/avx2/avx2.h @@ -11,6 +11,7 @@ #define NPY_SIMD_FMA3 0 // fast emulated #endif #define NPY_SIMD_BIGENDIAN 0 +#define NPY_SIMD_CMPSIGNAL 0 // Enough limit to allow us to use _mm256_i32gather_* #define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 8) diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h index 69e7e897b..86e0038d9 100644 --- a/numpy/core/src/common/simd/avx2/operators.h +++ b/numpy/core/src/common/simd/avx2/operators.h @@ -218,16 +218,6 @@ NPY_FINLINE __m256i npyv_cmpge_u32(__m256i a, __m256i b) #define npyv_cmpgt_f64(A, B) _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_GT_OQ)) #define npyv_cmpge_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_GE_OQ)) #define npyv_cmpge_f64(A, B) _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_GE_OQ)) -// ordered comparison guarantees non-signaling -// don't raise FP invalid exception if one of the sources containing qnan. -#define npyv_cmpgeq_f32 npyv_cmpge_f32 -#define npyv_cmpgeq_f64 npyv_cmpge_f64 -#define npyv_cmpgtq_f32 npyv_cmpgt_f32 -#define npyv_cmpgtq_f64 npyv_cmpgt_f64 -#define npyv_cmpleq_f32 npyv_cmple_f32 -#define npyv_cmpleq_f64 npyv_cmple_f64 -#define npyv_cmpltq_f32 npyv_cmplt_f32 -#define npyv_cmpltq_f64 npyv_cmplt_f64 // check special cases NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h index 0946e6443..aa6abe256 100644 --- a/numpy/core/src/common/simd/avx512/avx512.h +++ b/numpy/core/src/common/simd/avx512/avx512.h @@ -7,6 +7,7 @@ #define NPY_SIMD_F64 1 #define NPY_SIMD_FMA3 1 // native support #define NPY_SIMD_BIGENDIAN 0 +#define NPY_SIMD_CMPSIGNAL 0 // Enough limit to allow us to use _mm512_i32gather_* and _mm512_i32scatter_* #define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 16) #define NPY_SIMD_MAXSTORE_STRIDE32 (0x7fffffff / 16) diff --git a/numpy/core/src/common/simd/avx512/operators.h b/numpy/core/src/common/simd/avx512/operators.h index 0ff57847b..c70932d5f 100644 --- a/numpy/core/src/common/simd/avx512/operators.h +++ b/numpy/core/src/common/simd/avx512/operators.h @@ -331,16 +331,6 @@ #define npyv_cmpgt_f64(A, B) _mm512_cmp_pd_mask(A, B, _CMP_GT_OQ) #define npyv_cmpge_f32(A, B) _mm512_cmp_ps_mask(A, B, _CMP_GE_OQ) #define npyv_cmpge_f64(A, B) _mm512_cmp_pd_mask(A, B, _CMP_GE_OQ) -// ordered non-signaling comparison -// don't raise FP invalid exception if one of the sources containing qnan. -#define npyv_cmpgeq_f32 npyv_cmpge_f32 -#define npyv_cmpgeq_f64 npyv_cmpge_f64 -#define npyv_cmpgtq_f32 npyv_cmpgt_f32 -#define npyv_cmpgtq_f64 npyv_cmpgt_f64 -#define npyv_cmpleq_f32 npyv_cmple_f32 -#define npyv_cmpleq_f64 npyv_cmple_f64 -#define npyv_cmpltq_f32 npyv_cmplt_f32 -#define npyv_cmpltq_f64 npyv_cmplt_f64 // check special cases NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) diff --git a/numpy/core/src/common/simd/neon/neon.h b/numpy/core/src/common/simd/neon/neon.h index b08071527..49c35c415 100644 --- a/numpy/core/src/common/simd/neon/neon.h +++ b/numpy/core/src/common/simd/neon/neon.h @@ -16,6 +16,7 @@ #define NPY_SIMD_FMA3 0 // HW emulated #endif #define NPY_SIMD_BIGENDIAN 0 +#define NPY_SIMD_CMPSIGNAL 1 typedef uint8x16_t npyv_u8; typedef int8x16_t npyv_s8; diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h index a6c479998..249621bd6 100644 --- a/numpy/core/src/common/simd/neon/operators.h +++ b/numpy/core/src/common/simd/neon/operators.h @@ -238,35 +238,6 @@ #define npyv_cmple_f32(A, B) npyv_cmpge_f32(B, A) #define npyv_cmple_f64(A, B) npyv_cmpge_f64(B, A) -// ordered comparison guarantees non-signaling -// don't raise FP invalid exception if one of the sources containing qnan. -NPY_FINLINE npyv_b32 npyv_cmpgeq_f32(npyv_f32 a, npyv_f32 b) -{ - return vceqq_f32(vmaxq_f32(a, b), a); -} -NPY_FINLINE npyv_b32 npyv_cmpgtq_f32(npyv_f32 a, npyv_f32 b) -{ - npyv_f32 max = vmaxq_f32(a, b); - npyv_b32 nnan = vceqq_f32(max, max); - return vbicq_u32(nnan, vceqq_f32(max, b)); -} -#define npyv_cmpleq_f32(A, B) npyv_cmpgeq_f32(B, A) -#define npyv_cmpltq_f32(A, B) npyv_cmpgtq_f32(B, A) -#if NPY_SIMD_F64 -NPY_FINLINE npyv_b64 npyv_cmpgeq_f64(npyv_f64 a, npyv_f64 b) -{ - return vceqq_f64(vmaxq_f64(a, b), a); -} -NPY_FINLINE npyv_b64 npyv_cmpgtq_f64(npyv_f64 a, npyv_f64 b) -{ - npyv_f64 max = vmaxq_f64(a, b); - npyv_b64 nnan = vceqq_f64(max, max); - return vbicq_u64(nnan, vceqq_f64(max, b)); -} -#define npyv_cmpleq_f64(A, B) npyv_cmpgeq_f64(B, A) -#define npyv_cmpltq_f64(A, B) npyv_cmpgtq_f64(B, A) -#endif - // check special cases NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) { return vceqq_f32(a, a); } diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h index 92a77ad80..8c9b14251 100644 --- a/numpy/core/src/common/simd/simd.h +++ b/numpy/core/src/common/simd/simd.h @@ -82,6 +82,9 @@ typedef double npyv_lanetype_f64; #define NPY_SIMD_FMA3 0 /// 1 if the enabled SIMD extension is running on big-endian mode otherwise 0. #define NPY_SIMD_BIGENDIAN 0 + /// 1 if the supported comparison intrinsics(lt, le, gt, ge) + /// raises FP invalid exception for quite NaNs. + #define NPY_SIMD_CMPSIGNAL 0 #endif // enable emulated mask operations for all SIMD extension except for AVX512 diff --git a/numpy/core/src/common/simd/sse/operators.h b/numpy/core/src/common/simd/sse/operators.h index 28aa343bb..59182679e 100644 --- a/numpy/core/src/common/simd/sse/operators.h +++ b/numpy/core/src/common/simd/sse/operators.h @@ -277,38 +277,6 @@ NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c) #define npyv_cmpge_f32(a, b) _mm_castps_si128(_mm_cmpge_ps(a, b)) #define npyv_cmpge_f64(a, b) _mm_castpd_si128(_mm_cmpge_pd(a, b)) -// ordered comparison guarantees non-signaling -// don't raise FP invalid exception if one of the sources containing qnan. -NPY_FINLINE npyv_b32 npyv_cmpgtq_f32(npyv_f32 a, npyv_f32 b) -{ - __m128 nan_mask = _mm_cmpunord_ps(a, b); - __m128 cmp_mask = _mm_cmpgt_ps(_mm_xor_ps(nan_mask, a), _mm_xor_ps(nan_mask, b)); - return _mm_castps_si128(_mm_andnot_ps(nan_mask, cmp_mask)); -} -NPY_FINLINE npyv_b64 npyv_cmpgtq_f64(npyv_f64 a, npyv_f64 b) -{ - __m128d nan_mask = _mm_cmpunord_pd(a, b); - __m128d cmp_mask = _mm_cmpgt_pd(_mm_xor_pd(nan_mask, a), _mm_xor_pd(nan_mask, b)); - return _mm_castpd_si128(_mm_andnot_pd(nan_mask, cmp_mask)); -} -NPY_FINLINE npyv_b32 npyv_cmpgeq_f32(npyv_f32 a, npyv_f32 b) -{ - __m128 nan_mask = _mm_cmpunord_ps(a, b); - __m128 cmp_mask = _mm_cmpge_ps(_mm_xor_ps(nan_mask, a), _mm_xor_ps(nan_mask, b)); - return _mm_castps_si128(_mm_andnot_ps(nan_mask, cmp_mask)); -} -NPY_FINLINE npyv_b64 npyv_cmpgeq_f64(npyv_f64 a, npyv_f64 b) -{ - __m128d nan_mask = _mm_cmpunord_pd(a, b); - __m128d cmp_mask = _mm_cmpge_pd(_mm_xor_pd(nan_mask, a), _mm_xor_pd(nan_mask, b)); - return _mm_castpd_si128(_mm_andnot_pd(nan_mask, cmp_mask)); -} - -#define npyv_cmpltq_f32(A, B) npyv_cmpgtq_f32(B, A) -#define npyv_cmpltq_f64(A, B) npyv_cmpgtq_f64(B, A) -#define npyv_cmpleq_f32(A, B) npyv_cmpgeq_f32(B, A) -#define npyv_cmpleq_f64(A, B) npyv_cmpgeq_f64(B, A) - // check special cases NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) { return _mm_castps_si128(_mm_cmpord_ps(a, a)); } diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h index c21bbfda7..0c6b8cdba 100644 --- a/numpy/core/src/common/simd/sse/sse.h +++ b/numpy/core/src/common/simd/sse/sse.h @@ -12,6 +12,7 @@ #define NPY_SIMD_FMA3 0 // fast emulated #endif #define NPY_SIMD_BIGENDIAN 0 +#define NPY_SIMD_CMPSIGNAL 1 typedef __m128i npyv_u8; typedef __m128i npyv_s8; diff --git a/numpy/core/src/common/simd/vec/operators.h b/numpy/core/src/common/simd/vec/operators.h index fe1a9b10b..50dac20f6 100644 --- a/numpy/core/src/common/simd/vec/operators.h +++ b/numpy/core/src/common/simd/vec/operators.h @@ -266,50 +266,6 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a) #endif #define npyv_cmple_f64(A, B) npyv_cmpge_f64(B, A) -// ordered comparison guarantees non-signaling -// don't raise FP invalid exception if one of the sources containing qnan. -#if NPY_SIMD_F32 -NPY_FINLINE npyv_b32 npyv_cmpgeq_f32(npyv_f32 a, npyv_f32 b) -{ -#ifdef NPY_HAVE_VSX - return vec_vcmpgefp(a, b); -#else - return vec_cmpge(a, b); -#endif -} -NPY_FINLINE npyv_b32 npyv_cmpgtq_f32(npyv_f32 a, npyv_f32 b) -{ -#ifdef NPY_HAVE_VSX - return vec_vcmpgtfp(a, b); -#else - return vec_cmpgt(a, b); -#endif -} -#define npyv_cmpleq_f32(A, B) npyv_cmpgeq_f32(B, A) -#define npyv_cmpltq_f32(A, B) npyv_cmpgtq_f32(B, A) -#endif // NPY_SIMD_F32 - -NPY_FINLINE npyv_b64 npyv_cmpgeq_f64(npyv_f64 a, npyv_f64 b) -{ -#ifdef NPY_HAVE_VSX - return vec_cmpeq(vec_max(a, b), a); -#else - return vec_cmpge(a, b); -#endif -} -NPY_FINLINE npyv_b64 npyv_cmpgtq_f64(npyv_f64 a, npyv_f64 b) -{ -#ifdef NPY_HAVE_VSX - npyv_f64 max = vec_max(a, b); - npyv_b64 nnan = vec_cmpeq(max, max); - return vec_andc(max, vec_cmpeq(max, b)); -#else - return vec_cmpgt(a, b); -#endif -} -#define npyv_cmpleq_f64(A, B) npyv_cmpgeq_f64(B, A) -#define npyv_cmpltq_f64(A, B) npyv_cmpgtq_f64(B, A) - // check special cases #if NPY_SIMD_F32 NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) diff --git a/numpy/core/src/common/simd/vec/vec.h b/numpy/core/src/common/simd/vec/vec.h index abcd33ce1..1d4508669 100644 --- a/numpy/core/src/common/simd/vec/vec.h +++ b/numpy/core/src/common/simd/vec/vec.h @@ -39,8 +39,10 @@ #ifdef NPY_HAVE_VX #define NPY_SIMD_BIGENDIAN 1 + #define NPY_SIMD_CMPSIGNAL 0 #else #define NPY_SIMD_BIGENDIAN 0 + #define NPY_SIMD_CMPSIGNAL 1 #endif typedef __vector unsigned char npyv_u8; diff --git a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src index e09c283de..9f9978e66 100644 --- a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src +++ b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src @@ -124,7 +124,12 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, } else { x_in = npyv_loadn_tillz_f32(src, ssrc, len); } - npyv_b32 simd_mask = npyv_cmpleq_f32(npyv_abs_f32(x_in), max_cody); + npyv_b32 nnan_mask = npyv_notnan_f32(x_in); + #if NPY_SIMD_CMPSIGNAL + // Eliminate NaN to avoid FP invalid exception + x_in = npyv_and_f32(x_in, npyv_reinterpret_f32_u32(npyv_cvt_u32_b32(nnan_mask))); + #endif + npyv_b32 simd_mask = npyv_cmple_f32(npyv_abs_f32(x_in), max_cody); npy_uint64 simd_maski = npyv_tobits_b32(simd_mask); /* * For elements outside of this range, Cody-Waite's range reduction @@ -132,7 +137,6 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, * these numbers */ if (simd_maski != 0) { - npyv_b32 nnan_mask = npyv_notnan_f32(x_in); npyv_f32 x = npyv_select_f32(npyv_and_b32(nnan_mask, simd_mask), x_in, zerosf); npyv_f32 quadrant = npyv_mul_f32(x, two_over_pi); diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py index 17a5ae0dd..2c16243db 100644 --- a/numpy/core/tests/test_simd.py +++ b/numpy/core/tests/test_simd.py @@ -579,28 +579,11 @@ class _SIMD_FP(_Test_Utility): intrin(v) assert check_floatstatus(invalid=True) == False - @pytest.mark.parametrize("intrin_name", [ - "cmpltq", "cmpleq", "cmpgtq", "cmpgeq" - ]) - def test_binary_invalid_fpexception(self, intrin_name): - intrin = getattr(self, intrin_name) - for d in [float("nan"), float("inf"), -float("inf")]: - a = self.setall(d) - b = self.setall(1.0) - clear_floatstatus() - intrin(a, b) - intrin(b, a) - assert check_floatstatus(invalid=True) == False - @pytest.mark.parametrize('py_comp,np_comp', [ (operator.lt, "cmplt"), (operator.le, "cmple"), (operator.gt, "cmpgt"), (operator.ge, "cmpge"), - (operator.lt, "cmpltq"), - (operator.le, "cmpleq"), - (operator.gt, "cmpgtq"), - (operator.ge, "cmpgeq"), (operator.eq, "cmpeq"), (operator.ne, "cmpneq") ]) @@ -923,21 +906,14 @@ class _SIMD_ALL(_Test_Utility): rev64 = self.rev64(self.load(range(self.nlanes))) assert rev64 == data_rev64 - @pytest.mark.parametrize('func, intrin, sup_sfx', [ - (operator.lt, "cmplt", []), - (operator.le, "cmple", []), - (operator.gt, "cmpgt", []), - (operator.ge, "cmpge", []), - (operator.eq, "cmpeq", []), - (operator.ne, "cmpneq", ("f32", "f64")), - (operator.lt, "cmpltq", ("f32", "f64")), - (operator.le, "cmpleq", ("f32", "f64")), - (operator.gt, "cmpgtq", ("f32", "f64")), - (operator.ge, "cmpgeq", ("f32", "f64")) + @pytest.mark.parametrize('func, intrin', [ + (operator.lt, "cmplt"), + (operator.le, "cmple"), + (operator.gt, "cmpgt"), + (operator.ge, "cmpge"), + (operator.eq, "cmpeq") ]) - def test_operators_comparison(self, func, intrin, sup_sfx): - if sup_sfx and self.sfx not in sup_sfx: - return + def test_operators_comparison(self, func, intrin): if self._is_fp(): data_a = self._data() else: |