ENH, SIMD: Discard non-signaling comparison intrinsics

Providing non-signaling comparison intrinsics that guarantee no FP invalid exception in case of qNaN sounds great but it cost unacceptable extra intrinsics on ppc64le(VSX) and x86(SSE). Therefore, an integer definition #NPY_SIMD_CMPSIGNAL has been provided instead to differenate between SIMD extensions that support only supports signaling comparison.
author: Sayed Adel <seiko@imavr.com> 2022-12-14 21:04:27 +0200
committer: Sayed Adel <seiko@imavr.com> 2022-12-15 05:17:00 +0200
commit: d02fc70cb6ad43093e1c407172c7193957705b5d (patch)
tree: 4200eefc874bf59f65ef71dee45c37faf842c5b5
parent: 5bf0e4413db20069953fe8c941ff118bb685cb46 (diff)
download: numpy-d02fc70cb6ad43093e1c407172c7193957705b5d.tar.gz
14 files changed, 22 insertions, 173 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index 8d2ec6c30..48023af80 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -333,13 +333,6 @@ SIMD_IMPL_INTRIN_1(not_@sfx@, v@sfx@, v@sfx@)
  */
 SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@bsfx@, v@sfx@, v@sfx@)
 /**end repeat1**/
-#if @fp_only@
-/**begin repeat1
- * #intrin = cmpgtq, cmpgeq, cmpltq, cmpleq#
- */
-SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@bsfx@, v@sfx@, v@sfx@)
-/**end repeat1**/
-#endif
 
 #if @bitw8b_sup@
 SIMD_IMPL_INTRIN_2(andc_@sfx@, v@sfx@, v@sfx@, v@sfx@)
@@ -618,14 +611,6 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
 SIMD_INTRIN_DEF(@intrin@_@sfx@)
 /**end repeat1**/
 
-#if @fp_only@
-/**begin repeat1
- * #intrin = cmpgtq, cmpgeq, cmpltq, cmpleq#
- */
-SIMD_INTRIN_DEF(@intrin@_@sfx@)
-/**end repeat1**/
-#endif
-
 #if @bitw8b_sup@
 SIMD_INTRIN_DEF(andc_@sfx@)
 SIMD_INTRIN_DEF(andc_@bsfx@)
diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h
index 8cb74df2b..d64f3c6d6 100644
--- a/numpy/core/src/common/simd/avx2/avx2.h
+++ b/numpy/core/src/common/simd/avx2/avx2.h
@@ -11,6 +11,7 @@
     #define NPY_SIMD_FMA3 0 // fast emulated
 #endif
 #define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 0
 // Enough limit to allow us to use _mm256_i32gather_*
 #define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 8)
 
diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h
index 69e7e897b..86e0038d9 100644
--- a/numpy/core/src/common/simd/avx2/operators.h
+++ b/numpy/core/src/common/simd/avx2/operators.h
@@ -218,16 +218,6 @@ NPY_FINLINE __m256i npyv_cmpge_u32(__m256i a, __m256i b)
 #define npyv_cmpgt_f64(A, B)  _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_GT_OQ))
 #define npyv_cmpge_f32(A, B)  _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_GE_OQ))
 #define npyv_cmpge_f64(A, B)  _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_GE_OQ))
-// ordered comparison guarantees non-signaling
-// don't raise FP invalid exception if one of the sources containing qnan.
-#define npyv_cmpgeq_f32 npyv_cmpge_f32
-#define npyv_cmpgeq_f64 npyv_cmpge_f64
-#define npyv_cmpgtq_f32 npyv_cmpgt_f32
-#define npyv_cmpgtq_f64 npyv_cmpgt_f64
-#define npyv_cmpleq_f32 npyv_cmple_f32
-#define npyv_cmpleq_f64 npyv_cmple_f64
-#define npyv_cmpltq_f32 npyv_cmplt_f32
-#define npyv_cmpltq_f64 npyv_cmplt_f64
 
 // check special cases
 NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h
index 0946e6443..aa6abe256 100644
--- a/numpy/core/src/common/simd/avx512/avx512.h
+++ b/numpy/core/src/common/simd/avx512/avx512.h
@@ -7,6 +7,7 @@
 #define NPY_SIMD_F64 1
 #define NPY_SIMD_FMA3 1 // native support
 #define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 0
 // Enough limit to allow us to use _mm512_i32gather_* and _mm512_i32scatter_*
 #define NPY_SIMD_MAXLOAD_STRIDE32  (0x7fffffff / 16)
 #define NPY_SIMD_MAXSTORE_STRIDE32 (0x7fffffff / 16)
diff --git a/numpy/core/src/common/simd/avx512/operators.h b/numpy/core/src/common/simd/avx512/operators.h
index 0ff57847b..c70932d5f 100644
--- a/numpy/core/src/common/simd/avx512/operators.h
+++ b/numpy/core/src/common/simd/avx512/operators.h
@@ -331,16 +331,6 @@
 #define npyv_cmpgt_f64(A, B)  _mm512_cmp_pd_mask(A, B, _CMP_GT_OQ)
 #define npyv_cmpge_f32(A, B)  _mm512_cmp_ps_mask(A, B, _CMP_GE_OQ)
 #define npyv_cmpge_f64(A, B)  _mm512_cmp_pd_mask(A, B, _CMP_GE_OQ)
-// ordered non-signaling comparison
-// don't raise FP invalid exception if one of the sources containing qnan.
-#define npyv_cmpgeq_f32 npyv_cmpge_f32
-#define npyv_cmpgeq_f64 npyv_cmpge_f64
-#define npyv_cmpgtq_f32 npyv_cmpgt_f32
-#define npyv_cmpgtq_f64 npyv_cmpgt_f64
-#define npyv_cmpleq_f32 npyv_cmple_f32
-#define npyv_cmpleq_f64 npyv_cmple_f64
-#define npyv_cmpltq_f32 npyv_cmplt_f32
-#define npyv_cmpltq_f64 npyv_cmplt_f64
 
 // check special cases
 NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
diff --git a/numpy/core/src/common/simd/neon/neon.h b/numpy/core/src/common/simd/neon/neon.h
index b08071527..49c35c415 100644
--- a/numpy/core/src/common/simd/neon/neon.h
+++ b/numpy/core/src/common/simd/neon/neon.h
@@ -16,6 +16,7 @@
     #define NPY_SIMD_FMA3 0  // HW emulated
 #endif
 #define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 1
 
 typedef uint8x16_t  npyv_u8;
 typedef int8x16_t   npyv_s8;
diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h
index a6c479998..249621bd6 100644
--- a/numpy/core/src/common/simd/neon/operators.h
+++ b/numpy/core/src/common/simd/neon/operators.h
@@ -238,35 +238,6 @@
 #define npyv_cmple_f32(A, B) npyv_cmpge_f32(B, A)
 #define npyv_cmple_f64(A, B) npyv_cmpge_f64(B, A)
 
-// ordered comparison guarantees non-signaling
-// don't raise FP invalid exception if one of the sources containing qnan.
-NPY_FINLINE npyv_b32 npyv_cmpgeq_f32(npyv_f32 a, npyv_f32 b)
-{
-    return vceqq_f32(vmaxq_f32(a, b), a);
-}
-NPY_FINLINE npyv_b32 npyv_cmpgtq_f32(npyv_f32 a, npyv_f32 b)
-{
-    npyv_f32 max = vmaxq_f32(a, b);
-    npyv_b32 nnan = vceqq_f32(max, max);
-    return vbicq_u32(nnan, vceqq_f32(max, b));
-}
-#define npyv_cmpleq_f32(A, B) npyv_cmpgeq_f32(B, A)
-#define npyv_cmpltq_f32(A, B) npyv_cmpgtq_f32(B, A)
-#if NPY_SIMD_F64
-NPY_FINLINE npyv_b64 npyv_cmpgeq_f64(npyv_f64 a, npyv_f64 b)
-{
-    return vceqq_f64(vmaxq_f64(a, b), a);
-}
-NPY_FINLINE npyv_b64 npyv_cmpgtq_f64(npyv_f64 a, npyv_f64 b)
-{
-    npyv_f64 max = vmaxq_f64(a, b);
-    npyv_b64 nnan = vceqq_f64(max, max);
-    return vbicq_u64(nnan, vceqq_f64(max, b));
-}
-#define npyv_cmpleq_f64(A, B) npyv_cmpgeq_f64(B, A)
-#define npyv_cmpltq_f64(A, B) npyv_cmpgtq_f64(B, A)
-#endif
-
 // check special cases
 NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
 { return vceqq_f32(a, a); }
diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h
index 92a77ad80..8c9b14251 100644
--- a/numpy/core/src/common/simd/simd.h
+++ b/numpy/core/src/common/simd/simd.h
@@ -82,6 +82,9 @@ typedef double     npyv_lanetype_f64;
     #define NPY_SIMD_FMA3 0
     /// 1 if the enabled SIMD extension is running on big-endian mode otherwise 0.
     #define NPY_SIMD_BIGENDIAN 0
+    /// 1 if the supported comparison intrinsics(lt, le, gt, ge)
+    /// raises FP invalid exception for quite NaNs.
+    #define NPY_SIMD_CMPSIGNAL 0
 #endif
 
 // enable emulated mask operations for all SIMD extension except for AVX512
diff --git a/numpy/core/src/common/simd/sse/operators.h b/numpy/core/src/common/simd/sse/operators.h
index 28aa343bb..59182679e 100644
--- a/numpy/core/src/common/simd/sse/operators.h
+++ b/numpy/core/src/common/simd/sse/operators.h
@@ -277,38 +277,6 @@ NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c)
 #define npyv_cmpge_f32(a, b)  _mm_castps_si128(_mm_cmpge_ps(a, b))
 #define npyv_cmpge_f64(a, b)  _mm_castpd_si128(_mm_cmpge_pd(a, b))
 
-// ordered comparison guarantees non-signaling
-// don't raise FP invalid exception if one of the sources containing qnan.
-NPY_FINLINE npyv_b32 npyv_cmpgtq_f32(npyv_f32 a, npyv_f32 b)
-{
-    __m128 nan_mask = _mm_cmpunord_ps(a, b);
-    __m128 cmp_mask = _mm_cmpgt_ps(_mm_xor_ps(nan_mask, a), _mm_xor_ps(nan_mask, b));
-    return _mm_castps_si128(_mm_andnot_ps(nan_mask, cmp_mask));
-}
-NPY_FINLINE npyv_b64 npyv_cmpgtq_f64(npyv_f64 a, npyv_f64 b)
-{
-    __m128d nan_mask = _mm_cmpunord_pd(a, b);
-    __m128d cmp_mask = _mm_cmpgt_pd(_mm_xor_pd(nan_mask, a), _mm_xor_pd(nan_mask, b));
-    return _mm_castpd_si128(_mm_andnot_pd(nan_mask, cmp_mask));
-}
-NPY_FINLINE npyv_b32 npyv_cmpgeq_f32(npyv_f32 a, npyv_f32 b)
-{
-    __m128 nan_mask = _mm_cmpunord_ps(a, b);
-    __m128 cmp_mask = _mm_cmpge_ps(_mm_xor_ps(nan_mask, a), _mm_xor_ps(nan_mask, b));
-    return _mm_castps_si128(_mm_andnot_ps(nan_mask, cmp_mask));
-}
-NPY_FINLINE npyv_b64 npyv_cmpgeq_f64(npyv_f64 a, npyv_f64 b)
-{
-    __m128d nan_mask = _mm_cmpunord_pd(a, b);
-    __m128d cmp_mask = _mm_cmpge_pd(_mm_xor_pd(nan_mask, a), _mm_xor_pd(nan_mask, b));
-    return _mm_castpd_si128(_mm_andnot_pd(nan_mask, cmp_mask));
-}
-
-#define npyv_cmpltq_f32(A, B) npyv_cmpgtq_f32(B, A)
-#define npyv_cmpltq_f64(A, B) npyv_cmpgtq_f64(B, A)
-#define npyv_cmpleq_f32(A, B) npyv_cmpgeq_f32(B, A)
-#define npyv_cmpleq_f64(A, B) npyv_cmpgeq_f64(B, A)
-
 // check special cases
 NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
 { return _mm_castps_si128(_mm_cmpord_ps(a, a)); }
diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h
index c21bbfda7..0c6b8cdba 100644
--- a/numpy/core/src/common/simd/sse/sse.h
+++ b/numpy/core/src/common/simd/sse/sse.h
@@ -12,6 +12,7 @@
     #define NPY_SIMD_FMA3 0  // fast emulated
 #endif
 #define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 1
 
 typedef __m128i npyv_u8;
 typedef __m128i npyv_s8;
diff --git a/numpy/core/src/common/simd/vec/operators.h b/numpy/core/src/common/simd/vec/operators.h
index fe1a9b10b..50dac20f6 100644
--- a/numpy/core/src/common/simd/vec/operators.h
+++ b/numpy/core/src/common/simd/vec/operators.h
@@ -266,50 +266,6 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a)
 #endif
 #define npyv_cmple_f64(A, B) npyv_cmpge_f64(B, A)
 
-// ordered comparison guarantees non-signaling
-// don't raise FP invalid exception if one of the sources containing qnan.
-#if NPY_SIMD_F32
-NPY_FINLINE npyv_b32 npyv_cmpgeq_f32(npyv_f32 a, npyv_f32 b)
-{
-#ifdef NPY_HAVE_VSX
-    return vec_vcmpgefp(a, b);
-#else
-    return vec_cmpge(a, b);
-#endif
-}
-NPY_FINLINE npyv_b32 npyv_cmpgtq_f32(npyv_f32 a, npyv_f32 b)
-{
-#ifdef NPY_HAVE_VSX
-    return vec_vcmpgtfp(a, b);
-#else
-    return vec_cmpgt(a, b);
-#endif
-}
-#define npyv_cmpleq_f32(A, B) npyv_cmpgeq_f32(B, A)
-#define npyv_cmpltq_f32(A, B) npyv_cmpgtq_f32(B, A)
-#endif // NPY_SIMD_F32
-
-NPY_FINLINE npyv_b64 npyv_cmpgeq_f64(npyv_f64 a, npyv_f64 b)
-{
-#ifdef NPY_HAVE_VSX
-    return vec_cmpeq(vec_max(a, b), a);
-#else
-    return vec_cmpge(a, b);
-#endif
-}
-NPY_FINLINE npyv_b64 npyv_cmpgtq_f64(npyv_f64 a, npyv_f64 b)
-{
-#ifdef NPY_HAVE_VSX
-    npyv_f64 max = vec_max(a, b);
-    npyv_b64 nnan = vec_cmpeq(max, max);
-    return vec_andc(max, vec_cmpeq(max, b));
-#else
-    return vec_cmpgt(a, b);
-#endif
-}
-#define npyv_cmpleq_f64(A, B) npyv_cmpgeq_f64(B, A)
-#define npyv_cmpltq_f64(A, B) npyv_cmpgtq_f64(B, A)
-
 // check special cases
 #if NPY_SIMD_F32
     NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
diff --git a/numpy/core/src/common/simd/vec/vec.h b/numpy/core/src/common/simd/vec/vec.h
index abcd33ce1..1d4508669 100644
--- a/numpy/core/src/common/simd/vec/vec.h
+++ b/numpy/core/src/common/simd/vec/vec.h
@@ -39,8 +39,10 @@
 
 #ifdef NPY_HAVE_VX
     #define NPY_SIMD_BIGENDIAN 1
+    #define NPY_SIMD_CMPSIGNAL 0
 #else
     #define NPY_SIMD_BIGENDIAN 0
+    #define NPY_SIMD_CMPSIGNAL 1
 #endif
 
 typedef __vector unsigned char      npyv_u8;
diff --git a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
index e09c283de..9f9978e66 100644
--- a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
+++ b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
@@ -124,7 +124,12 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
         } else {
             x_in = npyv_loadn_tillz_f32(src, ssrc, len);
         }
-        npyv_b32 simd_mask = npyv_cmpleq_f32(npyv_abs_f32(x_in), max_cody);
+        npyv_b32 nnan_mask = npyv_notnan_f32(x_in);
+    #if NPY_SIMD_CMPSIGNAL
+        // Eliminate NaN to avoid FP invalid exception
+        x_in = npyv_and_f32(x_in, npyv_reinterpret_f32_u32(npyv_cvt_u32_b32(nnan_mask)));
+    #endif
+        npyv_b32 simd_mask = npyv_cmple_f32(npyv_abs_f32(x_in), max_cody);
         npy_uint64 simd_maski = npyv_tobits_b32(simd_mask);
         /*
          * For elements outside of this range, Cody-Waite's range reduction
@@ -132,7 +137,6 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
          * these numbers
          */
         if (simd_maski != 0) {
-            npyv_b32 nnan_mask = npyv_notnan_f32(x_in);
             npyv_f32 x = npyv_select_f32(npyv_and_b32(nnan_mask, simd_mask), x_in, zerosf);
 
             npyv_f32 quadrant = npyv_mul_f32(x, two_over_pi);
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index 17a5ae0dd..2c16243db 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -579,28 +579,11 @@ class _SIMD_FP(_Test_Utility):
             intrin(v)
             assert check_floatstatus(invalid=True) == False
 
-    @pytest.mark.parametrize("intrin_name", [
-        "cmpltq", "cmpleq", "cmpgtq", "cmpgeq"
-    ])
-    def test_binary_invalid_fpexception(self, intrin_name):
-        intrin = getattr(self, intrin_name)
-        for d in [float("nan"), float("inf"), -float("inf")]:
-            a = self.setall(d)
-            b = self.setall(1.0)
-            clear_floatstatus()
-            intrin(a, b)
-            intrin(b, a)
-            assert check_floatstatus(invalid=True) == False
-
     @pytest.mark.parametrize('py_comp,np_comp', [
         (operator.lt, "cmplt"),
         (operator.le, "cmple"),
         (operator.gt, "cmpgt"),
         (operator.ge, "cmpge"),
-        (operator.lt, "cmpltq"),
-        (operator.le, "cmpleq"),
-        (operator.gt, "cmpgtq"),
-        (operator.ge, "cmpgeq"),
         (operator.eq, "cmpeq"),
         (operator.ne, "cmpneq")
     ])
@@ -923,21 +906,14 @@ class _SIMD_ALL(_Test_Utility):
         rev64 = self.rev64(self.load(range(self.nlanes)))
         assert rev64 == data_rev64
 
-    @pytest.mark.parametrize('func, intrin, sup_sfx', [
-        (operator.lt, "cmplt", []),
-        (operator.le, "cmple", []),
-        (operator.gt, "cmpgt", []),
-        (operator.ge, "cmpge", []),
-        (operator.eq, "cmpeq", []),
-        (operator.ne, "cmpneq", ("f32", "f64")),
-        (operator.lt, "cmpltq", ("f32", "f64")),
-        (operator.le, "cmpleq", ("f32", "f64")),
-        (operator.gt, "cmpgtq", ("f32", "f64")),
-        (operator.ge, "cmpgeq", ("f32", "f64"))
+    @pytest.mark.parametrize('func, intrin', [
+        (operator.lt, "cmplt"),
+        (operator.le, "cmple"),
+        (operator.gt, "cmpgt"),
+        (operator.ge, "cmpge"),
+        (operator.eq, "cmpeq")
     ])
-    def test_operators_comparison(self, func, intrin, sup_sfx):
-        if sup_sfx and self.sfx not in sup_sfx:
-            return
+    def test_operators_comparison(self, func, intrin):
         if self._is_fp():
             data_a = self._data()
         else:
author	Sayed Adel <seiko@imavr.com>	2022-12-14 21:04:27 +0200
committer	Sayed Adel <seiko@imavr.com>	2022-12-15 05:17:00 +0200
commit	d02fc70cb6ad43093e1c407172c7193957705b5d (patch)
tree	4200eefc874bf59f65ef71dee45c37faf842c5b5
parent	5bf0e4413db20069953fe8c941ff118bb685cb46 (diff)
download	numpy-d02fc70cb6ad43093e1c407172c7193957705b5d.tar.gz