summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSayed Adel <seiko@imavr.com>2022-12-14 21:04:27 +0200
committerSayed Adel <seiko@imavr.com>2022-12-15 05:17:00 +0200
commitd02fc70cb6ad43093e1c407172c7193957705b5d (patch)
tree4200eefc874bf59f65ef71dee45c37faf842c5b5
parent5bf0e4413db20069953fe8c941ff118bb685cb46 (diff)
downloadnumpy-d02fc70cb6ad43093e1c407172c7193957705b5d.tar.gz
ENH, SIMD: Discard non-signaling comparison intrinsics
Providing non-signaling comparison intrinsics that guarantee no FP invalid exception in case of qNaN sounds great but it cost unacceptable extra intrinsics on ppc64le(VSX) and x86(SSE). Therefore, an integer definition #NPY_SIMD_CMPSIGNAL has been provided instead to differenate between SIMD extensions that support only supports signaling comparison.
-rw-r--r--numpy/core/src/_simd/_simd.dispatch.c.src15
-rw-r--r--numpy/core/src/common/simd/avx2/avx2.h1
-rw-r--r--numpy/core/src/common/simd/avx2/operators.h10
-rw-r--r--numpy/core/src/common/simd/avx512/avx512.h1
-rw-r--r--numpy/core/src/common/simd/avx512/operators.h10
-rw-r--r--numpy/core/src/common/simd/neon/neon.h1
-rw-r--r--numpy/core/src/common/simd/neon/operators.h29
-rw-r--r--numpy/core/src/common/simd/simd.h3
-rw-r--r--numpy/core/src/common/simd/sse/operators.h32
-rw-r--r--numpy/core/src/common/simd/sse/sse.h1
-rw-r--r--numpy/core/src/common/simd/vec/operators.h44
-rw-r--r--numpy/core/src/common/simd/vec/vec.h2
-rw-r--r--numpy/core/src/umath/loops_trigonometric.dispatch.c.src8
-rw-r--r--numpy/core/tests/test_simd.py38
14 files changed, 22 insertions, 173 deletions
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index 8d2ec6c30..48023af80 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -333,13 +333,6 @@ SIMD_IMPL_INTRIN_1(not_@sfx@, v@sfx@, v@sfx@)
*/
SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@bsfx@, v@sfx@, v@sfx@)
/**end repeat1**/
-#if @fp_only@
-/**begin repeat1
- * #intrin = cmpgtq, cmpgeq, cmpltq, cmpleq#
- */
-SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@bsfx@, v@sfx@, v@sfx@)
-/**end repeat1**/
-#endif
#if @bitw8b_sup@
SIMD_IMPL_INTRIN_2(andc_@sfx@, v@sfx@, v@sfx@, v@sfx@)
@@ -618,14 +611,6 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
SIMD_INTRIN_DEF(@intrin@_@sfx@)
/**end repeat1**/
-#if @fp_only@
-/**begin repeat1
- * #intrin = cmpgtq, cmpgeq, cmpltq, cmpleq#
- */
-SIMD_INTRIN_DEF(@intrin@_@sfx@)
-/**end repeat1**/
-#endif
-
#if @bitw8b_sup@
SIMD_INTRIN_DEF(andc_@sfx@)
SIMD_INTRIN_DEF(andc_@bsfx@)
diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h
index 8cb74df2b..d64f3c6d6 100644
--- a/numpy/core/src/common/simd/avx2/avx2.h
+++ b/numpy/core/src/common/simd/avx2/avx2.h
@@ -11,6 +11,7 @@
#define NPY_SIMD_FMA3 0 // fast emulated
#endif
#define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 0
// Enough limit to allow us to use _mm256_i32gather_*
#define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 8)
diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h
index 69e7e897b..86e0038d9 100644
--- a/numpy/core/src/common/simd/avx2/operators.h
+++ b/numpy/core/src/common/simd/avx2/operators.h
@@ -218,16 +218,6 @@ NPY_FINLINE __m256i npyv_cmpge_u32(__m256i a, __m256i b)
#define npyv_cmpgt_f64(A, B) _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_GT_OQ))
#define npyv_cmpge_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_GE_OQ))
#define npyv_cmpge_f64(A, B) _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_GE_OQ))
-// ordered comparison guarantees non-signaling
-// don't raise FP invalid exception if one of the sources containing qnan.
-#define npyv_cmpgeq_f32 npyv_cmpge_f32
-#define npyv_cmpgeq_f64 npyv_cmpge_f64
-#define npyv_cmpgtq_f32 npyv_cmpgt_f32
-#define npyv_cmpgtq_f64 npyv_cmpgt_f64
-#define npyv_cmpleq_f32 npyv_cmple_f32
-#define npyv_cmpleq_f64 npyv_cmple_f64
-#define npyv_cmpltq_f32 npyv_cmplt_f32
-#define npyv_cmpltq_f64 npyv_cmplt_f64
// check special cases
NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h
index 0946e6443..aa6abe256 100644
--- a/numpy/core/src/common/simd/avx512/avx512.h
+++ b/numpy/core/src/common/simd/avx512/avx512.h
@@ -7,6 +7,7 @@
#define NPY_SIMD_F64 1
#define NPY_SIMD_FMA3 1 // native support
#define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 0
// Enough limit to allow us to use _mm512_i32gather_* and _mm512_i32scatter_*
#define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 16)
#define NPY_SIMD_MAXSTORE_STRIDE32 (0x7fffffff / 16)
diff --git a/numpy/core/src/common/simd/avx512/operators.h b/numpy/core/src/common/simd/avx512/operators.h
index 0ff57847b..c70932d5f 100644
--- a/numpy/core/src/common/simd/avx512/operators.h
+++ b/numpy/core/src/common/simd/avx512/operators.h
@@ -331,16 +331,6 @@
#define npyv_cmpgt_f64(A, B) _mm512_cmp_pd_mask(A, B, _CMP_GT_OQ)
#define npyv_cmpge_f32(A, B) _mm512_cmp_ps_mask(A, B, _CMP_GE_OQ)
#define npyv_cmpge_f64(A, B) _mm512_cmp_pd_mask(A, B, _CMP_GE_OQ)
-// ordered non-signaling comparison
-// don't raise FP invalid exception if one of the sources containing qnan.
-#define npyv_cmpgeq_f32 npyv_cmpge_f32
-#define npyv_cmpgeq_f64 npyv_cmpge_f64
-#define npyv_cmpgtq_f32 npyv_cmpgt_f32
-#define npyv_cmpgtq_f64 npyv_cmpgt_f64
-#define npyv_cmpleq_f32 npyv_cmple_f32
-#define npyv_cmpleq_f64 npyv_cmple_f64
-#define npyv_cmpltq_f32 npyv_cmplt_f32
-#define npyv_cmpltq_f64 npyv_cmplt_f64
// check special cases
NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
diff --git a/numpy/core/src/common/simd/neon/neon.h b/numpy/core/src/common/simd/neon/neon.h
index b08071527..49c35c415 100644
--- a/numpy/core/src/common/simd/neon/neon.h
+++ b/numpy/core/src/common/simd/neon/neon.h
@@ -16,6 +16,7 @@
#define NPY_SIMD_FMA3 0 // HW emulated
#endif
#define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 1
typedef uint8x16_t npyv_u8;
typedef int8x16_t npyv_s8;
diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h
index a6c479998..249621bd6 100644
--- a/numpy/core/src/common/simd/neon/operators.h
+++ b/numpy/core/src/common/simd/neon/operators.h
@@ -238,35 +238,6 @@
#define npyv_cmple_f32(A, B) npyv_cmpge_f32(B, A)
#define npyv_cmple_f64(A, B) npyv_cmpge_f64(B, A)
-// ordered comparison guarantees non-signaling
-// don't raise FP invalid exception if one of the sources containing qnan.
-NPY_FINLINE npyv_b32 npyv_cmpgeq_f32(npyv_f32 a, npyv_f32 b)
-{
- return vceqq_f32(vmaxq_f32(a, b), a);
-}
-NPY_FINLINE npyv_b32 npyv_cmpgtq_f32(npyv_f32 a, npyv_f32 b)
-{
- npyv_f32 max = vmaxq_f32(a, b);
- npyv_b32 nnan = vceqq_f32(max, max);
- return vbicq_u32(nnan, vceqq_f32(max, b));
-}
-#define npyv_cmpleq_f32(A, B) npyv_cmpgeq_f32(B, A)
-#define npyv_cmpltq_f32(A, B) npyv_cmpgtq_f32(B, A)
-#if NPY_SIMD_F64
-NPY_FINLINE npyv_b64 npyv_cmpgeq_f64(npyv_f64 a, npyv_f64 b)
-{
- return vceqq_f64(vmaxq_f64(a, b), a);
-}
-NPY_FINLINE npyv_b64 npyv_cmpgtq_f64(npyv_f64 a, npyv_f64 b)
-{
- npyv_f64 max = vmaxq_f64(a, b);
- npyv_b64 nnan = vceqq_f64(max, max);
- return vbicq_u64(nnan, vceqq_f64(max, b));
-}
-#define npyv_cmpleq_f64(A, B) npyv_cmpgeq_f64(B, A)
-#define npyv_cmpltq_f64(A, B) npyv_cmpgtq_f64(B, A)
-#endif
-
// check special cases
NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
{ return vceqq_f32(a, a); }
diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h
index 92a77ad80..8c9b14251 100644
--- a/numpy/core/src/common/simd/simd.h
+++ b/numpy/core/src/common/simd/simd.h
@@ -82,6 +82,9 @@ typedef double npyv_lanetype_f64;
#define NPY_SIMD_FMA3 0
/// 1 if the enabled SIMD extension is running on big-endian mode otherwise 0.
#define NPY_SIMD_BIGENDIAN 0
+ /// 1 if the supported comparison intrinsics(lt, le, gt, ge)
+ /// raises FP invalid exception for quite NaNs.
+ #define NPY_SIMD_CMPSIGNAL 0
#endif
// enable emulated mask operations for all SIMD extension except for AVX512
diff --git a/numpy/core/src/common/simd/sse/operators.h b/numpy/core/src/common/simd/sse/operators.h
index 28aa343bb..59182679e 100644
--- a/numpy/core/src/common/simd/sse/operators.h
+++ b/numpy/core/src/common/simd/sse/operators.h
@@ -277,38 +277,6 @@ NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c)
#define npyv_cmpge_f32(a, b) _mm_castps_si128(_mm_cmpge_ps(a, b))
#define npyv_cmpge_f64(a, b) _mm_castpd_si128(_mm_cmpge_pd(a, b))
-// ordered comparison guarantees non-signaling
-// don't raise FP invalid exception if one of the sources containing qnan.
-NPY_FINLINE npyv_b32 npyv_cmpgtq_f32(npyv_f32 a, npyv_f32 b)
-{
- __m128 nan_mask = _mm_cmpunord_ps(a, b);
- __m128 cmp_mask = _mm_cmpgt_ps(_mm_xor_ps(nan_mask, a), _mm_xor_ps(nan_mask, b));
- return _mm_castps_si128(_mm_andnot_ps(nan_mask, cmp_mask));
-}
-NPY_FINLINE npyv_b64 npyv_cmpgtq_f64(npyv_f64 a, npyv_f64 b)
-{
- __m128d nan_mask = _mm_cmpunord_pd(a, b);
- __m128d cmp_mask = _mm_cmpgt_pd(_mm_xor_pd(nan_mask, a), _mm_xor_pd(nan_mask, b));
- return _mm_castpd_si128(_mm_andnot_pd(nan_mask, cmp_mask));
-}
-NPY_FINLINE npyv_b32 npyv_cmpgeq_f32(npyv_f32 a, npyv_f32 b)
-{
- __m128 nan_mask = _mm_cmpunord_ps(a, b);
- __m128 cmp_mask = _mm_cmpge_ps(_mm_xor_ps(nan_mask, a), _mm_xor_ps(nan_mask, b));
- return _mm_castps_si128(_mm_andnot_ps(nan_mask, cmp_mask));
-}
-NPY_FINLINE npyv_b64 npyv_cmpgeq_f64(npyv_f64 a, npyv_f64 b)
-{
- __m128d nan_mask = _mm_cmpunord_pd(a, b);
- __m128d cmp_mask = _mm_cmpge_pd(_mm_xor_pd(nan_mask, a), _mm_xor_pd(nan_mask, b));
- return _mm_castpd_si128(_mm_andnot_pd(nan_mask, cmp_mask));
-}
-
-#define npyv_cmpltq_f32(A, B) npyv_cmpgtq_f32(B, A)
-#define npyv_cmpltq_f64(A, B) npyv_cmpgtq_f64(B, A)
-#define npyv_cmpleq_f32(A, B) npyv_cmpgeq_f32(B, A)
-#define npyv_cmpleq_f64(A, B) npyv_cmpgeq_f64(B, A)
-
// check special cases
NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
{ return _mm_castps_si128(_mm_cmpord_ps(a, a)); }
diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h
index c21bbfda7..0c6b8cdba 100644
--- a/numpy/core/src/common/simd/sse/sse.h
+++ b/numpy/core/src/common/simd/sse/sse.h
@@ -12,6 +12,7 @@
#define NPY_SIMD_FMA3 0 // fast emulated
#endif
#define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 1
typedef __m128i npyv_u8;
typedef __m128i npyv_s8;
diff --git a/numpy/core/src/common/simd/vec/operators.h b/numpy/core/src/common/simd/vec/operators.h
index fe1a9b10b..50dac20f6 100644
--- a/numpy/core/src/common/simd/vec/operators.h
+++ b/numpy/core/src/common/simd/vec/operators.h
@@ -266,50 +266,6 @@ NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a)
#endif
#define npyv_cmple_f64(A, B) npyv_cmpge_f64(B, A)
-// ordered comparison guarantees non-signaling
-// don't raise FP invalid exception if one of the sources containing qnan.
-#if NPY_SIMD_F32
-NPY_FINLINE npyv_b32 npyv_cmpgeq_f32(npyv_f32 a, npyv_f32 b)
-{
-#ifdef NPY_HAVE_VSX
- return vec_vcmpgefp(a, b);
-#else
- return vec_cmpge(a, b);
-#endif
-}
-NPY_FINLINE npyv_b32 npyv_cmpgtq_f32(npyv_f32 a, npyv_f32 b)
-{
-#ifdef NPY_HAVE_VSX
- return vec_vcmpgtfp(a, b);
-#else
- return vec_cmpgt(a, b);
-#endif
-}
-#define npyv_cmpleq_f32(A, B) npyv_cmpgeq_f32(B, A)
-#define npyv_cmpltq_f32(A, B) npyv_cmpgtq_f32(B, A)
-#endif // NPY_SIMD_F32
-
-NPY_FINLINE npyv_b64 npyv_cmpgeq_f64(npyv_f64 a, npyv_f64 b)
-{
-#ifdef NPY_HAVE_VSX
- return vec_cmpeq(vec_max(a, b), a);
-#else
- return vec_cmpge(a, b);
-#endif
-}
-NPY_FINLINE npyv_b64 npyv_cmpgtq_f64(npyv_f64 a, npyv_f64 b)
-{
-#ifdef NPY_HAVE_VSX
- npyv_f64 max = vec_max(a, b);
- npyv_b64 nnan = vec_cmpeq(max, max);
- return vec_andc(max, vec_cmpeq(max, b));
-#else
- return vec_cmpgt(a, b);
-#endif
-}
-#define npyv_cmpleq_f64(A, B) npyv_cmpgeq_f64(B, A)
-#define npyv_cmpltq_f64(A, B) npyv_cmpgtq_f64(B, A)
-
// check special cases
#if NPY_SIMD_F32
NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
diff --git a/numpy/core/src/common/simd/vec/vec.h b/numpy/core/src/common/simd/vec/vec.h
index abcd33ce1..1d4508669 100644
--- a/numpy/core/src/common/simd/vec/vec.h
+++ b/numpy/core/src/common/simd/vec/vec.h
@@ -39,8 +39,10 @@
#ifdef NPY_HAVE_VX
#define NPY_SIMD_BIGENDIAN 1
+ #define NPY_SIMD_CMPSIGNAL 0
#else
#define NPY_SIMD_BIGENDIAN 0
+ #define NPY_SIMD_CMPSIGNAL 1
#endif
typedef __vector unsigned char npyv_u8;
diff --git a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
index e09c283de..9f9978e66 100644
--- a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
+++ b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
@@ -124,7 +124,12 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
} else {
x_in = npyv_loadn_tillz_f32(src, ssrc, len);
}
- npyv_b32 simd_mask = npyv_cmpleq_f32(npyv_abs_f32(x_in), max_cody);
+ npyv_b32 nnan_mask = npyv_notnan_f32(x_in);
+ #if NPY_SIMD_CMPSIGNAL
+ // Eliminate NaN to avoid FP invalid exception
+ x_in = npyv_and_f32(x_in, npyv_reinterpret_f32_u32(npyv_cvt_u32_b32(nnan_mask)));
+ #endif
+ npyv_b32 simd_mask = npyv_cmple_f32(npyv_abs_f32(x_in), max_cody);
npy_uint64 simd_maski = npyv_tobits_b32(simd_mask);
/*
* For elements outside of this range, Cody-Waite's range reduction
@@ -132,7 +137,6 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
* these numbers
*/
if (simd_maski != 0) {
- npyv_b32 nnan_mask = npyv_notnan_f32(x_in);
npyv_f32 x = npyv_select_f32(npyv_and_b32(nnan_mask, simd_mask), x_in, zerosf);
npyv_f32 quadrant = npyv_mul_f32(x, two_over_pi);
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index 17a5ae0dd..2c16243db 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -579,28 +579,11 @@ class _SIMD_FP(_Test_Utility):
intrin(v)
assert check_floatstatus(invalid=True) == False
- @pytest.mark.parametrize("intrin_name", [
- "cmpltq", "cmpleq", "cmpgtq", "cmpgeq"
- ])
- def test_binary_invalid_fpexception(self, intrin_name):
- intrin = getattr(self, intrin_name)
- for d in [float("nan"), float("inf"), -float("inf")]:
- a = self.setall(d)
- b = self.setall(1.0)
- clear_floatstatus()
- intrin(a, b)
- intrin(b, a)
- assert check_floatstatus(invalid=True) == False
-
@pytest.mark.parametrize('py_comp,np_comp', [
(operator.lt, "cmplt"),
(operator.le, "cmple"),
(operator.gt, "cmpgt"),
(operator.ge, "cmpge"),
- (operator.lt, "cmpltq"),
- (operator.le, "cmpleq"),
- (operator.gt, "cmpgtq"),
- (operator.ge, "cmpgeq"),
(operator.eq, "cmpeq"),
(operator.ne, "cmpneq")
])
@@ -923,21 +906,14 @@ class _SIMD_ALL(_Test_Utility):
rev64 = self.rev64(self.load(range(self.nlanes)))
assert rev64 == data_rev64
- @pytest.mark.parametrize('func, intrin, sup_sfx', [
- (operator.lt, "cmplt", []),
- (operator.le, "cmple", []),
- (operator.gt, "cmpgt", []),
- (operator.ge, "cmpge", []),
- (operator.eq, "cmpeq", []),
- (operator.ne, "cmpneq", ("f32", "f64")),
- (operator.lt, "cmpltq", ("f32", "f64")),
- (operator.le, "cmpleq", ("f32", "f64")),
- (operator.gt, "cmpgtq", ("f32", "f64")),
- (operator.ge, "cmpgeq", ("f32", "f64"))
+ @pytest.mark.parametrize('func, intrin', [
+ (operator.lt, "cmplt"),
+ (operator.le, "cmple"),
+ (operator.gt, "cmpgt"),
+ (operator.ge, "cmpge"),
+ (operator.eq, "cmpeq")
])
- def test_operators_comparison(self, func, intrin, sup_sfx):
- if sup_sfx and self.sfx not in sup_sfx:
- return
+ def test_operators_comparison(self, func, intrin):
if self._is_fp():
data_a = self._data()
else: