summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSayed Adel <seiko@imavr.com>2022-02-01 02:36:45 +0200
committerSayed Adel <seiko@imavr.com>2022-02-01 02:55:39 +0200
commitd99bf0eff0bc6a9fee1dc7633d852cac176e4281 (patch)
tree1ed7050d80e70f8665dfbe2362fa8f613ce39dce
parenta52cdafadbb1b98d18374809b059e09b82591068 (diff)
downloadnumpy-d99bf0eff0bc6a9fee1dc7633d852cac176e4281.tar.gz
SIMD: handel |x| > HUGE_THRESHOLD, special cases via universal intrinsics
instead of fallback to C
-rw-r--r--numpy/core/src/umath/loops_hyperbolic.dispatch.c.src67
1 files changed, 28 insertions, 39 deletions
diff --git a/numpy/core/src/umath/loops_hyperbolic.dispatch.c.src b/numpy/core/src/umath/loops_hyperbolic.dispatch.c.src
index 0fda06fff..367b102c6 100644
--- a/numpy/core/src/umath/loops_hyperbolic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_hyperbolic.dispatch.c.src
@@ -35,11 +35,15 @@
* x = SNaN, r = QNaN
*
*
- * ALGORITHM DETAILS
- * We handle special values in a callout function, aside from main path
- * computations. "Special" for this algorithm are:
- * INF, NAN, |x| > HUGE_THRESHOLD
+ * ALGORITHM DETAILS
*
+ * SVML handel |x| > HUGE_THRESHOLD, INF and NaNs by scalar callout as following:
+ * 1. check special cases
+ * 2. return `+-1` for `|x| > HUGE_THRESHOLD` otherwise return `x`
+ *
+ * It wasn't clear to us the reason behind using callout instead of using
+ * AVX512 directly for single-precision.
+ * However, we saw it's better to use SIMD instead of following SVML.
*
* Main path computations are organized as follows:
* Actually we split the interval [0, SATURATION_THRESHOLD)
@@ -165,6 +169,7 @@ simd_tanh_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_
0xbc730b73f1eaff20ull, 0xbbba2cff8135d462ull, 0xbab5a71b5f7d9035ull, 0x0ull
};
const int nlanes = npyv_nlanes_f64;
+ const npyv_f64 qnan = npyv_setall_f64(NPY_NAN);
for (; len > 0; len -= nlanes, src += ssrc*nlanes, dst += sdst*nlanes) {
npyv_f64 x;
if (ssrc == 1) {
@@ -172,10 +177,11 @@ simd_tanh_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_
} else {
x = npyv_loadn_tillz_f64(src, ssrc, len);
}
- npyv_s64 denorm = npyv_and_s64(npyv_reinterpret_s64_f64(x), npyv_setall_s64(0x7ff8000000000000ll));
- // |x| > HUGE_THRESHOLD, INF and NaNs are filtered out to callout.
- npyv_b64 callout_m = npyv_cmple_s64(denorm, npyv_setall_s64(0x7fe0000000000000ll));
- npyv_s64 idxs = npyv_sub_s64(denorm, npyv_setall_s64(0x3fc0000000000000ll));
+ npyv_s64 ndnan = npyv_and_s64(npyv_reinterpret_s64_f64(x), npyv_setall_s64(0x7ff8000000000000ll));
+ // |x| > HUGE_THRESHOLD, INF and NaNs.
+ npyv_b64 special_m = npyv_cmple_s64(ndnan, npyv_setall_s64(0x7fe0000000000000ll));
+ npyv_b64 nnan_m = npyv_notnan_f64(x);
+ npyv_s64 idxs = npyv_sub_s64(ndnan, npyv_setall_s64(0x3fc0000000000000ll));
// no native 64-bit for max/min and its fine to use 32-bit max/min
// since we're not crossing 32-bit edge
npyv_s32 idxl = npyv_max_s32(npyv_reinterpret_s32_s64(idxs), npyv_zero_s32());
@@ -221,20 +227,11 @@ simd_tanh_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_
r = npyv_muladd_f64(r, y, c2);
r = npyv_muladd_f64(r, y, c1);
r = npyv_muladd_f64(r, y, c0);
+ // 1.0 if |x| > HUGE_THRESHOLD || INF
+ r = npyv_select_f64(special_m, r, npyv_setall_f64(1.0));
r = npyv_or_f64(r, sign);
-
- npy_uint64 callout = npyv_tobits_b64(callout_m);
- if (NPY_UNLIKELY(callout != ((1 << nlanes) - 1))) {
- double NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) back2c[npyv_nlanes_f64];
- npyv_storea_f64(back2c, x);
- for (unsigned i = 0; i < npyv_nlanes_f64; ++i) {
- if ((callout >> i) & 1) {
- continue;
- }
- back2c[i] = npy_tanh(back2c[i]);
- }
- r = npyv_select_f64(callout_m, r, npyv_loada_f64(back2c));
- }
+ // qnan if nan
+ r = npyv_select_f64(nnan_m, r, qnan);
if (sdst == 1) {
npyv_store_till_f64(dst, len, r);
} else {
@@ -290,6 +287,7 @@ simd_tanh_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, npy_in
};
const int nlanes = npyv_nlanes_f32;
+ const npyv_f32 qnan = npyv_setall_f32(NPY_NANF);
for (; len > 0; len -= nlanes, src += ssrc*nlanes, dst += sdst*nlanes) {
npyv_f32 x;
if (ssrc == 1) {
@@ -297,11 +295,11 @@ simd_tanh_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, npy_in
} else {
x = npyv_loadn_tillz_f32(src, ssrc, len);
}
- npyv_s32 denorm = npyv_and_s32(npyv_reinterpret_s32_f32(x), npyv_setall_s32(0x7fe00000));
- // |x| > HUGE_THRESHOLD, INF and NaNs are filtered out to callout.
- npyv_b32 callout_m = npyv_cmple_s32(denorm, npyv_setall_s32(0x7f000000));
-
- npyv_s32 idxs = npyv_sub_s32(denorm, npyv_setall_s32(0x3d400000));
+ npyv_s32 ndnan = npyv_and_s32(npyv_reinterpret_s32_f32(x), npyv_setall_s32(0x7fe00000));
+ // check |x| > HUGE_THRESHOLD, INF and NaNs.
+ npyv_b32 special_m = npyv_cmple_s32(ndnan, npyv_setall_s32(0x7f000000));
+ npyv_b32 nnan_m = npyv_notnan_f32(x);
+ npyv_s32 idxs = npyv_sub_s32(ndnan, npyv_setall_s32(0x3d400000));
idxs = npyv_max_s32(idxs, npyv_zero_s32());
idxs = npyv_min_s32(idxs, npyv_setall_s32(0x3e00000));
npyv_u32 idx = npyv_shri_u32(npyv_reinterpret_u32_s32(idxs), 21);
@@ -325,20 +323,11 @@ simd_tanh_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, npy_in
r = npyv_muladd_f32(r, y, c2);
r = npyv_muladd_f32(r, y, c1);
r = npyv_muladd_f32(r, y, c0);
+ // 1.0 if |x| > HUGE_THRESHOLD || INF
+ r = npyv_select_f32(special_m, r, npyv_setall_f32(1.0f));
r = npyv_or_f32(r, sign);
-
- npy_uint64 callout = npyv_tobits_b32(callout_m);
- if (NPY_UNLIKELY(callout != ((1 << nlanes) - 1))) {
- float NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) back2c[npyv_nlanes_f32];
- npyv_storea_f32(back2c, x);
- for (unsigned i = 0; i < npyv_nlanes_f32; ++i) {
- if ((callout >> i) & 1) {
- continue;
- }
- back2c[i] = npy_tanhf(back2c[i]);
- }
- r = npyv_select_f32(callout_m, r, npyv_loada_f32(back2c));
- }
+ // qnan if nan
+ r = npyv_select_f32(nnan_m, r, qnan);
if (sdst == 1) {
npyv_store_till_f32(dst, len, r);
} else {