summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
authorRaghuveer Devulapalli <raghuveer.devulapalli@intel.com>2021-05-05 14:53:28 -0700
committerRaghuveer Devulapalli <raghuveer.devulapalli@intel.com>2021-05-05 14:53:28 -0700
commitb2191de9ce77441373b3b7265c700ae91283a677 (patch)
tree6f7d17e58e45fb3a5afce7292828a71333d0a5d5 /numpy
parentbf531216d3b70f28c462e457dcda1795a3c28476 (diff)
downloadnumpy-b2191de9ce77441373b3b7265c700ae91283a677.tar.gz
BUG: Detect and report underflow condition in AVX implementation of np.exp
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/src/umath/loops_exponent_log.dispatch.c.src12
1 files changed, 12 insertions, 0 deletions
diff --git a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
index 1dc24b226..291ce4518 100644
--- a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
+++ b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
@@ -465,6 +465,7 @@ simd_exp_FLOAT(npy_float * op,
@mask@ xmax_mask, xmin_mask, nan_mask, inf_mask;
@mask@ overflow_mask = @isa@_get_partial_load_mask_ps(0, num_lanes);
+ @mask@ underflow_mask = @isa@_get_partial_load_mask_ps(0, num_lanes);
@mask@ load_mask = @isa@_get_full_load_mask_ps();
npy_intp num_remaining_elements = array_size;
@@ -491,6 +492,7 @@ simd_exp_FLOAT(npy_float * op,
inf_mask = _mm@vsize@_cmp_ps@vsub@(x, inf, _CMP_EQ_OQ);
overflow_mask = @or_masks@(overflow_mask,
@xor_masks@(xmax_mask, inf_mask));
+ underflow_mask = @or_masks@(underflow_mask, xmin_mask);
x = @isa@_set_masked_lanes_ps(x, zeros_f, @or_masks@(
@or_masks@(nan_mask, xmin_mask), xmax_mask));
@@ -539,6 +541,10 @@ simd_exp_FLOAT(npy_float * op,
if (@mask_to_int@(overflow_mask)) {
npy_set_floatstatus_overflow();
}
+
+ if (@mask_to_int@(underflow_mask)) {
+ npy_set_floatstatus_underflow();
+ }
}
/*
@@ -740,6 +746,7 @@ AVX512F_exp_DOUBLE(npy_double * op,
__m512d mTable_tail_3 = _mm512_loadu_pd(&(EXP_Table_tail[8*3]));
__mmask8 overflow_mask = avx512_get_partial_load_mask_pd(0, num_lanes);
+ __mmask8 underflow_mask = avx512_get_partial_load_mask_pd(0, num_lanes);
__mmask8 load_mask = avx512_get_full_load_mask_pd();
__mmask8 xmin_mask, xmax_mask, inf_mask, nan_mask, nearzero_mask;
@@ -769,6 +776,7 @@ AVX512F_exp_DOUBLE(npy_double * op,
nearzero_mask = _mm512_kxor(nearzero_mask, nan_mask);
overflow_mask = _mm512_kor(overflow_mask,
_mm512_kxor(xmax_mask, inf_mask));
+ underflow_mask = _mm512_kor(underflow_mask, xmax_mask);
x = avx512_set_masked_lanes_pd(x, zeros_d,
_mm512_kor(_mm512_kor(nan_mask, xmin_mask),
_mm512_kor(xmax_mask, nearzero_mask)));
@@ -828,6 +836,10 @@ AVX512F_exp_DOUBLE(npy_double * op,
if (overflow_mask) {
npy_set_floatstatus_overflow();
}
+
+ if (underflow_mask) {
+ npy_set_floatstatus_underflow();
+ }
}
/*
* Vectorized implementation of log double using AVX512