diff options
author | Raghuveer Devulapalli <raghuveer.devulapalli@intel.com> | 2022-01-31 09:32:37 -0800 |
---|---|---|
committer | Raghuveer Devulapalli <raghuveer.devulapalli@intel.com> | 2022-02-03 09:53:20 -0800 |
commit | 413ae9063a0bbde40a8efa80fda50b172201a11c (patch) | |
tree | 965bc6869e27705985af84b5a04b990c3615f80c | |
parent | 8840d950e0eedf2eadb96d9867d8f8341e51aaac (diff) | |
download | numpy-413ae9063a0bbde40a8efa80fda50b172201a11c.tar.gz |
ENH: Use SVML for f64 exp and log
-rw-r--r-- | numpy/core/src/common/npy_svml.h | 4 | ||||
-rw-r--r-- | numpy/core/src/umath/loops_exponent_log.dispatch.c.src | 470 | ||||
-rw-r--r-- | numpy/core/src/umath/npy_simd_data.h | 231 |
3 files changed, 58 insertions, 647 deletions
diff --git a/numpy/core/src/common/npy_svml.h b/numpy/core/src/common/npy_svml.h index 4292f7090..1111025d7 100644 --- a/numpy/core/src/common/npy_svml.h +++ b/numpy/core/src/common/npy_svml.h @@ -1,5 +1,7 @@ #if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML) +extern __m512 __svml_expf16(__m512 x); extern __m512 __svml_exp2f16(__m512 x); +extern __m512 __svml_logf16(__m512 x); extern __m512 __svml_log2f16(__m512 x); extern __m512 __svml_log10f16(__m512 x); extern __m512 __svml_expm1f16(__m512 x); @@ -19,7 +21,9 @@ extern __m512 __svml_asinhf16(__m512 x); extern __m512 __svml_acoshf16(__m512 x); extern __m512 __svml_atanhf16(__m512 x); +extern __m512d __svml_exp8(__m512d x); extern __m512d __svml_exp28(__m512d x); +extern __m512d __svml_log8(__m512d x); extern __m512d __svml_log28(__m512d x); extern __m512d __svml_log108(__m512d x); extern __m512d __svml_expm18(__m512d x); diff --git a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src index 2dd43fb85..be3f184fa 100644 --- a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src +++ b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src @@ -11,6 +11,7 @@ #include "numpy/npy_math.h" #include "simd/simd.h" +#include "npy_svml.h" #include "loops_utils.h" #include "loops.h" #include "lowlevel_strided_loops.h" @@ -340,34 +341,6 @@ avx512_scalef_ps(__m512 poly, __m512 quadrant) return _mm512_scalef_ps(poly, quadrant); } -NPY_FINLINE __m512d -avx512_permute_x4var_pd(__m512d t0, - __m512d t1, - __m512d t2, - __m512d t3, - __m512i index) -{ - __mmask8 lut_mask = _mm512_cmp_epi64_mask( - _mm512_and_epi64(_mm512_set1_epi64(0x10ULL), index), - _mm512_set1_epi64(0), _MM_CMPINT_GT); - __m512d res1 = _mm512_permutex2var_pd(t0, index, t1); - __m512d res2 = _mm512_permutex2var_pd(t2, index, t3); - return _mm512_mask_blend_pd(lut_mask, res1, res2); -} - -NPY_FINLINE __m512d -avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3, - __m512d t4, __m512d t5, __m512d t6, __m512d t7, - __m512i index) -{ - __mmask8 lut_mask = _mm512_cmp_epi64_mask( - _mm512_and_epi64(_mm512_set1_epi64(0x20ULL), index), - _mm512_set1_epi64(0), _MM_CMPINT_GT); - __m512d res1 = avx512_permute_x4var_pd(t0, t1, t2, t3, index); - __m512d res2 = avx512_permute_x4var_pd(t4, t5, t6, t7, index); - return _mm512_mask_blend_pd(lut_mask, res1, res2); -} - #endif // SIMD_AVX512F /******************************************************************************** @@ -688,399 +661,53 @@ simd_log_FLOAT(npy_float * op, #endif // @CHK@ /**end repeat**/ -#ifdef SIMD_AVX512F_NOCLANG_BUG -/* - * Vectorized implementation of exp double using AVX512 - * Reference: Tang, P.T.P., "Table-driven implementation of the - * exponential function in IEEE floating-point - * arithmetic," ACM Transactions on Mathematical - * Software, vol. 15, pp. 144-157, 1989. - * 1) if x > mTH_max or x is INF; return INF (overflow) - * 2) if x < mTH_min; return 0.0f (underflow) - * 3) if abs(x) < mTH_nearzero; return 1.0f + x - * 4) if x is Nan; return Nan - * 5) Range reduction: - * x = (32m + j)ln2 / 32 + r; r in [-ln2/64, ln2/64] - * 6) exp(r) - 1 is approximated by a polynomial function p(r) - * exp(x) = 2^m(2^(j/32) + 2^(j/32)p(r)); - */ -static void -AVX512F_exp_DOUBLE(npy_double * op, - npy_double * ip, - const npy_intp array_size, - const npy_intp steps) -{ - npy_intp num_remaining_elements = array_size; - const npy_intp stride = steps / (npy_intp)sizeof(npy_double); - const npy_int num_lanes = 64 / (npy_intp)sizeof(npy_double); - npy_int32 indexarr[8]; - for (npy_int32 ii = 0; ii < 8; ii++) { - indexarr[ii] = ii*stride; - } - - __m512d InvLn2N = _mm512_set1_pd(NPY_INV_LN2_MUL_32); - __m512d mShift = _mm512_set1_pd(NPY_RINT_CVT_MAGIC); - __m512d mNegL1 = _mm512_set1_pd(NPY_TANG_NEG_L1); - __m512d mNegL2 = _mm512_set1_pd(NPY_TANG_NEG_L2); - __m512i mMod = _mm512_set1_epi64(0x1f); - __m512d mA1 = _mm512_set1_pd(NPY_TANG_A1); - __m512d mA2 = _mm512_set1_pd(NPY_TANG_A2); - __m512d mA3 = _mm512_set1_pd(NPY_TANG_A3); - __m512d mA4 = _mm512_set1_pd(NPY_TANG_A4); - __m512d mA5 = _mm512_set1_pd(NPY_TANG_A5); - __m512d mTH_nearzero = _mm512_set1_pd(0x1p-54); - __m512d mTH_max = _mm512_set1_pd(0x1.62e42fefa39efp+9); - __m512d mTH_min = _mm512_set1_pd(-0x1.74910d52d3053p+9); - __m512d mTH_inf = _mm512_set1_pd(NPY_INFINITY); - __m512d zeros_d = _mm512_set1_pd(0.0f); - __m512d ones_d = _mm512_set1_pd(1.0f); - __m256i vindex = _mm256_loadu_si256((__m256i*)&indexarr[0]); - - __m512d mTable_top_0 = _mm512_loadu_pd(&(EXP_Table_top[8*0])); - __m512d mTable_top_1 = _mm512_loadu_pd(&(EXP_Table_top[8*1])); - __m512d mTable_top_2 = _mm512_loadu_pd(&(EXP_Table_top[8*2])); - __m512d mTable_top_3 = _mm512_loadu_pd(&(EXP_Table_top[8*3])); - __m512d mTable_tail_0 = _mm512_loadu_pd(&(EXP_Table_tail[8*0])); - __m512d mTable_tail_1 = _mm512_loadu_pd(&(EXP_Table_tail[8*1])); - __m512d mTable_tail_2 = _mm512_loadu_pd(&(EXP_Table_tail[8*2])); - __m512d mTable_tail_3 = _mm512_loadu_pd(&(EXP_Table_tail[8*3])); - - __mmask8 overflow_mask = avx512_get_partial_load_mask_pd(0, num_lanes); - __mmask8 underflow_mask = avx512_get_partial_load_mask_pd(0, num_lanes); - __mmask8 load_mask = avx512_get_full_load_mask_pd(); - __mmask8 xmin_mask, xmax_mask, inf_mask, nan_mask, nearzero_mask; - - while (num_remaining_elements > 0) { - if (num_remaining_elements < num_lanes) { - load_mask = avx512_get_partial_load_mask_pd(num_remaining_elements, - num_lanes); - } - - __m512d x; - if (1 == stride) { - x = avx512_masked_load_pd(load_mask, ip); - } - else { - x = avx512_masked_gather_pd(zeros_d, ip, vindex, load_mask); - } - - nan_mask = _mm512_cmp_pd_mask(x, x, _CMP_NEQ_UQ); - x = avx512_set_masked_lanes_pd(x, zeros_d, nan_mask); - xmax_mask = _mm512_cmp_pd_mask(x, mTH_max, _CMP_GT_OQ); - xmin_mask = _mm512_cmp_pd_mask(x, mTH_min, _CMP_LT_OQ); - inf_mask = _mm512_cmp_pd_mask(x, mTH_inf, _CMP_EQ_OQ); - __m512i x_abs = _mm512_and_epi64(_mm512_castpd_si512(x), - _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF)); - nearzero_mask = _mm512_cmp_pd_mask(_mm512_castsi512_pd(x_abs), - mTH_nearzero, _CMP_LT_OQ); - nearzero_mask = _mm512_kxor(nearzero_mask, nan_mask); - overflow_mask = _mm512_kor(overflow_mask, - _mm512_kxor(xmax_mask, inf_mask)); - underflow_mask = _mm512_kor(underflow_mask, xmin_mask); - x = avx512_set_masked_lanes_pd(x, zeros_d, - _mm512_kor(_mm512_kor(nan_mask, xmin_mask), - _mm512_kor(xmax_mask, nearzero_mask))); - - /* z = x * 32/ln2 */ - __m512d z = _mm512_mul_pd(x, InvLn2N); - - /* round to nearest */ - __m512d kd = _mm512_add_pd(z, mShift); - __m512i ki = _mm512_castpd_si512(kd); - kd = _mm512_sub_pd(kd, mShift); - - /* r = (x + kd*mNegL1) + kd*mNegL2 */ - __m512d r1 = _mm512_fmadd_pd(kd, mNegL1, x); - __m512d r2 = _mm512_mul_pd(kd, mNegL2); - __m512d r = _mm512_add_pd(r1,r2); - - /* Polynomial approximation for exp(r) - 1 */ - __m512d q = _mm512_fmadd_pd(mA5, r, mA4); - q = _mm512_fmadd_pd(q, r, mA3); - q = _mm512_fmadd_pd(q, r, mA2); - q = _mm512_fmadd_pd(q, r, mA1); - q = _mm512_mul_pd(q, r); - __m512d p = _mm512_fmadd_pd(r, q, r2); - p = _mm512_add_pd(r1, p); - - /* Get 2^(j/32) from lookup table */ - __m512i j = _mm512_and_epi64(ki, mMod); - __m512d top = avx512_permute_x4var_pd(mTable_top_0, mTable_top_1, - mTable_top_2, mTable_top_3, j); - __m512d tail = avx512_permute_x4var_pd(mTable_tail_0, mTable_tail_1, - mTable_tail_2, mTable_tail_3, j); - - /* - * s = top + tail; - * exp(x) = 2^m * (top + (tail + s * p)); - */ - __m512d s = _mm512_add_pd(top, tail); - __m512d res = _mm512_fmadd_pd(s, p, tail); - res = _mm512_add_pd(res, top); - res= _mm512_scalef_pd(res, _mm512_div_pd(kd, _mm512_set1_pd(32))); - - /* return special cases */ - res = avx512_set_masked_lanes_pd(res, _mm512_add_pd(x, ones_d), - nearzero_mask); - res = avx512_set_masked_lanes_pd(res, _mm512_set1_pd(NPY_NAN), - nan_mask); - res = avx512_set_masked_lanes_pd(res, mTH_inf, xmax_mask); - res = avx512_set_masked_lanes_pd(res, zeros_d, xmin_mask); - - _mm512_mask_storeu_pd(op, load_mask, res); - - ip += num_lanes * stride; - op += num_lanes; - num_remaining_elements -= num_lanes; - } - /* - * Don't count on the compiler for cast between mask and int registers. - * On gcc7 with flags -march>=nocona -O3 can cause FP stack overflow - * which may lead to putting NaN into certain HW/FP calculations. - * - * For more details, please check the comments in: - * - https://github.com/numpy/numpy/issues/20356 - */ - if (npyv_tobits_b64(overflow_mask)) { - npy_set_floatstatus_overflow(); - } - - if (npyv_tobits_b64(underflow_mask)) { - npy_set_floatstatus_underflow(); - } -} -/* - * Vectorized implementation of log double using AVX512 - * Reference: - * [1] Tang, Ping Tak Peter. Table-lookup algorithms for elementary functions - * and their error analysis. No. CONF-9106103-1. Argonne National Lab., - * IL (USA), 1991. - * [2] Tang, Ping-Tak Peter. "Table-driven implementation of the logarithm - * function in IEEE floating-point arithmetic." ACM Transactions on - * Mathematical Software (TOMS) 16.4 (1990): 378-400. - * [3] Muller, Jean-Michel. "Elementary functions: algorithms and - * implementation." (2016). - * 1) if x = 0; return -INF - * 2) if x < 0; return NAN - * 3) if x is INF; return INF - * 4) if x is NAN; return NAN - * 5) if x on (1.0 - 0x1p-4, 1.0 + 0x1.09p-4), calling npy_log() - * 6) Range reduction: - * log(x) = log(2^m * z) - * = mln2 + log(z) - * 7) log(z) = log(z / c_k) + log(c_k); - * where c_k = 1 + k/64, k = 0,1,...,64 - * s.t. |x - c_k| <= 1/128 when x on[1,2]. - * 8) r = 2(x - c_k)/(x + c_k) - * log(x/c_k) = log((1 + r/2) / (1 - r/2)) - * = p(r) - * = 2((r/2) + 1/3*(r/2)^3 + 1/5*(r/2)^5 + ...) +#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML) +/**begin repeat + * #sfx = f32, f64# + * #func_suffix = f16, 8# + * #USE = 0, 1# */ - -/* LLVM has a bug where AVX-512F intrinsic `_mm512_mask_mul_pd` emits an - * unmasked operation with a masked store. This can cause FP exceptions to - * occur for the lanes that are suppose to have been masked. - * - * See https://bugs.llvm.org/show_bug.cgi?id=51988 - * - * Note, this affects LLVM based compilers like Apple Clang, Clang, and Intel's - * ICX. +/**begin repeat1 + * #func = exp, log# + * #default_val = 0, 1# */ -#if defined(__clang__) - #if defined(__apple_build_version__) - // Apple Clang - #if __apple_build_version__ > 11000000 - // Apple Clang after v11 - #define WORKAROUND_LLVM__mm512_mask_mul_pd - #endif - #else - // Clang, not Apple Clang - #if __clang_major__ > 9 - // Clang v9+ - #define WORKAROUND_LLVM__mm512_mask_mul_pd - #endif - #endif -#endif +#if @USE@ static void -AVX512F_log_DOUBLE(npy_double * op, - npy_double * ip, - const npy_intp array_size, - const npy_intp steps) -{ - npy_intp num_remaining_elements = array_size; - const npy_intp stride = steps / (npy_intp)sizeof(npy_double); - const npy_int num_lanes = 64 / (npy_intp)sizeof(npy_double); - npy_int32 indexarr[8]; - for (npy_int32 ii = 0; ii < 8; ii++) { - indexarr[ii] = ii*stride; - } - - __m512d zeros_d = _mm512_set1_pd(0.0f); - __m512d ones_d = _mm512_set1_pd(1.0f); - __m512d mInf = _mm512_set1_pd(NPY_INFINITY); - __m512d mInv64 = _mm512_castsi512_pd(_mm512_set1_epi64(0x3f90000000000000)); - __m512d mNeg_nan = _mm512_set1_pd(-NPY_NAN); - __m512d mNan = _mm512_set1_pd(NPY_NAN); - __m512d mNeg_inf = _mm512_set1_pd(-NPY_INFINITY); - __m512d mA1 = _mm512_set1_pd(NPY_TANG_LOG_A1); - __m512d mA2 = _mm512_set1_pd(NPY_TANG_LOG_A2); - __m512d mA3 = _mm512_set1_pd(NPY_TANG_LOG_A3); - __m512d mA4 = _mm512_set1_pd(NPY_TANG_LOG_A4); - __m512d mLN2HI = _mm512_set1_pd(NPY_TANG_LOG_LN2HI); - __m512d mLN2LO = _mm512_set1_pd(NPY_TANG_LOG_LN2LO); - - __m512d mTo_glibc_min = _mm512_set1_pd(1.0 - 0x1p-4); - __m512d mTo_glibc_max = _mm512_set1_pd(1.0 + 0x1.09p-4); - __m256i vindex = _mm256_loadu_si256((__m256i*)&indexarr[0]); - - /* Load lookup table data */ - /**begin repeat - * #i = 0, 1, 2, 3, 4, 5, 6, 7# - */ - - __m512d mLUT_TOP_@i@ = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*@i@])); - __m512d mLUT_TAIL_@i@ = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*@i@])); - - /**end repeat**/ - - __mmask8 load_mask = avx512_get_full_load_mask_pd(); - __mmask8 invalid_mask = avx512_get_partial_load_mask_pd(0, num_lanes); - __mmask8 divide_by_zero_mask = invalid_mask; - - __mmask8 inf_mask, nan_mask, zero_mask, negx_mask, denormal_mask, - glibc_mask; - - __m512d x_in; - while (num_remaining_elements > 0) { - if (num_remaining_elements < num_lanes) { - load_mask = avx512_get_partial_load_mask_pd(num_remaining_elements, - num_lanes); - } - - if (1 == stride) { - x_in = avx512_masked_load_pd(load_mask, ip); - } - else { - x_in = avx512_masked_gather_pd(zeros_d, ip, vindex, load_mask); - } - - /* call glibc when x on [1.0 - 0x1p-4, 1.0 + 0x1.09p-4] */ - __mmask8 m1 = _mm512_cmp_pd_mask(x_in, mTo_glibc_max, _CMP_LT_OQ); - __mmask8 m2 = _mm512_cmp_pd_mask(x_in, mTo_glibc_min, _CMP_GT_OQ); - glibc_mask = m1 & m2; - - if (glibc_mask != 0xFF) { - zero_mask = _mm512_cmp_pd_mask(x_in, zeros_d, _CMP_EQ_OQ); - inf_mask = _mm512_cmp_pd_mask(x_in, mInf, _CMP_EQ_OQ); - negx_mask = _mm512_cmp_pd_mask(x_in, zeros_d, _CMP_LT_OQ); - nan_mask = _mm512_cmp_pd_mask(x_in, x_in, _CMP_NEQ_UQ); - - divide_by_zero_mask = divide_by_zero_mask | (zero_mask & load_mask); - invalid_mask = invalid_mask | negx_mask; - - __m512d x = avx512_set_masked_lanes_pd(x_in, zeros_d, negx_mask); - __m512i ix = _mm512_castpd_si512(x); - - /* Normalize x when it is denormal */ - __m512i top12 = _mm512_and_epi64(ix, - _mm512_set1_epi64(0xfff0000000000000)); - denormal_mask = _mm512_cmp_epi64_mask(top12, _mm512_set1_epi64(0), - _CMP_EQ_OQ); - denormal_mask = (~zero_mask) & denormal_mask; - __m512d masked_x = x; - #ifdef WORKAROUND_LLVM__mm512_mask_mul_pd - masked_x = avx512_set_masked_lanes_pd(masked_x, zeros_d, (~denormal_mask)); - #endif - ix = _mm512_castpd_si512(_mm512_mask_mul_pd(x, denormal_mask, - masked_x, _mm512_set1_pd(0x1p52))); - ix = _mm512_mask_sub_epi64(ix, denormal_mask, - ix, _mm512_set1_epi64(52ULL << 52)); - - /* - * x = 2^k * z; where z in range [1,2] - */ - __m512i tmp = _mm512_sub_epi64(ix, - _mm512_set1_epi64(0x3ff0000000000000)); - __m512i i = _mm512_and_epi64(_mm512_srai_epi64(tmp, 52 - 6), - _mm512_set1_epi64(0x3fULL)); - __m512i ik = _mm512_srai_epi64(tmp, 52); - __m512d z = _mm512_castsi512_pd(_mm512_sub_epi64(ix, _mm512_and_epi64(tmp, - _mm512_set1_epi64(0xfff0000000000000)))); - /* c = i/64 + 1 */ - __m256i i_32 = _mm512_cvtepi64_epi32(i); - __m512d c = _mm512_fmadd_pd(_mm512_cvtepi32_pd(i_32), mInv64, ones_d); - - /* u = 2 * (z - c) / (z + c) */ - __m512d u = _mm512_div_pd(_mm512_sub_pd(z, c), _mm512_add_pd(z, c)); - u = _mm512_mul_pd(_mm512_set1_pd(2.0), u); - - /* v = u * u */ - __m512d v = _mm512_mul_pd(u,u); - - /* log(z/c) = u + u*v*(A1 + v*(A2 + v*(A3 + v*A4))) */ - __m512d res = _mm512_fmadd_pd(v, mA4, mA3); - res = _mm512_fmadd_pd(v, res, mA2); - res = _mm512_fmadd_pd(v, res, mA1); - res = _mm512_mul_pd(v, res); - res = _mm512_fmadd_pd(u, res, u); - - /* Load lookup table data */ - __m512d c_hi = avx512_permute_x8var_pd(mLUT_TOP_0, mLUT_TOP_1, - mLUT_TOP_2, mLUT_TOP_3, mLUT_TOP_4, mLUT_TOP_5, - mLUT_TOP_6, mLUT_TOP_7, i); - __m512d c_lo = avx512_permute_x8var_pd(mLUT_TAIL_0, mLUT_TAIL_1, - mLUT_TAIL_2, mLUT_TAIL_3, mLUT_TAIL_4, mLUT_TAIL_5, - mLUT_TAIL_6, mLUT_TAIL_7, i); - - /* - * log(x) = k * ln2_hi + c_hi + - * k * ln2_lo + c_lo + - * log(z/c) - */ - __m256i ik_32 = _mm512_cvtepi64_epi32(ik); - __m512d k = _mm512_cvtepi32_pd(ik_32); - __m512d tt = _mm512_fmadd_pd(k, mLN2HI, c_hi); - __m512d tt2 = _mm512_fmadd_pd(k, mLN2LO, c_lo); - tt = _mm512_add_pd(tt, tt2); - res = _mm512_add_pd(tt, res); - - /* return special cases */ - res = avx512_set_masked_lanes_pd(res, mNan, nan_mask); - res = avx512_set_masked_lanes_pd(res, mNeg_nan, negx_mask); - res = avx512_set_masked_lanes_pd(res, mNeg_inf, zero_mask); - res = avx512_set_masked_lanes_pd(res, mInf, inf_mask); - - _mm512_mask_storeu_pd(op, load_mask, res); - } - - /* call glibc's log func when x around 1.0f */ - if (glibc_mask != 0) { - double NPY_DECL_ALIGNED(64) ip_fback[8]; - _mm512_store_pd(ip_fback, x_in); - - for (int ii = 0; ii < 8; ++ii, glibc_mask >>= 1) { - if (glibc_mask & 0x01) { - op[ii] = npy_log(ip_fback[ii]); - } +simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc, + npyv_lanetype_@sfx@ *dst, npy_intp sdst, npy_intp len) +{ + const int vstep = npyv_nlanes_@sfx@; + for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) { + npyv_@sfx@ x; + #if @default_val@ + if (ssrc == 1) { + x = npyv_load_till_@sfx@(src, len, @default_val@); + } else { + x = npyv_loadn_till_@sfx@(src, ssrc, len, @default_val@); + } + #else + if (ssrc == 1) { + x = npyv_load_tillz_@sfx@(src, len); + } else { + x = npyv_loadn_tillz_@sfx@(src, ssrc, len); } + #endif + npyv_@sfx@ out = __svml_@func@@func_suffix@(x); + if (sdst == 1) { + npyv_store_till_@sfx@(dst, len, out); + } else { + npyv_storen_till_@sfx@(dst, sdst, len, out); } - ip += num_lanes * stride; - op += num_lanes; - num_remaining_elements -= num_lanes; - } - - if (npyv_tobits_b64(invalid_mask)) { - npy_set_floatstatus_invalid(); - } - if (npyv_tobits_b64(divide_by_zero_mask)) { - npy_set_floatstatus_divbyzero(); } + npyv_cleanup(); } -#undef WORKAROUND_LLVM__mm512_mask_mul_pd +#endif +/**end repeat1**/ +/**end repeat**/ +#endif -#endif // AVX512F_NOCLANG_BUG #ifdef SIMD_AVX512_SKX /**begin repeat @@ -1289,19 +916,30 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_@func@) /**begin repeat * #func = exp, log# * #scalar = npy_exp, npy_log# + * #type = npy_double, npy_double# + * #sfx = f64, f64# */ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) { -#ifdef SIMD_AVX512F_NOCLANG_BUG - if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_double), sizeof(npy_double), 64)) { - AVX512F_@func@_DOUBLE((npy_double*)args[1], (npy_double*)args[0], dimensions[0], steps[0]); +#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML) + const @type@ *src = (@type@*)args[0]; + @type@ *dst = (@type@*)args[1]; + const int lsize = sizeof(src[0]); + const npy_intp ssrc = steps[0] / lsize; + const npy_intp sdst = steps[1] / lsize; + const npy_intp len = dimensions[0]; + assert(steps[0] % lsize == 0 && steps[1] % lsize == 0); + if (!is_mem_overlap(src, steps[0], dst, steps[1], len) && + npyv_loadable_stride_@sfx@(ssrc) && + npyv_storable_stride_@sfx@(sdst)) { + simd_@func@_@sfx@(src, ssrc, dst, sdst, len); return; } #endif UNARY_LOOP { - const npy_double in1 = *(npy_double *)ip1; - *(npy_double *)op1 = @scalar@(in1); + const @type@ in1 = *(@type@ *)ip1; + *(@type@ *)op1 = npy_@func@(in1); } } /**end repeat**/ diff --git a/numpy/core/src/umath/npy_simd_data.h b/numpy/core/src/umath/npy_simd_data.h index 62438d7a3..75a7d7678 100644 --- a/numpy/core/src/umath/npy_simd_data.h +++ b/numpy/core/src/umath/npy_simd_data.h @@ -6,85 +6,6 @@ * Constants used in vector implementation of float64 exp(x) */ #define NPY_RINT_CVT_MAGIC 0x1.8p52 -#define NPY_INV_LN2_MUL_32 0x1.71547652b82fep+5 -#define NPY_TANG_NEG_L1 -0x1.62e42fefp-6 -#define NPY_TANG_NEG_L2 -0x1.473de6af278edp-39 -#define NPY_TANG_A1 0x1p-1 -#define NPY_TANG_A2 0x1.5555555548f7cp-3 -#define NPY_TANG_A3 0x1.5555555545d4ep-5 -#define NPY_TANG_A4 0x1.11115b7aa905ep-7 -#define NPY_TANG_A5 0x1.6c1728d739765p-10 - -/* Lookup table for 2^(j/32) */ -static npy_uint64 EXP_Table_top[32] = { - 0x3FF0000000000000, - 0x3FF059B0D3158540, - 0x3FF0B5586CF98900, - 0x3FF11301D0125B40, - 0x3FF172B83C7D5140, - 0x3FF1D4873168B980, - 0x3FF2387A6E756200, - 0x3FF29E9DF51FDEC0, - 0x3FF306FE0A31B700, - 0x3FF371A7373AA9C0, - 0x3FF3DEA64C123400, - 0x3FF44E0860618900, - 0x3FF4BFDAD5362A00, - 0x3FF5342B569D4F80, - 0x3FF5AB07DD485400, - 0x3FF6247EB03A5580, - 0x3FF6A09E667F3BC0, - 0x3FF71F75E8EC5F40, - 0x3FF7A11473EB0180, - 0x3FF82589994CCE00, - 0x3FF8ACE5422AA0C0, - 0x3FF93737B0CDC5C0, - 0x3FF9C49182A3F080, - 0x3FFA5503B23E2540, - 0x3FFAE89F995AD380, - 0x3FFB7F76F2FB5E40, - 0x3FFC199BDD855280, - 0x3FFCB720DCEF9040, - 0x3FFD5818DCFBA480, - 0x3FFDFC97337B9B40, - 0x3FFEA4AFA2A490C0, - 0x3FFF50765B6E4540, -}; - -static npy_uint64 EXP_Table_tail[32] = { - 0x0000000000000000, - 0x3D0A1D73E2A475B4, - 0x3CEEC5317256E308, - 0x3CF0A4EBBF1AED93, - 0x3D0D6E6FBE462876, - 0x3D053C02DC0144C8, - 0x3D0C3360FD6D8E0B, - 0x3D009612E8AFAD12, - 0x3CF52DE8D5A46306, - 0x3CE54E28AA05E8A9, - 0x3D011ADA0911F09F, - 0x3D068189B7A04EF8, - 0x3D038EA1CBD7F621, - 0x3CBDF0A83C49D86A, - 0x3D04AC64980A8C8F, - 0x3CD2C7C3E81BF4B7, - 0x3CE921165F626CDD, - 0x3D09EE91B8797785, - 0x3CDB5F54408FDB37, - 0x3CF28ACF88AFAB35, - 0x3CFB5BA7C55A192D, - 0x3D027A280E1F92A0, - 0x3CF01C7C46B071F3, - 0x3CFC8B424491CAF8, - 0x3D06AF439A68BB99, - 0x3CDBAA9EC206AD4F, - 0x3CFC2220CB12A092, - 0x3D048A81E5E8F4A5, - 0x3CDC976816BAD9B8, - 0x3CFEB968CAC39ED3, - 0x3CF9858F73A18F5E, - 0x3C99D3E12DD8A18B, -}; #endif #endif @@ -120,156 +41,4 @@ static npy_uint64 EXP_Table_tail[32] = { #define NPY_COEFF_Q4_LOGf 1.546476374983906719538e-01f #define NPY_COEFF_Q5_LOGf 5.875095403124574342950e-03f -/* - * Lookup table of log(c_k) - * Reference form: Tang, Ping-Tak Peter. "Table-driven implementation of the - * logarithm function in IEEE floating-point arithmetic." ACM Transactions - * on Mathematical Software (TOMS) 16.4 (1990): 378-400. - */ -#if defined NPY_HAVE_AVX512F -#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1))) -static npy_uint64 LOG_TABLE_TOP[64] = { - 0x0000000000000000, - 0x3F8FC0A8B1000000, - 0x3F9F829B0E780000, - 0x3FA77458F6340000, - 0x3FAF0A30C0100000, - 0x3FB341D7961C0000, - 0x3FB6F0D28AE60000, - 0x3FBA926D3A4A0000, - 0x3FBE27076E2A0000, - 0x3FC0D77E7CD10000, - 0x3FC29552F8200000, - 0x3FC44D2B6CCB0000, - 0x3FC5FF3070A80000, - 0x3FC7AB8902110000, - 0x3FC9525A9CF40000, - 0x3FCAF3C94E810000, - 0x3FCC8FF7C79B0000, - 0x3FCE27076E2B0000, - 0x3FCFB9186D5E0000, - 0x3FD0A324E2738000, - 0x3FD1675CABAB8000, - 0x3FD22941FBCF8000, - 0x3FD2E8E2BAE10000, - 0x3FD3A64C55698000, - 0x3FD4618BC21C8000, - 0x3FD51AAD872E0000, - 0x3FD5D1BDBF580000, - 0x3FD686C81E9B0000, - 0x3FD739D7F6BC0000, - 0x3FD7EAF83B828000, - 0x3FD89A3386C18000, - 0x3FD947941C210000, - 0x3FD9F323ECBF8000, - 0x3FDA9CEC9A9A0000, - 0x3FDB44F77BCC8000, - 0x3FDBEB4D9DA70000, - 0x3FDC8FF7C79A8000, - 0x3FDD32FE7E010000, - 0x3FDDD46A04C20000, - 0x3FDE744261D68000, - 0x3FDF128F5FAF0000, - 0x3FDFAF588F790000, - 0x3FE02552A5A5C000, - 0x3FE0723E5C1CC000, - 0x3FE0BE72E4254000, - 0x3FE109F39E2D4000, - 0x3FE154C3D2F4C000, - 0x3FE19EE6B467C000, - 0x3FE1E85F5E704000, - 0x3FE23130D7BEC000, - 0x3FE2795E1289C000, - 0x3FE2C0E9ED448000, - 0x3FE307D7334F0000, - 0x3FE34E289D9D0000, - 0x3FE393E0D3564000, - 0x3FE3D9026A714000, - 0x3FE41D8FE8468000, - 0x3FE4618BC21C4000, - 0x3FE4A4F85DB04000, - 0x3FE4E7D811B74000, - 0x3FE52A2D265BC000, - 0x3FE56BF9D5B40000, - 0x3FE5AD404C358000, - 0x3FE5EE02A9240000, -}; - -static npy_uint64 LOG_TABLE_TAIL[64] = { - 0x0000000000000000, - 0xBD5FE0E183092C59, - 0x3D2980267C7E09E4, - 0xBD62303B9CB0D5E1, - 0x3D662A6617CC9717, - 0xBD4717B6B33E44F8, - 0xBD62968C836CC8C2, - 0x3D6AAC6CA17A4554, - 0x3D6E5CBD3D50FFFC, - 0xBD6C69A65A23A170, - 0xBD35B967F4471DFC, - 0x3D6F4799F4F6543E, - 0xBD6B0B0DE3077D7E, - 0xBD537B720E4A694B, - 0x3D65AD1D904C1D4E, - 0xBD600349CC67F9B2, - 0xBD697794F689F843, - 0xBD3A342C2AF0003C, - 0x3D5F1546AAA3361C, - 0x3D50E35F73F7A018, - 0x3D630701CE63EAB9, - 0xBD3A6976F5EB0963, - 0x3D5D309C2CC91A85, - 0xBD6D0B1C68651946, - 0xBD609EC17A426426, - 0xBD3F4BD8DB0A7CC1, - 0x3D4394A11B1C1EE4, - 0x3D54AEC442BE1015, - 0xBD67FCB18ED9D603, - 0x3D67E1B259D2F3DA, - 0xBD6ED2A52C73BF78, - 0x3D56FABA4CDD147D, - 0x3D584BF2B68D766F, - 0x3D40931A909FEA5E, - 0x3D4EC5197DDB55D3, - 0x3D5B7BF7861D37AC, - 0x3D5A21AC25DB1EF3, - 0xBD542A9E21373414, - 0xBD6DAFA08CECADB1, - 0x3D3E1F8DF68DBCF3, - 0x3D3BB2CD720EC44C, - 0xBD49C24CA098362B, - 0x3D60FEC69C695D7F, - 0x3D6F404E57963891, - 0xBD657D49676844CC, - 0x3D592DFBC7D93617, - 0x3D65E9A98F33A396, - 0x3D52DD98B97BAEF0, - 0x3D1A07BD8B34BE7C, - 0xBD17AFA4392F1BA7, - 0xBD5DCA290F818480, - 0x3D5D1772F5386374, - 0x3D60BE1FB590A1F5, - 0xBD6E2CE9146D271A, - 0xBD65E6563BBD9FC9, - 0x3D66FAA404263D0B, - 0xBD5AA33736867A17, - 0x3D6EC27D0B7B37B3, - 0xBD244FDD840B8591, - 0x3D6BB09CB0985646, - 0x3D46ABB9DF22BC57, - 0xBD58CD7DC73BD194, - 0x3D6F2CFB29AAA5F0, - 0x3D66757006095FD2, -}; - -#define NPY_TANG_LOG_A1 0x1.55555555554e6p-4 -#define NPY_TANG_LOG_A2 0x1.9999999bac6d4p-7 -#define NPY_TANG_LOG_A3 0x1.2492307f1519fp-9 -#define NPY_TANG_LOG_A4 0x1.c8034c85dfffp-12 - -#define NPY_TANG_LOG_LN2HI 0x1.62e42fefa4p-1 -#define NPY_TANG_LOG_LN2LO -0x1.8432a1b0e2634p-43 -#endif -#endif - #endif |