diff options
-rw-r--r-- | numpy/core/src/common/npy_svml.h | 4 | ||||
-rw-r--r-- | numpy/core/src/umath/loops_exponent_log.dispatch.c.src | 61 | ||||
-rw-r--r-- | numpy/core/src/umath/npy_simd_data.h | 4 |
3 files changed, 67 insertions, 2 deletions
diff --git a/numpy/core/src/common/npy_svml.h b/numpy/core/src/common/npy_svml.h index 4292f7090..1111025d7 100644 --- a/numpy/core/src/common/npy_svml.h +++ b/numpy/core/src/common/npy_svml.h @@ -1,5 +1,7 @@ #if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML) +extern __m512 __svml_expf16(__m512 x); extern __m512 __svml_exp2f16(__m512 x); +extern __m512 __svml_logf16(__m512 x); extern __m512 __svml_log2f16(__m512 x); extern __m512 __svml_log10f16(__m512 x); extern __m512 __svml_expm1f16(__m512 x); @@ -19,7 +21,9 @@ extern __m512 __svml_asinhf16(__m512 x); extern __m512 __svml_acoshf16(__m512 x); extern __m512 __svml_atanhf16(__m512 x); +extern __m512d __svml_exp8(__m512d x); extern __m512d __svml_exp28(__m512d x); +extern __m512d __svml_log8(__m512d x); extern __m512d __svml_log28(__m512d x); extern __m512d __svml_log108(__m512d x); extern __m512d __svml_expm18(__m512d x); diff --git a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src index e0ee7f7eb..53db01594 100644 --- a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src +++ b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src @@ -11,6 +11,7 @@ #include "numpy/npy_math.h" #include "simd/simd.h" +#include "npy_svml.h" #include "loops_utils.h" #include "loops.h" #include "lowlevel_strided_loops.h" @@ -691,6 +692,43 @@ simd_log_FLOAT(npy_float * op, #endif // @CHK@ /**end repeat**/ +#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML) +/**begin repeat + * #func = exp, log# + * #default_val = 0, 1# + */ +static void +simd_@func@_f64(const npyv_lanetype_f64 *src, npy_intp ssrc, + npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len) +{ + const int vstep = npyv_nlanes_f64; + for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) { + npyv_f64 x; +#if @default_val@ + if (ssrc == 1) { + x = npyv_load_till_f64(src, len, @default_val@); + } else { + x = npyv_loadn_till_f64(src, ssrc, len, @default_val@); + } +#else + if (ssrc == 1) { + x = npyv_load_tillz_f64(src, len); + } else { + x = npyv_loadn_tillz_f64(src, ssrc, len); + } +#endif + npyv_f64 out = __svml_@func@8(x); + if (sdst == 1) { + npyv_store_till_f64(dst, len, out); + } else { + npyv_storen_till_f64(dst, sdst, len, out); + } + } + npyv_cleanup(); +} +/**end repeat**/ + +#else #ifdef SIMD_AVX512F_NOCLANG_BUG /* * Vectorized implementation of exp double using AVX512 @@ -1086,7 +1124,8 @@ AVX512F_log_DOUBLE(npy_double * op, #undef WORKAROUND_LLVM__mm512_mask_mul_pd -#endif // AVX512F_NOCLANG_BUG +#endif // SIMD_AVX512F_NOCLANG_BUG +#endif // NPY_CAN_LINK_SVML #ifdef SIMD_AVX512_SKX /**begin repeat @@ -1299,17 +1338,35 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_@func@) NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) { +#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML) + const npy_double *src = (npy_double*)args[0]; + npy_double *dst = (npy_double*)args[1]; + const int lsize = sizeof(src[0]); + const npy_intp ssrc = steps[0] / lsize; + const npy_intp sdst = steps[1] / lsize; + const npy_intp len = dimensions[0]; + assert(steps[0] % lsize == 0 && steps[1] % lsize == 0); + if (!is_mem_overlap(src, steps[0], dst, steps[1], len) && + npyv_loadable_stride_f64(ssrc) && + npyv_storable_stride_f64(sdst)) { + simd_@func@_f64(src, ssrc, dst, sdst, len); + return; + } +#else #ifdef SIMD_AVX512F_NOCLANG_BUG if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_double), sizeof(npy_double), 64)) { AVX512F_@func@_DOUBLE((npy_double*)args[1], (npy_double*)args[0], dimensions[0], steps[0]); return; } -#endif +#else UNARY_LOOP { const npy_double in1 = *(npy_double *)ip1; *(npy_double *)op1 = @scalar@(in1); } +#endif // SIMD_AVX512F_NOCLANG_BUG +#endif // NPY_CAN_LINK_SVML } + /**end repeat**/ /**begin repeat diff --git a/numpy/core/src/umath/npy_simd_data.h b/numpy/core/src/umath/npy_simd_data.h index 62438d7a3..43640a2d6 100644 --- a/numpy/core/src/umath/npy_simd_data.h +++ b/numpy/core/src/umath/npy_simd_data.h @@ -15,6 +15,7 @@ #define NPY_TANG_A4 0x1.11115b7aa905ep-7 #define NPY_TANG_A5 0x1.6c1728d739765p-10 +#if !defined NPY_HAVE_AVX512_SKX || !defined NPY_CAN_LINK_SVML /* Lookup table for 2^(j/32) */ static npy_uint64 EXP_Table_top[32] = { 0x3FF0000000000000, @@ -85,6 +86,7 @@ static npy_uint64 EXP_Table_tail[32] = { 0x3CF9858F73A18F5E, 0x3C99D3E12DD8A18B, }; +#endif //#if !defined NPY_HAVE_AVX512_SKX || !defined NPY_CAN_LINK_SVML #endif #endif @@ -128,6 +130,7 @@ static npy_uint64 EXP_Table_tail[32] = { */ #if defined NPY_HAVE_AVX512F #if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1))) +#if !defined NPY_HAVE_AVX512_SKX || !defined NPY_CAN_LINK_SVML static npy_uint64 LOG_TABLE_TOP[64] = { 0x0000000000000000, 0x3F8FC0A8B1000000, @@ -261,6 +264,7 @@ static npy_uint64 LOG_TABLE_TAIL[64] = { 0x3D6F2CFB29AAA5F0, 0x3D66757006095FD2, }; +#endif //#if !defined NPY_HAVE_AVX512_SKX || !defined NPY_CAN_LINK_SVML #define NPY_TANG_LOG_A1 0x1.55555555554e6p-4 #define NPY_TANG_LOG_A2 0x1.9999999bac6d4p-7 |