diff options
author | Matti Picus <matti.picus@gmail.com> | 2019-04-20 20:20:46 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-04-20 20:20:46 +0300 |
commit | cd2e52ce9dc54b0199beba25bd5e02ce54d60963 (patch) | |
tree | 69654e1b8491320bfad736202ea161273cfb3972 | |
parent | 31e71d7ce8d447cb74b9fb83875361cf7dba4579 (diff) | |
parent | d3125fa94893af10597f3e0b07613c719940a543 (diff) | |
download | numpy-cd2e52ce9dc54b0199beba25bd5e02ce54d60963.tar.gz |
Merge pull request #13134 from r-devulap/logexp-simd
ENH: Use AVX for float32 implementation of np.exp & np.log
-rw-r--r-- | doc/release/1.17.0-notes.rst | 6 | ||||
-rw-r--r-- | numpy/core/code_generators/generate_umath.py | 2 | ||||
-rw-r--r-- | numpy/core/include/numpy/npy_common.h | 17 | ||||
-rw-r--r-- | numpy/core/include/numpy/npy_math.h | 32 | ||||
-rw-r--r-- | numpy/core/setup.py | 5 | ||||
-rw-r--r-- | numpy/core/setup_common.py | 20 | ||||
-rw-r--r-- | numpy/core/src/umath/cpuid.c | 23 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 49 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.h.src | 16 | ||||
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 396 | ||||
-rw-r--r-- | numpy/distutils/command/autodist.py | 20 | ||||
-rw-r--r-- | numpy/distutils/command/config.py | 6 |
12 files changed, 591 insertions, 1 deletions
diff --git a/doc/release/1.17.0-notes.rst b/doc/release/1.17.0-notes.rst index fa6a132dd..87033f020 100644 --- a/doc/release/1.17.0-notes.rst +++ b/doc/release/1.17.0-notes.rst @@ -191,6 +191,12 @@ but with this change, you can do:: thereby saving a level of indentation +`numpy.exp and numpy.log` speed up for float32 implementation +------------------------------------------------------------- +float32 implementation of numpy.exp and numpy.log now benefit from AVX2/AVX512 +instruction set which are detected during runtime. numpy.exp has a max ulp +error of 2.52 and numpy.log has a max ulp error or 3.83. + Improve performance of ``np.pad`` --------------------------------- The performance of the function has been improved for most cases by filling in diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index f251ed6e9..27a2ba1e1 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -697,6 +697,7 @@ defdict = { Ufunc(1, 1, None, docstrings.get('numpy.core.umath.exp'), None, + TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]), TD(inexact, f='exp', astype={'e':'f'}), TD(P, f='exp'), ), @@ -718,6 +719,7 @@ defdict = { Ufunc(1, 1, None, docstrings.get('numpy.core.umath.log'), None, + TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]), TD(inexact, f='log', astype={'e':'f'}), TD(P, f='log'), ), diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h index 64aaaacff..108c0a202 100644 --- a/numpy/core/include/numpy/npy_common.h +++ b/numpy/core/include/numpy/npy_common.h @@ -46,10 +46,20 @@ #endif #if defined HAVE_ATTRIBUTE_TARGET_AVX2 && defined HAVE_LINK_AVX2 #define NPY_GCC_TARGET_AVX2 __attribute__((target("avx2"))) +#elif defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS +#define NPY_GCC_TARGET_AVX2 __attribute__((target("avx2"))) #else #define NPY_GCC_TARGET_AVX2 #endif +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F && defined HAVE_LINK_AVX512F +#define NPY_GCC_TARGET_AVX512F __attribute__((target("avx512f"))) +#elif defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS +#define NPY_GCC_TARGET_AVX512F __attribute__((target("avx512f"))) +#else +#define NPY_GCC_TARGET_AVX512F +#endif + /* * mark an argument (starting from 1) that must not be NULL and is not checked * DO NOT USE IF FUNCTION CHECKS FOR NULL!! the compiler will remove the check @@ -68,6 +78,13 @@ #define NPY_HAVE_SSE2_INTRINSICS #endif +#if defined HAVE_IMMINTRIN_H && defined HAVE_LINK_AVX2 +#define NPY_HAVE_AVX2_INTRINSICS +#endif + +#if defined HAVE_IMMINTRIN_H && defined HAVE_LINK_AVX512F +#define NPY_HAVE_AVX512F_INTRINSICS +#endif /* * give a hint to the compiler which branch is more likely or unlikely * to occur, e.g. rare error cases: diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h index 582390cdc..dfb8ff526 100644 --- a/numpy/core/include/numpy/npy_math.h +++ b/numpy/core/include/numpy/npy_math.h @@ -113,6 +113,38 @@ NPY_INLINE static float __npy_nzerof(void) #define NPY_SQRT2l 1.414213562373095048801688724209698079L /* sqrt(2) */ #define NPY_SQRT1_2l 0.707106781186547524400844362104849039L /* 1/sqrt(2) */ +/* + * Constants used in vector implementation of exp(x) + */ +#define NPY_RINT_CVT_MAGICf 0x1.800000p+23f +#define NPY_CODY_WAITE_LOGE_2_HIGHf -6.93145752e-1f +#define NPY_CODY_WAITE_LOGE_2_LOWf -1.42860677e-6f +#define NPY_COEFF_P0_EXPf 9.999999999980870924916e-01f +#define NPY_COEFF_P1_EXPf 7.257664613233124478488e-01f +#define NPY_COEFF_P2_EXPf 2.473615434895520810817e-01f +#define NPY_COEFF_P3_EXPf 5.114512081637298353406e-02f +#define NPY_COEFF_P4_EXPf 6.757896990527504603057e-03f +#define NPY_COEFF_P5_EXPf 5.082762527590693718096e-04f +#define NPY_COEFF_Q0_EXPf 1.000000000000000000000e+00f +#define NPY_COEFF_Q1_EXPf -2.742335390411667452936e-01f +#define NPY_COEFF_Q2_EXPf 2.159509375685829852307e-02f + +/* + * Constants used in vector implementation of log(x) + */ +#define NPY_COEFF_P0_LOGf 0.000000000000000000000e+00f +#define NPY_COEFF_P1_LOGf 9.999999999999998702752e-01f +#define NPY_COEFF_P2_LOGf 2.112677543073053063722e+00f +#define NPY_COEFF_P3_LOGf 1.480000633576506585156e+00f +#define NPY_COEFF_P4_LOGf 3.808837741388407920751e-01f +#define NPY_COEFF_P5_LOGf 2.589979117907922693523e-02f +#define NPY_COEFF_Q0_LOGf 1.000000000000000000000e+00f +#define NPY_COEFF_Q1_LOGf 2.612677543073109236779e+00f +#define NPY_COEFF_Q2_LOGf 2.453006071784736363091e+00f +#define NPY_COEFF_Q3_LOGf 9.864942958519418960339e-01f +#define NPY_COEFF_Q4_LOGf 1.546476374983906719538e-01f +#define NPY_COEFF_Q5_LOGf 5.875095403124574342950e-03f + /* * C99 double math funcs */ diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 2bcd17f27..9f1ecf358 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -171,6 +171,11 @@ def check_math_capabilities(config, moredefs, mathlibs): if config.check_gcc_function_attribute(dec, fn): moredefs.append((fname2def(fn), 1)) + for dec, fn, code, header in OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS: + if config.check_gcc_function_attribute_with_intrinsics(dec, fn, code, + header): + moredefs.append((fname2def(fn), 1)) + for fn in OPTIONAL_VARIABLE_ATTRIBUTES: if config.check_gcc_variable_attribute(fn): m = fn.replace("(", "_").replace(")", "_") diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py index f837df112..885aec443 100644 --- a/numpy/core/setup_common.py +++ b/numpy/core/setup_common.py @@ -118,6 +118,7 @@ OPTIONAL_HEADERS = [ # sse headers only enabled automatically on amd64/x32 builds "xmmintrin.h", # SSE "emmintrin.h", # SSE2 + "immintrin.h", # AVX "features.h", # for glibc version linux "xlocale.h", # see GH#8367 "dlfcn.h", # dladdr @@ -149,6 +150,8 @@ OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'), "stdio.h", "LINK_AVX"), ("__asm__ volatile", '"vpand %ymm1, %ymm2, %ymm3"', "stdio.h", "LINK_AVX2"), + ("__asm__ volatile", '"vpaddd %zmm1, %zmm2, %zmm3"', + "stdio.h", "LINK_AVX512F"), ("__asm__ volatile", '"xgetbv"', "stdio.h", "XGETBV"), ] @@ -165,6 +168,23 @@ OPTIONAL_FUNCTION_ATTRIBUTES = [('__attribute__((optimize("unroll-loops")))', 'attribute_target_avx'), ('__attribute__((target ("avx2")))', 'attribute_target_avx2'), + ('__attribute__((target ("avx512f")))', + 'attribute_target_avx512f'), + ] + +# function attributes with intrinsics +# To ensure your compiler can compile avx intrinsics with just the attributes +# gcc 4.8.4 support attributes but not with intrisics +# tested via "#include<%s> int %s %s(void *){code; return 0;};" % (header, attribute, name, code) +# function name will be converted to HAVE_<upper-case-name> preprocessor macro +OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS = [('__attribute__((target("avx2")))', + 'attribute_target_avx2_with_intrinsics', + '__m256 temp = _mm256_set1_ps(1.0)', + 'immintrin.h'), + ('__attribute__((target("avx512f")))', + 'attribute_target_avx512f_with_intrinsics', + '__m512 temp = _mm512_set1_ps(1.0)', + 'immintrin.h'), ] # variable attributes tested via "int %s a" % attribute diff --git a/numpy/core/src/umath/cpuid.c b/numpy/core/src/umath/cpuid.c index 6744ceb05..51c540457 100644 --- a/numpy/core/src/umath/cpuid.c +++ b/numpy/core/src/umath/cpuid.c @@ -11,6 +11,7 @@ #define XCR_XFEATURE_ENABLED_MASK 0x0 #define XSTATE_SSE 0x2 #define XSTATE_YMM 0x4 +#define XSTATE_ZMM 0x70 /* * verify the OS supports avx instructions @@ -33,6 +34,19 @@ int os_avx_support(void) #endif } +static NPY_INLINE +int os_avx512_support(void) +{ +#if HAVE_XGETBV + unsigned int eax, edx; + unsigned int ecx = XCR_XFEATURE_ENABLED_MASK; + unsigned int xcr0 = XSTATE_ZMM | XSTATE_YMM | XSTATE_SSE; + __asm__("xgetbv" : "=a" (eax), "=d" (edx) : "c" (ecx)); + return (eax & xcr0) == xcr0; +#else + return 0; +#endif +} /* * Primitive cpu feature detect function @@ -42,7 +56,14 @@ NPY_NO_EXPORT int npy_cpu_supports(const char * feature) { #ifdef HAVE___BUILTIN_CPU_SUPPORTS - if (strcmp(feature, "avx2") == 0) { + if (strcmp(feature, "avx512f") == 0) { +#if defined(__GNUC__) && (__GNUC__ < 5) + return 0; +#else + return __builtin_cpu_supports("avx512f") && os_avx512_support(); +#endif + } + else if (strcmp(feature, "avx2") == 0) { return __builtin_cpu_supports("avx2") && os_avx_support(); } else if (strcmp(feature, "avx") == 0) { diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 290a87a33..ff3b36428 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1569,6 +1569,55 @@ NPY_NO_EXPORT void /**end repeat**/ +/**begin repeat + * #func = exp, log# + * #scalarf = npy_expf, npy_logf# + */ + +NPY_NO_EXPORT NPY_GCC_OPT_3 void +FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)) +{ + UNARY_LOOP { + const npy_float in1 = *(npy_float *)ip1; + *(npy_float *)op1 = @scalarf@(in1); + } +} + +/**end repeat**/ + +/**begin repeat + * #isa = avx512f, avx2# + * #ISA = AVX512F, AVX2# + * #CHK = HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS# + */ + +/**begin repeat1 + * #func = exp, log# + * #scalarf = npy_expf, npy_logf# + */ + +NPY_NO_EXPORT NPY_GCC_OPT_3 void +FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)) +{ +#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS + @ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0]); +#else + /* + * This is the path it would take if ISA was runtime detected, but not + * compiled for. It fixes the error on clang6.0 which fails to compile + * AVX512F version. Not sure if I like this idea, if during runtime it + * detects AXV512F, it will end up running the scalar version instead + * of AVX2. + */ + UNARY_LOOP { + const npy_float in1 = *(npy_float *)ip1; + *(npy_float *)op1 = @scalarf@(in1); + } +#endif +} + +/**end repeat1**/ +/**end repeat**/ /**begin repeat * Float types diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 9dc1b7016..8dd3170e3 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -178,6 +178,22 @@ NPY_NO_EXPORT void /**end repeat**/ /**begin repeat + * #func = exp, log# + */ +NPY_NO_EXPORT void +FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)); + +/**begin repeat1 + * #isa = avx512f, avx2# + */ + +NPY_NO_EXPORT void +FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)); + +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat * Float types * #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE# * #c = f, f, , l# diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 4bb8569be..9a96ec3f4 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -122,6 +122,27 @@ abs_ptrdiff(char *a, char *b) */ /**begin repeat + * #ISA = AVX2, AVX512F# + */ + +/* prototypes */ +#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS + +/**begin repeat1 + * #func = exp, log# + */ + +static void +@ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_int n); + +/**end repeat1**/ +#endif + +/**end repeat**/ + + + +/**begin repeat * Float types * #type = npy_float, npy_double, npy_longdouble# * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# @@ -1075,6 +1096,381 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n) /**end repeat**/ +/* bunch of helper functions used in ISA_exp/log_FLOAT*/ + +#if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256 +avx2_fmadd(__m256 a, __m256 b, __m256 c) +{ + return _mm256_add_ps(_mm256_mul_ps(a, b), c); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256 +avx2_get_full_load_mask(void) +{ + return _mm256_set1_ps(-1.0); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256 +avx2_get_partial_load_mask(const npy_int num_elem, const npy_int total_elem) +{ + float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0, + 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; + float* addr = maskint + total_elem - num_elem; + return _mm256_loadu_ps(addr); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256 +avx2_masked_load(__m256 mask, npy_float* addr) +{ + return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask)); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256 +avx2_set_masked_lanes(__m256 x, __m256 val, __m256 mask) +{ + return _mm256_blendv_ps(x, val, mask); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256 +avx2_blend(__m256 x, __m256 y, __m256 ymask) +{ + return _mm256_blendv_ps(x, y, ymask); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256 +avx2_get_exponent(__m256 x) +{ + /* + * Special handling of denormals: + * 1) Multiply denormal elements with 2**100 (0x71800000) + * 2) Get the 8 bits of unbiased exponent + * 3) Subtract 100 from exponent of denormals + */ + + __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000)); + __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ); + __m256 temp = _mm256_mul_ps(x, two_power_100); + x = _mm256_blendv_ps(x, temp, denormal_mask); + + __m256 exp = _mm256_cvtepi32_ps( + _mm256_sub_epi32( + _mm256_srli_epi32( + _mm256_castps_si256(x), 23),_mm256_set1_epi32(0x7E))); + + __m256 denorm_exp = _mm256_sub_ps(exp, _mm256_set1_ps(100.0f)); + return _mm256_blendv_ps(exp, denorm_exp, denormal_mask); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256 +avx2_get_mantissa(__m256 x) +{ + /* + * Special handling of denormals: + * 1) Multiply denormal elements with 2**100 (0x71800000) + * 2) Get the 23 bits of mantissa + * 3) Mantissa for denormals is not affected by the multiplication + */ + + __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000)); + __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ); + __m256 temp = _mm256_mul_ps(x, two_power_100); + x = _mm256_blendv_ps(x, temp, denormal_mask); + + __m256i mantissa_bits = _mm256_set1_epi32(0x7fffff); + __m256i exp_126_bits = _mm256_set1_epi32(126 << 23); + return _mm256_castsi256_ps( + _mm256_or_si256( + _mm256_and_si256( + _mm256_castps_si256(x), mantissa_bits), exp_126_bits)); +} +#endif + +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 +avx512_get_full_load_mask(void) +{ + return 0xFFFF; +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 +avx512_get_partial_load_mask(const npy_int num_elem, const npy_int total_elem) +{ + return (0x0001 << num_elem) - 0x0001; +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 +avx512_masked_load(__mmask16 mask, npy_float* addr) +{ + return _mm512_maskz_loadu_ps(mask, (__m512 *)addr); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 +avx512_set_masked_lanes(__m512 x, __m512 val, __mmask16 mask) +{ + return _mm512_mask_blend_ps(mask, x, val); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 +avx512_blend(__m512 x, __m512 y, __mmask16 ymask) +{ + return _mm512_mask_mov_ps(x, ymask, y); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 +avx512_get_exponent(__m512 x) +{ + return _mm512_add_ps(_mm512_getexp_ps(x), _mm512_set1_ps(1.0f)); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 +avx512_get_mantissa(__m512 x) +{ + return _mm512_getmant_ps(x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src); +} +#endif + +/**begin repeat + * #ISA = AVX2, AVX512F# + * #isa = avx2, avx512# + * #vtype = __m256, __m512# + * #vsize = 256, 512# + * #or = or_ps, kor# + * #vsub = , _mask# + * #mask = __m256, __mmask16# + * #fmadd = avx2_fmadd,_mm512_fmadd_ps# + **/ + +#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ +@isa@_range_reduction(@vtype@ x, @vtype@ y, @vtype@ c1, @vtype@ c2, @vtype@ c3) +{ + @vtype@ reduced_x = @fmadd@(y, c1, x); + reduced_x = @fmadd@(y, c2, reduced_x); + reduced_x = @fmadd@(y, c3, reduced_x); + return reduced_x; +} +#endif +/**end repeat**/ + +/**begin repeat + * #ISA = AVX2, AVX512F# + * #isa = avx2, avx512# + * #vtype = __m256, __m512# + * #vsize = 256, 512# + * #BYTES = 32, 64# + * #mask = __m256, __mmask16# + * #vsub = , _mask# + * #and_masks =_mm256_and_ps, _mm512_kand# + * #fmadd = avx2_fmadd,_mm512_fmadd_ps# + * #mask_to_int = _mm256_movemask_ps, # + * #full_mask= 0xFF, 0xFFFF# + * #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps# + * #cvtps_epi32 = _mm256_cvtps_epi32, # + */ + + +/* + * Vectorized implementation of exp using AVX2 and AVX512: + * 1) if x >= xmax; return INF (overflow) + * 2) if x <= xmin; return 0.0f (underflow) + * 3) Range reduction (using Coyd-Waite): + * a) y = x - k*ln(2); k = rint(x/ln(2)); y \in [0, ln(2)] + * 4) Compute exp(y) = P/Q, ratio of 2 polynomials P and Q + * b) P = 5th order and Q = 2nd order polynomials obtained from Remez's + * algorithm (mini-max polynomial approximation) + * 5) Compute exp(x) = exp(y) * 2^k + * 6) Max ULP error measured across all 32-bit FP's = 2.52 (x = 0xc2781e37) + * 7) Max relative error measured across all 32-bit FP's= 2.1264E-07 (for the + * same x = 0xc2781e37) + */ + +#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS +static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void +@ISA@_exp_FLOAT(npy_float * op, npy_float * ip, const npy_int array_size) +{ + const npy_int num_lanes = @BYTES@/sizeof(npy_float); + npy_float xmax = 88.72283935546875f; + npy_float xmin = -87.3365478515625f; + + /* Load up frequently used constants */ + @vtype@ codyw_c1 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_LOGE_2_HIGHf); + @vtype@ codyw_c2 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_LOGE_2_LOWf); + @vtype@ exp_p0 = _mm@vsize@_set1_ps(NPY_COEFF_P0_EXPf); + @vtype@ exp_p1 = _mm@vsize@_set1_ps(NPY_COEFF_P1_EXPf); + @vtype@ exp_p2 = _mm@vsize@_set1_ps(NPY_COEFF_P2_EXPf); + @vtype@ exp_p3 = _mm@vsize@_set1_ps(NPY_COEFF_P3_EXPf); + @vtype@ exp_p4 = _mm@vsize@_set1_ps(NPY_COEFF_P4_EXPf); + @vtype@ exp_p5 = _mm@vsize@_set1_ps(NPY_COEFF_P5_EXPf); + @vtype@ exp_q0 = _mm@vsize@_set1_ps(NPY_COEFF_Q0_EXPf); + @vtype@ exp_q1 = _mm@vsize@_set1_ps(NPY_COEFF_Q1_EXPf); + @vtype@ exp_q2 = _mm@vsize@_set1_ps(NPY_COEFF_Q2_EXPf); + @vtype@ cvt_magic = _mm@vsize@_set1_ps(NPY_RINT_CVT_MAGICf); + @vtype@ log2e = _mm@vsize@_set1_ps(NPY_LOG2Ef); + @vtype@ inf = _mm@vsize@_set1_ps(NPY_INFINITYF); + @vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f); + @vtype@ poly, num_poly, denom_poly, quadrant; + @vtype@i exponent; + + @mask@ xmax_mask, xmin_mask; + @mask@ load_mask = @isa@_get_full_load_mask(); + npy_int num_remaining_elements = array_size; + + while (num_remaining_elements > 0) { + + if (num_remaining_elements < num_lanes) + load_mask = @isa@_get_partial_load_mask(num_remaining_elements, + num_lanes); + @vtype@ x = @isa@_masked_load(load_mask, ip); + xmax_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmax), _CMP_GE_OQ); + xmin_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmin), _CMP_LE_OQ); + + x = @isa@_set_masked_lanes(x, zeros_f, + @and_masks@(xmax_mask,xmin_mask)); + + quadrant = _mm@vsize@_mul_ps(x, log2e); + + /* round to nearest */ + quadrant = _mm@vsize@_add_ps(quadrant, cvt_magic); + quadrant = _mm@vsize@_sub_ps(quadrant, cvt_magic); + + /* Cody-Waite's range reduction algorithm */ + x = @isa@_range_reduction(x, quadrant, + codyw_c1, codyw_c2, zeros_f); + + num_poly = @fmadd@(exp_p5, x, exp_p4); + num_poly = @fmadd@(num_poly, x, exp_p3); + num_poly = @fmadd@(num_poly, x, exp_p2); + num_poly = @fmadd@(num_poly, x, exp_p1); + num_poly = @fmadd@(num_poly, x, exp_p0); + denom_poly = @fmadd@(exp_q2, x, exp_q1); + denom_poly = @fmadd@(denom_poly, x, exp_q0); + poly = _mm@vsize@_div_ps(num_poly, denom_poly); + + /* + * compute val = poly * 2^quadrant; which is same as adding the + * exponent of quadrant to the exponent of poly. quadrant is an int, + * so extracting exponent is simply extracting 8 bits. + */ + exponent = _mm@vsize@_slli_epi32(_mm@vsize@_cvtps_epi32(quadrant), 23); + poly = _mm@vsize@_castsi@vsize@_ps( + _mm@vsize@_add_epi32( + _mm@vsize@_castps_si@vsize@(poly), exponent)); + + /* elem > xmax; return inf, elem < xmin; return 0.0f */ + poly = @isa@_set_masked_lanes(poly, inf, xmax_mask); + poly = @isa@_set_masked_lanes(poly, zeros_f, xmin_mask); + + @masked_store@(op, @cvtps_epi32@(load_mask), poly); + + ip += num_lanes; + op += num_lanes; + num_remaining_elements -= num_lanes; + } +} + +/* + * Vectorized implementation of log using AVX2 and AVX512 + * 1) if x < 0.0f; return -NAN (invalid input) + * 2) Range reduction: y = x/2^k; + * a) y = normalized mantissa, k is the exponent (0.5 <= y < 1) + * 3) Compute log(y) = P/Q, ratio of 2 polynomials P and Q + * b) P = 5th order and Q = 5th order polynomials obtained from Remez's + * algorithm (mini-max polynomial approximation) + * 5) Compute log(x) = log(y) + k*ln(2) + * 6) Max ULP error measured across all 32-bit FP's = 3.83 (x = 0x3f486945) + * 7) Max relative error measured across all 32-bit FP's = 2.359E-07 (for same + * x = 0x3f486945) + */ + +static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void +@ISA@_log_FLOAT(npy_float * op, npy_float * ip, const npy_int array_size) +{ + const npy_int num_lanes = @BYTES@/sizeof(npy_float); + + /* Load up frequently used constants */ + @vtype@ log_p0 = _mm@vsize@_set1_ps(NPY_COEFF_P0_LOGf); + @vtype@ log_p1 = _mm@vsize@_set1_ps(NPY_COEFF_P1_LOGf); + @vtype@ log_p2 = _mm@vsize@_set1_ps(NPY_COEFF_P2_LOGf); + @vtype@ log_p3 = _mm@vsize@_set1_ps(NPY_COEFF_P3_LOGf); + @vtype@ log_p4 = _mm@vsize@_set1_ps(NPY_COEFF_P4_LOGf); + @vtype@ log_p5 = _mm@vsize@_set1_ps(NPY_COEFF_P5_LOGf); + @vtype@ log_q0 = _mm@vsize@_set1_ps(NPY_COEFF_Q0_LOGf); + @vtype@ log_q1 = _mm@vsize@_set1_ps(NPY_COEFF_Q1_LOGf); + @vtype@ log_q2 = _mm@vsize@_set1_ps(NPY_COEFF_Q2_LOGf); + @vtype@ log_q3 = _mm@vsize@_set1_ps(NPY_COEFF_Q3_LOGf); + @vtype@ log_q4 = _mm@vsize@_set1_ps(NPY_COEFF_Q4_LOGf); + @vtype@ log_q5 = _mm@vsize@_set1_ps(NPY_COEFF_Q5_LOGf); + @vtype@ loge2 = _mm@vsize@_set1_ps(NPY_LOGE2f); + @vtype@ neg_nan = _mm@vsize@_set1_ps(-NPY_NANF); + @vtype@ neg_inf = _mm@vsize@_set1_ps(-NPY_INFINITYF); + @vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f); + @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f); + @vtype@ poly, num_poly, denom_poly, exponent; + + @mask@ inf_nan_mask, sqrt2_mask, zero_mask, negx_mask; + @mask@ load_mask = @isa@_get_full_load_mask(); + npy_int num_remaining_elements = array_size; + + while (num_remaining_elements > 0) { + + if (num_remaining_elements < num_lanes) + load_mask = @isa@_get_partial_load_mask(num_remaining_elements, + num_lanes); + @vtype@ x_in = @isa@_masked_load(load_mask, ip); + + negx_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_LT_OQ); + zero_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_EQ_OQ); + inf_nan_mask = _mm@vsize@_cmp_ps@vsub@(x_in, _mm@vsize@_set1_ps(FLT_MAX), _CMP_GT_OQ); + + @vtype@ x = @isa@_set_masked_lanes(x_in, zeros_f, negx_mask); + + /* set x = normalized mantissa */ + exponent = @isa@_get_exponent(x); + x = @isa@_get_mantissa(x); + + /* if x < sqrt(2) {exp = exp-1; x = 2*x} */ + sqrt2_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(NPY_SQRT1_2f), _CMP_LE_OQ); + x = @isa@_blend(x, _mm@vsize@_add_ps(x,x), sqrt2_mask); + exponent = @isa@_blend(exponent, + _mm@vsize@_sub_ps(exponent,ones_f), sqrt2_mask); + + /* x = x - 1 */ + x = _mm@vsize@_sub_ps(x, ones_f); + + /* Polynomial approximation for log(1+x) */ + num_poly = @fmadd@(log_p5, x, log_p4); + num_poly = @fmadd@(num_poly, x, log_p3); + num_poly = @fmadd@(num_poly, x, log_p2); + num_poly = @fmadd@(num_poly, x, log_p1); + num_poly = @fmadd@(num_poly, x, log_p0); + denom_poly = @fmadd@(log_q5, x, log_q4); + denom_poly = @fmadd@(denom_poly, x, log_q3); + denom_poly = @fmadd@(denom_poly, x, log_q2); + denom_poly = @fmadd@(denom_poly, x, log_q1); + denom_poly = @fmadd@(denom_poly, x, log_q0); + poly = _mm@vsize@_div_ps(num_poly, denom_poly); + poly = @fmadd@(exponent, loge2, poly); + + /* + * x < 0.0f; return -NAN + * x = 0.0f; return -INF + * x > FLT_MAX; return x + */ + poly = @isa@_set_masked_lanes(poly, neg_nan, negx_mask); + poly = @isa@_set_masked_lanes(poly, neg_inf, zero_mask); + poly = @isa@_set_masked_lanes(poly, x_in, inf_nan_mask); + + @masked_store@(op, @cvtps_epi32@(load_mask), poly); + + ip += num_lanes; + op += num_lanes; + num_remaining_elements -= num_lanes; + } +} +#endif +/**end repeat**/ + /* ***************************************************************************** ** BOOL LOOPS diff --git a/numpy/distutils/command/autodist.py b/numpy/distutils/command/autodist.py index d5e78963c..03fe0c64c 100644 --- a/numpy/distutils/command/autodist.py +++ b/numpy/distutils/command/autodist.py @@ -78,6 +78,26 @@ main() """ % (attribute, name) return cmd.try_compile(body, None, None) != 0 +def check_gcc_function_attribute_with_intrinsics(cmd, attribute, name, code, + include): + """Return True if the given function attribute is supported with + intrinsics.""" + cmd._check_compiler() + body = """ +#include<%s> +int %s %s(void) +{ + %s; + return 0; +} + +int +main() +{ + return 0; +} +""" % (include, attribute, name, code) + return cmd.try_compile(body, None, None) != 0 def check_gcc_variable_attribute(cmd, attribute): """Return True if the given variable attribute is supported.""" cmd._check_compiler() diff --git a/numpy/distutils/command/config.py b/numpy/distutils/command/config.py index d9b1e8488..74d6900dc 100644 --- a/numpy/distutils/command/config.py +++ b/numpy/distutils/command/config.py @@ -18,6 +18,7 @@ import distutils from numpy.distutils.exec_command import filepath_from_subprocess_output from numpy.distutils.mingw32ccompiler import generate_manifest from numpy.distutils.command.autodist import (check_gcc_function_attribute, + check_gcc_function_attribute_with_intrinsics, check_gcc_variable_attribute, check_inline, check_restrict, @@ -424,6 +425,11 @@ int main (void) def check_gcc_function_attribute(self, attribute, name): return check_gcc_function_attribute(self, attribute, name) + def check_gcc_function_attribute_with_intrinsics(self, attribute, name, + code, include): + return check_gcc_function_attribute_with_intrinsics(self, attribute, + name, code, include) + def check_gcc_variable_attribute(self, attribute): return check_gcc_variable_attribute(self, attribute) |