diff options
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/core/code_generators/generate_umath.py | 8 | ||||
-rw-r--r-- | numpy/core/include/numpy/npy_common.h | 8 | ||||
-rw-r--r-- | numpy/core/include/numpy/npy_math.h | 26 | ||||
-rw-r--r-- | numpy/core/setup_common.py | 5 | ||||
-rw-r--r-- | numpy/core/src/umath/cpuid.c | 22 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 33 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.h.src | 4 | ||||
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 315 | ||||
-rw-r--r-- | numpy/core/tests/test_umath.py | 41 |
9 files changed, 410 insertions, 52 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index bf1747272..ae871ea6f 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -662,6 +662,8 @@ defdict = { Ufunc(1, 1, None, docstrings.get('numpy.core.umath.cos'), None, + TD('e', f='cos', astype={'e':'f'}), + TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]), TD(inexact, f='cos', astype={'e':'f'}), TD(P, f='cos'), ), @@ -669,6 +671,8 @@ defdict = { Ufunc(1, 1, None, docstrings.get('numpy.core.umath.sin'), None, + TD('e', f='sin', astype={'e':'f'}), + TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]), TD(inexact, f='sin', astype={'e':'f'}), TD(P, f='sin'), ), @@ -705,7 +709,7 @@ defdict = { docstrings.get('numpy.core.umath.exp'), None, TD('e', f='exp', astype={'e':'f'}), - TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]), + TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]), TD(inexact, f='exp', astype={'e':'f'}), TD(P, f='exp'), ), @@ -728,7 +732,7 @@ defdict = { docstrings.get('numpy.core.umath.log'), None, TD('e', f='log', astype={'e':'f'}), - TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]), + TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]), TD(inexact, f='log', astype={'e':'f'}), TD(P, f='log'), ), diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h index 108c0a202..27b83f7b5 100644 --- a/numpy/core/include/numpy/npy_common.h +++ b/numpy/core/include/numpy/npy_common.h @@ -44,10 +44,14 @@ #else #define NPY_GCC_TARGET_AVX #endif + +#if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS +#define HAVE_ATTRIBUTE_TARGET_FMA +#define NPY_GCC_TARGET_FMA __attribute__((target("avx2,fma"))) +#endif + #if defined HAVE_ATTRIBUTE_TARGET_AVX2 && defined HAVE_LINK_AVX2 #define NPY_GCC_TARGET_AVX2 __attribute__((target("avx2"))) -#elif defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS -#define NPY_GCC_TARGET_AVX2 __attribute__((target("avx2"))) #else #define NPY_GCC_TARGET_AVX2 #endif diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h index 6a78ff3c2..126b861bf 100644 --- a/numpy/core/include/numpy/npy_math.h +++ b/numpy/core/include/numpy/npy_math.h @@ -144,7 +144,22 @@ NPY_INLINE static float __npy_nzerof(void) #define NPY_COEFF_Q3_LOGf 9.864942958519418960339e-01f #define NPY_COEFF_Q4_LOGf 1.546476374983906719538e-01f #define NPY_COEFF_Q5_LOGf 5.875095403124574342950e-03f - +/* + * Constants used in vector implementation of sinf/cosf(x) + */ +#define NPY_TWO_O_PIf 0x1.45f306p-1f +#define NPY_CODY_WAITE_PI_O_2_HIGHf -0x1.921fb0p+00f +#define NPY_CODY_WAITE_PI_O_2_MEDf -0x1.5110b4p-22f +#define NPY_CODY_WAITE_PI_O_2_LOWf -0x1.846988p-48f +#define NPY_COEFF_INVF0_COSINEf 0x1.000000p+00f +#define NPY_COEFF_INVF2_COSINEf -0x1.000000p-01f +#define NPY_COEFF_INVF4_COSINEf 0x1.55553cp-05f +#define NPY_COEFF_INVF6_COSINEf -0x1.6c06dcp-10f +#define NPY_COEFF_INVF8_COSINEf 0x1.98e616p-16f +#define NPY_COEFF_INVF3_SINEf -0x1.555556p-03f +#define NPY_COEFF_INVF5_SINEf 0x1.11119ap-07f +#define NPY_COEFF_INVF7_SINEf -0x1.a06bbap-13f +#define NPY_COEFF_INVF9_SINEf 0x1.7d3bbcp-19f /* * Integer functions. */ @@ -163,6 +178,15 @@ NPY_INPLACE npy_longlong npy_gcdll(npy_longlong a, npy_longlong b); NPY_INPLACE npy_longlong npy_lcmll(npy_longlong a, npy_longlong b); /* + * avx function has a common API for both sin & cos. This enum is used to + * distinguish between the two + */ +typedef enum { + npy_compute_sin, + npy_compute_cos +} NPY_TRIG_OP; + +/* * C99 double math funcs */ NPY_INPLACE double npy_sin(double x); diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py index 6e3109ab5..a3f7acd6d 100644 --- a/numpy/core/setup_common.py +++ b/numpy/core/setup_common.py @@ -178,9 +178,10 @@ OPTIONAL_FUNCTION_ATTRIBUTES = [('__attribute__((optimize("unroll-loops")))', # gcc 4.8.4 support attributes but not with intrisics # tested via "#include<%s> int %s %s(void *){code; return 0;};" % (header, attribute, name, code) # function name will be converted to HAVE_<upper-case-name> preprocessor macro -OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS = [('__attribute__((target("avx2")))', +OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS = [('__attribute__((target("avx2,fma")))', 'attribute_target_avx2_with_intrinsics', - '__m256 temp = _mm256_set1_ps(1.0)', + '__m256 temp = _mm256_set1_ps(1.0); temp = \ + _mm256_fmadd_ps(temp, temp, temp)', 'immintrin.h'), ('__attribute__((target("avx512f")))', 'attribute_target_avx512f_with_intrinsics', diff --git a/numpy/core/src/umath/cpuid.c b/numpy/core/src/umath/cpuid.c index 8673f1736..72c6493e8 100644 --- a/numpy/core/src/umath/cpuid.c +++ b/numpy/core/src/umath/cpuid.c @@ -48,6 +48,25 @@ int os_avx512_support(void) #endif } +static NPY_INLINE +int cpu_supports_fma(void) +{ +#ifdef __x86_64__ + unsigned int feature = 0x01; + unsigned int a, b, c, d; + __asm__ volatile ( + "cpuid" "\n\t" + : "=a" (a), "=b" (b), "=c" (c), "=d" (d) + : "a" (feature)); + /* + * FMA is the 12th bit of ECX + */ + return (c >> 12) & 1; +#else + return 0; +#endif +} + /* * Primitive cpu feature detect function * Currently only supports checking for avx on gcc compatible compilers. @@ -63,6 +82,9 @@ npy_cpu_supports(const char * feature) return 0; #endif } + else if (strcmp(feature, "fma") == 0) { + return cpu_supports_fma() && __builtin_cpu_supports("avx2") && os_avx_support(); + } else if (strcmp(feature, "avx2") == 0) { return __builtin_cpu_supports("avx2") && os_avx_support(); } diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 1a4885133..2028a5712 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1594,8 +1594,8 @@ NPY_NO_EXPORT void /**end repeat**/ /**begin repeat - * #func = exp, log# - * #scalarf = npy_expf, npy_logf# + * #func = sin, cos, exp, log# + * #scalarf = npy_sinf, npy_cosf, npy_expf, npy_logf# */ NPY_NO_EXPORT NPY_GCC_OPT_3 void @@ -1610,8 +1610,8 @@ FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSE /**end repeat**/ /**begin repeat - * #isa = avx512f, avx2# - * #ISA = AVX512F, AVX2# + * #isa = avx512f, fma# + * #ISA = AVX512F, FMA# * #CHK = HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS# */ @@ -1642,6 +1642,31 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY } /**end repeat1**/ + +/**begin repeat1 + * #func = cos, sin# + * #enum = npy_compute_cos, npy_compute_sin# + * #scalarf = npy_cosf, npy_sinf# + */ + +NPY_NO_EXPORT NPY_GCC_OPT_3 void +FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)) +{ + if (!run_unary_@isa@_sincos_FLOAT(args, dimensions, steps, @enum@)) { + UNARY_LOOP { +#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS + @ISA@_sincos_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0], @enum@); +#else + const npy_float in1 = *(npy_float *)ip1; + *(npy_float *)op1 = @scalarf@(in1); +#endif + } + } +} + +/**end repeat1**/ + + /**end repeat**/ /**begin repeat diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 7f05a693a..5070ab38b 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -178,13 +178,13 @@ NPY_NO_EXPORT void /**end repeat**/ /**begin repeat - * #func = exp, log# + * #func = sin, cos, exp, log# */ NPY_NO_EXPORT void FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)); /**begin repeat1 - * #isa = avx512f, avx2# + * #isa = avx512f, fma# */ NPY_NO_EXPORT void diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 9816a1da4..7aec1ff49 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -130,8 +130,9 @@ abs_ptrdiff(char *a, char *b) */ /**begin repeat - * #ISA = AVX2, AVX512F# - * #isa = avx2, avx512f# + * #ISA = FMA, AVX512F# + * #isa = fma, avx512f# + * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS# * #REGISTER_SIZE = 32, 64# */ @@ -141,7 +142,7 @@ abs_ptrdiff(char *a, char *b) * #func = exp, log# */ -#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS +#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS static NPY_INLINE void @ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp stride); #endif @@ -149,7 +150,7 @@ static NPY_INLINE void static NPY_INLINE int run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps) { -#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS +#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) { @ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0]); return 1; @@ -162,6 +163,25 @@ run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps) /**end repeat1**/ +#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS +static NPY_INLINE void +@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp steps, NPY_TRIG_OP); +#endif + +static NPY_INLINE int +run_unary_@isa@_sincos_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps, NPY_TRIG_OP my_trig_op) +{ +#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS + if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) { + @ISA@_sincos_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0], my_trig_op); + return 1; + } + else + return 0; +#endif + return 0; +} + /**end repeat**/ @@ -1123,20 +1143,14 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n) /* bunch of helper functions used in ISA_exp/log_FLOAT*/ #if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256 -avx2_fmadd(__m256 a, __m256 b, __m256 c) -{ - return _mm256_add_ps(_mm256_mul_ps(a, b), c); -} - -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256 -avx2_get_full_load_mask(void) +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 +fma_get_full_load_mask(void) { return _mm256_set1_ps(-1.0); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256 -avx2_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem) +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 +fma_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem) { float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0, 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; @@ -1144,8 +1158,8 @@ avx2_get_partial_load_mask(const npy_int num_lanes, const npy_int total_elem) return _mm256_loadu_ps(addr); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256 -avx2_masked_gather(__m256 src, +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 +fma_masked_gather(__m256 src, npy_float* addr, __m256i vindex, __m256 mask) @@ -1153,26 +1167,39 @@ avx2_masked_gather(__m256 src, return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256 -avx2_masked_load(__m256 mask, npy_float* addr) +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 +fma_masked_load(__m256 mask, npy_float* addr) { return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask)); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256 -avx2_set_masked_lanes(__m256 x, __m256 val, __m256 mask) +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 +fma_set_masked_lanes(__m256 x, __m256 val, __m256 mask) { return _mm256_blendv_ps(x, val, mask); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256 -avx2_blend(__m256 x, __m256 y, __m256 ymask) +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 +fma_blend(__m256 x, __m256 y, __m256 ymask) { return _mm256_blendv_ps(x, y, ymask); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256 -avx2_get_exponent(__m256 x) +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 +fma_should_calculate_sine(__m256i k, __m256i andop, __m256i cmp) +{ + return _mm256_cvtepi32_ps( + _mm256_cmpeq_epi32(_mm256_and_si256(k, andop), cmp)); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 +fma_should_negate(__m256i k, __m256i andop, __m256i cmp) +{ + return fma_should_calculate_sine(k, andop, cmp); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 +fma_get_exponent(__m256 x) { /* * Special handling of denormals: @@ -1198,8 +1225,8 @@ avx2_get_exponent(__m256 x) return _mm256_blendv_ps(exp, denorm_exp, denormal_mask); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256 -avx2_get_mantissa(__m256 x) +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 +fma_get_mantissa(__m256 x) { /* * Special handling of denormals: @@ -1225,7 +1252,7 @@ avx2_get_mantissa(__m256 x) } static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256 -avx2_scalef_ps(__m256 poly, __m256 quadrant) +fma_scalef_ps(__m256 poly, __m256 quadrant) { /* * Handle denormals (which occur when quadrant <= -125): @@ -1305,6 +1332,18 @@ avx512_blend(__m512 x, __m512 y, __mmask16 ymask) return _mm512_mask_mov_ps(x, ymask, y); } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 +avx512_should_calculate_sine(__m512i k, __m512i andop, __m512i cmp) +{ + return _mm512_cmpeq_epi32_mask(_mm512_and_epi32(k, andop), cmp); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 +avx512_should_negate(__m512i k, __m512i andop, __m512i cmp) +{ + return avx512_should_calculate_sine(k, andop, cmp); +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 avx512_get_exponent(__m512 x) { @@ -1325,17 +1364,28 @@ avx512_scalef_ps(__m512 poly, __m512 quadrant) #endif /**begin repeat - * #ISA = AVX2, AVX512F# - * #isa = avx2, avx512# + * #ISA = FMA, AVX512F# + * #isa = fma, avx512# * #vtype = __m256, __m512# * #vsize = 256, 512# * #or = or_ps, kor# * #vsub = , _mask# * #mask = __m256, __mmask16# - * #fmadd = avx2_fmadd,_mm512_fmadd_ps# + * #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps# + * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS# **/ -#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS +#if defined @CHK@ + +/* + * Vectorized Cody-Waite range reduction technique + * Performs the reduction step x* = x - y*C in three steps: + * 1) x* = x - y*c1 + * 2) x* = x - y*c2 + * 3) x* = x - y*c3 + * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision + */ + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ @isa@_range_reduction(@vtype@ x, @vtype@ y, @vtype@ c1, @vtype@ c2, @vtype@ c3) { @@ -1344,12 +1394,56 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ reduced_x = @fmadd@(y, c3, reduced_x); return reduced_x; } + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @mask@ +@isa@_in_range_mask(@vtype@ x, npy_float fmax, npy_float fmin) +{ + @mask@ m1 = _mm@vsize@_cmp_ps@vsub@( + x, _mm@vsize@_set1_ps(fmax), _CMP_GT_OQ); + @mask@ m2 = _mm@vsize@_cmp_ps@vsub@( + x, _mm@vsize@_set1_ps(fmin), _CMP_LT_OQ); + return _mm@vsize@_@or@(m1,m2); +} + +/* + * Approximate cosine algorithm for x \in [-PI/4, PI/4] + * Maximum ULP across all 32-bit floats = 0.875 + */ + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ +@isa@_cosine(@vtype@ x2, @vtype@ invf8, @vtype@ invf6, @vtype@ invf4, + @vtype@ invf2, @vtype@ invf0) +{ + @vtype@ cos = @fmadd@(invf8, x2, invf6); + cos = @fmadd@(cos, x2, invf4); + cos = @fmadd@(cos, x2, invf2); + cos = @fmadd@(cos, x2, invf0); + return cos; +} + +/* + * Approximate sine algorithm for x \in [-PI/4, PI/4] + * Maximum ULP across all 32-bit floats = 0.647 + */ + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ +@isa@_sine(@vtype@ x, @vtype@ x2, @vtype@ invf9, @vtype@ invf7, + @vtype@ invf5, @vtype@ invf3, + @vtype@ zero) +{ + @vtype@ sin = @fmadd@(invf9, x2, invf7); + sin = @fmadd@(sin, x2, invf5); + sin = @fmadd@(sin, x2, invf3); + sin = @fmadd@(sin, x2, zero); + sin = @fmadd@(sin, x, x); + return sin; +} #endif /**end repeat**/ /**begin repeat - * #ISA = AVX2, AVX512F# - * #isa = avx2, avx512# + * #ISA = FMA, AVX512F# + * #isa = fma, avx512# * #vtype = __m256, __m512# * #vsize = 256, 512# * #BYTES = 32, 64# @@ -1358,13 +1452,165 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ * #or_masks =_mm256_or_ps, _mm512_kor# * #and_masks =_mm256_and_ps, _mm512_kand# * #xor_masks =_mm256_xor_ps, _mm512_kxor# - * #fmadd = avx2_fmadd,_mm512_fmadd_ps# + * #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps# * #mask_to_int = _mm256_movemask_ps, # * #full_mask= 0xFF, 0xFFFF# * #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps# * #cvtps_epi32 = _mm256_cvtps_epi32, # + * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS# + */ + + +/* + * Vectorized approximate sine/cosine algorithms: The following code is a + * vectorized version of the algorithm presented here: + * https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751 + * (1) Load data in ZMM/YMM registers and generate mask for elements that are + * within range [-71476.0625f, 71476.0625f] for cosine and [-117435.992f, + * 117435.992f] for sine. + * (2) For elements within range, perform range reduction using Cody-Waite's + * method: x* = x - y*PI/2, where y = rint(x*2/PI). x* \in [-PI/4, PI/4]. + * (3) Map cos(x) to (+/-)sine or (+/-)cosine of x* based on the quadrant k = + * int(y). + * (4) For elements outside that range, Cody-Waite reduction peforms poorly + * leading to catastrophic cancellation. We compute cosine by calling glibc in + * a scalar fashion. + * (5) Vectorized implementation has a max ULP of 1.49 and performs at least + * 5-7x faster than scalar implementations when magnitude of all elements in + * the array < 71476.0625f (117435.992f for sine). Worst case performance is + * when all the elements are large leading to about 1-2% reduction in + * performance. */ +#if defined @CHK@ +static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void +@ISA@_sincos_FLOAT(npy_float * op, + npy_float * ip, + const npy_intp array_size, + const npy_intp steps, + NPY_TRIG_OP my_trig_op) +{ + const npy_intp stride = steps/sizeof(npy_float); + const npy_int num_lanes = @BYTES@/sizeof(npy_float); + npy_float large_number = 71476.0625f; + if (my_trig_op == npy_compute_sin) { + large_number = 117435.992f; + } + + /* Load up frequently used constants */ + @vtype@i zeros = _mm@vsize@_set1_epi32(0); + @vtype@i ones = _mm@vsize@_set1_epi32(1); + @vtype@i twos = _mm@vsize@_set1_epi32(2); + @vtype@ two_over_pi = _mm@vsize@_set1_ps(NPY_TWO_O_PIf); + @vtype@ codyw_c1 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_HIGHf); + @vtype@ codyw_c2 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_MEDf); + @vtype@ codyw_c3 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_LOWf); + @vtype@ cos_invf0 = _mm@vsize@_set1_ps(NPY_COEFF_INVF0_COSINEf); + @vtype@ cos_invf2 = _mm@vsize@_set1_ps(NPY_COEFF_INVF2_COSINEf); + @vtype@ cos_invf4 = _mm@vsize@_set1_ps(NPY_COEFF_INVF4_COSINEf); + @vtype@ cos_invf6 = _mm@vsize@_set1_ps(NPY_COEFF_INVF6_COSINEf); + @vtype@ cos_invf8 = _mm@vsize@_set1_ps(NPY_COEFF_INVF8_COSINEf); + @vtype@ sin_invf3 = _mm@vsize@_set1_ps(NPY_COEFF_INVF3_SINEf); + @vtype@ sin_invf5 = _mm@vsize@_set1_ps(NPY_COEFF_INVF5_SINEf); + @vtype@ sin_invf7 = _mm@vsize@_set1_ps(NPY_COEFF_INVF7_SINEf); + @vtype@ sin_invf9 = _mm@vsize@_set1_ps(NPY_COEFF_INVF9_SINEf); + @vtype@ cvt_magic = _mm@vsize@_set1_ps(NPY_RINT_CVT_MAGICf); + @vtype@ zero_f = _mm@vsize@_set1_ps(0.0f); + @vtype@ quadrant, reduced_x, reduced_x2, cos, sin; + @vtype@i iquadrant; + @mask@ nan_mask, glibc_mask, sine_mask, negate_mask; + @mask@ load_mask = @isa@_get_full_load_mask(); + npy_intp num_remaining_elements = array_size; + npy_int indexarr[16]; + for (npy_int ii = 0; ii < 16; ii++) { + indexarr[ii] = ii*stride; + } + @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]); + + while (num_remaining_elements > 0) { + + if (num_remaining_elements < num_lanes) { + load_mask = @isa@_get_partial_load_mask(num_remaining_elements, + num_lanes); + } + + @vtype@ x; + if (stride == 1) { + x = @isa@_masked_load(load_mask, ip); + } + else { + x = @isa@_masked_gather(zero_f, ip, vindex, load_mask); + } + + /* + * For elements outside of this range, Cody-Waite's range reduction + * becomes inaccurate and we will call glibc to compute cosine for + * these numbers + */ + + glibc_mask = @isa@_in_range_mask(x, large_number,-large_number); + glibc_mask = @and_masks@(load_mask, glibc_mask); + nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ); + x = @isa@_set_masked_lanes(x, zero_f, @or_masks@(nan_mask, glibc_mask)); + npy_int iglibc_mask = @mask_to_int@(glibc_mask); + + if (iglibc_mask != @full_mask@) { + quadrant = _mm@vsize@_mul_ps(x, two_over_pi); + + /* round to nearest */ + quadrant = _mm@vsize@_add_ps(quadrant, cvt_magic); + quadrant = _mm@vsize@_sub_ps(quadrant, cvt_magic); + + /* Cody-Waite's range reduction algorithm */ + reduced_x = @isa@_range_reduction(x, quadrant, + codyw_c1, codyw_c2, codyw_c3); + reduced_x2 = _mm@vsize@_mul_ps(reduced_x, reduced_x); + + /* compute cosine and sine */ + cos = @isa@_cosine(reduced_x2, cos_invf8, cos_invf6, cos_invf4, + cos_invf2, cos_invf0); + sin = @isa@_sine(reduced_x, reduced_x2, sin_invf9, sin_invf7, + sin_invf5, sin_invf3, zero_f); + + iquadrant = _mm@vsize@_cvtps_epi32(quadrant); + if (my_trig_op == npy_compute_cos) { + iquadrant = _mm@vsize@_add_epi32(iquadrant, ones); + } + + /* blend sin and cos based on the quadrant */ + sine_mask = @isa@_should_calculate_sine(iquadrant, ones, zeros); + cos = @isa@_blend(cos, sin, sine_mask); + + /* multiply by -1 for appropriate elements */ + negate_mask = @isa@_should_negate(iquadrant, twos, twos); + cos = @isa@_blend(cos, _mm@vsize@_sub_ps(zero_f, cos), negate_mask); + cos = @isa@_set_masked_lanes(cos, _mm@vsize@_set1_ps(NPY_NANF), nan_mask); + + @masked_store@(op, @cvtps_epi32@(load_mask), cos); + } + + /* process elements using glibc for large elements */ + if (my_trig_op == npy_compute_cos) { + for (int ii = 0; iglibc_mask != 0; ii++) { + if (iglibc_mask & 0x01) { + op[ii] = npy_cosf(ip[ii]); + } + iglibc_mask = iglibc_mask >> 1; + } + } + else { + for (int ii = 0; iglibc_mask != 0; ii++) { + if (iglibc_mask & 0x01) { + op[ii] = npy_sinf(ip[ii]); + } + iglibc_mask = iglibc_mask >> 1; + } + } + ip += num_lanes*stride; + op += num_lanes; + num_remaining_elements -= num_lanes; + } +} /* * Vectorized implementation of exp using AVX2 and AVX512: @@ -1381,7 +1627,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ * same x = 0xc2781e37) */ -#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void @ISA@_exp_FLOAT(npy_float * op, npy_float * ip, diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index 91d403d98..ef48fed05 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -678,20 +678,49 @@ class TestSpecialFloats(object): assert_raises(FloatingPointError, np.log, np.float32(-np.inf)) assert_raises(FloatingPointError, np.log, np.float32(-1.0)) -class TestExpLogFloat32(object): + def test_sincos_values(self): + with np.errstate(all='ignore'): + x = [np.nan, np.nan, np.nan, np.nan] + y = [np.nan, -np.nan, np.inf, -np.inf] + for dt in ['f', 'd', 'g']: + xf = np.array(x, dtype=dt) + yf = np.array(y, dtype=dt) + assert_equal(np.sin(yf), xf) + assert_equal(np.cos(yf), xf) + + with np.errstate(invalid='raise'): + assert_raises(FloatingPointError, np.sin, np.float32(-np.inf)) + assert_raises(FloatingPointError, np.sin, np.float32(np.inf)) + assert_raises(FloatingPointError, np.cos, np.float32(-np.inf)) + assert_raises(FloatingPointError, np.cos, np.float32(np.inf)) + + +class TestSIMDFloat32(object): def test_exp_float32(self): np.random.seed(42) x_f32 = np.float32(np.random.uniform(low=0.0,high=88.1,size=1000000)) x_f64 = np.float64(x_f32) - assert_array_max_ulp(np.exp(x_f32), np.float32(np.exp(x_f64)), maxulp=2.6) + assert_array_max_ulp(np.exp(x_f32), np.float32(np.exp(x_f64)), maxulp=3) def test_log_float32(self): np.random.seed(42) x_f32 = np.float32(np.random.uniform(low=0.0,high=1000,size=1000000)) x_f64 = np.float64(x_f32) - assert_array_max_ulp(np.log(x_f32), np.float32(np.log(x_f64)), maxulp=3.9) + assert_array_max_ulp(np.log(x_f32), np.float32(np.log(x_f64)), maxulp=4) + + def test_sincos_float32(self): + np.random.seed(42) + N = 1000000 + M = np.int(N/20) + index = np.random.randint(low=0, high=N, size=M) + x_f32 = np.float32(np.random.uniform(low=-100.,high=100.,size=N)) + # test coverage for elements > 117435.992f for which glibc is used + x_f32[index] = np.float32(10E+10*np.random.rand(M)) + x_f64 = np.float64(x_f32) + assert_array_max_ulp(np.sin(x_f32), np.float32(np.sin(x_f64)), maxulp=2) + assert_array_max_ulp(np.cos(x_f32), np.float32(np.cos(x_f64)), maxulp=2) - def test_strided_exp_log_float32(self): + def test_strided_float32(self): np.random.seed(42) strides = np.random.randint(low=-100, high=100, size=100) sizes = np.random.randint(low=1, high=2000, size=100) @@ -699,9 +728,13 @@ class TestExpLogFloat32(object): x_f32 = np.float32(np.random.uniform(low=0.01,high=88.1,size=ii)) exp_true = np.exp(x_f32) log_true = np.log(x_f32) + sin_true = np.sin(x_f32) + cos_true = np.cos(x_f32) for jj in strides: assert_array_almost_equal_nulp(np.exp(x_f32[::jj]), exp_true[::jj], nulp=2) assert_array_almost_equal_nulp(np.log(x_f32[::jj]), log_true[::jj], nulp=2) + assert_array_almost_equal_nulp(np.sin(x_f32[::jj]), sin_true[::jj], nulp=2) + assert_array_almost_equal_nulp(np.cos(x_f32[::jj]), cos_true[::jj], nulp=2) class TestLogAddExp(_FilterInvalids): def test_logaddexp_values(self): |