diff options
author | Sayed Adel <seiko@imavr.com> | 2020-10-18 12:03:17 +0000 |
---|---|---|
committer | Sayed Adel <seiko@imavr.com> | 2020-12-26 16:32:04 +0000 |
commit | b16288698742e7593db61cf9a618e2d03de6b36e (patch) | |
tree | 301ed276b9eca8d18c7ffc9eae2fb0db9c9699e2 | |
parent | 098a3b417dbb3c620e423e29f194585b68882d5e (diff) | |
download | numpy-b16288698742e7593db61cf9a618e2d03de6b36e.tar.gz |
SIMD: Replace raw SIMD of sin/cos with NPYV
The new code improves the performance of non-contiguous memory access
for the output array without any reduction in performance.
For PPC64LE the performance increased by 2-3.0, and 1.5-2.0 on aarch64.
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | numpy/core/code_generators/generate_umath.py | 4 | ||||
-rw-r--r-- | numpy/core/include/numpy/npy_math.h | 9 | ||||
-rw-r--r-- | numpy/core/setup.py | 1 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 26 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.h.src | 13 | ||||
-rw-r--r-- | numpy/core/src/umath/loops_trigonometric.dispatch.c.src | 230 | ||||
-rw-r--r-- | numpy/core/src/umath/loops_utils.h.src | 11 | ||||
-rw-r--r-- | numpy/core/src/umath/npy_simd_data.h | 16 | ||||
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 238 |
10 files changed, 259 insertions, 290 deletions
diff --git a/.gitignore b/.gitignore index b0fa037a6..5a5e464cc 100644 --- a/.gitignore +++ b/.gitignore @@ -218,3 +218,4 @@ numpy/core/src/_simd/_simd_inc.h # umath module numpy/core/src/umath/loops_unary_fp.dispatch.c numpy/core/src/umath/loops_arithm_fp.dispatch.c +numpy/core/src/umath/loops_trigonometric.dispatch.c diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index 4e9a2cfec..6ee8031cb 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -676,7 +676,7 @@ defdict = { docstrings.get('numpy.core.umath.cos'), None, TD('e', f='cos', astype={'e':'f'}), - TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]), + TD('f', dispatch=[('loops_trigonometric', 'f')]), TD('fdg' + cmplx, f='cos'), TD(P, f='cos'), ), @@ -685,7 +685,7 @@ defdict = { docstrings.get('numpy.core.umath.sin'), None, TD('e', f='sin', astype={'e':'f'}), - TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]), + TD('f', dispatch=[('loops_trigonometric', 'f')]), TD('fdg' + cmplx, f='sin'), TD(P, f='sin'), ), diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h index 7d71c36cc..f32e298f0 100644 --- a/numpy/core/include/numpy/npy_math.h +++ b/numpy/core/include/numpy/npy_math.h @@ -151,15 +151,6 @@ NPY_INPLACE npy_longlong npy_rshiftll(npy_longlong a, npy_longlong b); NPY_INPLACE npy_longlong npy_lshiftll(npy_longlong a, npy_longlong b); /* - * avx function has a common API for both sin & cos. This enum is used to - * distinguish between the two - */ -typedef enum { - npy_compute_sin, - npy_compute_cos -} NPY_TRIG_OP; - -/* * C99 double math funcs */ NPY_INPLACE double npy_sin(double x); diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 2e020a595..1042a1c45 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -929,6 +929,7 @@ def configuration(parent_package='',top_path=None): join('src', 'umath', 'scalarmath.c.src'), join('src', 'umath', 'ufunc_type_resolution.c'), join('src', 'umath', 'override.c'), + join('src', 'umath', 'loops_trigonometric.dispatch.c.src'), ] umath_deps = [ diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 839d2b3ae..ba538d2ab 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1658,8 +1658,8 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void /**end repeat**/ /**begin repeat - * #func = sin, cos, exp, log# - * #scalarf = npy_sinf, npy_cosf, npy_expf, npy_logf# + * #func = exp, log# + * #scalarf = npy_expf, npy_logf# */ NPY_NO_EXPORT NPY_GCC_OPT_3 void @@ -1749,28 +1749,6 @@ FLOAT_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *step /**end repeat1**/ -/**begin repeat1 - * #func = cos, sin# - * #enum = npy_compute_cos, npy_compute_sin# - * #scalarf = npy_cosf, npy_sinf# - */ - -NPY_NO_EXPORT NPY_GCC_OPT_3 void -FLOAT_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) -{ - if (!run_unary_@isa@_sincos_FLOAT(args, dimensions, steps, @enum@)) { - UNARY_LOOP { -#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS - @ISA@_sincos_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0], @enum@); -#else - const npy_float in1 = *(npy_float *)ip1; - *(npy_float *)op1 = @scalarf@(in1); -#endif - } - } -} - -/**end repeat1**/ /**end repeat**/ NPY_NO_EXPORT NPY_GCC_OPT_3 void diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index c15ff8e3b..d73c9fa7f 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -225,8 +225,19 @@ DOUBLE_log(char **args, npy_intp const *dimensions, npy_intp const *steps, void NPY_NO_EXPORT void DOUBLE_log_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_trigonometric.dispatch.h" +#endif +/**begin repeat + * #func = sin, cos# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_@func@, ( + char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func) +)) +/**end repeat**/ + /**begin repeat - * #func = sin, cos, exp, log# + * #func = exp, log# */ NPY_NO_EXPORT void FLOAT_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); diff --git a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src new file mode 100644 index 000000000..8c2c83e7c --- /dev/null +++ b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src @@ -0,0 +1,230 @@ +/*@targets + ** $maxopt baseline + ** (avx2 fma3) avx512f + ** vsx2 + ** neon_vfpv4 + **/ +#include "numpy/npy_math.h" +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +/* + * TODO: + * - use vectorized version of Payne-Hanek style reduction for large elements or + * when there's no native FUSED support instead of fallback to libc + */ +#if NPY_SIMD_FMA3 // native support +/* + * Vectorized Cody-Waite range reduction technique + * Performs the reduction step x* = x - y*C in three steps: + * 1) x* = x - y*c1 + * 2) x* = x - y*c2 + * 3) x* = x - y*c3 + * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision + */ +NPY_FINLINE npyv_f32 +simd_range_reduction_f32(npyv_f32 x, npyv_f32 y, npyv_f32 c1, npyv_f32 c2, npyv_f32 c3) +{ + npyv_f32 reduced_x = npyv_muladd_f32(y, c1, x); + reduced_x = npyv_muladd_f32(y, c2, reduced_x); + reduced_x = npyv_muladd_f32(y, c3, reduced_x); + return reduced_x; +} +/* + * Approximate cosine algorithm for x \in [-PI/4, PI/4] + * Maximum ULP across all 32-bit floats = 0.875 + */ +NPY_FINLINE npyv_f32 +simd_cosine_poly_f32(npyv_f32 x2) +{ + const npyv_f32 invf8 = npyv_setall_f32(0x1.98e616p-16f); + const npyv_f32 invf6 = npyv_setall_f32(-0x1.6c06dcp-10f); + const npyv_f32 invf4 = npyv_setall_f32(0x1.55553cp-05f); + const npyv_f32 invf2 = npyv_setall_f32(-0x1.000000p-01f); + const npyv_f32 invf0 = npyv_setall_f32(0x1.000000p+00f); + + npyv_f32 r = npyv_muladd_f32(invf8, x2, invf6); + r = npyv_muladd_f32(r, x2, invf4); + r = npyv_muladd_f32(r, x2, invf2); + r = npyv_muladd_f32(r, x2, invf0); + return r; +} +/* + * Approximate sine algorithm for x \in [-PI/4, PI/4] + * Maximum ULP across all 32-bit floats = 0.647 + * Polynomial approximation based on unpublished work by T. Myklebust + */ +NPY_FINLINE npyv_f32 +simd_sine_poly_f32(npyv_f32 x, npyv_f32 x2) +{ + const npyv_f32 invf9 = npyv_setall_f32(0x1.7d3bbcp-19f); + const npyv_f32 invf7 = npyv_setall_f32(-0x1.a06bbap-13f); + const npyv_f32 invf5 = npyv_setall_f32(0x1.11119ap-07f); + const npyv_f32 invf3 = npyv_setall_f32(-0x1.555556p-03f); + + npyv_f32 r = npyv_muladd_f32(invf9, x2, invf7); + r = npyv_muladd_f32(r, x2, invf5); + r = npyv_muladd_f32(r, x2, invf3); + r = npyv_muladd_f32(r, x2, npyv_zero_f32()); + r = npyv_muladd_f32(r, x, x); + return r; +} +/* + * Vectorized approximate sine/cosine algorithms: The following code is a + * vectorized version of the algorithm presented here: + * https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751 + * (1) Load data in registers and generate mask for elements that are + * within range [-71476.0625f, 71476.0625f] for cosine and [-117435.992f, + * 117435.992f] for sine. + * (2) For elements within range, perform range reduction using Cody-Waite's + * method: x* = x - y*PI/2, where y = rint(x*2/PI). x* \in [-PI/4, PI/4]. + * (3) Map cos(x) to (+/-)sine or (+/-)cosine of x* based on the quadrant k = + * int(y). + * (4) For elements outside that range, Cody-Waite reduction performs poorly + * leading to catastrophic cancellation. We compute cosine by calling glibc in + * a scalar fashion. + * (5) Vectorized implementation has a max ULP of 1.49 and performs at least + * 5-7x(x86) - 2.5-3x(Power) - 1-2x(Arm) faster than scalar implementations + * when magnitude of all elements in the array < 71476.0625f (117435.992f for sine). + * Worst case performance is when all the elements are large leading to about 1-2% reduction in + * performance. + */ +typedef enum +{ + SIMD_COMPUTE_SIN, + SIMD_COMPUTE_COS +} SIMD_TRIG_OP; + +static void SIMD_MSVC_NOINLINE +simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, + npy_intp len, SIMD_TRIG_OP trig_op) +{ + // Load up frequently used constants + const npyv_f32 zerosf = npyv_zero_f32(); + const npyv_s32 ones = npyv_setall_s32(1); + const npyv_s32 twos = npyv_setall_s32(2); + const npyv_f32 two_over_pi = npyv_setall_f32(0x1.45f306p-1f); + const npyv_f32 codyw_pio2_highf = npyv_setall_f32(-0x1.921fb0p+00f); + const npyv_f32 codyw_pio2_medf = npyv_setall_f32(-0x1.5110b4p-22f); + const npyv_f32 codyw_pio2_lowf = npyv_setall_f32(-0x1.846988p-48f); + const npyv_f32 rint_cvt_magic = npyv_setall_f32(0x1.800000p+23f); + // Cody-Waite's range + float max_codi = 117435.992f; + if (trig_op == SIMD_COMPUTE_COS) { + max_codi = 71476.0625f; + } + const npyv_f32 max_cody = npyv_setall_f32(max_codi); + const int vstep = npyv_nlanes_f32; + + for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) { + npyv_f32 x_in; + if (ssrc == 1) { + x_in = npyv_load_tillz_f32(src, len); + } else { + x_in = npyv_loadn_tillz_f32(src, ssrc, len); + } + npyv_b32 simd_mask = npyv_cmple_f32(npyv_abs_f32(x_in), max_cody); + npy_uint64 simd_maski = npyv_tobits_b32(simd_mask); + /* + * For elements outside of this range, Cody-Waite's range reduction + * becomes inaccurate and we will call libc to compute cosine for + * these numbers + */ + if (simd_maski != 0) { + npyv_b32 nnan_mask = npyv_notnan_f32(x_in); + npyv_f32 x = npyv_select_f32(npyv_and_b32(nnan_mask, simd_mask), x_in, zerosf); + + npyv_f32 quadrant = npyv_mul_f32(x, two_over_pi); + // round to nearest, -0.0f -> +0.0f, and |a| must be <= 0x1.0p+22 + quadrant = npyv_add_f32(quadrant, rint_cvt_magic); + quadrant = npyv_sub_f32(quadrant, rint_cvt_magic); + + // Cody-Waite's range reduction algorithm + npyv_f32 reduced_x = simd_range_reduction_f32( + x, quadrant, codyw_pio2_highf, codyw_pio2_medf, codyw_pio2_lowf + ); + npyv_f32 reduced_x2 = npyv_square_f32(reduced_x); + + // compute cosine and sine + npyv_f32 cos = simd_cosine_poly_f32(reduced_x2); + npyv_f32 sin = simd_sine_poly_f32(reduced_x, reduced_x2); + + npyv_s32 iquadrant = npyv_round_s32_f32(quadrant); + if (trig_op == SIMD_COMPUTE_COS) { + iquadrant = npyv_add_s32(iquadrant, ones); + } + // blend sin and cos based on the quadrant + npyv_b32 sine_mask = npyv_cmpeq_s32(npyv_and_s32(iquadrant, ones), npyv_zero_s32()); + cos = npyv_select_f32(sine_mask, sin, cos); + + // multiply by -1 for appropriate elements + npyv_b32 negate_mask = npyv_cmpeq_s32(npyv_and_s32(iquadrant, twos), twos); + cos = npyv_ifsub_f32(negate_mask, zerosf, cos, cos); + cos = npyv_select_f32(nnan_mask, cos, npyv_setall_f32(NPY_NANF)); + + if (sdst == 1) { + npyv_store_till_f32(dst, len, cos); + } else { + npyv_storen_till_f32(dst, sdst, len, cos); + } + } + if (simd_maski != ((1 << vstep) - 1)) { + float NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) ip_fback[npyv_nlanes_f32]; + npyv_storea_f32(ip_fback, x_in); + + // process elements using libc for large elements + if (trig_op == SIMD_COMPUTE_COS) { + for (unsigned i = 0; i < npyv_nlanes_f32; ++i) { + if ((simd_maski >> i) & 1) { + continue; + } + dst[sdst*i] = npy_cosf(ip_fback[i]); + } + } + else { + for (unsigned i = 0; i < npyv_nlanes_f32; ++i) { + if ((simd_maski >> i) & 1) { + continue; + } + dst[sdst*i] = npy_sinf(ip_fback[i]); + } + } + } + } + npyv_cleanup(); +} +#endif // NPY_SIMD_FMA3 + +/**begin repeat + * #func = cos, sin# + * #enum = SIMD_COMPUTE_COS, SIMD_COMPUTE_SIN# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_@func@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) +{ + const float *src = (float*)args[0]; + float *dst = (float*)args[1]; + + const int lsize = sizeof(src[0]); + const npy_intp ssrc = steps[0] / lsize; + const npy_intp sdst = steps[1] / lsize; + npy_intp len = dimensions[0]; + assert(steps[0] % lsize == 0 && steps[1] % lsize == 0); +#if NPY_SIMD_FMA3 + if (is_mem_overlap(src, steps[0], dst, steps[1], len) || + !npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst) + ) { + for (; len > 0; --len, src += ssrc, dst += sdst) { + simd_sincos_f32(src, 1, dst, 1, 1, @enum@); + } + } else { + simd_sincos_f32(src, ssrc, dst, sdst, len, @enum@); + } +#else + for (; len > 0; --len, src += ssrc, dst += sdst) { + const float src0 = *src; + *dst = npy_@func@f(src0); + } +#endif +} +/**end repeat**/ diff --git a/numpy/core/src/umath/loops_utils.h.src b/numpy/core/src/umath/loops_utils.h.src index dfa790ed9..1a2a5a32b 100644 --- a/numpy/core/src/umath/loops_utils.h.src +++ b/numpy/core/src/umath/loops_utils.h.src @@ -3,6 +3,17 @@ #include "numpy/npy_common.h" // NPY_FINLINE #include "numpy/halffloat.h" // npy_half_to_float + +/** + * Old versions of MSVC causes ambiguous link errors when we deal with large SIMD kernels + * which lead to break the build, probably releated to the following bug: + * https://developercommunity.visualstudio.com/content/problem/415095/internal-compiler-error-with-perfectly-forwarded-r.html + */ +#if defined(_MSC_VER) && _MSC_VER < 1916 + #define SIMD_MSVC_NOINLINE __declspec(noinline) +#else + #define SIMD_MSVC_NOINLINE +#endif /* * nomemoverlap - returns false if two strided arrays have an overlapping * region in memory. ip_size/op_size = size of the arrays which can be negative diff --git a/numpy/core/src/umath/npy_simd_data.h b/numpy/core/src/umath/npy_simd_data.h index 45487d0a8..be9288aff 100644 --- a/numpy/core/src/umath/npy_simd_data.h +++ b/numpy/core/src/umath/npy_simd_data.h @@ -119,22 +119,6 @@ static npy_uint64 EXP_Table_tail[32] = { #define NPY_COEFF_Q3_LOGf 9.864942958519418960339e-01f #define NPY_COEFF_Q4_LOGf 1.546476374983906719538e-01f #define NPY_COEFF_Q5_LOGf 5.875095403124574342950e-03f -/* - * Constants used in vector implementation of sinf/cosf(x) - */ -#define NPY_TWO_O_PIf 0x1.45f306p-1f -#define NPY_CODY_WAITE_PI_O_2_HIGHf -0x1.921fb0p+00f -#define NPY_CODY_WAITE_PI_O_2_MEDf -0x1.5110b4p-22f -#define NPY_CODY_WAITE_PI_O_2_LOWf -0x1.846988p-48f -#define NPY_COEFF_INVF0_COSINEf 0x1.000000p+00f -#define NPY_COEFF_INVF2_COSINEf -0x1.000000p-01f -#define NPY_COEFF_INVF4_COSINEf 0x1.55553cp-05f -#define NPY_COEFF_INVF6_COSINEf -0x1.6c06dcp-10f -#define NPY_COEFF_INVF8_COSINEf 0x1.98e616p-16f -#define NPY_COEFF_INVF3_SINEf -0x1.555556p-03f -#define NPY_COEFF_INVF5_SINEf 0x1.11119ap-07f -#define NPY_COEFF_INVF7_SINEf -0x1.a06bbap-13f -#define NPY_COEFF_INVF9_SINEf 0x1.7d3bbcp-19f /* * Lookup table of log(c_k) diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 3d4e6de87..e66763986 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -271,25 +271,6 @@ run_unary_@isa@_@func@_FLOAT(char **args, npy_intp const *dimensions, npy_intp c /**end repeat1**/ -#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS -static NPY_INLINE void -@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp steps, NPY_TRIG_OP); -#endif - -static NPY_INLINE int -run_unary_@isa@_sincos_FLOAT(char **args, npy_intp const *dimensions, npy_intp const *steps, NPY_TRIG_OP my_trig_op) -{ -#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS - if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), sizeof(npy_float), @REGISTER_SIZE@)) { - @ISA@_sincos_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0], my_trig_op); - return 1; - } - else - return 0; -#endif - return 0; -} - /**end repeat**/ #if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS @@ -976,19 +957,6 @@ fma_invert_mask_pd(__m256i ymask) } static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_should_calculate_sine(__m256i k, __m256i andop, __m256i cmp) -{ - return _mm256_cvtepi32_ps( - _mm256_cmpeq_epi32(_mm256_and_si256(k, andop), cmp)); -} - -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 -fma_should_negate(__m256i k, __m256i andop, __m256i cmp) -{ - return fma_should_calculate_sine(k, andop, cmp); -} - -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256 fma_get_exponent(__m256 x) { /* @@ -1215,18 +1183,6 @@ avx512_invert_mask_pd(__mmask8 ymask) return _mm512_knot(ymask); } -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 -avx512_should_calculate_sine(__m512i k, __m512i andop, __m512i cmp) -{ - return _mm512_cmpeq_epi32_mask(_mm512_and_epi32(k, andop), cmp); -} - -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 -avx512_should_negate(__m512i k, __m512i andop, __m512i cmp) -{ - return avx512_should_calculate_sine(k, andop, cmp); -} - static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 avx512_get_exponent(__m512 x) { @@ -1458,40 +1414,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @mask@ return _mm@vsize@_@or@(m1,m2); } -/* - * Approximate cosine algorithm for x \in [-PI/4, PI/4] - * Maximum ULP across all 32-bit floats = 0.875 - */ - -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ -@isa@_cosine(@vtype@ x2, @vtype@ invf8, @vtype@ invf6, @vtype@ invf4, - @vtype@ invf2, @vtype@ invf0) -{ - @vtype@ cos = @fmadd@(invf8, x2, invf6); - cos = @fmadd@(cos, x2, invf4); - cos = @fmadd@(cos, x2, invf2); - cos = @fmadd@(cos, x2, invf0); - return cos; -} - -/* - * Approximate sine algorithm for x \in [-PI/4, PI/4] - * Maximum ULP across all 32-bit floats = 0.647 - */ - -static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ -@isa@_sine(@vtype@ x, @vtype@ x2, @vtype@ invf9, @vtype@ invf7, - @vtype@ invf5, @vtype@ invf3, - @vtype@ zero) -{ - @vtype@ sin = @fmadd@(invf9, x2, invf7); - sin = @fmadd@(sin, x2, invf5); - sin = @fmadd@(sin, x2, invf3); - sin = @fmadd@(sin, x2, zero); - sin = @fmadd@(sin, x, x); - return sin; -} - static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ @isa@_sqrt_ps(@vtype@ x) { @@ -2004,167 +1926,7 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void * #cvtps_epi32 = _mm256_cvtps_epi32, # * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS# */ - -/* - * Vectorized approximate sine/cosine algorithms: The following code is a - * vectorized version of the algorithm presented here: - * https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751 - * (1) Load data in ZMM/YMM registers and generate mask for elements that are - * within range [-71476.0625f, 71476.0625f] for cosine and [-117435.992f, - * 117435.992f] for sine. - * (2) For elements within range, perform range reduction using Cody-Waite's - * method: x* = x - y*PI/2, where y = rint(x*2/PI). x* \in [-PI/4, PI/4]. - * (3) Map cos(x) to (+/-)sine or (+/-)cosine of x* based on the quadrant k = - * int(y). - * (4) For elements outside that range, Cody-Waite reduction performs poorly - * leading to catastrophic cancellation. We compute cosine by calling glibc in - * a scalar fashion. - * (5) Vectorized implementation has a max ULP of 1.49 and performs at least - * 5-7x faster than scalar implementations when magnitude of all elements in - * the array < 71476.0625f (117435.992f for sine). Worst case performance is - * when all the elements are large leading to about 1-2% reduction in - * performance. - */ - #if defined @CHK@ -static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void -@ISA@_sincos_FLOAT(npy_float * op, - npy_float * ip, - const npy_intp array_size, - const npy_intp steps, - NPY_TRIG_OP my_trig_op) -{ - const npy_intp stride = steps/(npy_intp)sizeof(npy_float); - const npy_int num_lanes = @NUM_LANES@; - npy_float large_number = 71476.0625f; - if (my_trig_op == npy_compute_sin) { - large_number = 117435.992f; - } - - /* Load up frequently used constants */ - @vtype@i zeros = _mm@vsize@_set1_epi32(0); - @vtype@i ones = _mm@vsize@_set1_epi32(1); - @vtype@i twos = _mm@vsize@_set1_epi32(2); - @vtype@ two_over_pi = _mm@vsize@_set1_ps(NPY_TWO_O_PIf); - @vtype@ codyw_c1 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_HIGHf); - @vtype@ codyw_c2 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_MEDf); - @vtype@ codyw_c3 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_LOWf); - @vtype@ cos_invf0 = _mm@vsize@_set1_ps(NPY_COEFF_INVF0_COSINEf); - @vtype@ cos_invf2 = _mm@vsize@_set1_ps(NPY_COEFF_INVF2_COSINEf); - @vtype@ cos_invf4 = _mm@vsize@_set1_ps(NPY_COEFF_INVF4_COSINEf); - @vtype@ cos_invf6 = _mm@vsize@_set1_ps(NPY_COEFF_INVF6_COSINEf); - @vtype@ cos_invf8 = _mm@vsize@_set1_ps(NPY_COEFF_INVF8_COSINEf); - @vtype@ sin_invf3 = _mm@vsize@_set1_ps(NPY_COEFF_INVF3_SINEf); - @vtype@ sin_invf5 = _mm@vsize@_set1_ps(NPY_COEFF_INVF5_SINEf); - @vtype@ sin_invf7 = _mm@vsize@_set1_ps(NPY_COEFF_INVF7_SINEf); - @vtype@ sin_invf9 = _mm@vsize@_set1_ps(NPY_COEFF_INVF9_SINEf); - @vtype@ cvt_magic = _mm@vsize@_set1_ps(NPY_RINT_CVT_MAGICf); - @vtype@ zero_f = _mm@vsize@_set1_ps(0.0f); - @vtype@ quadrant, reduced_x, reduced_x2, cos, sin; - @vtype@i iquadrant; - @mask@ nan_mask, glibc_mask, sine_mask, negate_mask; - @mask@ load_mask = @isa@_get_full_load_mask_ps(); - npy_intp num_remaining_elements = array_size; - - /* - * Note: while generally indices are npy_intp, we ensure that our maximum index - * will fit in an int32 as a precondition for this function via - * IS_OUTPUT_BLOCKABLE_UNARY - */ - npy_int32 indexarr[16]; - for (npy_int32 ii = 0; ii < 16; ii++) { - indexarr[ii] = ii*stride; - } - @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]); - - while (num_remaining_elements > 0) { - - if (num_remaining_elements < num_lanes) { - load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements, - num_lanes); - } - - @vtype@ x_in; - if (stride == 1) { - x_in = @isa@_masked_load_ps(load_mask, ip); - } - else { - x_in = @isa@_masked_gather_ps(zero_f, ip, vindex, load_mask); - } - - /* - * For elements outside of this range, Cody-Waite's range reduction - * becomes inaccurate and we will call glibc to compute cosine for - * these numbers - */ - - glibc_mask = @isa@_in_range_mask(x_in, large_number,-large_number); - glibc_mask = @and_masks@(load_mask, glibc_mask); - nan_mask = _mm@vsize@_cmp_ps@vsub@(x_in, x_in, _CMP_NEQ_UQ); - @vtype@ x = @isa@_set_masked_lanes_ps(x_in, zero_f, @or_masks@(nan_mask, glibc_mask)); - npy_int iglibc_mask = @mask_to_int@(glibc_mask); - - if (iglibc_mask != @full_mask@) { - quadrant = _mm@vsize@_mul_ps(x, two_over_pi); - - /* round to nearest */ - quadrant = _mm@vsize@_add_ps(quadrant, cvt_magic); - quadrant = _mm@vsize@_sub_ps(quadrant, cvt_magic); - - /* Cody-Waite's range reduction algorithm */ - reduced_x = @isa@_range_reduction(x, quadrant, - codyw_c1, codyw_c2, codyw_c3); - reduced_x2 = _mm@vsize@_mul_ps(reduced_x, reduced_x); - - /* compute cosine and sine */ - cos = @isa@_cosine(reduced_x2, cos_invf8, cos_invf6, cos_invf4, - cos_invf2, cos_invf0); - sin = @isa@_sine(reduced_x, reduced_x2, sin_invf9, sin_invf7, - sin_invf5, sin_invf3, zero_f); - - iquadrant = _mm@vsize@_cvtps_epi32(quadrant); - if (my_trig_op == npy_compute_cos) { - iquadrant = _mm@vsize@_add_epi32(iquadrant, ones); - } - - /* blend sin and cos based on the quadrant */ - sine_mask = @isa@_should_calculate_sine(iquadrant, ones, zeros); - cos = @isa@_blend(cos, sin, sine_mask); - - /* multiply by -1 for appropriate elements */ - negate_mask = @isa@_should_negate(iquadrant, twos, twos); - cos = @isa@_blend(cos, _mm@vsize@_sub_ps(zero_f, cos), negate_mask); - cos = @isa@_set_masked_lanes_ps(cos, _mm@vsize@_set1_ps(NPY_NANF), nan_mask); - - @masked_store@(op, @cvtps_epi32@(load_mask), cos); - } - - /* process elements using glibc for large elements */ - if (iglibc_mask != 0) { - float NPY_DECL_ALIGNED(@BYTES@) ip_fback[@NUM_LANES@]; - _mm@vsize@_store_ps(ip_fback, x_in); - - if (my_trig_op == npy_compute_cos) { - for (int ii = 0; ii < num_lanes; ++ii, iglibc_mask >>= 1) { - if (iglibc_mask & 0x01) { - op[ii] = npy_cosf(ip_fback[ii]); - } - } - } - else { - for (int ii = 0; ii < num_lanes; ++ii, iglibc_mask >>= 1) { - if (iglibc_mask & 0x01) { - op[ii] = npy_sinf(ip_fback[ii]); - } - } - } - } - ip += num_lanes*stride; - op += num_lanes; - num_remaining_elements -= num_lanes; - } -} - /* * Vectorized implementation of exp using AVX2 and AVX512: * 1) if x >= xmax; return INF (overflow) |