diff options
author | Raghuveer Devulapalli <raghuveer.devulapalli@intel.com> | 2019-03-20 15:12:50 -0700 |
---|---|---|
committer | Raghuveer Devulapalli <raghuveer.devulapalli@intel.com> | 2019-08-03 10:48:43 -0700 |
commit | bd2c82bf141852b4737da297c081e5f621604317 (patch) | |
tree | db9b0bf70cabe5fb3199f5fa3b9693de841c0b8c /numpy | |
parent | 71c8a1030d5a32342edc2e2311cb71dc38a7374e (diff) | |
download | numpy-bd2c82bf141852b4737da297c081e5f621604317.tar.gz |
ENH: Use AVX for float32 implementation of np.sin & np.cos
This commit implements vectorized single precision sine and cosine using
AVX2 and AVX512. Both sine and cosine are computed using a
polynomial approximation which are accurate for values between
[-PI/4,PI/4]. The original input is reduced to this range using a 3-step
Cody-Waite's range reduction method. This method is only accurate for
values between [-71476.0625f, 71476.0625f] for cosine and [-117435.992f,
117435.992f] for sine. The algorithm identifies elements outside this
range and calls glibc in a scalar loop to compute their output.
The algorithm is a vectorized version of the methods presented
here: https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
Accuracy: maximum ULP error = 1.49
Performance: The speed-up this implementation provides is dependent on
the values of the input array. It performs best when all the input
values are within the range specified above. Details of the performance
boost are provided below. Its worst performance is when all the array
elements are outside the range leading to about 1-2% reduction in
performance.
Three different benchmarking data are provided, each of which was benchmarked
using timeit package in python. Each function is executed 1000 times and
this is repeated 100 times. The standard deviation for all the runs was
less than 2% of their mean value and hence not included in the data.
(1) Micro-bencharking:
Array size = 10000, Command = "%timeit np.cos([myarr])"
|---------------+------------+--------+---------+----------+----------|
| Function Name | NumPy 1.16 | AVX2 | AVX512 | AVX2 | AVX512 |
| | | | | speed up | speed up |
|---------------+------------+--------+---------+----------+----------|
| np.cos | 1.5174 | 0.1553 | 0.06634 | 9.77 | 22.87 |
| np.sin | 1.4738 | 0.1517 | 0.06406 | 9.71 | 23.00 |
|---------------+------------+--------+---------+----------+----------|
(2) Package ai.cs provides an API to transform spherical coordinates to
cartesean system:
Array size = 10000, Command = "%timeit ai.cs.sp2cart(r,theta,phi)"
|---------------+------------+--------+--------+----------+----------|
| Function Name | NumPy 1.16 | AVX2 | AVX512 | AVX2 | AVX512 |
| | | | | speed up | speed up |
|---------------+------------+--------+--------+----------+----------|
| ai.cs.sp2cart | 0.6371 | 0.1066 | 0.0605 | 5.97 | 10.53 |
|---------------+------------+--------+--------+----------+----------|
(3) Package photutils provides an API to find the best fit of first and
second harmonic functions to a set of (angle, intensity) pairs:
Array size = 1000, Command = "%timeit fit_first_and_second_harmonics(E, data)"
|--------------------------------+------------+--------+--------+----------+----------|
| Function Name | NumPy 1.16 | AVX2 | AVX512 | AVX2 | AVX512 |
| | | | | speed up | speed up |
|--------------------------------+------------+--------+--------+----------+----------|
| fit_first_and_second_harmonics | 1.598 | 0.8709 | 0.7761 | 1.83 | 2.05 |
|--------------------------------+------------+--------+--------+----------+----------|
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/core/code_generators/generate_umath.py | 2 | ||||
-rw-r--r-- | numpy/core/include/numpy/npy_math.h | 17 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 34 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.h.src | 2 | ||||
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 216 |
5 files changed, 266 insertions, 5 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index bf1747272..896e03278 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -662,6 +662,7 @@ defdict = { Ufunc(1, 1, None, docstrings.get('numpy.core.umath.cos'), None, + TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]), TD(inexact, f='cos', astype={'e':'f'}), TD(P, f='cos'), ), @@ -669,6 +670,7 @@ defdict = { Ufunc(1, 1, None, docstrings.get('numpy.core.umath.sin'), None, + TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]), TD(inexact, f='sin', astype={'e':'f'}), TD(P, f='sin'), ), diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h index 6a78ff3c2..7831dd3d7 100644 --- a/numpy/core/include/numpy/npy_math.h +++ b/numpy/core/include/numpy/npy_math.h @@ -144,7 +144,22 @@ NPY_INLINE static float __npy_nzerof(void) #define NPY_COEFF_Q3_LOGf 9.864942958519418960339e-01f #define NPY_COEFF_Q4_LOGf 1.546476374983906719538e-01f #define NPY_COEFF_Q5_LOGf 5.875095403124574342950e-03f - +/* + * Constants used in vector implementation of sinf/cosf(x) + */ +#define NPY_TWO_O_PIf 0x1.45f306p-1f +#define NPY_CODY_WAITE_PI_O_2_HIGHf -0x1.921fb0p+00f +#define NPY_CODY_WAITE_PI_O_2_MEDf -0x1.5110b4p-22f +#define NPY_CODY_WAITE_PI_O_2_LOWf -0x1.846988p-48f +#define NPY_COEFF_INVF0_COSINEf 0x1.000000p+00f +#define NPY_COEFF_INVF2_COSINEf -0x1.000000p-01f +#define NPY_COEFF_INVF4_COSINEf 0x1.55553cp-05f +#define NPY_COEFF_INVF6_COSINEf -0x1.6c06dcp-10f +#define NPY_COEFF_INVF8_COSINEf 0x1.98e616p-16f +#define NPY_COEFF_INVF3_SINEf -0x1.555556p-03f +#define NPY_COEFF_INVF5_SINEf 0x1.11119ap-07f +#define NPY_COEFF_INVF7_SINEf -0x1.a06bbap-13f +#define NPY_COEFF_INVF9_SINEf 0x1.7d3bbcp-19f /* * Integer functions. */ diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 1a4885133..bc7e075cb 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1594,8 +1594,8 @@ NPY_NO_EXPORT void /**end repeat**/ /**begin repeat - * #func = exp, log# - * #scalarf = npy_expf, npy_logf# + * #func = sin, cos, exp, log# + * #scalarf = npy_sinf, npy_cosf, npy_expf, npy_logf# */ NPY_NO_EXPORT NPY_GCC_OPT_3 void @@ -1642,6 +1642,36 @@ FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY } /**end repeat1**/ + +/**begin repeat1 + * #func = cos, sin# + * #scalarf = npy_cosf, npy_sinf# + */ + +NPY_NO_EXPORT NPY_GCC_OPT_3 void +FLOAT_@func@_@isa@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data)) +{ +#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS + char str[] = "@func@"; + @ISA@_sincos_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], str); +#else + /* + * This is the path it would take if ISA was runtime detected, but not + * compiled for. It fixes the error on clang6.0 which fails to compile + * AVX512F version. Not sure if I like this idea, if during runtime it + * detects AXV512F, it will end up running the scalar version instead + * of AVX2. + */ + UNARY_LOOP { + const npy_float in1 = *(npy_float *)ip1; + *(npy_float *)op1 = @scalarf@(in1); + } +#endif +} + +/**end repeat1**/ + + /**end repeat**/ /**begin repeat diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 7f05a693a..accd0eab6 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -178,7 +178,7 @@ NPY_NO_EXPORT void /**end repeat**/ /**begin repeat - * #func = exp, log# + * #func = sin, cos, exp, log# */ NPY_NO_EXPORT void FLOAT_@func@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)); diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 9816a1da4..58ec48447 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -162,6 +162,11 @@ run_unary_@isa@_@func@_FLOAT(char **args, npy_intp *dimensions, npy_intp *steps) /**end repeat1**/ +#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS +static NPY_INLINE void +@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_int n, char*); +#endif + /**end repeat**/ @@ -1172,6 +1177,19 @@ avx2_blend(__m256 x, __m256 y, __m256 ymask) } static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256 +avx2_should_calculate_sine(__m256i k, __m256i andop, __m256i cmp) +{ + return _mm256_cvtepi32_ps( + _mm256_cmpeq_epi32(_mm256_and_si256(k, andop), cmp)); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256 +avx2_should_negate(__m256i k, __m256i andop, __m256i cmp) +{ + return avx2_should_calculate_sine(k, andop, cmp); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256 avx2_get_exponent(__m256 x) { /* @@ -1305,6 +1323,18 @@ avx512_blend(__m512 x, __m512 y, __mmask16 ymask) return _mm512_mask_mov_ps(x, ymask, y); } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 +avx512_should_calculate_sine(__m512i k, __m512i andop, __m512i cmp) +{ + return _mm512_cmpeq_epi32_mask(_mm512_and_epi32(k, andop), cmp); +} + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16 +avx512_should_negate(__m512i k, __m512i andop, __m512i cmp) +{ + return avx512_should_calculate_sine(k, andop, cmp); +} + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512 avx512_get_exponent(__m512 x) { @@ -1336,6 +1366,16 @@ avx512_scalef_ps(__m512 poly, __m512 quadrant) **/ #if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS + +/* + * Vectorized Cody-Waite range reduction technique + * Performs the reduction step x* = x - y*C in three steps: + * 1) x* = x - y*c1 + * 2) x* = x - y*c2 + * 3) x* = x - y*c3 + * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision + */ + static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ @isa@_range_reduction(@vtype@ x, @vtype@ y, @vtype@ c1, @vtype@ c2, @vtype@ c3) { @@ -1344,6 +1384,50 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ reduced_x = @fmadd@(y, c3, reduced_x); return reduced_x; } + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @mask@ +@isa@_in_range_mask(@vtype@ x, npy_float fmax, npy_float fmin) +{ + @mask@ m1 = _mm@vsize@_cmp_ps@vsub@( + x, _mm@vsize@_set1_ps(fmax), _CMP_GT_OQ); + @mask@ m2 = _mm@vsize@_cmp_ps@vsub@( + x, _mm@vsize@_set1_ps(fmin), _CMP_LT_OQ); + return _mm@vsize@_@or@(m1,m2); +} + +/* + * Approximate cosine algorithm for x \in [-PI/4, PI/4] + * Maximum ULP across all 32-bit floats = 0.875 + */ + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ +@isa@_cosine(@vtype@ x2, @vtype@ invf8, @vtype@ invf6, @vtype@ invf4, + @vtype@ invf2, @vtype@ invf0) +{ + @vtype@ cos = @fmadd@(invf8, x2, invf6); + cos = @fmadd@(cos, x2, invf4); + cos = @fmadd@(cos, x2, invf2); + cos = @fmadd@(cos, x2, invf0); + return cos; +} + +/* + * Approximate sine algorithm for x \in [-PI/4, PI/4] + * Maximum ULP across all 32-bit floats = 0.647 + */ + +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ +@isa@_sine(@vtype@ x, @vtype@ x2, @vtype@ invf9, @vtype@ invf7, + @vtype@ invf5, @vtype@ invf3, + @vtype@ zero) +{ + @vtype@ sin = @fmadd@(invf9, x2, invf7); + sin = @fmadd@(sin, x2, invf5); + sin = @fmadd@(sin, x2, invf3); + sin = @fmadd@(sin, x2, zero); + sin = @fmadd@(sin, x, x); + return sin; +} #endif /**end repeat**/ @@ -1367,6 +1451,137 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ /* + * Vectorized approximate sine/cosine algorithms: The following code is a + * vectorized version of the algorithm presented here: + * https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751 + * (1) Load data in ZMM/YMM registers and generate mask for elements that are + * within range [-71476.0625f, 71476.0625f] for cosine and [-117435.992f, + * 117435.992f] for sine. + * (2) For elements within range, perform range reduction using Cody-Waite's + * method: x* = x - y*PI/2, where y = rint(x*2/PI). x* \in [-PI/4, PI/4]. + * (3) Map cos(x) to (+/-)sine or (+/-)cosine of x* based on the quadrant k = + * int(y). + * (4) For elements outside that range, Cody-Waite reduction peforms poorly + * leading to catastrophic cancellation. We compute cosine by calling glibc in + * a scalar fashion. + * (5) Vectorized implementation has a max ULP of 1.49 and performs at least + * 5-7x faster than scalar implementations when magnitude of all elements in + * the array < 71476.0625f (117435.992f for sine). Worst case performance is + * when all the elements are large leading to about 1-2% reduction in + * performance. + */ + +#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS +static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void +@ISA@_sincos_FLOAT(npy_float * op, npy_float * ip, const npy_int array_size, + char* operation) +{ + const npy_int num_lanes = @BYTES@/sizeof(npy_float); + npy_int compute_cos = 1; + npy_float large_number = 71476.0625f; + if (*operation == 's') { + compute_cos = 0; + large_number = 117435.992f; + } + + /* Load up frequently used constants */ + @vtype@i zeros = _mm@vsize@_set1_epi32(0); + @vtype@i ones = _mm@vsize@_set1_epi32(1); + @vtype@i twos = _mm@vsize@_set1_epi32(2); + @vtype@ two_over_pi = _mm@vsize@_set1_ps(NPY_TWO_O_PIf); + @vtype@ codyw_c1 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_HIGHf); + @vtype@ codyw_c2 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_MEDf); + @vtype@ codyw_c3 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_LOWf); + @vtype@ cos_invf0 = _mm@vsize@_set1_ps(NPY_COEFF_INVF0_COSINEf); + @vtype@ cos_invf2 = _mm@vsize@_set1_ps(NPY_COEFF_INVF2_COSINEf); + @vtype@ cos_invf4 = _mm@vsize@_set1_ps(NPY_COEFF_INVF4_COSINEf); + @vtype@ cos_invf6 = _mm@vsize@_set1_ps(NPY_COEFF_INVF6_COSINEf); + @vtype@ cos_invf8 = _mm@vsize@_set1_ps(NPY_COEFF_INVF8_COSINEf); + @vtype@ sin_invf3 = _mm@vsize@_set1_ps(NPY_COEFF_INVF3_SINEf); + @vtype@ sin_invf5 = _mm@vsize@_set1_ps(NPY_COEFF_INVF5_SINEf); + @vtype@ sin_invf7 = _mm@vsize@_set1_ps(NPY_COEFF_INVF7_SINEf); + @vtype@ sin_invf9 = _mm@vsize@_set1_ps(NPY_COEFF_INVF9_SINEf); + @vtype@ cvt_magic = _mm@vsize@_set1_ps(NPY_RINT_CVT_MAGICf); + @vtype@ zero_f = _mm@vsize@_set1_ps(0.0f); + @vtype@ quadrant, reduced_x, reduced_x2, cos, sin; + @vtype@i iquadrant; + @mask@ glibc_mask, sine_mask, negate_mask; + @mask@ load_mask = @isa@_get_full_load_mask(); + npy_int num_remaining_elements = array_size; + + while (num_remaining_elements > 0) { + + if (num_remaining_elements < num_lanes) + load_mask = @isa@_get_partial_load_mask(num_remaining_elements, + num_lanes); + @vtype@ x = @isa@_masked_load(load_mask, ip); + + /* + * For elements outside of this range, Cody-Waite's range reduction + * becomes inaccurate and we will call glibc to compute cosine for + * these numbers + */ + + glibc_mask = @isa@_in_range_mask(x, large_number,-large_number); + glibc_mask = @and_masks@(load_mask, glibc_mask); + x = @isa@_set_masked_lanes(x, zero_f, glibc_mask); + npy_int iglibc_mask = @mask_to_int@(glibc_mask); + + if (iglibc_mask != @full_mask@) { + quadrant = _mm@vsize@_mul_ps(x, two_over_pi); + + /* round to nearest */ + quadrant = _mm@vsize@_add_ps(quadrant, cvt_magic); + quadrant = _mm@vsize@_sub_ps(quadrant, cvt_magic); + + /* Cody-Waite's range reduction algorithm */ + reduced_x = @isa@_range_reduction(x, quadrant, + codyw_c1, codyw_c2, codyw_c3); + reduced_x2 = _mm@vsize@_mul_ps(reduced_x, reduced_x); + + /* compute cosine and sine */ + cos = @isa@_cosine(reduced_x2, cos_invf8, cos_invf6, cos_invf4, + cos_invf2, cos_invf0); + sin = @isa@_sine(reduced_x, reduced_x2, sin_invf9, sin_invf7, + sin_invf5, sin_invf3, zero_f); + + iquadrant = _mm@vsize@_cvtps_epi32(quadrant); + if (compute_cos) + iquadrant = _mm@vsize@_add_epi32(iquadrant, ones); + + /* blend sin and cos based on the quadrant */ + sine_mask = @isa@_should_calculate_sine(iquadrant, ones, zeros); + cos = @isa@_blend(cos, sin, sine_mask); + + /* multiply by -1 for appropriate elements */ + negate_mask = @isa@_should_negate(iquadrant, twos, twos); + cos = @isa@_blend(cos, _mm@vsize@_sub_ps(zero_f, cos), negate_mask); + + @masked_store@(op, @cvtps_epi32@(load_mask), cos); + } + + /* process elements using glibc for large elements */ + if (compute_cos) { + for (int ii = 0; iglibc_mask != 0; ii++) { + if (iglibc_mask & 0x01) + op[ii] = npy_cosf(ip[ii]); + iglibc_mask = iglibc_mask >> 1; + } + } + else { + for (int ii = 0; iglibc_mask != 0; ii++) { + if (iglibc_mask & 0x01) + op[ii] = npy_sinf(ip[ii]); + iglibc_mask = iglibc_mask >> 1; + } + } + ip += num_lanes; + op += num_lanes; + num_remaining_elements -= num_lanes; + } +} + +/* * Vectorized implementation of exp using AVX2 and AVX512: * 1) if x >= xmax; return INF (overflow) * 2) if x <= xmin; return 0.0f (underflow) @@ -1381,7 +1596,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@ * same x = 0xc2781e37) */ -#if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void @ISA@_exp_FLOAT(npy_float * op, npy_float * ip, |