summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSayed Adel <seiko@imavr.com>2020-10-18 12:03:17 +0000
committerSayed Adel <seiko@imavr.com>2020-12-26 16:32:04 +0000
commitb16288698742e7593db61cf9a618e2d03de6b36e (patch)
tree301ed276b9eca8d18c7ffc9eae2fb0db9c9699e2
parent098a3b417dbb3c620e423e29f194585b68882d5e (diff)
downloadnumpy-b16288698742e7593db61cf9a618e2d03de6b36e.tar.gz
SIMD: Replace raw SIMD of sin/cos with NPYV
The new code improves the performance of non-contiguous memory access for the output array without any reduction in performance. For PPC64LE the performance increased by 2-3.0, and 1.5-2.0 on aarch64.
-rw-r--r--.gitignore1
-rw-r--r--numpy/core/code_generators/generate_umath.py4
-rw-r--r--numpy/core/include/numpy/npy_math.h9
-rw-r--r--numpy/core/setup.py1
-rw-r--r--numpy/core/src/umath/loops.c.src26
-rw-r--r--numpy/core/src/umath/loops.h.src13
-rw-r--r--numpy/core/src/umath/loops_trigonometric.dispatch.c.src230
-rw-r--r--numpy/core/src/umath/loops_utils.h.src11
-rw-r--r--numpy/core/src/umath/npy_simd_data.h16
-rw-r--r--numpy/core/src/umath/simd.inc.src238
10 files changed, 259 insertions, 290 deletions
diff --git a/.gitignore b/.gitignore
index b0fa037a6..5a5e464cc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -218,3 +218,4 @@ numpy/core/src/_simd/_simd_inc.h
# umath module
numpy/core/src/umath/loops_unary_fp.dispatch.c
numpy/core/src/umath/loops_arithm_fp.dispatch.c
+numpy/core/src/umath/loops_trigonometric.dispatch.c
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 4e9a2cfec..6ee8031cb 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -676,7 +676,7 @@ defdict = {
docstrings.get('numpy.core.umath.cos'),
None,
TD('e', f='cos', astype={'e':'f'}),
- TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
+ TD('f', dispatch=[('loops_trigonometric', 'f')]),
TD('fdg' + cmplx, f='cos'),
TD(P, f='cos'),
),
@@ -685,7 +685,7 @@ defdict = {
docstrings.get('numpy.core.umath.sin'),
None,
TD('e', f='sin', astype={'e':'f'}),
- TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
+ TD('f', dispatch=[('loops_trigonometric', 'f')]),
TD('fdg' + cmplx, f='sin'),
TD(P, f='sin'),
),
diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h
index 7d71c36cc..f32e298f0 100644
--- a/numpy/core/include/numpy/npy_math.h
+++ b/numpy/core/include/numpy/npy_math.h
@@ -151,15 +151,6 @@ NPY_INPLACE npy_longlong npy_rshiftll(npy_longlong a, npy_longlong b);
NPY_INPLACE npy_longlong npy_lshiftll(npy_longlong a, npy_longlong b);
/*
- * avx function has a common API for both sin & cos. This enum is used to
- * distinguish between the two
- */
-typedef enum {
- npy_compute_sin,
- npy_compute_cos
-} NPY_TRIG_OP;
-
-/*
* C99 double math funcs
*/
NPY_INPLACE double npy_sin(double x);
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 2e020a595..1042a1c45 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -929,6 +929,7 @@ def configuration(parent_package='',top_path=None):
join('src', 'umath', 'scalarmath.c.src'),
join('src', 'umath', 'ufunc_type_resolution.c'),
join('src', 'umath', 'override.c'),
+ join('src', 'umath', 'loops_trigonometric.dispatch.c.src'),
]
umath_deps = [
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 839d2b3ae..ba538d2ab 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1658,8 +1658,8 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
/**end repeat**/
/**begin repeat
- * #func = sin, cos, exp, log#
- * #scalarf = npy_sinf, npy_cosf, npy_expf, npy_logf#
+ * #func = exp, log#
+ * #scalarf = npy_expf, npy_logf#
*/
NPY_NO_EXPORT NPY_GCC_OPT_3 void
@@ -1749,28 +1749,6 @@ FLOAT_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *step
/**end repeat1**/
-/**begin repeat1
- * #func = cos, sin#
- * #enum = npy_compute_cos, npy_compute_sin#
- * #scalarf = npy_cosf, npy_sinf#
- */
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-FLOAT_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
- if (!run_unary_@isa@_sincos_FLOAT(args, dimensions, steps, @enum@)) {
- UNARY_LOOP {
-#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
- @ISA@_sincos_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0], @enum@);
-#else
- const npy_float in1 = *(npy_float *)ip1;
- *(npy_float *)op1 = @scalarf@(in1);
-#endif
- }
- }
-}
-
-/**end repeat1**/
/**end repeat**/
NPY_NO_EXPORT NPY_GCC_OPT_3 void
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index c15ff8e3b..d73c9fa7f 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -225,8 +225,19 @@ DOUBLE_log(char **args, npy_intp const *dimensions, npy_intp const *steps, void
NPY_NO_EXPORT void
DOUBLE_log_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_trigonometric.dispatch.h"
+#endif
+/**begin repeat
+ * #func = sin, cos#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_@func@, (
+ char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+/**end repeat**/
+
/**begin repeat
- * #func = sin, cos, exp, log#
+ * #func = exp, log#
*/
NPY_NO_EXPORT void
FLOAT_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
diff --git a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
new file mode 100644
index 000000000..8c2c83e7c
--- /dev/null
+++ b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
@@ -0,0 +1,230 @@
+/*@targets
+ ** $maxopt baseline
+ ** (avx2 fma3) avx512f
+ ** vsx2
+ ** neon_vfpv4
+ **/
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+/*
+ * TODO:
+ * - use vectorized version of Payne-Hanek style reduction for large elements or
+ * when there's no native FUSED support instead of fallback to libc
+ */
+#if NPY_SIMD_FMA3 // native support
+/*
+ * Vectorized Cody-Waite range reduction technique
+ * Performs the reduction step x* = x - y*C in three steps:
+ * 1) x* = x - y*c1
+ * 2) x* = x - y*c2
+ * 3) x* = x - y*c3
+ * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
+ */
+NPY_FINLINE npyv_f32
+simd_range_reduction_f32(npyv_f32 x, npyv_f32 y, npyv_f32 c1, npyv_f32 c2, npyv_f32 c3)
+{
+ npyv_f32 reduced_x = npyv_muladd_f32(y, c1, x);
+ reduced_x = npyv_muladd_f32(y, c2, reduced_x);
+ reduced_x = npyv_muladd_f32(y, c3, reduced_x);
+ return reduced_x;
+}
+/*
+ * Approximate cosine algorithm for x \in [-PI/4, PI/4]
+ * Maximum ULP across all 32-bit floats = 0.875
+ */
+NPY_FINLINE npyv_f32
+simd_cosine_poly_f32(npyv_f32 x2)
+{
+ const npyv_f32 invf8 = npyv_setall_f32(0x1.98e616p-16f);
+ const npyv_f32 invf6 = npyv_setall_f32(-0x1.6c06dcp-10f);
+ const npyv_f32 invf4 = npyv_setall_f32(0x1.55553cp-05f);
+ const npyv_f32 invf2 = npyv_setall_f32(-0x1.000000p-01f);
+ const npyv_f32 invf0 = npyv_setall_f32(0x1.000000p+00f);
+
+ npyv_f32 r = npyv_muladd_f32(invf8, x2, invf6);
+ r = npyv_muladd_f32(r, x2, invf4);
+ r = npyv_muladd_f32(r, x2, invf2);
+ r = npyv_muladd_f32(r, x2, invf0);
+ return r;
+}
+/*
+ * Approximate sine algorithm for x \in [-PI/4, PI/4]
+ * Maximum ULP across all 32-bit floats = 0.647
+ * Polynomial approximation based on unpublished work by T. Myklebust
+ */
+NPY_FINLINE npyv_f32
+simd_sine_poly_f32(npyv_f32 x, npyv_f32 x2)
+{
+ const npyv_f32 invf9 = npyv_setall_f32(0x1.7d3bbcp-19f);
+ const npyv_f32 invf7 = npyv_setall_f32(-0x1.a06bbap-13f);
+ const npyv_f32 invf5 = npyv_setall_f32(0x1.11119ap-07f);
+ const npyv_f32 invf3 = npyv_setall_f32(-0x1.555556p-03f);
+
+ npyv_f32 r = npyv_muladd_f32(invf9, x2, invf7);
+ r = npyv_muladd_f32(r, x2, invf5);
+ r = npyv_muladd_f32(r, x2, invf3);
+ r = npyv_muladd_f32(r, x2, npyv_zero_f32());
+ r = npyv_muladd_f32(r, x, x);
+ return r;
+}
+/*
+ * Vectorized approximate sine/cosine algorithms: The following code is a
+ * vectorized version of the algorithm presented here:
+ * https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
+ * (1) Load data in registers and generate mask for elements that are
+ * within range [-71476.0625f, 71476.0625f] for cosine and [-117435.992f,
+ * 117435.992f] for sine.
+ * (2) For elements within range, perform range reduction using Cody-Waite's
+ * method: x* = x - y*PI/2, where y = rint(x*2/PI). x* \in [-PI/4, PI/4].
+ * (3) Map cos(x) to (+/-)sine or (+/-)cosine of x* based on the quadrant k =
+ * int(y).
+ * (4) For elements outside that range, Cody-Waite reduction performs poorly
+ * leading to catastrophic cancellation. We compute cosine by calling glibc in
+ * a scalar fashion.
+ * (5) Vectorized implementation has a max ULP of 1.49 and performs at least
+ * 5-7x(x86) - 2.5-3x(Power) - 1-2x(Arm) faster than scalar implementations
+ * when magnitude of all elements in the array < 71476.0625f (117435.992f for sine).
+ * Worst case performance is when all the elements are large leading to about 1-2% reduction in
+ * performance.
+ */
+typedef enum
+{
+ SIMD_COMPUTE_SIN,
+ SIMD_COMPUTE_COS
+} SIMD_TRIG_OP;
+
+static void SIMD_MSVC_NOINLINE
+simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
+ npy_intp len, SIMD_TRIG_OP trig_op)
+{
+ // Load up frequently used constants
+ const npyv_f32 zerosf = npyv_zero_f32();
+ const npyv_s32 ones = npyv_setall_s32(1);
+ const npyv_s32 twos = npyv_setall_s32(2);
+ const npyv_f32 two_over_pi = npyv_setall_f32(0x1.45f306p-1f);
+ const npyv_f32 codyw_pio2_highf = npyv_setall_f32(-0x1.921fb0p+00f);
+ const npyv_f32 codyw_pio2_medf = npyv_setall_f32(-0x1.5110b4p-22f);
+ const npyv_f32 codyw_pio2_lowf = npyv_setall_f32(-0x1.846988p-48f);
+ const npyv_f32 rint_cvt_magic = npyv_setall_f32(0x1.800000p+23f);
+ // Cody-Waite's range
+ float max_codi = 117435.992f;
+ if (trig_op == SIMD_COMPUTE_COS) {
+ max_codi = 71476.0625f;
+ }
+ const npyv_f32 max_cody = npyv_setall_f32(max_codi);
+ const int vstep = npyv_nlanes_f32;
+
+ for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+ npyv_f32 x_in;
+ if (ssrc == 1) {
+ x_in = npyv_load_tillz_f32(src, len);
+ } else {
+ x_in = npyv_loadn_tillz_f32(src, ssrc, len);
+ }
+ npyv_b32 simd_mask = npyv_cmple_f32(npyv_abs_f32(x_in), max_cody);
+ npy_uint64 simd_maski = npyv_tobits_b32(simd_mask);
+ /*
+ * For elements outside of this range, Cody-Waite's range reduction
+ * becomes inaccurate and we will call libc to compute cosine for
+ * these numbers
+ */
+ if (simd_maski != 0) {
+ npyv_b32 nnan_mask = npyv_notnan_f32(x_in);
+ npyv_f32 x = npyv_select_f32(npyv_and_b32(nnan_mask, simd_mask), x_in, zerosf);
+
+ npyv_f32 quadrant = npyv_mul_f32(x, two_over_pi);
+ // round to nearest, -0.0f -> +0.0f, and |a| must be <= 0x1.0p+22
+ quadrant = npyv_add_f32(quadrant, rint_cvt_magic);
+ quadrant = npyv_sub_f32(quadrant, rint_cvt_magic);
+
+ // Cody-Waite's range reduction algorithm
+ npyv_f32 reduced_x = simd_range_reduction_f32(
+ x, quadrant, codyw_pio2_highf, codyw_pio2_medf, codyw_pio2_lowf
+ );
+ npyv_f32 reduced_x2 = npyv_square_f32(reduced_x);
+
+ // compute cosine and sine
+ npyv_f32 cos = simd_cosine_poly_f32(reduced_x2);
+ npyv_f32 sin = simd_sine_poly_f32(reduced_x, reduced_x2);
+
+ npyv_s32 iquadrant = npyv_round_s32_f32(quadrant);
+ if (trig_op == SIMD_COMPUTE_COS) {
+ iquadrant = npyv_add_s32(iquadrant, ones);
+ }
+ // blend sin and cos based on the quadrant
+ npyv_b32 sine_mask = npyv_cmpeq_s32(npyv_and_s32(iquadrant, ones), npyv_zero_s32());
+ cos = npyv_select_f32(sine_mask, sin, cos);
+
+ // multiply by -1 for appropriate elements
+ npyv_b32 negate_mask = npyv_cmpeq_s32(npyv_and_s32(iquadrant, twos), twos);
+ cos = npyv_ifsub_f32(negate_mask, zerosf, cos, cos);
+ cos = npyv_select_f32(nnan_mask, cos, npyv_setall_f32(NPY_NANF));
+
+ if (sdst == 1) {
+ npyv_store_till_f32(dst, len, cos);
+ } else {
+ npyv_storen_till_f32(dst, sdst, len, cos);
+ }
+ }
+ if (simd_maski != ((1 << vstep) - 1)) {
+ float NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) ip_fback[npyv_nlanes_f32];
+ npyv_storea_f32(ip_fback, x_in);
+
+ // process elements using libc for large elements
+ if (trig_op == SIMD_COMPUTE_COS) {
+ for (unsigned i = 0; i < npyv_nlanes_f32; ++i) {
+ if ((simd_maski >> i) & 1) {
+ continue;
+ }
+ dst[sdst*i] = npy_cosf(ip_fback[i]);
+ }
+ }
+ else {
+ for (unsigned i = 0; i < npyv_nlanes_f32; ++i) {
+ if ((simd_maski >> i) & 1) {
+ continue;
+ }
+ dst[sdst*i] = npy_sinf(ip_fback[i]);
+ }
+ }
+ }
+ }
+ npyv_cleanup();
+}
+#endif // NPY_SIMD_FMA3
+
+/**begin repeat
+ * #func = cos, sin#
+ * #enum = SIMD_COMPUTE_COS, SIMD_COMPUTE_SIN#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_@func@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+ const float *src = (float*)args[0];
+ float *dst = (float*)args[1];
+
+ const int lsize = sizeof(src[0]);
+ const npy_intp ssrc = steps[0] / lsize;
+ const npy_intp sdst = steps[1] / lsize;
+ npy_intp len = dimensions[0];
+ assert(steps[0] % lsize == 0 && steps[1] % lsize == 0);
+#if NPY_SIMD_FMA3
+ if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
+ !npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)
+ ) {
+ for (; len > 0; --len, src += ssrc, dst += sdst) {
+ simd_sincos_f32(src, 1, dst, 1, 1, @enum@);
+ }
+ } else {
+ simd_sincos_f32(src, ssrc, dst, sdst, len, @enum@);
+ }
+#else
+ for (; len > 0; --len, src += ssrc, dst += sdst) {
+ const float src0 = *src;
+ *dst = npy_@func@f(src0);
+ }
+#endif
+}
+/**end repeat**/
diff --git a/numpy/core/src/umath/loops_utils.h.src b/numpy/core/src/umath/loops_utils.h.src
index dfa790ed9..1a2a5a32b 100644
--- a/numpy/core/src/umath/loops_utils.h.src
+++ b/numpy/core/src/umath/loops_utils.h.src
@@ -3,6 +3,17 @@
#include "numpy/npy_common.h" // NPY_FINLINE
#include "numpy/halffloat.h" // npy_half_to_float
+
+/**
+ * Old versions of MSVC causes ambiguous link errors when we deal with large SIMD kernels
+ * which lead to break the build, probably releated to the following bug:
+ * https://developercommunity.visualstudio.com/content/problem/415095/internal-compiler-error-with-perfectly-forwarded-r.html
+ */
+#if defined(_MSC_VER) && _MSC_VER < 1916
+ #define SIMD_MSVC_NOINLINE __declspec(noinline)
+#else
+ #define SIMD_MSVC_NOINLINE
+#endif
/*
* nomemoverlap - returns false if two strided arrays have an overlapping
* region in memory. ip_size/op_size = size of the arrays which can be negative
diff --git a/numpy/core/src/umath/npy_simd_data.h b/numpy/core/src/umath/npy_simd_data.h
index 45487d0a8..be9288aff 100644
--- a/numpy/core/src/umath/npy_simd_data.h
+++ b/numpy/core/src/umath/npy_simd_data.h
@@ -119,22 +119,6 @@ static npy_uint64 EXP_Table_tail[32] = {
#define NPY_COEFF_Q3_LOGf 9.864942958519418960339e-01f
#define NPY_COEFF_Q4_LOGf 1.546476374983906719538e-01f
#define NPY_COEFF_Q5_LOGf 5.875095403124574342950e-03f
-/*
- * Constants used in vector implementation of sinf/cosf(x)
- */
-#define NPY_TWO_O_PIf 0x1.45f306p-1f
-#define NPY_CODY_WAITE_PI_O_2_HIGHf -0x1.921fb0p+00f
-#define NPY_CODY_WAITE_PI_O_2_MEDf -0x1.5110b4p-22f
-#define NPY_CODY_WAITE_PI_O_2_LOWf -0x1.846988p-48f
-#define NPY_COEFF_INVF0_COSINEf 0x1.000000p+00f
-#define NPY_COEFF_INVF2_COSINEf -0x1.000000p-01f
-#define NPY_COEFF_INVF4_COSINEf 0x1.55553cp-05f
-#define NPY_COEFF_INVF6_COSINEf -0x1.6c06dcp-10f
-#define NPY_COEFF_INVF8_COSINEf 0x1.98e616p-16f
-#define NPY_COEFF_INVF3_SINEf -0x1.555556p-03f
-#define NPY_COEFF_INVF5_SINEf 0x1.11119ap-07f
-#define NPY_COEFF_INVF7_SINEf -0x1.a06bbap-13f
-#define NPY_COEFF_INVF9_SINEf 0x1.7d3bbcp-19f
/*
* Lookup table of log(c_k)
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 3d4e6de87..e66763986 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -271,25 +271,6 @@ run_unary_@isa@_@func@_FLOAT(char **args, npy_intp const *dimensions, npy_intp c
/**end repeat1**/
-#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_INLINE void
-@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp steps, NPY_TRIG_OP);
-#endif
-
-static NPY_INLINE int
-run_unary_@isa@_sincos_FLOAT(char **args, npy_intp const *dimensions, npy_intp const *steps, NPY_TRIG_OP my_trig_op)
-{
-#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
- if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), sizeof(npy_float), @REGISTER_SIZE@)) {
- @ISA@_sincos_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0], my_trig_op);
- return 1;
- }
- else
- return 0;
-#endif
- return 0;
-}
-
/**end repeat**/
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
@@ -976,19 +957,6 @@ fma_invert_mask_pd(__m256i ymask)
}
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_should_calculate_sine(__m256i k, __m256i andop, __m256i cmp)
-{
- return _mm256_cvtepi32_ps(
- _mm256_cmpeq_epi32(_mm256_and_si256(k, andop), cmp));
-}
-
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_should_negate(__m256i k, __m256i andop, __m256i cmp)
-{
- return fma_should_calculate_sine(k, andop, cmp);
-}
-
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_get_exponent(__m256 x)
{
/*
@@ -1215,18 +1183,6 @@ avx512_invert_mask_pd(__mmask8 ymask)
return _mm512_knot(ymask);
}
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_should_calculate_sine(__m512i k, __m512i andop, __m512i cmp)
-{
- return _mm512_cmpeq_epi32_mask(_mm512_and_epi32(k, andop), cmp);
-}
-
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_should_negate(__m512i k, __m512i andop, __m512i cmp)
-{
- return avx512_should_calculate_sine(k, andop, cmp);
-}
-
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
avx512_get_exponent(__m512 x)
{
@@ -1458,40 +1414,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @mask@
return _mm@vsize@_@or@(m1,m2);
}
-/*
- * Approximate cosine algorithm for x \in [-PI/4, PI/4]
- * Maximum ULP across all 32-bit floats = 0.875
- */
-
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
-@isa@_cosine(@vtype@ x2, @vtype@ invf8, @vtype@ invf6, @vtype@ invf4,
- @vtype@ invf2, @vtype@ invf0)
-{
- @vtype@ cos = @fmadd@(invf8, x2, invf6);
- cos = @fmadd@(cos, x2, invf4);
- cos = @fmadd@(cos, x2, invf2);
- cos = @fmadd@(cos, x2, invf0);
- return cos;
-}
-
-/*
- * Approximate sine algorithm for x \in [-PI/4, PI/4]
- * Maximum ULP across all 32-bit floats = 0.647
- */
-
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
-@isa@_sine(@vtype@ x, @vtype@ x2, @vtype@ invf9, @vtype@ invf7,
- @vtype@ invf5, @vtype@ invf3,
- @vtype@ zero)
-{
- @vtype@ sin = @fmadd@(invf9, x2, invf7);
- sin = @fmadd@(sin, x2, invf5);
- sin = @fmadd@(sin, x2, invf3);
- sin = @fmadd@(sin, x2, zero);
- sin = @fmadd@(sin, x, x);
- return sin;
-}
-
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
@isa@_sqrt_ps(@vtype@ x)
{
@@ -2004,167 +1926,7 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
* #cvtps_epi32 = _mm256_cvtps_epi32, #
* #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
*/
-
-/*
- * Vectorized approximate sine/cosine algorithms: The following code is a
- * vectorized version of the algorithm presented here:
- * https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
- * (1) Load data in ZMM/YMM registers and generate mask for elements that are
- * within range [-71476.0625f, 71476.0625f] for cosine and [-117435.992f,
- * 117435.992f] for sine.
- * (2) For elements within range, perform range reduction using Cody-Waite's
- * method: x* = x - y*PI/2, where y = rint(x*2/PI). x* \in [-PI/4, PI/4].
- * (3) Map cos(x) to (+/-)sine or (+/-)cosine of x* based on the quadrant k =
- * int(y).
- * (4) For elements outside that range, Cody-Waite reduction performs poorly
- * leading to catastrophic cancellation. We compute cosine by calling glibc in
- * a scalar fashion.
- * (5) Vectorized implementation has a max ULP of 1.49 and performs at least
- * 5-7x faster than scalar implementations when magnitude of all elements in
- * the array < 71476.0625f (117435.992f for sine). Worst case performance is
- * when all the elements are large leading to about 1-2% reduction in
- * performance.
- */
-
#if defined @CHK@
-static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
-@ISA@_sincos_FLOAT(npy_float * op,
- npy_float * ip,
- const npy_intp array_size,
- const npy_intp steps,
- NPY_TRIG_OP my_trig_op)
-{
- const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
- const npy_int num_lanes = @NUM_LANES@;
- npy_float large_number = 71476.0625f;
- if (my_trig_op == npy_compute_sin) {
- large_number = 117435.992f;
- }
-
- /* Load up frequently used constants */
- @vtype@i zeros = _mm@vsize@_set1_epi32(0);
- @vtype@i ones = _mm@vsize@_set1_epi32(1);
- @vtype@i twos = _mm@vsize@_set1_epi32(2);
- @vtype@ two_over_pi = _mm@vsize@_set1_ps(NPY_TWO_O_PIf);
- @vtype@ codyw_c1 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_HIGHf);
- @vtype@ codyw_c2 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_MEDf);
- @vtype@ codyw_c3 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_LOWf);
- @vtype@ cos_invf0 = _mm@vsize@_set1_ps(NPY_COEFF_INVF0_COSINEf);
- @vtype@ cos_invf2 = _mm@vsize@_set1_ps(NPY_COEFF_INVF2_COSINEf);
- @vtype@ cos_invf4 = _mm@vsize@_set1_ps(NPY_COEFF_INVF4_COSINEf);
- @vtype@ cos_invf6 = _mm@vsize@_set1_ps(NPY_COEFF_INVF6_COSINEf);
- @vtype@ cos_invf8 = _mm@vsize@_set1_ps(NPY_COEFF_INVF8_COSINEf);
- @vtype@ sin_invf3 = _mm@vsize@_set1_ps(NPY_COEFF_INVF3_SINEf);
- @vtype@ sin_invf5 = _mm@vsize@_set1_ps(NPY_COEFF_INVF5_SINEf);
- @vtype@ sin_invf7 = _mm@vsize@_set1_ps(NPY_COEFF_INVF7_SINEf);
- @vtype@ sin_invf9 = _mm@vsize@_set1_ps(NPY_COEFF_INVF9_SINEf);
- @vtype@ cvt_magic = _mm@vsize@_set1_ps(NPY_RINT_CVT_MAGICf);
- @vtype@ zero_f = _mm@vsize@_set1_ps(0.0f);
- @vtype@ quadrant, reduced_x, reduced_x2, cos, sin;
- @vtype@i iquadrant;
- @mask@ nan_mask, glibc_mask, sine_mask, negate_mask;
- @mask@ load_mask = @isa@_get_full_load_mask_ps();
- npy_intp num_remaining_elements = array_size;
-
- /*
- * Note: while generally indices are npy_intp, we ensure that our maximum index
- * will fit in an int32 as a precondition for this function via
- * IS_OUTPUT_BLOCKABLE_UNARY
- */
- npy_int32 indexarr[16];
- for (npy_int32 ii = 0; ii < 16; ii++) {
- indexarr[ii] = ii*stride;
- }
- @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
-
- while (num_remaining_elements > 0) {
-
- if (num_remaining_elements < num_lanes) {
- load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
- num_lanes);
- }
-
- @vtype@ x_in;
- if (stride == 1) {
- x_in = @isa@_masked_load_ps(load_mask, ip);
- }
- else {
- x_in = @isa@_masked_gather_ps(zero_f, ip, vindex, load_mask);
- }
-
- /*
- * For elements outside of this range, Cody-Waite's range reduction
- * becomes inaccurate and we will call glibc to compute cosine for
- * these numbers
- */
-
- glibc_mask = @isa@_in_range_mask(x_in, large_number,-large_number);
- glibc_mask = @and_masks@(load_mask, glibc_mask);
- nan_mask = _mm@vsize@_cmp_ps@vsub@(x_in, x_in, _CMP_NEQ_UQ);
- @vtype@ x = @isa@_set_masked_lanes_ps(x_in, zero_f, @or_masks@(nan_mask, glibc_mask));
- npy_int iglibc_mask = @mask_to_int@(glibc_mask);
-
- if (iglibc_mask != @full_mask@) {
- quadrant = _mm@vsize@_mul_ps(x, two_over_pi);
-
- /* round to nearest */
- quadrant = _mm@vsize@_add_ps(quadrant, cvt_magic);
- quadrant = _mm@vsize@_sub_ps(quadrant, cvt_magic);
-
- /* Cody-Waite's range reduction algorithm */
- reduced_x = @isa@_range_reduction(x, quadrant,
- codyw_c1, codyw_c2, codyw_c3);
- reduced_x2 = _mm@vsize@_mul_ps(reduced_x, reduced_x);
-
- /* compute cosine and sine */
- cos = @isa@_cosine(reduced_x2, cos_invf8, cos_invf6, cos_invf4,
- cos_invf2, cos_invf0);
- sin = @isa@_sine(reduced_x, reduced_x2, sin_invf9, sin_invf7,
- sin_invf5, sin_invf3, zero_f);
-
- iquadrant = _mm@vsize@_cvtps_epi32(quadrant);
- if (my_trig_op == npy_compute_cos) {
- iquadrant = _mm@vsize@_add_epi32(iquadrant, ones);
- }
-
- /* blend sin and cos based on the quadrant */
- sine_mask = @isa@_should_calculate_sine(iquadrant, ones, zeros);
- cos = @isa@_blend(cos, sin, sine_mask);
-
- /* multiply by -1 for appropriate elements */
- negate_mask = @isa@_should_negate(iquadrant, twos, twos);
- cos = @isa@_blend(cos, _mm@vsize@_sub_ps(zero_f, cos), negate_mask);
- cos = @isa@_set_masked_lanes_ps(cos, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
-
- @masked_store@(op, @cvtps_epi32@(load_mask), cos);
- }
-
- /* process elements using glibc for large elements */
- if (iglibc_mask != 0) {
- float NPY_DECL_ALIGNED(@BYTES@) ip_fback[@NUM_LANES@];
- _mm@vsize@_store_ps(ip_fback, x_in);
-
- if (my_trig_op == npy_compute_cos) {
- for (int ii = 0; ii < num_lanes; ++ii, iglibc_mask >>= 1) {
- if (iglibc_mask & 0x01) {
- op[ii] = npy_cosf(ip_fback[ii]);
- }
- }
- }
- else {
- for (int ii = 0; ii < num_lanes; ++ii, iglibc_mask >>= 1) {
- if (iglibc_mask & 0x01) {
- op[ii] = npy_sinf(ip_fback[ii]);
- }
- }
- }
- }
- ip += num_lanes*stride;
- op += num_lanes;
- num_remaining_elements -= num_lanes;
- }
-}
-
/*
* Vectorized implementation of exp using AVX2 and AVX512:
* 1) if x >= xmax; return INF (overflow)