SIMD: Replace raw SIMD of sin/cos with NPYV

The new code improves the performance of non-contiguous memory access for the output array without any reduction in performance. For PPC64LE the performance increased by 2-3.0, and 1.5-2.0 on aarch64.
author: Sayed Adel <seiko@imavr.com> 2020-10-18 12:03:17 +0000
committer: Sayed Adel <seiko@imavr.com> 2020-12-26 16:32:04 +0000
commit: b16288698742e7593db61cf9a618e2d03de6b36e (patch)
tree: 301ed276b9eca8d18c7ffc9eae2fb0db9c9699e2
parent: 098a3b417dbb3c620e423e29f194585b68882d5e (diff)
download: numpy-b16288698742e7593db61cf9a618e2d03de6b36e.tar.gz
10 files changed, 259 insertions, 290 deletions
diff --git a/.gitignore b/.gitignore
index b0fa037a6..5a5e464cc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -218,3 +218,4 @@ numpy/core/src/_simd/_simd_inc.h
 # umath module
 numpy/core/src/umath/loops_unary_fp.dispatch.c
 numpy/core/src/umath/loops_arithm_fp.dispatch.c
+numpy/core/src/umath/loops_trigonometric.dispatch.c
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 4e9a2cfec..6ee8031cb 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -676,7 +676,7 @@ defdict = {
           docstrings.get('numpy.core.umath.cos'),
           None,
           TD('e', f='cos', astype={'e':'f'}),
-          TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
+          TD('f', dispatch=[('loops_trigonometric', 'f')]),
           TD('fdg' + cmplx, f='cos'),
           TD(P, f='cos'),
           ),
@@ -685,7 +685,7 @@ defdict = {
           docstrings.get('numpy.core.umath.sin'),
           None,
           TD('e', f='sin', astype={'e':'f'}),
-          TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
+          TD('f', dispatch=[('loops_trigonometric', 'f')]),
           TD('fdg' + cmplx, f='sin'),
           TD(P, f='sin'),
           ),
diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h
index 7d71c36cc..f32e298f0 100644
--- a/numpy/core/include/numpy/npy_math.h
+++ b/numpy/core/include/numpy/npy_math.h
@@ -151,15 +151,6 @@ NPY_INPLACE npy_longlong npy_rshiftll(npy_longlong a, npy_longlong b);
 NPY_INPLACE npy_longlong npy_lshiftll(npy_longlong a, npy_longlong b);
 
 /*
- * avx function has a common API for both sin & cos. This enum is used to
- * distinguish between the two
- */
-typedef enum {
-    npy_compute_sin,
-    npy_compute_cos
-} NPY_TRIG_OP;
-
-/*
  * C99 double math funcs
  */
 NPY_INPLACE double npy_sin(double x);
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 2e020a595..1042a1c45 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -929,6 +929,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'scalarmath.c.src'),
             join('src', 'umath', 'ufunc_type_resolution.c'),
             join('src', 'umath', 'override.c'),
+            join('src', 'umath', 'loops_trigonometric.dispatch.c.src'),
             ]
 
     umath_deps = [
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 839d2b3ae..ba538d2ab 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1658,8 +1658,8 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
 /**end repeat**/
 
 /**begin repeat
- *  #func = sin, cos, exp, log#
- *  #scalarf = npy_sinf, npy_cosf, npy_expf, npy_logf#
+ *  #func = exp, log#
+ *  #scalarf = npy_expf, npy_logf#
  */
 
 NPY_NO_EXPORT NPY_GCC_OPT_3 void
@@ -1749,28 +1749,6 @@ FLOAT_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *step
 
 /**end repeat1**/
 
-/**begin repeat1
- *  #func = cos, sin#
- *  #enum = npy_compute_cos, npy_compute_sin#
- *  #scalarf = npy_cosf, npy_sinf#
- */
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-FLOAT_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    if (!run_unary_@isa@_sincos_FLOAT(args, dimensions, steps, @enum@)) {
-        UNARY_LOOP {
-#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
-            @ISA@_sincos_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0], @enum@);
-#else
-	        const npy_float in1 = *(npy_float *)ip1;
-	        *(npy_float *)op1 = @scalarf@(in1);
-#endif
-        }
-    }
-}
-
-/**end repeat1**/
 /**end repeat**/
 
 NPY_NO_EXPORT NPY_GCC_OPT_3 void
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index c15ff8e3b..d73c9fa7f 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -225,8 +225,19 @@ DOUBLE_log(char **args, npy_intp const *dimensions, npy_intp const *steps, void
 NPY_NO_EXPORT void
 DOUBLE_log_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_trigonometric.dispatch.h"
+#endif
+/**begin repeat
+ *  #func = sin, cos#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_@func@, (
+    char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+/**end repeat**/
+
 /**begin repeat
- *  #func = sin, cos, exp, log#
+ *  #func = exp, log#
  */
 NPY_NO_EXPORT void
 FLOAT_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
diff --git a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
new file mode 100644
index 000000000..8c2c83e7c
--- /dev/null
+++ b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
@@ -0,0 +1,230 @@
+/*@targets
+ ** $maxopt baseline
+ ** (avx2 fma3) avx512f
+ ** vsx2
+ ** neon_vfpv4
+ **/
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+/*
+ * TODO:
+ * - use vectorized version of Payne-Hanek style reduction for large elements or
+ *   when there's no native FUSED support instead of fallback to libc
+ */
+#if NPY_SIMD_FMA3 // native support
+/*
+ * Vectorized Cody-Waite range reduction technique
+ * Performs the reduction step x* = x - y*C in three steps:
+ * 1) x* = x - y*c1
+ * 2) x* = x - y*c2
+ * 3) x* = x - y*c3
+ * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
+ */
+NPY_FINLINE npyv_f32
+simd_range_reduction_f32(npyv_f32 x, npyv_f32 y, npyv_f32 c1, npyv_f32 c2, npyv_f32 c3)
+{
+    npyv_f32 reduced_x = npyv_muladd_f32(y, c1, x);
+    reduced_x = npyv_muladd_f32(y, c2, reduced_x);
+    reduced_x = npyv_muladd_f32(y, c3, reduced_x);
+    return reduced_x;
+}
+/*
+ * Approximate cosine algorithm for x \in [-PI/4, PI/4]
+ * Maximum ULP across all 32-bit floats = 0.875
+ */
+NPY_FINLINE npyv_f32
+simd_cosine_poly_f32(npyv_f32 x2)
+{
+    const npyv_f32 invf8 = npyv_setall_f32(0x1.98e616p-16f);
+    const npyv_f32 invf6 = npyv_setall_f32(-0x1.6c06dcp-10f);
+    const npyv_f32 invf4 = npyv_setall_f32(0x1.55553cp-05f);
+    const npyv_f32 invf2 = npyv_setall_f32(-0x1.000000p-01f);
+    const npyv_f32 invf0 = npyv_setall_f32(0x1.000000p+00f);
+
+    npyv_f32 r = npyv_muladd_f32(invf8, x2, invf6);
+    r = npyv_muladd_f32(r, x2, invf4);
+    r = npyv_muladd_f32(r, x2, invf2);
+    r = npyv_muladd_f32(r, x2, invf0);
+    return r;
+}
+/*
+ * Approximate sine algorithm for x \in [-PI/4, PI/4]
+ * Maximum ULP across all 32-bit floats = 0.647
+ * Polynomial approximation based on unpublished work by T. Myklebust
+ */
+NPY_FINLINE npyv_f32
+simd_sine_poly_f32(npyv_f32 x, npyv_f32 x2)
+{
+    const npyv_f32 invf9 = npyv_setall_f32(0x1.7d3bbcp-19f);
+    const npyv_f32 invf7 = npyv_setall_f32(-0x1.a06bbap-13f);
+    const npyv_f32 invf5 = npyv_setall_f32(0x1.11119ap-07f);
+    const npyv_f32 invf3 = npyv_setall_f32(-0x1.555556p-03f);
+
+    npyv_f32 r = npyv_muladd_f32(invf9, x2, invf7);
+    r = npyv_muladd_f32(r, x2, invf5);
+    r = npyv_muladd_f32(r, x2, invf3);
+    r = npyv_muladd_f32(r, x2, npyv_zero_f32());
+    r = npyv_muladd_f32(r, x, x);
+    return r;
+}
+/*
+ * Vectorized approximate sine/cosine algorithms: The following code is a
+ * vectorized version of the algorithm presented here:
+ * https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
+ * (1) Load data in registers and generate mask for elements that are
+ * within range [-71476.0625f, 71476.0625f] for cosine and [-117435.992f,
+ * 117435.992f] for sine.
+ * (2) For elements within range, perform range reduction using Cody-Waite's
+ * method: x* = x - y*PI/2, where y = rint(x*2/PI). x* \in [-PI/4, PI/4].
+ * (3) Map cos(x) to (+/-)sine or (+/-)cosine of x* based on the quadrant k =
+ * int(y).
+ * (4) For elements outside that range, Cody-Waite reduction performs poorly
+ * leading to catastrophic cancellation. We compute cosine by calling glibc in
+ * a scalar fashion.
+ * (5) Vectorized implementation has a max ULP of 1.49 and performs at least
+ * 5-7x(x86) - 2.5-3x(Power) - 1-2x(Arm) faster than scalar implementations
+ * when magnitude of all elements in the array < 71476.0625f (117435.992f for sine).
+ * Worst case performance is when all the elements are large leading to about 1-2% reduction in
+ * performance.
+ */
+typedef enum
+{
+    SIMD_COMPUTE_SIN,
+    SIMD_COMPUTE_COS
+} SIMD_TRIG_OP;
+
+static void SIMD_MSVC_NOINLINE
+simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
+                npy_intp len, SIMD_TRIG_OP trig_op)
+{
+    // Load up frequently used constants
+    const npyv_f32 zerosf = npyv_zero_f32();
+    const npyv_s32 ones  = npyv_setall_s32(1);
+    const npyv_s32 twos  = npyv_setall_s32(2);
+    const npyv_f32 two_over_pi = npyv_setall_f32(0x1.45f306p-1f);
+    const npyv_f32 codyw_pio2_highf = npyv_setall_f32(-0x1.921fb0p+00f);
+    const npyv_f32 codyw_pio2_medf = npyv_setall_f32(-0x1.5110b4p-22f);
+    const npyv_f32 codyw_pio2_lowf = npyv_setall_f32(-0x1.846988p-48f);
+    const npyv_f32 rint_cvt_magic = npyv_setall_f32(0x1.800000p+23f);
+    // Cody-Waite's range
+    float max_codi = 117435.992f;
+    if (trig_op == SIMD_COMPUTE_COS) {
+        max_codi = 71476.0625f;
+    }
+    const npyv_f32 max_cody = npyv_setall_f32(max_codi);
+    const int vstep = npyv_nlanes_f32;
+
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f32 x_in;
+        if (ssrc == 1) {
+            x_in = npyv_load_tillz_f32(src, len);
+        } else {
+            x_in = npyv_loadn_tillz_f32(src, ssrc, len);
+        }
+        npyv_b32 simd_mask = npyv_cmple_f32(npyv_abs_f32(x_in), max_cody);
+        npy_uint64 simd_maski = npyv_tobits_b32(simd_mask);
+        /*
+         * For elements outside of this range, Cody-Waite's range reduction
+         * becomes inaccurate and we will call libc to compute cosine for
+         * these numbers
+         */
+        if (simd_maski != 0) {
+            npyv_b32 nnan_mask = npyv_notnan_f32(x_in);
+            npyv_f32 x = npyv_select_f32(npyv_and_b32(nnan_mask, simd_mask), x_in, zerosf);
+
+            npyv_f32 quadrant = npyv_mul_f32(x, two_over_pi);
+            // round to nearest, -0.0f -> +0.0f, and |a| must be <= 0x1.0p+22
+            quadrant = npyv_add_f32(quadrant, rint_cvt_magic);
+            quadrant = npyv_sub_f32(quadrant, rint_cvt_magic);
+
+            // Cody-Waite's range reduction algorithm
+            npyv_f32 reduced_x = simd_range_reduction_f32(
+                x, quadrant, codyw_pio2_highf, codyw_pio2_medf, codyw_pio2_lowf
+            );
+            npyv_f32 reduced_x2 = npyv_square_f32(reduced_x);
+
+            // compute cosine and sine
+            npyv_f32 cos = simd_cosine_poly_f32(reduced_x2);
+            npyv_f32 sin = simd_sine_poly_f32(reduced_x, reduced_x2);
+
+            npyv_s32 iquadrant = npyv_round_s32_f32(quadrant);
+            if (trig_op == SIMD_COMPUTE_COS) {
+                iquadrant = npyv_add_s32(iquadrant, ones);
+            }
+            // blend sin and cos based on the quadrant
+            npyv_b32 sine_mask = npyv_cmpeq_s32(npyv_and_s32(iquadrant, ones), npyv_zero_s32());
+            cos = npyv_select_f32(sine_mask, sin, cos);
+
+            // multiply by -1 for appropriate elements
+            npyv_b32 negate_mask = npyv_cmpeq_s32(npyv_and_s32(iquadrant, twos), twos);
+            cos = npyv_ifsub_f32(negate_mask, zerosf, cos, cos);
+            cos = npyv_select_f32(nnan_mask, cos, npyv_setall_f32(NPY_NANF));
+
+            if (sdst == 1) {
+                npyv_store_till_f32(dst, len, cos);
+            } else {
+                npyv_storen_till_f32(dst, sdst, len, cos);
+            }
+        }
+        if (simd_maski != ((1 << vstep) - 1)) {
+            float NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) ip_fback[npyv_nlanes_f32];
+            npyv_storea_f32(ip_fback, x_in);
+
+            // process elements using libc for large elements
+            if (trig_op == SIMD_COMPUTE_COS) {
+                for (unsigned i = 0; i < npyv_nlanes_f32; ++i) {
+                    if ((simd_maski >> i) & 1) {
+                        continue;
+                    }
+                    dst[sdst*i] = npy_cosf(ip_fback[i]);
+                }
+            }
+            else {
+                for (unsigned i = 0; i < npyv_nlanes_f32; ++i) {
+                    if ((simd_maski >> i) & 1) {
+                        continue;
+                    }
+                    dst[sdst*i] = npy_sinf(ip_fback[i]);
+                }
+            }
+        }
+    }
+    npyv_cleanup();
+}
+#endif // NPY_SIMD_FMA3
+
+/**begin repeat
+ *  #func = cos, sin#
+ *  #enum = SIMD_COMPUTE_COS, SIMD_COMPUTE_SIN#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_@func@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    const float *src = (float*)args[0];
+          float *dst = (float*)args[1];
+
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    npy_intp len = dimensions[0];
+    assert(steps[0] % lsize == 0 && steps[1] % lsize == 0);
+#if NPY_SIMD_FMA3
+    if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
+        !npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)
+    ) {
+        for (; len > 0; --len, src += ssrc, dst += sdst) {
+            simd_sincos_f32(src, 1, dst, 1, 1, @enum@);
+        }
+    } else {
+        simd_sincos_f32(src, ssrc, dst, sdst, len, @enum@);
+    }
+#else
+    for (; len > 0; --len, src += ssrc, dst += sdst) {
+        const float src0 = *src;
+        *dst = npy_@func@f(src0);
+    }
+#endif
+}
+/**end repeat**/
diff --git a/numpy/core/src/umath/loops_utils.h.src b/numpy/core/src/umath/loops_utils.h.src
index dfa790ed9..1a2a5a32b 100644
--- a/numpy/core/src/umath/loops_utils.h.src
+++ b/numpy/core/src/umath/loops_utils.h.src
@@ -3,6 +3,17 @@
 
 #include "numpy/npy_common.h" // NPY_FINLINE
 #include "numpy/halffloat.h" // npy_half_to_float
+
+/**
+ * Old versions of MSVC causes ambiguous link errors when we deal with large SIMD kernels
+ * which lead to break the build, probably releated to the following bug:
+ * https://developercommunity.visualstudio.com/content/problem/415095/internal-compiler-error-with-perfectly-forwarded-r.html
+ */
+#if defined(_MSC_VER) && _MSC_VER < 1916
+    #define SIMD_MSVC_NOINLINE __declspec(noinline)
+#else
+    #define SIMD_MSVC_NOINLINE
+#endif
 /*
  * nomemoverlap - returns false if two strided arrays have an overlapping
  * region in memory. ip_size/op_size = size of the arrays which can be negative
diff --git a/numpy/core/src/umath/npy_simd_data.h b/numpy/core/src/umath/npy_simd_data.h
index 45487d0a8..be9288aff 100644
--- a/numpy/core/src/umath/npy_simd_data.h
+++ b/numpy/core/src/umath/npy_simd_data.h
@@ -119,22 +119,6 @@ static npy_uint64 EXP_Table_tail[32] = {
 #define NPY_COEFF_Q3_LOGf 9.864942958519418960339e-01f
 #define NPY_COEFF_Q4_LOGf 1.546476374983906719538e-01f
 #define NPY_COEFF_Q5_LOGf 5.875095403124574342950e-03f
-/*
- * Constants used in vector implementation of sinf/cosf(x)
- */
-#define NPY_TWO_O_PIf 0x1.45f306p-1f
-#define NPY_CODY_WAITE_PI_O_2_HIGHf -0x1.921fb0p+00f
-#define NPY_CODY_WAITE_PI_O_2_MEDf -0x1.5110b4p-22f
-#define NPY_CODY_WAITE_PI_O_2_LOWf -0x1.846988p-48f
-#define NPY_COEFF_INVF0_COSINEf 0x1.000000p+00f
-#define NPY_COEFF_INVF2_COSINEf -0x1.000000p-01f
-#define NPY_COEFF_INVF4_COSINEf 0x1.55553cp-05f
-#define NPY_COEFF_INVF6_COSINEf -0x1.6c06dcp-10f
-#define NPY_COEFF_INVF8_COSINEf 0x1.98e616p-16f
-#define NPY_COEFF_INVF3_SINEf -0x1.555556p-03f
-#define NPY_COEFF_INVF5_SINEf 0x1.11119ap-07f
-#define NPY_COEFF_INVF7_SINEf -0x1.a06bbap-13f
-#define NPY_COEFF_INVF9_SINEf 0x1.7d3bbcp-19f
 
 /*
  * Lookup table of log(c_k)
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 3d4e6de87..e66763986 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -271,25 +271,6 @@ run_unary_@isa@_@func@_FLOAT(char **args, npy_intp const *dimensions, npy_intp c
 
 /**end repeat1**/
 
-#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_INLINE void
-@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp steps, NPY_TRIG_OP);
-#endif
-
-static NPY_INLINE int
-run_unary_@isa@_sincos_FLOAT(char **args, npy_intp const *dimensions, npy_intp const *steps, NPY_TRIG_OP my_trig_op)
-{
-#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
-    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), sizeof(npy_float), @REGISTER_SIZE@)) {
-        @ISA@_sincos_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0], my_trig_op);
-        return 1;
-    }
-    else
-        return 0;
-#endif
-    return 0;
-}
-
 /**end repeat**/
 
 #if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined  NPY_HAVE_SSE2_INTRINSICS
@@ -976,19 +957,6 @@ fma_invert_mask_pd(__m256i ymask)
 }
 
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_should_calculate_sine(__m256i k, __m256i andop, __m256i cmp)
-{
-   return _mm256_cvtepi32_ps(
-                _mm256_cmpeq_epi32(_mm256_and_si256(k, andop), cmp));
-}
-
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_should_negate(__m256i k, __m256i andop, __m256i cmp)
-{
-    return fma_should_calculate_sine(k, andop, cmp);
-}
-
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
 fma_get_exponent(__m256 x)
 {
     /*
@@ -1215,18 +1183,6 @@ avx512_invert_mask_pd(__mmask8 ymask)
     return _mm512_knot(ymask);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_should_calculate_sine(__m512i k, __m512i andop, __m512i cmp)
-{
-    return _mm512_cmpeq_epi32_mask(_mm512_and_epi32(k, andop), cmp);
-}
-
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_should_negate(__m512i k, __m512i andop, __m512i cmp)
-{
-    return avx512_should_calculate_sine(k, andop, cmp);
-}
-
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
 avx512_get_exponent(__m512 x)
 {
@@ -1458,40 +1414,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @mask@
     return _mm@vsize@_@or@(m1,m2);
 }
 
-/*
- * Approximate cosine algorithm for x \in [-PI/4, PI/4]
- * Maximum ULP across all 32-bit floats = 0.875
- */
-
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
-@isa@_cosine(@vtype@ x2, @vtype@ invf8, @vtype@ invf6, @vtype@ invf4,
-                                                @vtype@ invf2, @vtype@ invf0)
-{
-    @vtype@ cos = @fmadd@(invf8, x2, invf6);
-    cos = @fmadd@(cos, x2, invf4);
-    cos = @fmadd@(cos, x2, invf2);
-    cos = @fmadd@(cos, x2, invf0);
-    return cos;
-}
-
-/*
- * Approximate sine algorithm for x \in [-PI/4, PI/4]
- * Maximum ULP across all 32-bit floats = 0.647
- */
-
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
-@isa@_sine(@vtype@ x, @vtype@ x2, @vtype@ invf9, @vtype@ invf7,
-                                          @vtype@ invf5, @vtype@ invf3,
-                                          @vtype@ zero)
-{
-    @vtype@ sin = @fmadd@(invf9, x2, invf7);
-    sin = @fmadd@(sin, x2, invf5);
-    sin = @fmadd@(sin, x2, invf3);
-    sin = @fmadd@(sin, x2, zero);
-    sin = @fmadd@(sin, x, x);
-    return sin;
-}
-
 static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
 @isa@_sqrt_ps(@vtype@ x)
 {
@@ -2004,167 +1926,7 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
  * #cvtps_epi32 = _mm256_cvtps_epi32, #
  * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
  */
-
-/*
- * Vectorized approximate sine/cosine algorithms: The following code is a
- * vectorized version of the algorithm presented here:
- * https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
- * (1) Load data in ZMM/YMM registers and generate mask for elements that are
- * within range [-71476.0625f, 71476.0625f] for cosine and [-117435.992f,
- * 117435.992f] for sine.
- * (2) For elements within range, perform range reduction using Cody-Waite's
- * method: x* = x - y*PI/2, where y = rint(x*2/PI). x* \in [-PI/4, PI/4].
- * (3) Map cos(x) to (+/-)sine or (+/-)cosine of x* based on the quadrant k =
- * int(y).
- * (4) For elements outside that range, Cody-Waite reduction performs poorly
- * leading to catastrophic cancellation. We compute cosine by calling glibc in
- * a scalar fashion.
- * (5) Vectorized implementation has a max ULP of 1.49 and performs at least
- * 5-7x faster than scalar implementations when magnitude of all elements in
- * the array < 71476.0625f (117435.992f for sine). Worst case performance is
- * when all the elements are large leading to about 1-2% reduction in
- * performance.
- */
-
 #if defined @CHK@
-static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
-@ISA@_sincos_FLOAT(npy_float * op,
-                   npy_float * ip,
-                   const npy_intp array_size,
-                   const npy_intp steps,
-                   NPY_TRIG_OP my_trig_op)
-{
-    const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
-    const npy_int num_lanes = @NUM_LANES@;
-    npy_float large_number = 71476.0625f;
-    if (my_trig_op == npy_compute_sin) {
-        large_number = 117435.992f;
-    }
-
-    /* Load up frequently used constants */
-    @vtype@i zeros = _mm@vsize@_set1_epi32(0);
-    @vtype@i ones = _mm@vsize@_set1_epi32(1);
-    @vtype@i twos = _mm@vsize@_set1_epi32(2);
-    @vtype@ two_over_pi = _mm@vsize@_set1_ps(NPY_TWO_O_PIf);
-    @vtype@ codyw_c1 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_HIGHf);
-    @vtype@ codyw_c2 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_MEDf);
-    @vtype@ codyw_c3 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_LOWf);
-    @vtype@ cos_invf0 = _mm@vsize@_set1_ps(NPY_COEFF_INVF0_COSINEf);
-    @vtype@ cos_invf2 = _mm@vsize@_set1_ps(NPY_COEFF_INVF2_COSINEf);
-    @vtype@ cos_invf4 = _mm@vsize@_set1_ps(NPY_COEFF_INVF4_COSINEf);
-    @vtype@ cos_invf6 = _mm@vsize@_set1_ps(NPY_COEFF_INVF6_COSINEf);
-    @vtype@ cos_invf8 = _mm@vsize@_set1_ps(NPY_COEFF_INVF8_COSINEf);
-    @vtype@ sin_invf3 = _mm@vsize@_set1_ps(NPY_COEFF_INVF3_SINEf);
-    @vtype@ sin_invf5 = _mm@vsize@_set1_ps(NPY_COEFF_INVF5_SINEf);
-    @vtype@ sin_invf7 = _mm@vsize@_set1_ps(NPY_COEFF_INVF7_SINEf);
-    @vtype@ sin_invf9 = _mm@vsize@_set1_ps(NPY_COEFF_INVF9_SINEf);
-    @vtype@ cvt_magic = _mm@vsize@_set1_ps(NPY_RINT_CVT_MAGICf);
-    @vtype@ zero_f = _mm@vsize@_set1_ps(0.0f);
-    @vtype@ quadrant, reduced_x, reduced_x2, cos, sin;
-    @vtype@i iquadrant;
-    @mask@ nan_mask, glibc_mask, sine_mask, negate_mask;
-    @mask@ load_mask = @isa@_get_full_load_mask_ps();
-    npy_intp num_remaining_elements = array_size;
-
-    /*
-     * Note: while generally indices are npy_intp, we ensure that our maximum index
-     * will fit in an int32 as a precondition for this function via
-     * IS_OUTPUT_BLOCKABLE_UNARY
-     */
-    npy_int32 indexarr[16];
-    for (npy_int32 ii = 0; ii < 16; ii++) {
-        indexarr[ii] = ii*stride;
-    }
-    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);
-
-    while (num_remaining_elements > 0) {
-
-        if (num_remaining_elements < num_lanes) {
-            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
-                                                         num_lanes);
-        }
-
-        @vtype@ x_in;
-        if (stride == 1) {
-            x_in = @isa@_masked_load_ps(load_mask, ip);
-        }
-        else {
-            x_in = @isa@_masked_gather_ps(zero_f, ip, vindex, load_mask);
-        }
-
-        /*
-         * For elements outside of this range, Cody-Waite's range reduction
-         * becomes inaccurate and we will call glibc to compute cosine for
-         * these numbers
-         */
-
-        glibc_mask = @isa@_in_range_mask(x_in, large_number,-large_number);
-        glibc_mask = @and_masks@(load_mask, glibc_mask);
-        nan_mask = _mm@vsize@_cmp_ps@vsub@(x_in, x_in, _CMP_NEQ_UQ);
-        @vtype@ x = @isa@_set_masked_lanes_ps(x_in, zero_f, @or_masks@(nan_mask, glibc_mask));
-        npy_int iglibc_mask = @mask_to_int@(glibc_mask);
-
-        if (iglibc_mask != @full_mask@) {
-            quadrant = _mm@vsize@_mul_ps(x, two_over_pi);
-
-            /* round to nearest */
-            quadrant = _mm@vsize@_add_ps(quadrant, cvt_magic);
-            quadrant = _mm@vsize@_sub_ps(quadrant, cvt_magic);
-
-            /* Cody-Waite's range reduction algorithm */
-            reduced_x = @isa@_range_reduction(x, quadrant,
-                                                   codyw_c1, codyw_c2, codyw_c3);
-            reduced_x2 = _mm@vsize@_mul_ps(reduced_x, reduced_x);
-
-            /* compute cosine and sine */
-            cos = @isa@_cosine(reduced_x2, cos_invf8, cos_invf6, cos_invf4,
-                                                           cos_invf2, cos_invf0);
-            sin = @isa@_sine(reduced_x, reduced_x2, sin_invf9, sin_invf7,
-                                             sin_invf5, sin_invf3, zero_f);
-
-            iquadrant = _mm@vsize@_cvtps_epi32(quadrant);
-            if (my_trig_op == npy_compute_cos) {
-                iquadrant = _mm@vsize@_add_epi32(iquadrant, ones);
-            }
-
-            /* blend sin and cos based on the quadrant */
-            sine_mask = @isa@_should_calculate_sine(iquadrant, ones, zeros);
-            cos = @isa@_blend(cos, sin, sine_mask);
-
-            /* multiply by -1 for appropriate elements */
-            negate_mask = @isa@_should_negate(iquadrant, twos, twos);
-            cos = @isa@_blend(cos, _mm@vsize@_sub_ps(zero_f, cos), negate_mask);
-            cos = @isa@_set_masked_lanes_ps(cos, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
-
-            @masked_store@(op, @cvtps_epi32@(load_mask), cos);
-        }
-
-        /* process elements using glibc for large elements */
-        if (iglibc_mask != 0) {
-            float NPY_DECL_ALIGNED(@BYTES@) ip_fback[@NUM_LANES@];
-            _mm@vsize@_store_ps(ip_fback, x_in);
-
-            if (my_trig_op == npy_compute_cos) {
-                for (int ii = 0; ii < num_lanes; ++ii, iglibc_mask >>= 1) {
-                    if (iglibc_mask & 0x01) {
-                        op[ii] = npy_cosf(ip_fback[ii]);
-                    }
-                }
-            }
-            else {
-                for (int ii = 0; ii < num_lanes; ++ii, iglibc_mask >>= 1) {
-                    if (iglibc_mask & 0x01) {
-                        op[ii] = npy_sinf(ip_fback[ii]);
-                    }
-                }
-            }
-        }
-        ip += num_lanes*stride;
-        op += num_lanes;
-        num_remaining_elements -= num_lanes;
-    }
-}
-
 /*
  * Vectorized implementation of exp using AVX2 and AVX512:
  * 1) if x >= xmax; return INF (overflow)
author	Sayed Adel <seiko@imavr.com>	2020-10-18 12:03:17 +0000
committer	Sayed Adel <seiko@imavr.com>	2020-12-26 16:32:04 +0000
commit	b16288698742e7593db61cf9a618e2d03de6b36e (patch)
tree	301ed276b9eca8d18c7ffc9eae2fb0db9c9699e2
parent	098a3b417dbb3c620e423e29f194585b68882d5e (diff)
download	numpy-b16288698742e7593db61cf9a618e2d03de6b36e.tar.gz