/*@targets
 ** $maxopt baseline
 ** (avx2 fma3) avx512f
 ** vsx2 vsx3 vsx4
 ** neon_vfpv4
 ** vxe vxe2
 **/
#include "numpy/npy_math.h"
#include "simd/simd.h"
#include "loops_utils.h"
#include "loops.h"
#include "fast_loop_macros.h"
/*
 * TODO:
 * - use vectorized version of Payne-Hanek style reduction for large elements or
 *   when there's no native FUSED support instead of fallback to libc
 */
#if NPY_SIMD_FMA3  // native support
/**begin repeat
 *  #check = F64, F32#
 *  #sfx  = f64, f32#
 */
#if NPY_SIMD_@check@
/*
 * Vectorized Cody-Waite range reduction technique
 * Performs the reduction step x* = x - y*C in three steps:
 * 1) x* = x - y*c1
 * 2) x* = x - y*c2
 * 3) x* = x - y*c3
 * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
 */
NPY_FINLINE npyv_@sfx@
simd_range_reduction_@sfx@(npyv_@sfx@ x, npyv_@sfx@ y, npyv_@sfx@ c1, npyv_@sfx@ c2, npyv_@sfx@ c3)
{
    npyv_@sfx@ reduced_x = npyv_muladd_@sfx@(y, c1, x);
    reduced_x = npyv_muladd_@sfx@(y, c2, reduced_x);
    reduced_x = npyv_muladd_@sfx@(y, c3, reduced_x);
    return reduced_x;
}
#endif
/**end repeat**/

#if NPY_SIMD_F64
/**begin repeat
 *  #op = cos, sin#
 */
#if defined(NPY_OS_WIN32) || defined(NPY_OS_CYGWIN)
NPY_FINLINE npyv_f64
#else
NPY_NOINLINE npyv_f64
#endif
simd_@op@_scalar_f64(npyv_f64 out, npy_uint64 cmp_bits)
{
    // MSVC doesn't compile with direct vector access, so we copy it here
    // as we have no npyv_get_lane/npyv_set_lane intrinsics
    npy_double NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) out_copy[npyv_nlanes_f64];
    npyv_storea_f64(out_copy, out);

    for (unsigned i = 0; i < npyv_nlanes_f64; ++i) {
        if (cmp_bits & (1 << i)) {
            out_copy[i] = npy_@op@(out_copy[i]);
        }
    }

    return npyv_loada_f64(out_copy);
}
/**end repeat**/

/*
 * Approximate sine algorithm for x \in [-pi/2, pi/2]
 * worst-case error is 3.5 ulp.
 * abs error: 0x1.be222a58p-53 in [-pi/2, pi/2].
 */
NPY_FINLINE npyv_f64
simd_approx_sine_poly_f64(npyv_f64 r)
{
    const npyv_f64 poly1 = npyv_setall_f64(-0x1.9f4a9c8b21dc9p-41);
    const npyv_f64 poly2 = npyv_setall_f64(0x1.60e88a10163f2p-33);
    const npyv_f64 poly3 = npyv_setall_f64(-0x1.ae6361b7254e7p-26);
    const npyv_f64 poly4 = npyv_setall_f64(0x1.71de382e8d62bp-19);
    const npyv_f64 poly5 = npyv_setall_f64(-0x1.a01a019aeb4ffp-13);
    const npyv_f64 poly6 = npyv_setall_f64(0x1.111111110b25ep-7);
    const npyv_f64 poly7 = npyv_setall_f64(-0x1.55555555554c3p-3);

    npyv_f64 r2 = npyv_mul_f64(r, r);
    npyv_f64 y = npyv_muladd_f64(poly1, r2, poly2);
    y = npyv_muladd_f64(y, r2, poly3);
    y = npyv_muladd_f64(y, r2, poly4);
    y = npyv_muladd_f64(y, r2, poly5);
    y = npyv_muladd_f64(y, r2, poly6);
    y = npyv_muladd_f64(y, r2, poly7);
    y = npyv_muladd_f64(npyv_mul_f64(y, r2), r, r);

    return y;
}

/* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
NPY_FINLINE npyv_f64
simd_range_reduction_pi2(npyv_f64 r, npyv_f64 n) {
    const npyv_f64 pi1 = npyv_setall_f64(-0x1.921fb54442d18p+1);
    const npyv_f64 pi2 = npyv_setall_f64(-0x1.1a62633145c06p-53);
    const npyv_f64 pi3 = npyv_setall_f64(-0x1.c1cd129024e09p-106);

    return simd_range_reduction_f64(r, n, pi1, pi2, pi3);
}

NPY_FINLINE npyv_b64 simd_sin_range_check_f64(npyv_u64 ir) {
    const npyv_u64 tiny_bound = npyv_setall_u64(0x202); /* top12 (asuint64 (0x1p-509)).  */
    const npyv_u64 simd_thresh = npyv_setall_u64(0x214); /* top12 (asuint64 (RangeVal)) - SIMD_TINY_BOUND.  */

    return npyv_cmpge_u64(npyv_sub_u64(npyv_shri_u64(ir, 52), tiny_bound), simd_thresh);
}

NPY_FINLINE npyv_b64 simd_cos_range_check_f64(npyv_u64 ir) {
    const npyv_f64 range_val = npyv_setall_f64(0x1p23);

    return npyv_cmpge_u64(ir, npyv_reinterpret_u64_f64(range_val));
}

NPY_FINLINE npyv_f64
simd_cos_poly_f64(npyv_f64 r, npyv_u64 ir, npyv_u64 sign)
{
    const npyv_f64 inv_pi = npyv_setall_f64(0x1.45f306dc9c883p-2);
    const npyv_f64 half_pi = npyv_setall_f64(0x1.921fb54442d18p+0);
    const npyv_f64 shift = npyv_setall_f64(0x1.8p52);

    /* n = rint((|x|+pi/2)/pi) - 0.5.  */
    npyv_f64 n = npyv_muladd_f64(inv_pi, npyv_add_f64(r, half_pi), shift);
    npyv_u64 odd = npyv_shli_u64(npyv_reinterpret_u64_f64(n), 63);
    n = npyv_sub_f64(n, shift);
    n = npyv_sub_f64(n, npyv_setall_f64(0.5));

    /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
    r = simd_range_reduction_pi2(r, n);

    /* sin(r) poly approx.  */
    npyv_f64 y = simd_approx_sine_poly_f64(r);

    /* sign.  */
    return npyv_reinterpret_f64_u64(npyv_xor_u64(npyv_reinterpret_u64_f64(y), odd));
}

NPY_FINLINE npyv_f64
simd_sin_poly_f64(npyv_f64 r, npyv_u64 ir, npyv_u64 sign)
{
    const npyv_f64 inv_pi = npyv_setall_f64(0x1.45f306dc9c883p-2);
    const npyv_f64 shift = npyv_setall_f64(0x1.8p52);

    /* n = rint(|x|/pi).  */
    npyv_f64 n = npyv_muladd_f64(inv_pi, r, shift);
    npyv_u64 odd = npyv_shli_u64(npyv_reinterpret_u64_f64(n), 63);
    n = npyv_sub_f64(n, shift);

    /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
    r = simd_range_reduction_pi2(r, n);

    /* sin(r) poly approx.  */
    npyv_f64 y = simd_approx_sine_poly_f64(r);

    /* sign.  */
    return npyv_reinterpret_f64_u64(npyv_xor_u64(npyv_xor_u64(npyv_reinterpret_u64_f64(y), sign), odd));
}

/**begin repeat
 *  #op = cos, sin#
 */
NPY_FINLINE void
simd_@op@_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_intp len)
{
    const npyv_u64 abs_mask = npyv_setall_u64(0x7fffffffffffffff);
    const int vstep = npyv_nlanes_f64;

    npyv_f64 out = npyv_zero_f64();
    npyv_f64 x_in;

    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
        if (ssrc == 1) {
            x_in = npyv_load_tillz_f64(src, len);
        } else {
            x_in = npyv_loadn_tillz_f64(src, ssrc, len);
        }

        npyv_u64 ir = npyv_and_u64(npyv_reinterpret_u64_f64(x_in), abs_mask);
        npyv_f64 r = npyv_reinterpret_f64_u64(ir);
        npyv_u64 sign = npyv_and_u64(npyv_reinterpret_u64_f64(x_in), npyv_not_u64(abs_mask));

        npyv_b64 cmp = simd_@op@_range_check_f64(ir);
        /* If fenv exceptions are to be triggered correctly, set any special lanes
        to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
        scalar loop later.  */
        r = npyv_select_f64(cmp, npyv_setall_f64(1.0), r);

        // Some in range, at least one calculation is useful
        if (!npyv_all_b64(cmp)) {
            out = simd_@op@_poly_f64(r, ir, sign);
        }

        if (npyv_any_b64(cmp)) {
            out = npyv_select_f64(cmp, x_in, out);
            out = simd_@op@_scalar_f64(out, npyv_tobits_b64(cmp));
        }

        if (sdst == 1) {
            npyv_store_till_f64(dst, len, out);
        } else {
            npyv_storen_till_f64(dst, sdst, len, out);
        }
    }
    npyv_cleanup();
}
/**end repeat**/
#endif // NPY_SIMD_F64

#if NPY_SIMD_F32
/*
 * Approximate cosine algorithm for x \in [-PI/4, PI/4]
 * Maximum ULP across all 32-bit floats = 0.875
 */
NPY_FINLINE npyv_f32
simd_cosine_poly_f32(npyv_f32 x2)
{
    const npyv_f32 invf8 = npyv_setall_f32(0x1.98e616p-16f);
    const npyv_f32 invf6 = npyv_setall_f32(-0x1.6c06dcp-10f);
    const npyv_f32 invf4 = npyv_setall_f32(0x1.55553cp-05f);
    const npyv_f32 invf2 = npyv_setall_f32(-0x1.000000p-01f);
    const npyv_f32 invf0 = npyv_setall_f32(0x1.000000p+00f);

    npyv_f32 r = npyv_muladd_f32(invf8, x2, invf6);
    r = npyv_muladd_f32(r, x2, invf4);
    r = npyv_muladd_f32(r, x2, invf2);
    r = npyv_muladd_f32(r, x2, invf0);
    return r;
}
/*
 * Approximate sine algorithm for x \in [-PI/4, PI/4]
 * Maximum ULP across all 32-bit floats = 0.647
 * Polynomial approximation based on unpublished work by T. Myklebust
 */
NPY_FINLINE npyv_f32
simd_sine_poly_f32(npyv_f32 x, npyv_f32 x2)
{
    const npyv_f32 invf9 = npyv_setall_f32(0x1.7d3bbcp-19f);
    const npyv_f32 invf7 = npyv_setall_f32(-0x1.a06bbap-13f);
    const npyv_f32 invf5 = npyv_setall_f32(0x1.11119ap-07f);
    const npyv_f32 invf3 = npyv_setall_f32(-0x1.555556p-03f);

    npyv_f32 r = npyv_muladd_f32(invf9, x2, invf7);
    r = npyv_muladd_f32(r, x2, invf5);
    r = npyv_muladd_f32(r, x2, invf3);
    r = npyv_muladd_f32(r, x2, npyv_zero_f32());
    r = npyv_muladd_f32(r, x, x);
    return r;
}
/*
 * Vectorized approximate sine/cosine algorithms: The following code is a
 * vectorized version of the algorithm presented here:
 * https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
 * (1) Load data in registers and generate mask for elements that are
 * within range [-71476.0625f, 71476.0625f] for cosine and [-117435.992f,
 * 117435.992f] for sine.
 * (2) For elements within range, perform range reduction using Cody-Waite's
 * method: x* = x - y*PI/2, where y = rint(x*2/PI). x* \in [-PI/4, PI/4].
 * (3) Map cos(x) to (+/-)sine or (+/-)cosine of x* based on the quadrant k =
 * int(y).
 * (4) For elements outside that range, Cody-Waite reduction performs poorly
 * leading to catastrophic cancellation. We compute cosine by calling glibc in
 * a scalar fashion.
 * (5) Vectorized implementation has a max ULP of 1.49 and performs at least
 * 5-7x(x86) - 2.5-3x(Power) - 1-2x(Arm) faster than scalar implementations
 * when magnitude of all elements in the array < 71476.0625f (117435.992f for sine).
 * Worst case performance is when all the elements are large leading to about 1-2% reduction in
 * performance.
 */
typedef enum
{
    SIMD_COMPUTE_SIN,
    SIMD_COMPUTE_COS
} SIMD_TRIG_OP;

static void SIMD_MSVC_NOINLINE
simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
                npy_intp len, SIMD_TRIG_OP trig_op)
{
    // Load up frequently used constants
    const npyv_f32 zerosf = npyv_zero_f32();
    const npyv_s32 ones  = npyv_setall_s32(1);
    const npyv_s32 twos  = npyv_setall_s32(2);
    const npyv_f32 two_over_pi = npyv_setall_f32(0x1.45f306p-1f);
    const npyv_f32 codyw_pio2_highf = npyv_setall_f32(-0x1.921fb0p+00f);
    const npyv_f32 codyw_pio2_medf = npyv_setall_f32(-0x1.5110b4p-22f);
    const npyv_f32 codyw_pio2_lowf = npyv_setall_f32(-0x1.846988p-48f);
    const npyv_f32 rint_cvt_magic = npyv_setall_f32(0x1.800000p+23f);
    // Cody-Waite's range
    float max_codi = 117435.992f;
    if (trig_op == SIMD_COMPUTE_COS) {
        max_codi = 71476.0625f;
    }
    const npyv_f32 max_cody = npyv_setall_f32(max_codi);
    const int vstep = npyv_nlanes_f32;

    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
        npyv_f32 x_in;
        if (ssrc == 1) {
            x_in = npyv_load_tillz_f32(src, len);
        } else {
            x_in = npyv_loadn_tillz_f32(src, ssrc, len);
        }
        npyv_b32 nnan_mask = npyv_notnan_f32(x_in);
    #if NPY_SIMD_CMPSIGNAL
        // Eliminate NaN to avoid FP invalid exception
        x_in = npyv_and_f32(x_in, npyv_reinterpret_f32_u32(npyv_cvt_u32_b32(nnan_mask)));
    #endif
        npyv_b32 simd_mask = npyv_cmple_f32(npyv_abs_f32(x_in), max_cody);
        npy_uint64 simd_maski = npyv_tobits_b32(simd_mask);
        /*
         * For elements outside of this range, Cody-Waite's range reduction
         * becomes inaccurate and we will call libc to compute cosine for
         * these numbers
         */
        if (simd_maski != 0) {
            npyv_f32 x = npyv_select_f32(npyv_and_b32(nnan_mask, simd_mask), x_in, zerosf);

            npyv_f32 quadrant = npyv_mul_f32(x, two_over_pi);
            // round to nearest, -0.0f -> +0.0f, and |a| must be <= 0x1.0p+22
            quadrant = npyv_add_f32(quadrant, rint_cvt_magic);
            quadrant = npyv_sub_f32(quadrant, rint_cvt_magic);

            // Cody-Waite's range reduction algorithm
            npyv_f32 reduced_x = simd_range_reduction_f32(
                x, quadrant, codyw_pio2_highf, codyw_pio2_medf, codyw_pio2_lowf
            );
            npyv_f32 reduced_x2 = npyv_square_f32(reduced_x);

            // compute cosine and sine
            npyv_f32 cos = simd_cosine_poly_f32(reduced_x2);
            npyv_f32 sin = simd_sine_poly_f32(reduced_x, reduced_x2);

            npyv_s32 iquadrant = npyv_round_s32_f32(quadrant);
            if (trig_op == SIMD_COMPUTE_COS) {
                iquadrant = npyv_add_s32(iquadrant, ones);
            }
            // blend sin and cos based on the quadrant
            npyv_b32 sine_mask = npyv_cmpeq_s32(npyv_and_s32(iquadrant, ones), npyv_zero_s32());
            cos = npyv_select_f32(sine_mask, sin, cos);

            // multiply by -1 for appropriate elements
            npyv_b32 negate_mask = npyv_cmpeq_s32(npyv_and_s32(iquadrant, twos), twos);
            cos = npyv_ifsub_f32(negate_mask, zerosf, cos, cos);
            cos = npyv_select_f32(nnan_mask, cos, npyv_setall_f32(NPY_NANF));

            if (sdst == 1) {
                npyv_store_till_f32(dst, len, cos);
            } else {
                npyv_storen_till_f32(dst, sdst, len, cos);
            }
        }
        if (simd_maski != (npy_uint64)((1 << vstep) - 1)) {
            float NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) ip_fback[npyv_nlanes_f32];
            npyv_storea_f32(ip_fback, x_in);

            // process elements using libc for large elements
            if (trig_op == SIMD_COMPUTE_COS) {
                for (unsigned i = 0; i < npyv_nlanes_f32; ++i) {
                    if ((simd_maski >> i) & 1) {
                        continue;
                    }
                    dst[sdst*i] = npy_cosf(ip_fback[i]);
                }
            }
            else {
                for (unsigned i = 0; i < npyv_nlanes_f32; ++i) {
                    if ((simd_maski >> i) & 1) {
                        continue;
                    }
                    dst[sdst*i] = npy_sinf(ip_fback[i]);
                }
            }
        }
    }
    npyv_cleanup();
}
#endif // NPY_SIMD_FP32
#endif // NYP_SIMD_FMA3

/**begin repeat
 *  #func = cos, sin#
 */
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
#if NPY_SIMD_F64 && NPY_SIMD_FMA3
    const double *src = (double*)args[0];
          double *dst = (double*)args[1];
    const int lsize = sizeof(src[0]);
    const npy_intp ssrc = steps[0] / lsize;
    const npy_intp sdst = steps[1] / lsize;
    npy_intp len = dimensions[0];
    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));

    if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
        !npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)
    ) {
        for (; len > 0; --len, src += ssrc, dst += sdst) {
            simd_@func@_f64(src, 1, dst, 1, 1);
        }
    } else {
        simd_@func@_f64(src, ssrc, dst, sdst, len);
    }
#else
    UNARY_LOOP {
        const npy_double in1 = *(npy_double *)ip1;
        *(npy_double *)op1 = npy_@func@(in1);
    }
#endif
}
/**end repeat**/

/**begin repeat
 *  #func = sin, cos#
 *  #enum = SIMD_COMPUTE_SIN, SIMD_COMPUTE_COS#
 */
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_@func@)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
{
#if NPY_SIMD_F32 && NPY_SIMD_FMA3
    const npy_float *src = (npy_float*)args[0];
          npy_float *dst = (npy_float*)args[1];

    const int lsize = sizeof(src[0]);
    const npy_intp ssrc = steps[0] / lsize;
    const npy_intp sdst = steps[1] / lsize;
    npy_intp len = dimensions[0];
    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
    if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
        !npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)
    ) {
        for (; len > 0; --len, src += ssrc, dst += sdst) {
            simd_sincos_f32(src, 1, dst, 1, 1, @enum@);
        }
    } else {
        simd_sincos_f32(src, ssrc, dst, sdst, len, @enum@);
    }
#else
    UNARY_LOOP {
        const npy_float in1 = *(npy_float *)ip1;
        *(npy_float *)op1 = npy_@func@f(in1);
    }
#endif
}
/**end repeat**/