/*@targets
 ** $maxopt baseline
 ** neon asimd
 ** sse2 avx2 avx512_skx
 ** vsx2
 ** vx vxe
 **/

#define _UMATHMODULE
#define _MULTIARRAYMODULE
#define NPY_NO_DEPRECATED_API NPY_API_VERSION

#include "numpy/npy_math.h"
#include "simd/simd.h"
#include "loops_utils.h"
#include "loops.h"
#include "lowlevel_strided_loops.h"
// Provides the various *_LOOP macros
#include "fast_loop_macros.h"

/*******************************************************************************
 ** Scalar ops
 ******************************************************************************/
#define scalar_negative(X) (-X)

/*******************************************************************************
 ** extra SIMD intrinsics
 ******************************************************************************/

#if NPY_SIMD

/**begin repeat
 * #sfx  = s8, u8, s16, u16, s32, u32, s64, u64#
 * #ssfx =  8,  8,  16,  16,  32,  32,  64,  64#
 */
static NPY_INLINE npyv_@sfx@
npyv_negative_@sfx@(npyv_@sfx@ v)
{
#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || @ssfx@ < 64)
    return npyv_reinterpret_@sfx@_s@ssfx@(vnegq_s@ssfx@(npyv_reinterpret_s@ssfx@_@sfx@(v)));
#else
    // (x ^ -1) + 1
    const npyv_@sfx@ m1 = npyv_setall_@sfx@((npyv_lanetype_@sfx@)-1);
    return npyv_sub_@sfx@(npyv_xor_@sfx@(v, m1), m1);
#endif
}
/**end repeat**/

/**begin repeat
 * #sfx  = f32, f64#
 * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
 * #fd = f, #
 */
#if @VCHK@
static NPY_INLINE npyv_@sfx@
npyv_negative_@sfx@(npyv_@sfx@ v)
{
#if defined(NPY_HAVE_NEON)
    return vnegq_@sfx@(v);
#else
    // (v ^ signmask)
    const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@);
    return npyv_xor_@sfx@(v, signmask);
#endif
}
#endif // @VCHK@
/**end repeat**/

#endif // NPY_SIMD

/********************************************************************************
 ** Defining the SIMD kernels
 ********************************************************************************/
/**begin repeat
 * #sfx = s8, u8, s16, u16, s32, u32, s64, u64, f32, f64#
 * #simd_chk = NPY_SIMD*8, NPY_SIMD_F32, NPY_SIMD_F64#
 * #is_fp = 0*8, 1*2#
 * #supports_ncontig = 0*4,1*6#
 */
/**begin repeat1
 * #kind   = negative#
 * #intrin = negative#
 * #unroll = 4#
 */
#if @simd_chk@
#if @unroll@ < 1
#error "Unroll must be at least 1"
#elif NPY_SIMD != 128 && @unroll@ > 2
// Avoid memory bandwidth bottleneck for larger SIMD
#define UNROLL 2
#else
#define UNROLL @unroll@
#endif
// contiguous inputs and output.
static NPY_INLINE void
simd_unary_cc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip,
                             npyv_lanetype_@sfx@ *op,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_@sfx@;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
    /**begin repeat2
     * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
     */
    #if UNROLL > @U@
        npyv_@sfx@ v_@U@ = npyv_load_@sfx@(ip + @U@ * vstep);
        npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
        npyv_store_@sfx@(op + @U@ * vstep, r_@U@);
    #endif
    /**end repeat2**/
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
        npyv_@sfx@ v = npyv_load_@sfx@(ip);
        npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
        npyv_store_@sfx@(op, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ++ip, ++op) {
        *op = scalar_@intrin@(*ip);
    }
}

#if @supports_ncontig@
// contiguous input, non-contiguous output
static NPY_INLINE void
simd_unary_cn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip,
                             npyv_lanetype_@sfx@ *op, npy_intp ostride,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_@sfx@;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
    /**begin repeat2
     * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
     */
    #if UNROLL > @U@
        npyv_@sfx@ v_@U@ = npyv_load_@sfx@(ip + @U@ * vstep);
        npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
        npyv_storen_@sfx@(op + @U@ * vstep * ostride, ostride, r_@U@);
    #endif
    /**end repeat2**/
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
        npyv_@sfx@ v = npyv_load_@sfx@(ip);
        npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
        npyv_storen_@sfx@(op, ostride, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ++ip, op += ostride) {
        *op = scalar_@intrin@(*ip);
    }
}
// non-contiguous input, contiguous output
static NPY_INLINE void
simd_unary_nc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride,
                             npyv_lanetype_@sfx@ *op,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_@sfx@;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
    /**begin repeat2
     * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
     */
    #if UNROLL > @U@
        npyv_@sfx@ v_@U@ = npyv_loadn_@sfx@(ip + @U@ * vstep * istride, istride);
        npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
        npyv_store_@sfx@(op + @U@ * vstep, r_@U@);
    #endif
    /**end repeat2**/
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
        npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride);
        npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
        npyv_store_@sfx@(op, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ip += istride, ++op) {
        *op = scalar_@intrin@(*ip);
    }
}
// non-contiguous input and output
// limit unroll to 2x
#if UNROLL > 2
#undef UNROLL
#define UNROLL 2
#endif
static NPY_INLINE void
simd_unary_nn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride,
                             npyv_lanetype_@sfx@ *op, npy_intp ostride,
                             npy_intp len)
{
    const int vstep = npyv_nlanes_@sfx@;
    const int wstep = vstep * UNROLL;

    // unrolled vector loop
    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
    /**begin repeat2
     * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
     */
    #if UNROLL > @U@
        npyv_@sfx@ v_@U@ = npyv_loadn_@sfx@(ip + @U@ * vstep * istride, istride);
        npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
        npyv_storen_@sfx@(op + @U@ * vstep * ostride, ostride, r_@U@);
    #endif
    /**end repeat2**/
    }
    // single vector loop
    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
        npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride);
        npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
        npyv_storen_@sfx@(op, ostride, r);
    }
    // scalar finish up any remaining iterations
    for (; len > 0; --len, ip += istride, op += ostride) {
        *op = scalar_@intrin@(*ip);
    }
}
#endif // @supports_ncontig@
#undef UNROLL
#endif // @simd_chk@
/*end repeat1**/
/**end repeat**/

/********************************************************************************
 ** Defining ufunc inner functions
 ********************************************************************************/
/**begin repeat
 * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
 *         BYTE,  SHORT,  INT,  LONG,  LONGLONG,
 *         FLOAT, DOUBLE, LONGDOUBLE#
 *
 * #BTYPE = BYTE, SHORT, INT,  LONG, LONGLONG,
 *          BYTE, SHORT, INT, LONG, LONGLONG,
 *          FLOAT, DOUBLE, LONGDOUBLE#
 * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
 *         npy_byte, npy_short, npy_int, npy_long, npy_longlong,
 *         npy_float, npy_double, npy_longdouble#
 *
 * #is_fp = 0*10, 1*3#
 * #is_unsigned = 1*5, 0*5, 0*3#
 * #supports_ncontig = 0*2, 1*3, 0*2, 1*3, 1*3#
 */
#undef TO_SIMD_SFX
#if 0
/**begin repeat1
 * #len = 8, 16, 32, 64#
 */
#elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@
    #if @is_fp@
        #define TO_SIMD_SFX(X) X##_f@len@
        #if NPY_BITSOF_@BTYPE@ == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
    #elif @is_unsigned@
        #define TO_SIMD_SFX(X) X##_u@len@
    #else
        #define TO_SIMD_SFX(X) X##_s@len@
    #endif
/**end repeat1**/
#endif

/**begin repeat1
 * #kind = negative#
 * #intrin = negative#
 */
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
    char *ip = args[0], *op = args[1];
    npy_intp istep = steps[0], ostep = steps[1],
             len = dimensions[0];
#ifdef TO_SIMD_SFX
    #undef STYPE
    #define STYPE TO_SIMD_SFX(npyv_lanetype)
    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
        if (IS_UNARY_CONT(@type@, @type@)) {
            // no overlap and operands are contiguous
            TO_SIMD_SFX(simd_unary_cc_@intrin@)(
                (STYPE*)ip, (STYPE*)op, len
            );
            goto clear;
        }
    #if @supports_ncontig@
        const npy_intp istride = istep / sizeof(STYPE);
        const npy_intp ostride = ostep / sizeof(STYPE);
        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
            TO_SIMD_SFX(npyv_storable_stride)(ostride))
        {
            if (istride == 1 && ostride != 1) {
                // contiguous input, non-contiguous output
                TO_SIMD_SFX(simd_unary_cn_@intrin@)(
                    (STYPE*)ip, (STYPE*)op, ostride, len
                );
                goto clear;
            }
            else if (istride != 1 && ostride == 1) {
                // non-contiguous input, contiguous output
                TO_SIMD_SFX(simd_unary_nc_@intrin@)(
                    (STYPE*)ip, istride, (STYPE*)op, len
                );
                goto clear;
            }
        // SSE2 does better with unrolled scalar for heavy non-contiguous
        #if !defined(NPY_HAVE_SSE2)
            else if (istride != 1 && ostride != 1) {
                // non-contiguous input and output
                TO_SIMD_SFX(simd_unary_nn_@intrin@)(
                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
                );
                goto clear;
            }
        #endif
        }
    #endif // @supports_ncontig@
    }
#endif // TO_SIMD_SFX
#ifndef NPY_DISABLE_OPTIMIZATION
    /*
     * scalar unrolls
     * 8x unroll performed best on
     *  - Apple M1 Native / arm64
     *  - Apple M1 Rosetta / SSE42
     *  - iMacPro / AVX512
     */
    #define UNROLL 8
    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
    /**begin repeat2
     * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
     */
    #if UNROLL > @U@
        const @type@ in_@U@ = *((const @type@ *)(ip + @U@ * istep));
        *((@type@ *)(op + @U@ * ostep)) = scalar_@intrin@(in_@U@);
    #endif
    /**end repeat2**/
    }
#endif // NPY_DISABLE_OPTIMIZATION
    for (; len > 0; --len, ip += istep, op += ostep) {
        *((@type@ *)op) = scalar_@intrin@(*(const @type@ *)ip);
    }
#ifdef TO_SIMD_SFX
clear:
    npyv_cleanup();
#endif
#if @is_fp@
    npy_clear_floatstatus_barrier((char*)dimensions);
#endif
}
/**end repeat**/

#undef NEGATIVE_CONTIG_ONLY