/* -*- c -*- */
/*@targets
 ** $maxopt baseline
 ** sse2 sse42 xop avx2 avx512_skx
 ** vsx2
 ** neon asimd
 ** vx vxe
 **/

#define NPY_NO_DEPRECATED_API NPY_API_VERSION

#include "simd/simd.h"
#include "numpy/npy_math.h"

#include "arraytypes.h"

#define MIN(a,b) (((a)<(b))?(a):(b))

#if NPY_SIMD
#if NPY_SIMD > 512 || NPY_SIMD < 0
    #error "the following 8/16-bit argmax kernel isn't applicable for larger SIMD"
    // TODO: add special loop for large SIMD width.
    // i.e avoid unroll by x4 should be numerically safe till 2048-bit SIMD width
    // or maybe expand the indices to 32|64-bit vectors(slower).
#endif
/**begin repeat
 * #sfx = u8, s8, u16, s16#
 * #usfx = u8, u8, u16, u16#
 * #bsfx = b8, b8, b16, b16#
 * #idx_max = NPY_MAX_UINT8*2, NPY_MAX_UINT16*2#
 */
/**begin repeat1
 * #intrin = cmpgt, cmplt#
 * #func = argmax, argmin#
 * #op = >, <#
 */
static inline npy_intp
simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len)
{
    npyv_lanetype_@sfx@ s_acc = *ip;
    npy_intp ret_idx = 0, i = 0;

    const int vstep = npyv_nlanes_@sfx@;
    const int wstep = vstep*4;
    npyv_lanetype_@usfx@ d_vindices[npyv_nlanes_@sfx@*4];
    for (int vi = 0; vi < wstep; ++vi) {
        d_vindices[vi] = vi;
    }
    const npyv_@usfx@ vindices_0 = npyv_load_@usfx@(d_vindices);
    const npyv_@usfx@ vindices_1 = npyv_load_@usfx@(d_vindices + vstep);
    const npyv_@usfx@ vindices_2 = npyv_load_@usfx@(d_vindices + vstep*2);
    const npyv_@usfx@ vindices_3 = npyv_load_@usfx@(d_vindices + vstep*3);

    const npy_intp max_block = @idx_max@*wstep & -wstep;
    npy_intp len0 = len & -wstep;
    while (i < len0) {
        npyv_@sfx@ acc = npyv_setall_@sfx@(s_acc);
        npyv_@usfx@ acc_indices = npyv_zero_@usfx@();
        npyv_@usfx@ acc_indices_scale = npyv_zero_@usfx@();

        npy_intp n = i + MIN(len0 - i, max_block);
        npy_intp ik = i, i2 = 0;
        for (; i < n; i += wstep, ++i2) {
            npyv_@usfx@ vi = npyv_setall_@usfx@((npyv_lanetype_@usfx@)i2);
            npyv_@sfx@ a = npyv_load_@sfx@(ip + i);
            npyv_@sfx@ b = npyv_load_@sfx@(ip + i + vstep);
            npyv_@sfx@ c = npyv_load_@sfx@(ip + i + vstep*2);
            npyv_@sfx@ d = npyv_load_@sfx@(ip + i + vstep*3);

            // reverse to put lowest index first in case of matched values
            npyv_@bsfx@ m_ba = npyv_@intrin@_@sfx@(b, a);
            npyv_@bsfx@ m_dc = npyv_@intrin@_@sfx@(d, c);
            npyv_@sfx@  x_ba = npyv_select_@sfx@(m_ba, b, a);
            npyv_@sfx@  x_dc = npyv_select_@sfx@(m_dc, d, c);
            npyv_@bsfx@ m_dcba = npyv_@intrin@_@sfx@(x_dc, x_ba);
            npyv_@sfx@  x_dcba = npyv_select_@sfx@(m_dcba, x_dc, x_ba);

            npyv_@usfx@ idx_ba = npyv_select_@usfx@(m_ba, vindices_1, vindices_0);
            npyv_@usfx@ idx_dc = npyv_select_@usfx@(m_dc, vindices_3, vindices_2);
            npyv_@usfx@ idx_dcba = npyv_select_@usfx@(m_dcba, idx_dc, idx_ba);
            npyv_@bsfx@ m_acc = npyv_@intrin@_@sfx@(x_dcba, acc);
            acc = npyv_select_@sfx@(m_acc, x_dcba, acc);
            acc_indices = npyv_select_@usfx@(m_acc, idx_dcba, acc_indices);
            acc_indices_scale = npyv_select_@usfx@(m_acc, vi, acc_indices_scale);
        }
        // reduce
        npyv_lanetype_@sfx@ dacc[npyv_nlanes_@sfx@];
        npyv_lanetype_@usfx@ dacc_i[npyv_nlanes_@sfx@];
        npyv_lanetype_@usfx@ dacc_s[npyv_nlanes_@sfx@];
        npyv_store_@sfx@(dacc, acc);
        npyv_store_@usfx@(dacc_i, acc_indices);
        npyv_store_@usfx@(dacc_s, acc_indices_scale);

        for (int vi = 0; vi < vstep; ++vi) {
            if (dacc[vi] @op@ s_acc) {
                s_acc = dacc[vi];
                ret_idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
            }
        }
        // get the lowest index in case of matched values
        for (int vi = 0; vi < vstep; ++vi) {
            npy_intp idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
            if (s_acc == dacc[vi] && ret_idx > idx) {
                ret_idx = idx;
            }
        }
    }
    for (; i < len; ++i) {
        npyv_lanetype_@sfx@ a = ip[i];
        if (a @op@ s_acc) {
            s_acc = a;
            ret_idx = i;
        }
    }
    return ret_idx;
}
/**end repeat1**/
/**end repeat**/
#endif

/**begin repeat
 * #sfx = u32, s32, u64, s64, f32, f64#
 * #usfx = u32, u32, u64, u64, u32, u64#
 * #bsfx = b32, b32, b64, b64, b32, b64#
 * #is_fp = 0*4, 1*2#
 * #is_idx32 = 1*2, 0*2, 1, 0#
 * #chk_simd = NPY_SIMD*4, NPY_SIMD_F32, NPY_SIMD_F64#
 */
#if @chk_simd@
/**begin repeat1
 * #intrin = cmpgt, cmplt#
 * #func = argmax, argmin#
 * #op = >, <#
 * #iop = <, >#
 */
static inline npy_intp
simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len)
{
    npyv_lanetype_@sfx@ s_acc = *ip;
    npy_intp ret_idx = 0, i = 0;
    const int vstep = npyv_nlanes_@sfx@;
    const int wstep = vstep*4;
    // loop by a scalar will perform better for small arrays
    if (len < wstep) {
        goto scalar_loop;
    }
    npy_intp len0 = len;
    // guard against wraparound vector addition for 32-bit indices
    // in case of the array length is larger than 16gb
#if @is_idx32@
    if (len0 > NPY_MAX_UINT32) {
        len0 = NPY_MAX_UINT32;
    }
#endif
    // create index for vector indices
    npyv_lanetype_@usfx@ d_vindices[npyv_nlanes_@sfx@*4];
    for (int vi = 0; vi < wstep; ++vi) {
        d_vindices[vi] = vi;
    }
    const npyv_@usfx@ vindices_0 = npyv_load_@usfx@(d_vindices);
    const npyv_@usfx@ vindices_1 = npyv_load_@usfx@(d_vindices + vstep);
    const npyv_@usfx@ vindices_2 = npyv_load_@usfx@(d_vindices + vstep*2);
    const npyv_@usfx@ vindices_3 = npyv_load_@usfx@(d_vindices + vstep*3);
    // initialize vector accumulator for highest values and its indexes
    npyv_@usfx@ acc_indices = npyv_zero_@usfx@();
    npyv_@sfx@ acc = npyv_setall_@sfx@(s_acc);
    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
        npyv_@usfx@ vi = npyv_setall_@usfx@((npyv_lanetype_@usfx@)i);
        npyv_@sfx@ a = npyv_load_@sfx@(ip + i);
        npyv_@sfx@ b = npyv_load_@sfx@(ip + i + vstep);
        npyv_@sfx@ c = npyv_load_@sfx@(ip + i + vstep*2);
        npyv_@sfx@ d = npyv_load_@sfx@(ip + i + vstep*3);

        // reverse to put lowest index first in case of matched values
        npyv_@bsfx@ m_ba = npyv_@intrin@_@sfx@(b, a);
        npyv_@bsfx@ m_dc = npyv_@intrin@_@sfx@(d, c);
        npyv_@sfx@  x_ba = npyv_select_@sfx@(m_ba, b, a);
        npyv_@sfx@  x_dc = npyv_select_@sfx@(m_dc, d, c);
        npyv_@bsfx@ m_dcba = npyv_@intrin@_@sfx@(x_dc, x_ba);
        npyv_@sfx@  x_dcba = npyv_select_@sfx@(m_dcba, x_dc, x_ba);

        npyv_@usfx@ idx_ba = npyv_select_@usfx@(m_ba, vindices_1, vindices_0);
        npyv_@usfx@ idx_dc = npyv_select_@usfx@(m_dc, vindices_3, vindices_2);
        npyv_@usfx@ idx_dcba = npyv_select_@usfx@(m_dcba, idx_dc, idx_ba);
        npyv_@bsfx@ m_acc = npyv_@intrin@_@sfx@(x_dcba, acc);
        acc = npyv_select_@sfx@(m_acc, x_dcba, acc);
        acc_indices = npyv_select_@usfx@(m_acc, npyv_add_@usfx@(vi, idx_dcba), acc_indices);

    #if @is_fp@
        npyv_@bsfx@ nnan_a = npyv_notnan_@sfx@(a);
        npyv_@bsfx@ nnan_b = npyv_notnan_@sfx@(b);
        npyv_@bsfx@ nnan_c = npyv_notnan_@sfx@(c);
        npyv_@bsfx@ nnan_d = npyv_notnan_@sfx@(d);
        npyv_@bsfx@ nnan_ab = npyv_and_@bsfx@(nnan_a, nnan_b);
        npyv_@bsfx@ nnan_cd = npyv_and_@bsfx@(nnan_c, nnan_d);
        npy_uint64 nnan = npyv_tobits_@bsfx@(npyv_and_@bsfx@(nnan_ab, nnan_cd));
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            npy_uint64 nnan_4[4];
            nnan_4[0] = npyv_tobits_@bsfx@(nnan_a);
            nnan_4[1] = npyv_tobits_@bsfx@(nnan_b);
            nnan_4[2] = npyv_tobits_@bsfx@(nnan_c);
            nnan_4[3] = npyv_tobits_@bsfx@(nnan_d);
            for (int ni = 0; ni < 4; ++ni) {
                for (int vi = 0; vi < vstep; ++vi) {
                    if (!((nnan_4[ni] >> vi) & 1)) {
                        return i + ni*vstep + vi;
                    }
                }
            }
        }
    #endif
    }
    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
        npyv_@usfx@ vi = npyv_setall_@usfx@((npyv_lanetype_@usfx@)i);
        npyv_@sfx@ a = npyv_load_@sfx@(ip + i);
        npyv_@bsfx@ m_acc = npyv_@intrin@_@sfx@(a, acc);
        acc = npyv_select_@sfx@(m_acc, a, acc);
        acc_indices = npyv_select_@usfx@(m_acc, npyv_add_@usfx@(vi, vindices_0), acc_indices);
    #if @is_fp@
        npyv_@bsfx@ nnan_a = npyv_notnan_@sfx@(a);
        npy_uint64 nnan = npyv_tobits_@bsfx@(nnan_a);
        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
            for (int vi = 0; vi < vstep; ++vi) {
                if (!((nnan >> vi) & 1)) {
                    return i + vi;
                }
            }
        }
    #endif
    }

    // reduce
    npyv_lanetype_@sfx@ dacc[npyv_nlanes_@sfx@];
    npyv_lanetype_@usfx@ dacc_i[npyv_nlanes_@sfx@];
    npyv_store_@usfx@(dacc_i, acc_indices);
    npyv_store_@sfx@(dacc, acc);

    s_acc = dacc[0];
    ret_idx = dacc_i[0];
    for (int vi = 1; vi < vstep; ++vi) {
        if (dacc[vi] @op@ s_acc) {
            s_acc = dacc[vi];
            ret_idx = (npy_intp)dacc_i[vi];
        }
    }
    // get the lowest index in case of matched values
    for (int vi = 0; vi < vstep; ++vi) {
        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
            ret_idx = dacc_i[vi];
        }
    }
scalar_loop:
    for (; i < len; ++i) {
        npyv_lanetype_@sfx@ a = ip[i];
    #if @is_fp@
        if (!(a @iop@= s_acc)) {  // negated, for correct nan handling
    #else
        if (a @op@ s_acc) {
    #endif
            s_acc = a;
            ret_idx = i;
        #if @is_fp@
            if (npy_isnan(s_acc)) {
                // nan encountered, it's maximal
                return ret_idx;
            }
        #endif
        }
    }
    return ret_idx;
}
/**end repeat1**/
#endif // chk_simd
/**end repeat**/

/**begin repeat
 * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
 *         BYTE, SHORT, INT, LONG, LONGLONG,
 *         FLOAT, DOUBLE, LONGDOUBLE#
 *
 * #BTYPE = BYTE, SHORT, INT, LONG, LONGLONG,
 *          BYTE, SHORT, INT, LONG, LONGLONG,
 *          FLOAT, DOUBLE, LONGDOUBLE#
 * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
 *         npy_byte, npy_short, npy_int, npy_long, npy_longlong,
 *         npy_float, npy_double, npy_longdouble#
 *
 * #is_fp = 0*10, 1*3#
 * #is_unsigned = 1*5, 0*5, 0*3#
 */
#undef TO_SIMD_SFX
#if 0
/**begin repeat1
 * #len = 8, 16, 32, 64#
 */
#elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@
    #if @is_fp@
        #define TO_SIMD_SFX(X) X##_f@len@
        #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64
            #undef TO_SIMD_SFX
        #endif
        #if NPY_BITSOF_@BTYPE@ == 32 && !NPY_SIMD_F32
            #undef TO_SIMD_SFX
        #endif
    #elif @is_unsigned@
        #define TO_SIMD_SFX(X) X##_u@len@
    #else
        #define TO_SIMD_SFX(X) X##_s@len@
    #endif
/**end repeat1**/
#endif

/**begin repeat1
 * #func = argmax, argmin#
 * #op = >, <#
 * #iop = <, >#
 */
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@)
(@type@ *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
{
#if @is_fp@
    if (npy_isnan(*ip)) {
        // nan encountered; it's maximal|minimal
        *mindx = 0;
        return 0;
    }
#endif
#ifdef TO_SIMD_SFX
    *mindx = TO_SIMD_SFX(simd_@func@)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
    npyv_cleanup();
#else
    @type@ mp = *ip;
    *mindx = 0;
    npy_intp i = 1;

    for (; i < n; ++i) {
        @type@ a = ip[i];
    #if @is_fp@
        if (!(a @iop@= mp)) {  // negated, for correct nan handling
    #else
        if (a @op@ mp) {
    #endif
            mp = a;
            *mindx = i;
        #if @is_fp@
            if (npy_isnan(mp)) {
                // nan encountered, it's maximal|minimal
                break;
            }
        #endif
        }
    }
#endif // TO_SIMD_SFX
    return 0;
}
/**end repeat1**/
/**end repeat**/

NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BOOL_argmax)
(npy_bool *ip, npy_intp len, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))

{
    npy_intp i = 0;
#if NPY_SIMD
    const npyv_u8 zero = npyv_zero_u8();
    const int vstep = npyv_nlanes_u8;
    const int wstep = vstep * 4;
    for (npy_intp n = len & -wstep; i < n; i += wstep) {
        npyv_u8 a = npyv_load_u8(ip + i + vstep*0);
        npyv_u8 b = npyv_load_u8(ip + i + vstep*1);
        npyv_u8 c = npyv_load_u8(ip + i + vstep*2);
        npyv_u8 d = npyv_load_u8(ip + i + vstep*3);
        npyv_b8 m_a = npyv_cmpeq_u8(a, zero);
        npyv_b8 m_b = npyv_cmpeq_u8(b, zero);
        npyv_b8 m_c = npyv_cmpeq_u8(c, zero);
        npyv_b8 m_d = npyv_cmpeq_u8(d, zero);
        npyv_b8 m_ab = npyv_and_b8(m_a, m_b);
        npyv_b8 m_cd = npyv_and_b8(m_c, m_d);
        npy_uint64 m = npyv_tobits_b8(npyv_and_b8(m_ab, m_cd));
    #if NPY_SIMD == 512
        if (m != NPY_MAX_UINT64) {
    #else
        if ((npy_int64)m != ((1LL << vstep) - 1)) {
    #endif
            break;
        }
    }
    npyv_cleanup();
#endif // NPY_SIMD
    for (; i < len; ++i) {
        if (ip[i]) {
            *mindx = i;
            return 0;
        }
    }
    *mindx = 0;
    return 0;
}