#ifndef _NPY_UMATH_LOOPS_UTILS_H_
#define _NPY_UMATH_LOOPS_UTILS_H_

#include "numpy/npy_common.h" // NPY_FINLINE
#include "numpy/halffloat.h" // npy_half_to_float

/**
 * Old versions of MSVC causes ambiguous link errors when we deal with large SIMD kernels
 * which lead to break the build, probably related to the following bug:
 * https://developercommunity.visualstudio.com/content/problem/415095/internal-compiler-error-with-perfectly-forwarded-r.html
 */
#if defined(_MSC_VER) && _MSC_VER < 1916
    #define SIMD_MSVC_NOINLINE __declspec(noinline)
#else
    #define SIMD_MSVC_NOINLINE
#endif
/*
 * nomemoverlap - returns false if two strided arrays have an overlapping
 * region in memory. ip_size/op_size = size of the arrays which can be negative
 * indicating negative steps.
 */
NPY_FINLINE npy_bool
nomemoverlap(char *ip, npy_intp ip_size, char *op, npy_intp op_size)
{
    char *ip_start, *ip_end, *op_start, *op_end;
    if (ip_size < 0) {
        ip_start = ip + ip_size;
        ip_end = ip;
    }
    else {
        ip_start = ip;
        ip_end = ip + ip_size;
    }
    if (op_size < 0) {
        op_start = op + op_size;
        op_end = op;
    }
    else {
        op_start = op;
        op_end = op + op_size;
    }
    return (ip_start == op_start && op_end == ip_end) ||
           (ip_start > op_end) || (op_start > ip_end);
}

// returns true if two strided arrays have an overlapping region in memory
// same as `nomemoverlap()` but requires array length and step sizes
NPY_FINLINE npy_bool
is_mem_overlap(const void *src, npy_intp src_step, const void *dst, npy_intp dst_step, npy_intp len)
{
    return !(nomemoverlap((char*)src, src_step*len, (char*)dst, dst_step*len));
}

/*
 * cutoff blocksize for pairwise summation
 * decreasing it decreases errors slightly as more pairs are summed but
 * also lowers performance, as the inner loop is unrolled eight times it is
 * effectively 16
 */
#define PW_BLOCKSIZE    128

/**begin repeat
 * Float types
 *  #type = npy_float, npy_double, npy_longdouble, npy_float#
 *  #dtype = npy_float, npy_double, npy_longdouble, npy_half#
 *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE, HALF#
 *  #c = f, , l, #
 *  #C = F, , L, #
 *  #trf = , , , npy_half_to_float#
 */

/*
 * Pairwise summation, rounding error O(lg n) instead of O(n).
 * The recursion depth is O(lg n) as well.
 * when updating also update similar complex floats summation
 */
static inline @type@
@TYPE@_pairwise_sum(char *a, npy_intp n, npy_intp stride)
{
    if (n < 8) {
        npy_intp i;
        /*
         * Start with -0 to preserve -0 values.  The reason is that summing
         * only -0 should return -0, but `0 + -0 == 0` while `-0 + -0 == -0`.
         */
        @type@ res = -0.0;

        for (i = 0; i < n; i++) {
            res += @trf@(*((@dtype@*)(a + i * stride)));
        }
        return res;
    }
    else if (n <= PW_BLOCKSIZE) {
        npy_intp i;
        @type@ r[8], res;

        /*
         * sum a block with 8 accumulators
         * 8 times unroll reduces blocksize to 16 and allows vectorization with
         * avx without changing summation ordering
         */
        r[0] = @trf@(*((@dtype@ *)(a + 0 * stride)));
        r[1] = @trf@(*((@dtype@ *)(a + 1 * stride)));
        r[2] = @trf@(*((@dtype@ *)(a + 2 * stride)));
        r[3] = @trf@(*((@dtype@ *)(a + 3 * stride)));
        r[4] = @trf@(*((@dtype@ *)(a + 4 * stride)));
        r[5] = @trf@(*((@dtype@ *)(a + 5 * stride)));
        r[6] = @trf@(*((@dtype@ *)(a + 6 * stride)));
        r[7] = @trf@(*((@dtype@ *)(a + 7 * stride)));

        for (i = 8; i < n - (n % 8); i += 8) {
            /* small blocksizes seems to mess with hardware prefetch */
            NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(@dtype@))*stride, 0, 3);
            r[0] += @trf@(*((@dtype@ *)(a + (i + 0) * stride)));
            r[1] += @trf@(*((@dtype@ *)(a + (i + 1) * stride)));
            r[2] += @trf@(*((@dtype@ *)(a + (i + 2) * stride)));
            r[3] += @trf@(*((@dtype@ *)(a + (i + 3) * stride)));
            r[4] += @trf@(*((@dtype@ *)(a + (i + 4) * stride)));
            r[5] += @trf@(*((@dtype@ *)(a + (i + 5) * stride)));
            r[6] += @trf@(*((@dtype@ *)(a + (i + 6) * stride)));
            r[7] += @trf@(*((@dtype@ *)(a + (i + 7) * stride)));
        }

        /* accumulate now to avoid stack spills for single peel loop */
        res = ((r[0] + r[1]) + (r[2] + r[3])) +
              ((r[4] + r[5]) + (r[6] + r[7]));

        /* do non multiple of 8 rest */
        for (; i < n; i++) {
            res += @trf@(*((@dtype@ *)(a + i * stride)));
        }
        return res;
    }
    else {
        /* divide by two but avoid non-multiples of unroll factor */
        npy_intp n2 = n / 2;

        n2 -= n2 % 8;
        return @TYPE@_pairwise_sum(a, n2, stride) +
               @TYPE@_pairwise_sum(a + n2 * stride, n - n2, stride);
    }
}

/**end repeat**/

/**begin repeat
 * complex types
 * #TYPE = CFLOAT, CDOUBLE, CLONGDOUBLE#
 * #ftype = npy_float, npy_double, npy_longdouble#
 * #c = f, , l#
 * #C = F, , L#
 * #SIMD = 1, 1, 0#
 */
/* similar to pairwise sum of real floats */
static inline void
@TYPE@_pairwise_sum(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n,
                    npy_intp stride)
{
    assert(n % 2 == 0);
    if (n < 8) {
        npy_intp i;

        *rr = -0.0;
        *ri = -0.0;
        for (i = 0; i < n; i += 2) {
            *rr += *((@ftype@ *)(a + i * stride + 0));
            *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@)));
        }
        return;
    }
    else if (n <= PW_BLOCKSIZE) {
        npy_intp i;
        @ftype@ r[8];

        /*
         * sum a block with 8 accumulators
         * 8 times unroll reduces blocksize to 16 and allows vectorization with
         * avx without changing summation ordering
         */
        r[0] = *((@ftype@ *)(a + 0 * stride));
        r[1] = *((@ftype@ *)(a + 0 * stride + sizeof(@ftype@)));
        r[2] = *((@ftype@ *)(a + 2 * stride));
        r[3] = *((@ftype@ *)(a + 2 * stride + sizeof(@ftype@)));
        r[4] = *((@ftype@ *)(a + 4 * stride));
        r[5] = *((@ftype@ *)(a + 4 * stride + sizeof(@ftype@)));
        r[6] = *((@ftype@ *)(a + 6 * stride));
        r[7] = *((@ftype@ *)(a + 6 * stride + sizeof(@ftype@)));

        for (i = 8; i < n - (n % 8); i += 8) {
            /* small blocksizes seems to mess with hardware prefetch */
            NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(@ftype@))*stride, 0, 3);
            r[0] += *((@ftype@ *)(a + (i + 0) * stride));
            r[1] += *((@ftype@ *)(a + (i + 0) * stride + sizeof(@ftype@)));
            r[2] += *((@ftype@ *)(a + (i + 2) * stride));
            r[3] += *((@ftype@ *)(a + (i + 2) * stride + sizeof(@ftype@)));
            r[4] += *((@ftype@ *)(a + (i + 4) * stride));
            r[5] += *((@ftype@ *)(a + (i + 4) * stride + sizeof(@ftype@)));
            r[6] += *((@ftype@ *)(a + (i + 6) * stride));
            r[7] += *((@ftype@ *)(a + (i + 6) * stride + sizeof(@ftype@)));
        }

        /* accumulate now to avoid stack spills for single peel loop */
        *rr = ((r[0] + r[2]) + (r[4] + r[6]));
        *ri = ((r[1] + r[3]) + (r[5] + r[7]));

        /* do non multiple of 8 rest */
        for (; i < n; i+=2) {
            *rr += *((@ftype@ *)(a + i * stride + 0));
            *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@)));
        }
        return;
    }
    else {
        /* divide by two but avoid non-multiples of unroll factor */
        @ftype@ rr1, ri1, rr2, ri2;
        npy_intp n2 = n / 2;

        n2 -= n2 % 8;
        @TYPE@_pairwise_sum(&rr1, &ri1, a, n2, stride);
        @TYPE@_pairwise_sum(&rr2, &ri2, a + n2 * stride, n - n2, stride);
        *rr = rr1 + rr2;
        *ri = ri1 + ri2;
        return;
    }
}
/**end repeat**/

#endif // _NPY_UMATH_LOOPS_UTILS_H_