/*@targets
 ** baseline vsx4
 **/
#define _UMATHMODULE
#define _MULTIARRAYMODULE
#define NPY_NO_DEPRECATED_API NPY_API_VERSION

#include "simd/simd.h"
#include "loops_utils.h"
#include "loops.h"
#include "lowlevel_strided_loops.h"
// Provides the various *_LOOP macros
#include "fast_loop_macros.h"


#define DIVIDEBYZERO_OVERFLOW_CHECK(x, y, min_val, signed) \
    (NPY_UNLIKELY(                                         \
        (signed)                                    ?      \
        ((y == 0) || ((x == min_val) && (y == -1))) :      \
        (y == 0))                                          \
    )

#define FLAG_IF_DIVIDEBYZERO(x) do {     \
    if (NPY_UNLIKELY(x == 0)) {          \
        npy_set_floatstatus_divbyzero(); \
    }                                    \
} while (0)


#if NPY_SIMD && defined(NPY_HAVE_VSX4)
typedef struct {
    npyv_u32x2 hi;
    npyv_u32x2 lo;
} vsx4_u32x4;

typedef struct {
    npyv_s32x2 hi;
    npyv_s32x2 lo;
} vsx4_s32x4;

// Converts 1 8-bit vector into 2 16-bit vectors
NPY_FINLINE npyv_s16x2
vsx4_expand_s16_s8(npyv_s8 data)
{
    npyv_s16x2 r;
    r.val[0] = vec_unpackh(data);
    r.val[1] = vec_unpackl(data);
    return r;
}

// Converts 1 16-bit vector into 2 32-bit vectors
NPY_FINLINE npyv_s32x2
vsx4_expand_s32_s16(npyv_s16 data)
{
    npyv_s32x2 r;
    r.val[0] = vec_unpackh(data);
    r.val[1] = vec_unpackl(data);
    return r;
}

/**begin repeat
 * #t = u, s#
 * #expand = npyv_expand, vsx4_expand#
 */
// Converts 1 8-bit vector into 4 32-bit vectors
NPY_FINLINE vsx4_@t@32x4
vsx4_expand_@t@32_@t@8(npyv_@t@8 data)
{
    vsx4_@t@32x4 r;
    npyv_@t@16x2 expand = @expand@_@t@16_@t@8(data);
    r.hi = @expand@_@t@32_@t@16(expand.val[0]);
    r.lo = @expand@_@t@32_@t@16(expand.val[1]);
    return r;
}

/**begin repeat1
 * #simd = div, mod##
 */
/*
 * Computes division/modulo of 2 8-bit signed/unsigned integer vectors
 *
 * As Power10 only supports integer vector division/modulo for data of 32 bits
 * or greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer
 * vector division/modulo instruction, and then, convert the result back to
 * npyv_u8.
 */
NPY_FINLINE npyv_@t@8
vsx4_@simd@_@t@8(npyv_@t@8 a, npyv_@t@8 b)
{
    vsx4_@t@32x4 a_expand = vsx4_expand_@t@32_@t@8(a);
    vsx4_@t@32x4 b_expand = vsx4_expand_@t@32_@t@8(b);
    npyv_@t@32 v1 = vec_@simd@(a_expand.hi.val[0], b_expand.hi.val[0]);
    npyv_@t@32 v2 = vec_@simd@(a_expand.hi.val[1], b_expand.hi.val[1]);
    npyv_@t@32 v3 = vec_@simd@(a_expand.lo.val[0], b_expand.lo.val[0]);
    npyv_@t@32 v4 = vec_@simd@(a_expand.lo.val[1], b_expand.lo.val[1]);
    npyv_@t@16 hi = vec_pack(v1, v2);
    npyv_@t@16 lo = vec_pack(v3, v4);
    return vec_pack(hi, lo);
}

NPY_FINLINE npyv_@t@8
vsx4_@simd@_scalar_@t@8(npyv_@t@8 a, const vsx4_@t@32x4 b_expand)
{
    vsx4_@t@32x4 a_expand = vsx4_expand_@t@32_@t@8(a);
    npyv_@t@32 v1 = vec_@simd@(a_expand.hi.val[0], b_expand.hi.val[0]);
    npyv_@t@32 v2 = vec_@simd@(a_expand.hi.val[1], b_expand.hi.val[1]);
    npyv_@t@32 v3 = vec_@simd@(a_expand.lo.val[0], b_expand.lo.val[0]);
    npyv_@t@32 v4 = vec_@simd@(a_expand.lo.val[1], b_expand.lo.val[1]);
    npyv_@t@16 hi = vec_pack(v1, v2);
    npyv_@t@16 lo = vec_pack(v3, v4);
    return vec_pack(hi, lo);
}

NPY_FINLINE npyv_@t@16
vsx4_@simd@_@t@16(npyv_@t@16 a, npyv_@t@16 b)
{
    npyv_@t@32x2 a_expand = @expand@_@t@32_@t@16(a);
    npyv_@t@32x2 b_expand = @expand@_@t@32_@t@16(b);
    npyv_@t@32 v1 = vec_@simd@(a_expand.val[0], b_expand.val[0]);
    npyv_@t@32 v2 = vec_@simd@(a_expand.val[1], b_expand.val[1]);
    return vec_pack(v1, v2);
}

NPY_FINLINE npyv_@t@16
vsx4_@simd@_scalar_@t@16(npyv_@t@16 a, const npyv_@t@32x2 b_expand)
{
    npyv_@t@32x2 a_expand = @expand@_@t@32_@t@16(a);
    npyv_@t@32 v1 = vec_@simd@(a_expand.val[0], b_expand.val[0]);
    npyv_@t@32 v2 = vec_@simd@(a_expand.val[1], b_expand.val[1]);
    return vec_pack(v1, v2);
}

#define vsx4_@simd@_@t@32 vec_@simd@
#define vsx4_@simd@_@t@64 vec_@simd@
#define vsx4_@simd@_scalar_@t@32 vec_@simd@
#define vsx4_@simd@_scalar_@t@64 vec_@simd@
/**end repeat1**/
/**end repeat**/

/**begin repeat
 * #sfx  = u8,  u16, s8,  s16#
 * #osfx = u32, u32, s32, s32#
 * #otype  = vsx4_u32x4,  npyv_u32x2,  vsx4_s32x4,  npyv_s32x2#
 * #expand = vsx4_expand, npyv_expand, vsx4_expand, vsx4_expand#
 */
// Generates the divisor for the division/modulo operations
NPY_FINLINE @otype@
vsx4_divisor_@sfx@(const npyv_@sfx@ vscalar)
{
    return @expand@_@osfx@_@sfx@(vscalar);
}
/**end repeat**/

/**begin repeat
 * #sfx = u32, u64, s32, s64#
 */
NPY_FINLINE npyv_@sfx@
vsx4_divisor_@sfx@(const npyv_@sfx@ vscalar)
{
    return vscalar;
}
/**end repeat**/

/**begin repeat
 * Unsigned types
 * #sfx = u8, u16, u32, u64#
 * #len = 8,  16,  32,  64#
 * #divtype = vsx4_u32x4, npyv_u32x2,  npyv_u32,  npyv_u64#
 */
/**begin repeat1
 * #func = fmod, remainder, divmod#
 * #id = 0, 1, 2#
 */
static inline void
vsx4_simd_@func@_contig_@sfx@(char **args, npy_intp len)
{
    npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0];
    npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1];
    npyv_lanetype_@sfx@ *dst1 = (npyv_lanetype_@sfx@ *) args[2];
    const npyv_@sfx@ vzero    = npyv_zero_@sfx@();
    const int vstep           = npyv_nlanes_@sfx@;
#if @id@ == 2 /* divmod */
    npyv_lanetype_@sfx@ *dst2 = (npyv_lanetype_@sfx@ *) args[3];
    npyv_b@len@ warn          = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
        npyv_@sfx@ a        = npyv_load_@sfx@(src1);
        npyv_@sfx@ b        = npyv_load_@sfx@(src2);
        npyv_@sfx@ quo      = vsx4_div_@sfx@(a, b);
        npyv_@sfx@ rem      = npyv_sub_@sfx@(a, vec_mul(b, quo));
        npyv_b@len@ bzero   = npyv_cmpeq_@sfx@(b, vzero);
        // when b is 0, forces the remainder to be 0 too
                        rem = npyv_select_@sfx@(bzero, vzero, rem);
                       warn = npyv_or_@sfx@(bzero, warn);
        npyv_store_@sfx@(dst1, quo);
        npyv_store_@sfx@(dst2, rem);
    }

    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_divbyzero();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_@sfx@ a = *src1;
        const npyv_lanetype_@sfx@ b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
            *dst2 = 0;
        } else{
            *dst1 = a / b;
            *dst2 = a % b;
        }
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
        npyv_@sfx@ a = npyv_load_@sfx@(src1);
        npyv_@sfx@ b = npyv_load_@sfx@(src2);
        npyv_@sfx@ c = vsx4_mod_@sfx@(a, b);
        npyv_store_@sfx@(dst1, c);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_@sfx@ a = *src1;
        const npyv_lanetype_@sfx@ b = *src2;
        if (NPY_UNLIKELY(b == 0)) {
            npy_set_floatstatus_divbyzero();
            *dst1 = 0;
        } else{
            *dst1 = a % b;
        }
    }
#endif
    npyv_cleanup();
}

static inline void
vsx4_simd_@func@_by_scalar_contig_@sfx@(char **args, npy_intp len)
{
    npyv_lanetype_@sfx@ *src1  = (npyv_lanetype_@sfx@ *) args[0];
    npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1];
    npyv_lanetype_@sfx@ *dst1  = (npyv_lanetype_@sfx@ *) args[2];
    const int vstep            = npyv_nlanes_@sfx@;
    const npyv_@sfx@ vscalar   = npyv_setall_@sfx@(scalar);
    const @divtype@ divisor    = vsx4_divisor_@sfx@(vscalar);
#if @id@ == 2 /* divmod */
    npyv_lanetype_@sfx@ *dst2 = (npyv_lanetype_@sfx@ *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
        npyv_@sfx@ a   = npyv_load_@sfx@(src1);
        npyv_@sfx@ quo = vsx4_div_scalar_@sfx@(a, divisor);
        npyv_@sfx@ rem = npyv_sub_@sfx@(a, vec_mul(vscalar, quo));
        npyv_store_@sfx@(dst1, quo);
        npyv_store_@sfx@(dst2, rem);
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_@sfx@ a = *src1;
        *dst1 = a / scalar;
        *dst2 = a % scalar;
    }
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
        npyv_@sfx@ a = npyv_load_@sfx@(src1);
        npyv_@sfx@ c = vsx4_mod_scalar_@sfx@(a, divisor);
        npyv_store_@sfx@(dst1, c);
    }

    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_@sfx@ a = *src1;
        *dst1 = a % scalar;
    }
#endif
    npyv_cleanup();
}
/**end repeat1**/
/**end repeat**/

/**begin repeat
 * Signed types
 * #sfx = s8, s16, s32, s64#
 * #len = 8,  16,  32,  64#
 * #divtype = vsx4_s32x4, npyv_s32x2,  npyv_s32,  npyv_s64#
 */
/**begin repeat1
 * #func = fmod, remainder, divmod#
 * #id = 0, 1, 2#
 */
static inline void
vsx4_simd_@func@_contig_@sfx@(char **args, npy_intp len)
{
    npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0];
    npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1];
    npyv_lanetype_@sfx@ *dst1 = (npyv_lanetype_@sfx@ *) args[2];
    const npyv_@sfx@ vzero    = npyv_zero_@sfx@();
    const int vstep           = npyv_nlanes_@sfx@;
#if @id@ == 2 /* divmod */
    npyv_lanetype_@sfx@ *dst2 = (npyv_lanetype_@sfx@ *) args[3];
    const npyv_@sfx@ vneg_one = npyv_setall_@sfx@(-1);
    const npyv_@sfx@ vmin     = npyv_setall_@sfx@(NPY_MIN_INT@len@);
    npyv_b@len@ warn_zero     = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@());
    npyv_b@len@ warn_overflow = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@());

    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep, dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
         dst1 += vstep) {
#endif
        npyv_@sfx@ a = npyv_load_@sfx@(src1);
        npyv_@sfx@ b = npyv_load_@sfx@(src2);
#if @id@ <= 1 /* fmod and remainder */
        npyv_@sfx@ rem       = vsx4_mod_@sfx@(a, b);
#else /* divmod */
        npyv_@sfx@ quo       = vsx4_div_@sfx@(a, b);
        npyv_@sfx@ rem       = npyv_sub_@sfx@(a, vec_mul(b, quo));
        // (b == 0 || (a == NPY_MIN_INT@len@ && b == -1))
        npyv_b@len@ bzero    = npyv_cmpeq_@sfx@(b, vzero);
        npyv_b@len@ amin     = npyv_cmpeq_@sfx@(a, vmin);
        npyv_b@len@ bneg_one = npyv_cmpeq_@sfx@(b, vneg_one);
        npyv_b@len@ overflow = npyv_and_@sfx@(bneg_one, amin);
                warn_zero = npyv_or_@sfx@(bzero, warn_zero);
               warn_overflow = npyv_or_@sfx@(overflow, warn_overflow);
#endif
#if @id@ >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b@len@ a_gt_zero  = npyv_cmpgt_@sfx@(a, vzero);
        npyv_b@len@ b_gt_zero  = npyv_cmpgt_@sfx@(b, vzero);
        npyv_b@len@ ab_eq_cond = npyv_cmpeq_@sfx@(a_gt_zero, b_gt_zero);
        npyv_b@len@ rem_zero   = npyv_cmpeq_@sfx@(rem, vzero);
        npyv_b@len@ or         = npyv_or_@sfx@(ab_eq_cond, rem_zero);
        npyv_@sfx@ to_add      = npyv_select_@sfx@(or, vzero, b);
                           rem = npyv_add_@sfx@(rem, to_add);
#endif
#if @id@ == 2 /* divmod */
        npyv_@sfx@ to_sub = npyv_select_@sfx@(or, vzero, vneg_one);
                      quo = npyv_add_@sfx@(quo, to_sub);
                      // Divide by zero
                      quo = npyv_select_@sfx@(bzero, vzero, quo);
                      rem = npyv_select_@sfx@(bzero, vzero, rem);
                      // Overflow
                      quo = npyv_select_@sfx@(overflow, vmin, quo);
                      rem = npyv_select_@sfx@(overflow, vzero, rem);
        npyv_store_@sfx@(dst1, quo);
        npyv_store_@sfx@(dst2, rem);
#else /* fmod and remainder */
        npyv_store_@sfx@(dst1, rem);
        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
            npy_set_floatstatus_divbyzero();
        }
#endif
    }

#if @id@ == 2 /* divmod */
    if (!vec_all_eq(warn_zero, vzero)) {
        npy_set_floatstatus_divbyzero();
    }
    if (!vec_all_eq(warn_overflow, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
        const npyv_lanetype_@sfx@ a = *src1;
        const npyv_lanetype_@sfx@ b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT@len@, NPY_TRUE)) {
            if (b == 0) {
                npy_set_floatstatus_divbyzero();
                *dst1 = 0;
                *dst2 = 0;
            }
            else {
                npy_set_floatstatus_overflow();
                *dst1 = NPY_MIN_INT@len@;
                *dst2 = 0;
            }
        }
        else {
            *dst1 = a / b;
            *dst2 = a % b;
            if (!((a > 0) == (b > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += b;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
        const npyv_lanetype_@sfx@ a = *src1;
        const npyv_lanetype_@sfx@ b = *src2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT@len@, NPY_TRUE)) {
            FLAG_IF_DIVIDEBYZERO(b);
            *dst1 = 0;
        } else{
            *dst1 = a % b;
#if @id@ == 1 /* remainder */
            if (!((a > 0) == (b > 0) || *dst1 == 0)) {
                *dst1 += b;
            }
#endif
        }
    }
#endif
    npyv_cleanup();
}

static inline void
vsx4_simd_@func@_by_scalar_contig_@sfx@(char **args, npy_intp len)
{
    npyv_lanetype_@sfx@ *src1  = (npyv_lanetype_@sfx@ *) args[0];
    npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1];
    npyv_lanetype_@sfx@ *dst1  = (npyv_lanetype_@sfx@ *) args[2];
    const npyv_@sfx@ vscalar   = npyv_setall_@sfx@(scalar);
    const @divtype@ divisor    = vsx4_divisor_@sfx@(vscalar);
    const int vstep            = npyv_nlanes_@sfx@;
#if @id@ >= 1 /* remainder and divmod */
    const npyv_@sfx@ vzero     = npyv_zero_@sfx@();
    npyv_b@len@ b_gt_zero      = npyv_cmpgt_@sfx@(vscalar, vzero);
#endif
#if @id@ == 2 /* divmod */
    npyv_b@len@ warn          = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@());
    const npyv_@sfx@ vmin     = npyv_setall_@sfx@(NPY_MIN_INT@len@);
    const npyv_@sfx@ vneg_one = npyv_setall_@sfx@(-1);
    npyv_b@len@ bneg_one      = npyv_cmpeq_@sfx@(vscalar, vneg_one);
    npyv_lanetype_@sfx@ *dst2 = (npyv_lanetype_@sfx@ *) args[3];

    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
         dst2 += vstep) {
#else /* fmod and remainder */
    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
#endif
        npyv_@sfx@ a = npyv_load_@sfx@(src1);
#if @id@ <= 1 /* fmod and remainder */
        npyv_@sfx@ rem       = vsx4_mod_scalar_@sfx@(a, divisor);
#else /* divmod */
        npyv_@sfx@ quo       = vsx4_div_scalar_@sfx@(a, divisor);
        npyv_@sfx@ rem       = npyv_sub_@sfx@(a, vec_mul(vscalar, quo));
        // (a == NPY_MIN_INT@len@ && b == -1)
        npyv_b@len@ amin     = npyv_cmpeq_@sfx@(a, vmin);
        npyv_b@len@ overflow = npyv_and_@sfx@(bneg_one, amin);
                        warn = npyv_or_@sfx@(overflow, warn);
#endif
#if @id@ >= 1 /* remainder and divmod */
        // handle mixed case the way Python does
        // ((a > 0) == (b > 0) || rem == 0)
        npyv_b@len@ a_gt_zero  = npyv_cmpgt_@sfx@(a, vzero);
        npyv_b@len@ ab_eq_cond = npyv_cmpeq_@sfx@(a_gt_zero, b_gt_zero);
        npyv_b@len@ rem_zero   = npyv_cmpeq_@sfx@(rem, vzero);
        npyv_b@len@ or         = npyv_or_@sfx@(ab_eq_cond, rem_zero);
        npyv_@sfx@ to_add      = npyv_select_@sfx@(or, vzero, vscalar);
                           rem = npyv_add_@sfx@(rem, to_add);
#endif
#if @id@ == 2 /* divmod */
        npyv_@sfx@ to_sub = npyv_select_@sfx@(or, vzero, vneg_one);
        quo               = npyv_add_@sfx@(quo, to_sub);
        // Overflow: set quo to minimum and rem to 0
        quo               = npyv_select_@sfx@(overflow, vmin, quo);
        rem               = npyv_select_@sfx@(overflow, vzero, rem);
        npyv_store_@sfx@(dst1, quo);
        npyv_store_@sfx@(dst2, rem);
#else /* fmod and remainder */
        npyv_store_@sfx@(dst1, rem);
#endif
    }

#if @id@ == 2 /* divmod */
    if (!vec_all_eq(warn, vzero)) {
        npy_set_floatstatus_overflow();
    }

    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
        const npyv_lanetype_@sfx@ a = *src1;
        if (NPY_UNLIKELY(a == NPY_MIN_INT@len@ && scalar == -1)) {
            npy_set_floatstatus_overflow();
            *dst1 = NPY_MIN_INT@len@;
            *dst2 = 0;
        }
        else {
            *dst1 = a / scalar;
            *dst2 = a % scalar;
            if (!((a > 0) == (scalar > 0) || *dst2 == 0)) {
                *dst1 -= 1;
                *dst2 += scalar;
            }
        }
    }
#else /* fmod and remainder */
    for (; len > 0; --len, ++src1, ++dst1) {
        const npyv_lanetype_@sfx@ a = *src1;
        *dst1 = a % scalar;
#if @id@ == 1 /* remainder */
        if (!((a > 0) == (scalar > 0) || *dst1 == 0)) {
            *dst1 += scalar;
        }
#endif
    }
#endif
    npyv_cleanup();
}
/**end repeat1**/
/**end repeat**/
#endif // NPY_SIMD && defined(NPY_HAVE_VSX4)

/*****************************************************************************
 ** Defining ufunc inner functions
 *****************************************************************************/

/**begin repeat
 * Signed and Unsigned types
 *  #type  = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
 *           npy_byte,  npy_short,  npy_int,  npy_long,  npy_longlong#
 *  #TYPE  = UBYTE,     USHORT,     UINT,     ULONG,     ULONGLONG,
 *           BYTE,      SHORT,      INT,      LONG,      LONGLONG#
 *  #STYPE = BYTE,      SHORT,      INT,      LONG,      LONGLONG,
 *           BYTE,      SHORT,      INT,      LONG,      LONGLONG#
 *  #signed = 0, 0, 0, 0, 0, 1, 1, 1, 1, 1#
 */
#undef TO_SIMD_SFX
#if 0
/**begin repeat1
 * #len = 8, 16, 32, 64#
 */
#elif NPY_BITSOF_@STYPE@ == @len@
    #if @signed@
        #define TO_SIMD_SFX(X) X##_s@len@
    #else
        #define TO_SIMD_SFX(X) X##_u@len@
    #endif
/**end repeat1**/
#endif

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_fmod)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) &&
             (*(@type@ *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
#if @signed@
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_@TYPE@, NPY_TRUE)) {
#else
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
#endif
            FLAG_IF_DIVIDEBYZERO(in2);
            *((@type@ *)op1) = 0;
        } else{
            *((@type@ *)op1)= in1 % in2;
        }
    }
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_remainder)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) &&
             (*(@type@ *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
    BINARY_LOOP {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
#if @signed@
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_@TYPE@, NPY_TRUE)) {
#else
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
#endif
            FLAG_IF_DIVIDEBYZERO(in2);
            *((@type@ *)op1) = 0;
        } else{
#if @signed@
            /* handle mixed case the way Python does */
            const @type@ rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((@type@ *)op1) = rem;
            }
            else {
                *((@type@ *)op1) = rem + in2;
            }
#else
            *((@type@ *)op1)= in1 % in2;
#endif
        }
    }
}

NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divmod)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
    // both arguments are arrays of the same size
    if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
        TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]);
        return;
    }
    // for contiguous block of memory, divisor is a scalar and not 0
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) &&
             (*(@type@ *)args[1]) != 0) {
        TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]);
        return ;
    }
#endif
#if @signed@
    BINARY_LOOP_TWO_OUT {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        /* see FIXME note for divide above */
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_@TYPE@, NPY_TRUE)) {
            if (in2 == 0) {
                npy_set_floatstatus_divbyzero();
                *((@type@ *)op1) = 0;
                *((@type@ *)op2) = 0;
            }
            else {
                npy_set_floatstatus_overflow();
                *((@type@ *)op1) = NPY_MIN_@TYPE@;
                *((@type@ *)op2) = 0;
            }
        }
        else {
            /* handle mixed case the way Python does */
            const @type@ quo = in1 / in2;
            const @type@ rem = in1 % in2;
            if ((in1 > 0) == (in2 > 0) || rem == 0) {
                *((@type@ *)op1) = quo;
                *((@type@ *)op2) = rem;
            }
            else {
                *((@type@ *)op1) = quo - 1;
                *((@type@ *)op2) = rem + in2;
            }
        }
    }
#else
    BINARY_LOOP_TWO_OUT {
        const @type@ in1 = *(@type@ *)ip1;
        const @type@ in2 = *(@type@ *)ip2;
        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
            npy_set_floatstatus_divbyzero();
            *((@type@ *)op1) = 0;
            *((@type@ *)op2) = 0;
        }
        else {
            *((@type@ *)op1)= in1/in2;
            *((@type@ *)op2) = in1 % in2;
        }
    }
#endif
}
/**end repeat**/