/*@targets ** $maxopt baseline ** sse2 sse41 avx2 avx512f avx512_skx ** vsx2 vsx4 ** neon ** vx **/ #define _UMATHMODULE #define _MULTIARRAYMODULE #define NPY_NO_DEPRECATED_API NPY_API_VERSION #include "simd/simd.h" #include "loops_utils.h" #include "loops.h" #include "lowlevel_strided_loops.h" // Provides the various *_LOOP macros #include "fast_loop_macros.h" //############################################################################### //## Division //############################################################################### /******************************************************************************** ** Defining the SIMD kernels * * Floor division of signed is based on T. Granlund and P. L. Montgomery * "Division by invariant integers using multiplication(see [Figure 6.1] * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556)" * For details on TRUNC division see simd/intdiv.h for more clarification *********************************************************************************** ** Figure 6.1: Signed division by run-time invariant divisor, rounded towards -INF *********************************************************************************** * For q = FLOOR(a/d), all sword: * sword -dsign = SRL(d, N - 1); * uword -nsign = (n < -dsign); * uword -qsign = EOR(-nsign, -dsign); * q = TRUNC((n - (-dsign ) + (-nsign))/d) - (-qsign); ********************************************************************************/ #if NPY_SIMD /**begin repeat * Signed types * #sfx = s8, s16, s32, s64# * #len = 8, 16, 32, 64# */ static inline void simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) { npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0]; npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1]; npyv_lanetype_@sfx@ *dst = (npyv_lanetype_@sfx@ *) args[2]; const int vstep = npyv_nlanes_@sfx@; const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar); if (scalar == -1) { npyv_b@len@ noverflow = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1)); const npyv_@sfx@ vzero = npyv_zero_@sfx@(); const npyv_@sfx@ vmin = npyv_setall_@sfx@(NPY_MIN_INT@len@); for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { npyv_@sfx@ a = npyv_load_@sfx@(src); npyv_b@len@ gt_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@)); noverflow = npyv_and_b@len@(noverflow, gt_min); npyv_@sfx@ neg = npyv_ifsub_@sfx@(gt_min, vzero, a, vmin); npyv_store_@sfx@(dst, neg); } int raise_err = npyv_tobits_b@len@(npyv_not_b@len@(noverflow)) != 0; for (; len > 0; --len, ++src, ++dst) { npyv_lanetype_@sfx@ a = *src; if (a == NPY_MIN_INT@len@) { raise_err = 1; *dst = NPY_MIN_INT@len@; } else { *dst = -a; } } if (raise_err) { npy_set_floatstatus_overflow(); } } else { for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { npyv_@sfx@ nsign_d = npyv_setall_@sfx@(scalar < 0); npyv_@sfx@ a = npyv_load_@sfx@(src); npyv_@sfx@ nsign_a = npyv_cvt_@sfx@_b@len@(npyv_cmplt_@sfx@(a, nsign_d)); nsign_a = npyv_and_@sfx@(nsign_a, npyv_setall_@sfx@(1)); npyv_@sfx@ diff_sign = npyv_sub_@sfx@(nsign_a, nsign_d); npyv_@sfx@ to_ninf = npyv_xor_@sfx@(nsign_a, nsign_d); npyv_@sfx@ trunc = npyv_divc_@sfx@(npyv_add_@sfx@(a, diff_sign), divisor); npyv_@sfx@ floor = npyv_sub_@sfx@(trunc, to_ninf); npyv_store_@sfx@(dst, floor); } for (; len > 0; --len, ++src, ++dst) { const npyv_lanetype_@sfx@ a = *src; npyv_lanetype_@sfx@ r = a / scalar; // Negative quotients needs to be rounded down if (((a > 0) != (scalar > 0)) && ((r * scalar) != a)) { r--; } *dst = r; } } npyv_cleanup(); } /**end repeat**/ /**begin repeat * Unsigned types * #sfx = u8, u16, u32, u64# * #len = 8, 16, 32, 64# */ static inline void simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) { npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0]; npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1]; npyv_lanetype_@sfx@ *dst = (npyv_lanetype_@sfx@ *) args[2]; const int vstep = npyv_nlanes_@sfx@; const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar); for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { npyv_@sfx@ a = npyv_load_@sfx@(src); npyv_@sfx@ c = npyv_divc_@sfx@(a, divisor); npyv_store_@sfx@(dst, c); } for (; len > 0; --len, ++src, ++dst) { const npyv_lanetype_@sfx@ a = *src; *dst = a / scalar; } npyv_cleanup(); } /**end repeat**/ #if defined(NPY_HAVE_VSX4) /**begin repeat * #t = u, s# * #signed = 0, 1# */ /* * Computes division of 2 8-bit signed/unsigned integer vectors * * As Power10 only supports integer vector division for data of 32 bits or * greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer * vector division instruction, and then, convert the result back to npyv_u8. */ NPY_FINLINE npyv_@t@8 vsx4_div_@t@8(npyv_@t@8 a, npyv_@t@8 b) { #if @signed@ npyv_s16x2 ta, tb; npyv_s32x2 ahi, alo, bhi, blo; ta.val[0] = vec_unpackh(a); ta.val[1] = vec_unpackl(a); tb.val[0] = vec_unpackh(b); tb.val[1] = vec_unpackl(b); ahi.val[0] = vec_unpackh(ta.val[0]); ahi.val[1] = vec_unpackl(ta.val[0]); alo.val[0] = vec_unpackh(ta.val[1]); alo.val[1] = vec_unpackl(ta.val[1]); bhi.val[0] = vec_unpackh(tb.val[0]); bhi.val[1] = vec_unpackl(tb.val[0]); blo.val[0] = vec_unpackh(tb.val[1]); blo.val[1] = vec_unpackl(tb.val[1]); #else npyv_u16x2 a_expand = npyv_expand_u16_u8(a); npyv_u16x2 b_expand = npyv_expand_u16_u8(b); npyv_u32x2 ahi = npyv_expand_u32_u16(a_expand.val[0]); npyv_u32x2 alo = npyv_expand_u32_u16(a_expand.val[1]); npyv_u32x2 bhi = npyv_expand_u32_u16(b_expand.val[0]); npyv_u32x2 blo = npyv_expand_u32_u16(b_expand.val[1]); #endif npyv_@t@32 v1 = vec_div(ahi.val[0], bhi.val[0]); npyv_@t@32 v2 = vec_div(ahi.val[1], bhi.val[1]); npyv_@t@32 v3 = vec_div(alo.val[0], blo.val[0]); npyv_@t@32 v4 = vec_div(alo.val[1], blo.val[1]); npyv_@t@16 hi = vec_pack(v1, v2); npyv_@t@16 lo = vec_pack(v3, v4); return vec_pack(hi, lo); } NPY_FINLINE npyv_@t@16 vsx4_div_@t@16(npyv_@t@16 a, npyv_@t@16 b) { #if @signed@ npyv_s32x2 a_expand; npyv_s32x2 b_expand; a_expand.val[0] = vec_unpackh(a); a_expand.val[1] = vec_unpackl(a); b_expand.val[0] = vec_unpackh(b); b_expand.val[1] = vec_unpackl(b); #else npyv_u32x2 a_expand = npyv_expand_@t@32_@t@16(a); npyv_u32x2 b_expand = npyv_expand_@t@32_@t@16(b); #endif npyv_@t@32 v1 = vec_div(a_expand.val[0], b_expand.val[0]); npyv_@t@32 v2 = vec_div(a_expand.val[1], b_expand.val[1]); return vec_pack(v1, v2); } #define vsx4_div_@t@32 vec_div #define vsx4_div_@t@64 vec_div /**end repeat**/ /**begin repeat * Unsigned types * #sfx = u8, u16, u32, u64# * #len = 8, 16, 32, 64# */ static inline void vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len) { npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0]; npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1]; npyv_lanetype_@sfx@ *dst1 = (npyv_lanetype_@sfx@ *) args[2]; const npyv_@sfx@ vzero = npyv_zero_@sfx@(); const int vstep = npyv_nlanes_@sfx@; for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, dst1 += vstep) { npyv_@sfx@ a = npyv_load_@sfx@(src1); npyv_@sfx@ b = npyv_load_@sfx@(src2); npyv_@sfx@ c = vsx4_div_@sfx@(a, b); npyv_store_@sfx@(dst1, c); if (NPY_UNLIKELY(vec_any_eq(b, vzero))) { npy_set_floatstatus_divbyzero(); } } for (; len > 0; --len, ++src1, ++src2, ++dst1) { const npyv_lanetype_@sfx@ a = *src1; const npyv_lanetype_@sfx@ b = *src2; if (NPY_UNLIKELY(b == 0)) { npy_set_floatstatus_divbyzero(); *dst1 = 0; } else{ *dst1 = a / b; } } npyv_cleanup(); } /**end repeat**/ /**begin repeat * Signed types * #sfx = s8, s16, s32, s64# * #len = 8, 16, 32, 64# */ static inline void vsx4_simd_divide_contig_@sfx@(char **args, npy_intp len) { npyv_lanetype_@sfx@ *src1 = (npyv_lanetype_@sfx@ *) args[0]; npyv_lanetype_@sfx@ *src2 = (npyv_lanetype_@sfx@ *) args[1]; npyv_lanetype_@sfx@ *dst1 = (npyv_lanetype_@sfx@ *) args[2]; const npyv_@sfx@ vneg_one = npyv_setall_@sfx@(-1); const npyv_@sfx@ vzero = npyv_zero_@sfx@(); const npyv_@sfx@ vmin = npyv_setall_@sfx@(NPY_MIN_INT@len@); npyv_b@len@ warn_zero = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@()); npyv_b@len@ warn_overflow = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@()); const int vstep = npyv_nlanes_@sfx@; for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep, dst1 += vstep) { npyv_@sfx@ a = npyv_load_@sfx@(src1); npyv_@sfx@ b = npyv_load_@sfx@(src2); npyv_@sfx@ quo = vsx4_div_@sfx@(a, b); npyv_@sfx@ rem = npyv_sub_@sfx@(a, vec_mul(b, quo)); // (b == 0 || (a == NPY_MIN_INT@len@ && b == -1)) npyv_b@len@ bzero = npyv_cmpeq_@sfx@(b, vzero); npyv_b@len@ amin = npyv_cmpeq_@sfx@(a, vmin); npyv_b@len@ bneg_one = npyv_cmpeq_@sfx@(b, vneg_one); npyv_b@len@ overflow = npyv_and_@sfx@(bneg_one, amin); warn_zero = npyv_or_@sfx@(bzero, warn_zero); warn_overflow = npyv_or_@sfx@(overflow, warn_overflow); // handle mixed case the way Python does // ((a > 0) == (b > 0) || rem == 0) npyv_b@len@ a_gt_zero = npyv_cmpgt_@sfx@(a, vzero); npyv_b@len@ b_gt_zero = npyv_cmpgt_@sfx@(b, vzero); npyv_b@len@ ab_eq_cond = npyv_cmpeq_@sfx@(a_gt_zero, b_gt_zero); npyv_b@len@ rem_zero = npyv_cmpeq_@sfx@(rem, vzero); npyv_b@len@ or = npyv_or_@sfx@(ab_eq_cond, rem_zero); npyv_@sfx@ to_sub = npyv_select_@sfx@(or, vzero, vneg_one); quo = npyv_add_@sfx@(quo, to_sub); // Divide by zero quo = npyv_select_@sfx@(bzero, vzero, quo); // Overflow quo = npyv_select_@sfx@(overflow, vmin, quo); npyv_store_@sfx@(dst1, quo); } if (!vec_all_eq(warn_zero, vzero)) { npy_set_floatstatus_divbyzero(); } if (!vec_all_eq(warn_overflow, vzero)) { npy_set_floatstatus_overflow(); } for (; len > 0; --len, ++src1, ++src2, ++dst1) { const npyv_lanetype_@sfx@ a = *src1; const npyv_lanetype_@sfx@ b = *src2; if (NPY_UNLIKELY(b == 0)) { npy_set_floatstatus_divbyzero(); *dst1 = 0; } else if (NPY_UNLIKELY((a == NPY_MIN_INT@len@) && (b == -1))) { npy_set_floatstatus_overflow(); *dst1 = NPY_MIN_INT@len@; } else { *dst1 = a / b; if (((a > 0) != (b > 0)) && ((*dst1 * b) != a)) { *dst1 -= 1; } } } npyv_cleanup(); } /**end repeat**/ #endif // NPY_HAVE_VSX4 #endif // NPY_SIMD /******************************************************************************** ** Defining ufunc inner functions ********************************************************************************/ /**begin repeat * Signed types * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong# * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# */ #undef TO_SIMD_SFX #if 0 /**begin repeat1 * #len = 8, 16, 32, 64# */ #elif NPY_BITSOF_@TYPE@ == @len@ #define TO_SIMD_SFX(X) X##_s@len@ /**end repeat1**/ #endif #if NPY_BITSOF_@TYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON)) #undef TO_SIMD_SFX #endif NPY_FINLINE @type@ floor_div_@TYPE@(const @type@ n, const @type@ d) { /* * FIXME: On x86 at least, dividing the smallest representable integer * by -1 causes a SIFGPE (division overflow). We treat this case here * (to avoid a SIGFPE crash at python level), but a good solution would * be to treat integer division problems separately from FPU exceptions * (i.e. a different approach than npy_set_floatstatus_divbyzero()). */ if (NPY_UNLIKELY(d == 0 || (n == NPY_MIN_@TYPE@ && d == -1))) { if (d == 0) { npy_set_floatstatus_divbyzero(); return 0; } else { npy_set_floatstatus_overflow(); return NPY_MIN_@TYPE@; } } @type@ r = n / d; // Negative quotients needs to be rounded down if (((n > 0) != (d > 0)) && ((r * d) != n)) { r--; } return r; } NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { if (IS_BINARY_REDUCE) { BINARY_REDUCE_LOOP(@type@) { io1 = floor_div_@TYPE@(io1, *(@type@*)ip2); } *((@type@ *)iop1) = io1; } #if NPY_SIMD && defined(TO_SIMD_SFX) #if defined(NPY_HAVE_VSX4) // both arguments are arrays of the same size else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) { TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]); } #endif // for contiguous block of memory, divisor is a scalar and not 0 else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) && (*(@type@ *)args[1]) != 0) { TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]); } #endif else { BINARY_LOOP { *((@type@ *)op1) = floor_div_@TYPE@(*(@type@*)ip1, *(@type@*)ip2); } } } NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed) (PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) { char *ip1 = args[0]; char *indx = args[1]; char *value = args[2]; npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; npy_intp n = dimensions[0]; npy_intp i; @type@ *indexed; for(i = 0; i < n; i++, indx += isindex, value += isb) { indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx); *indexed = floor_div_@TYPE@(*indexed, *(@type@ *)value); } return 0; } /**end repeat**/ /**begin repeat * Unsigned types * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong# * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG# * #STYPE = BYTE, SHORT, INT, LONG, LONGLONG# */ #undef TO_SIMD_SFX #if 0 /**begin repeat1 * #len = 8, 16, 32, 64# */ #elif NPY_BITSOF_@STYPE@ == @len@ #define TO_SIMD_SFX(X) X##_u@len@ /**end repeat1**/ #endif /* * For 64-bit division on Armv7, Aarch64, and IBM/Power, NPYV fall-backs to the scalar division * because emulating multiply-high on these architectures is going to be expensive comparing * to the native scalar dividers. * Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles. * Power10(VSX4) is an exception here since it has native support for integer vector division. */ #if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON)) #undef TO_SIMD_SFX #endif NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { if (IS_BINARY_REDUCE) { BINARY_REDUCE_LOOP(@type@) { const @type@ d = *(@type@ *)ip2; if (NPY_UNLIKELY(d == 0)) { npy_set_floatstatus_divbyzero(); io1 = 0; } else { io1 /= d; } } *((@type@ *)iop1) = io1; } #if NPY_SIMD && defined(TO_SIMD_SFX) #if defined(NPY_HAVE_VSX4) // both arguments are arrays of the same size else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) { TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]); } #endif // for contiguous block of memory, divisor is a scalar and not 0 else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) && (*(@type@ *)args[1]) != 0) { TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]); } #endif else { BINARY_LOOP { const @type@ in1 = *(@type@ *)ip1; const @type@ in2 = *(@type@ *)ip2; if (NPY_UNLIKELY(in2 == 0)) { npy_set_floatstatus_divbyzero(); *((@type@ *)op1) = 0; } else{ *((@type@ *)op1) = in1 / in2; } } } } NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed) (PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) { char *ip1 = args[0]; char *indx = args[1]; char *value = args[2]; npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2]; npy_intp n = dimensions[0]; npy_intp i; @type@ *indexed; for(i = 0; i < n; i++, indx += isindex, value += isb) { indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx); @type@ in2 = *(@type@ *)value; if (NPY_UNLIKELY(in2 == 0)) { npy_set_floatstatus_divbyzero(); *indexed = 0; } else { *indexed = *indexed / in2; } } return 0; } /**end repeat**/