diff options
| author | Ganesh Kathiresan <ganesh3597@gmail.com> | 2021-04-13 21:08:48 +0530 |
|---|---|---|
| committer | Sayed Adel <seiko@imavr.com> | 2021-05-20 23:19:50 +0200 |
| commit | 0b8838ef0c2e4c5d9e66163d260dc30902cc6170 (patch) | |
| tree | 14d43fce4ad05b2bd6bc8ab89c33bfc569ebaf35 /numpy | |
| parent | 7f9d342324a730185cdf215c66d73530033436ab (diff) | |
| download | numpy-0b8838ef0c2e4c5d9e66163d260dc30902cc6170.tar.gz | |
SIMD: Added floor divide logic for signed
Diffstat (limited to 'numpy')
| -rw-r--r-- | numpy/core/src/umath/loops_arithmetic.dispatch.c.src | 107 |
1 files changed, 81 insertions, 26 deletions
diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src index c071edb3b..55066589f 100644 --- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src +++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src @@ -23,9 +23,53 @@ ********************************************************************************/ #if NPY_SIMD /**begin repeat - * #sfx = u8, u16, u32, u64, s8, s16, s32, s64# + * #sfx = u8, u16, u32, u64, s8, s16, s32, s64# + * #len = 8, 16, 32, 64, 8, 16, 32, 64# * #signed = 0*4, 1*4# */ +#if @signed@ +static NPY_INLINE npyv_@sfx@ +simd_floor_divide_@sfx@(npyv_lanetype_@sfx@ *src, const npyv_@sfx@x3 divisor, npyv_lanetype_@sfx@ scalar) +{ + npyv_@sfx@ a, nsign_d, nsign_a, diff_sign, to_ninf, trunc, floor; + npyv_b@len@ greater_min, noverflow; + + nsign_d = npyv_setall_@sfx@(scalar < 0); + a = npyv_load_@sfx@(src); + nsign_a = npyv_cvt_@sfx@_b@len@(npyv_cmplt_@sfx@(a, nsign_d)); + nsign_a = npyv_and_@sfx@((npyv_@sfx@)nsign_a, npyv_setall_@sfx@(1)); + diff_sign = npyv_sub_@sfx@((npyv_@sfx@)nsign_a, nsign_d); + to_ninf = npyv_xor_@sfx@((npyv_@sfx@)nsign_a, nsign_d); + trunc = npyv_divc_@sfx@(npyv_add_@sfx@(a, diff_sign), divisor); + + if (NPY_UNLIKELY(-1 == scalar)) { + greater_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@)); + noverflow = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1)); + noverflow = npyv_and_b@len@(noverflow, greater_min); + if (npyv_tobits_b@len@(noverflow) != (((npy_uint64)1) << npyv_nlanes_@sfx@)-1) { + npy_set_floatstatus_divbyzero(); + } + floor = npyv_ifsub_@sfx@(greater_min, trunc, to_ninf, npyv_zero_@sfx@()); + } + else { + floor = npyv_sub_@sfx@(trunc, to_ninf); + } + + return floor; +} +#else +static NPY_INLINE npyv_@sfx@ +simd_floor_divide_@sfx@(npyv_lanetype_@sfx@ *src, const npyv_@sfx@x3 divisor, npyv_lanetype_@sfx@ scalar) +{ + npyv_@sfx@ a, c; + + a = npyv_load_@sfx@(src); + c = npyv_divc_@sfx@(a, divisor); + + return c; +} +#endif + static NPY_INLINE void simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) { @@ -36,20 +80,24 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar); for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { - npyv_@sfx@ a = npyv_load_@sfx@(src); - npyv_@sfx@ c = npyv_divc_@sfx@(a, divisor); + npyv_@sfx@ c = simd_floor_divide_@sfx@(src, divisor, scalar); npyv_store_@sfx@(dst, c); } for (; len > 0; --len, ++src, ++dst) { const npyv_lanetype_@sfx@ a = *src; - *dst = a / scalar; + if (scalar == 0 || (a == (npyv_lanetype_@sfx@)NPY_MIN_INT@len@ && scalar == (npyv_lanetype_@sfx@)-1)) { + npy_set_floatstatus_divbyzero(); + *dst = 0; + } else { + *dst = a / scalar; #if @signed@ - /* Negative quotients needs to be rounded down */ - if (((a > 0) != (scalar > 0)) && (*dst * scalar != a)) { - *dst = *dst - 1; - } + /* Negative quotients needs to be rounded down */ + if (((a > 0) != (scalar > 0)) && (*dst * scalar != a)) { + *dst = *dst - 1; + } #endif + } } npyv_cleanup(); @@ -68,19 +116,24 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) */ /**begin repeat1 - * #s = , u# - * #S = , U# - * #slen = s, u# * #signed = 1, 0# */ #undef TO_SIMD_SFX +#undef SIMD_TYPE +#undef SIMD_DIVIDE #if 0 /**begin repeat2 * #len = 8, 16, 32, 64# */ +#elif NPY_BITSOF_@TYPE@ == @len@ && @signed@ + #define TO_SIMD_SFX(X) X##_s@len@ + #define SIMD_TYPE npy_@type@ + #define SIMD_DIVIDE @TYPE@_divide #elif NPY_BITSOF_@TYPE@ == @len@ - #define TO_SIMD_SFX(X) X##_@slen@@len@ + #define TO_SIMD_SFX(X) X##_u@len@ + #define SIMD_TYPE npy_u@type@ + #define SIMD_DIVIDE U@TYPE@_divide /**end repeat2**/ #endif /* @@ -93,42 +146,44 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) */ #if NPY_BITSOF_@TYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON)) #undef TO_SIMD_SFX + #undef SIMD_TYPE + #undef SIMD_DIVIDE #endif -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@S@@TYPE@_divide) +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SIMD_DIVIDE) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { if (IS_BINARY_REDUCE) { - BINARY_REDUCE_LOOP(npy_@s@@type@) { - const npy_@s@@type@ d = *(npy_@s@@type@ *)ip2; - if (NPY_UNLIKELY(d == 0)) { + BINARY_REDUCE_LOOP(SIMD_TYPE) { + const SIMD_TYPE d = *(SIMD_TYPE *)ip2; + if (NPY_UNLIKELY(d == 0 || (io1 == (SIMD_TYPE)NPY_MIN_@TYPE@ && d == (SIMD_TYPE)-1))) { npy_set_floatstatus_divbyzero(); io1 = 0; } else { io1 /= d; } } - *((npy_@s@@type@ *)iop1) = io1; + *((SIMD_TYPE *)iop1) = io1; } #if NPY_SIMD && defined(TO_SIMD_SFX) // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_@s@@type@), NPY_SIMD_WIDTH) && - (*(npy_@s@@type@ *)args[1]) != 0) { + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(SIMD_TYPE), NPY_SIMD_WIDTH) && + (*(SIMD_TYPE *)args[1]) != 0) { TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]); } #endif else { BINARY_LOOP { - const npy_@s@@type@ in1 = *(npy_@s@@type@ *)ip1; - const npy_@s@@type@ in2 = *(npy_@s@@type@ *)ip2; - if (NPY_UNLIKELY(in2 == 0)) { + const SIMD_TYPE in1 = *(SIMD_TYPE *)ip1; + const SIMD_TYPE in2 = *(SIMD_TYPE *)ip2; + if (NPY_UNLIKELY(in2 == 0 || (in1 == (SIMD_TYPE)NPY_MIN_@TYPE@ && in2 == (SIMD_TYPE)-1))) { npy_set_floatstatus_divbyzero(); - *((npy_@s@@type@ *)op1) = 0; + *((SIMD_TYPE *)op1) = 0; } else{ - *((npy_@s@@type@ *)op1) = in1 / in2; + *((SIMD_TYPE *)op1) = in1 / in2; #if @signed@ /* Negative quotients needs to be rounded down */ - if (((in1 > 0) != (in2 > 0)) && (*((npy_@type@ *)op1) * in2 != in1)) { - *((npy_@type@ *)op1) = *((npy_@type@ *)op1) - 1; + if (((in1 > 0) != (in2 > 0)) && (*((SIMD_TYPE *)op1) * in2 != in1)) { + *((SIMD_TYPE *)op1) = *((SIMD_TYPE *)op1) - 1; } #endif } |
