diff options
| author | Ganesh Kathiresan <ganesh3597@gmail.com> | 2021-04-21 22:01:17 +0530 |
|---|---|---|
| committer | Sayed Adel <seiko@imavr.com> | 2021-05-20 23:19:50 +0200 |
| commit | b6b32674d634b6dfe9d92212e8a6ced0f1e14319 (patch) | |
| tree | e64e44993d75cd31d47f0f0669b9b3b08ea11271 /numpy | |
| parent | b1c3c98bfa13699dda51642723e3ce849d5950eb (diff) | |
| download | numpy-b6b32674d634b6dfe9d92212e8a6ced0f1e14319.tar.gz | |
SIMD: Refined signed and unsigned floor divide
Diffstat (limited to 'numpy')
| -rw-r--r-- | numpy/core/src/umath/loops_arithmetic.dispatch.c.src | 108 |
1 files changed, 68 insertions, 40 deletions
diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src index 30d7a2a99..5e54a45de 100644 --- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src +++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src @@ -41,48 +41,82 @@ * #signed = 0*4, 1*4# */ #if @signed@ -static NPY_INLINE npyv_@sfx@ -simd_floor_divide_@sfx@(npyv_lanetype_@sfx@ *src, const npyv_@sfx@x3 divisor, npyv_lanetype_@sfx@ scalar) +static NPY_INLINE void +simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) { - npyv_@sfx@ a, nsign_d, nsign_a, diff_sign, to_ninf, trunc, floor; + npyv_@sfx@ a, nsign_d, nsign_a, diff_sign, to_ninf, trunc, floor, neg, vzero; npyv_b@len@ greater_min, noverflow; + npy_bool raise; + npy_uint64 tobits; - nsign_d = npyv_setall_@sfx@(scalar < 0); - a = npyv_load_@sfx@(src); - nsign_a = npyv_cvt_@sfx@_b@len@(npyv_cmplt_@sfx@(a, nsign_d)); - nsign_a = npyv_and_@sfx@((npyv_@sfx@)nsign_a, npyv_setall_@sfx@(1)); - diff_sign = npyv_sub_@sfx@((npyv_@sfx@)nsign_a, nsign_d); - to_ninf = npyv_xor_@sfx@((npyv_@sfx@)nsign_a, nsign_d); - trunc = npyv_divc_@sfx@(npyv_add_@sfx@(a, diff_sign), divisor); + npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0]; + npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1]; + npyv_lanetype_@sfx@ *dst = (npyv_lanetype_@sfx@ *) args[2]; + const int vstep = npyv_nlanes_@sfx@; + const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar); if (NPY_UNLIKELY(-1 == scalar)) { - greater_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@)); - noverflow = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1)); - noverflow = npyv_and_b@len@(noverflow, greater_min); - if (npyv_tobits_b@len@(noverflow) != (((npy_uint64)1) << npyv_nlanes_@sfx@)-1) { + noverflow = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1)); + vzero = npyv_zero_@sfx@(); + for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { + a = npyv_load_@sfx@(src); + greater_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@)); + noverflow = npyv_and_b@len@(noverflow, greater_min); + neg = npyv_ifsub_@sfx@(greater_min, vzero, a, vzero); + + npyv_store_@sfx@(dst, neg); + } + tobits = npyv_tobits_b@len@(noverflow); + #if npyv_nlanes_@sfx@ == 64 + raise = (~tobits) != 0; + #else + raise = tobits != (1ULL << vstep)-1; + #endif + + for (; len > 0; --len, ++src, ++dst) { + npyv_lanetype_@sfx@ a = *src; + if (a == NPY_MIN_INT@len@) { + raise = NPY_TRUE; + *dst = 0; + } else { + *dst = -a; + } + } + if (raise) { npy_set_floatstatus_divbyzero(); } - floor = npyv_ifsub_@sfx@(greater_min, trunc, to_ninf, npyv_zero_@sfx@()); - } - else { - floor = npyv_sub_@sfx@(trunc, to_ninf); - } + } else { + for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { + nsign_d = npyv_setall_@sfx@(scalar < 0); + a = npyv_load_@sfx@(src); + nsign_a = npyv_cvt_@sfx@_b@len@(npyv_cmplt_@sfx@(a, nsign_d)); + nsign_a = npyv_and_@sfx@(nsign_a, npyv_setall_@sfx@(1)); + diff_sign = npyv_sub_@sfx@(nsign_a, nsign_d); + to_ninf = npyv_xor_@sfx@(nsign_a, nsign_d); + trunc = npyv_divc_@sfx@(npyv_add_@sfx@(a, diff_sign), divisor); + floor = npyv_sub_@sfx@(trunc, to_ninf); - return floor; -} -#else -static NPY_INLINE npyv_@sfx@ -simd_floor_divide_@sfx@(npyv_lanetype_@sfx@ *src, const npyv_@sfx@x3 divisor, npyv_lanetype_@sfx@ scalar) -{ - npyv_@sfx@ a, c; + npyv_store_@sfx@(dst, floor); + } - a = npyv_load_@sfx@(src); - c = npyv_divc_@sfx@(a, divisor); + for (; len > 0; --len, ++src, ++dst) { + const npyv_lanetype_@sfx@ a = *src; + if (scalar == 0 || (a == (npyv_lanetype_@sfx@)NPY_MIN_INT@len@ && scalar == (npyv_lanetype_@sfx@)-1)) { + npy_set_floatstatus_divbyzero(); + *dst = 0; + } else { + *dst = a / scalar; + /* Negative quotients needs to be rounded down */ + if (((a > 0) != (scalar > 0)) && (*dst * scalar != a)) { + *dst = *dst - 1; + } + } + } + } - return c; + npyv_cleanup(); } -#endif - +#else static NPY_INLINE void simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) { @@ -93,7 +127,8 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar); for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { - npyv_@sfx@ c = simd_floor_divide_@sfx@(src, divisor, scalar); + npyv_@sfx@ a = npyv_load_@sfx@(src); + npyv_@sfx@ c = npyv_divc_@sfx@(a, divisor); npyv_store_@sfx@(dst, c); } @@ -104,17 +139,12 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) *dst = 0; } else { *dst = a / scalar; -#if @signed@ - /* Negative quotients needs to be rounded down */ - if (((a > 0) != (scalar > 0)) && (*dst * scalar != a)) { - *dst = *dst - 1; - } -#endif } } npyv_cleanup(); } +#endif /**end repeat**/ #endif @@ -159,8 +189,6 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) */ #if NPY_BITSOF_@TYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON)) #undef TO_SIMD_SFX - #undef SIMD_TYPE - #undef SIMD_DIVIDE #endif NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SIMD_DIVIDE) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) |
