diff options
| author | Matti Picus <matti.picus@gmail.com> | 2021-05-21 16:25:59 +0300 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2021-05-21 16:25:59 +0300 |
| commit | 7de0fa959e476900725d8a654775e0a38745de08 (patch) | |
| tree | 9d610d8a640960b672ee4cb63c425afc5b1c5d8d /numpy | |
| parent | 5f3c3181b5d8db0e430e5f605cc45c4392f04934 (diff) | |
| parent | f74f5003090a4c7ec83dead46567bd96de6dfce8 (diff) | |
| download | numpy-7de0fa959e476900725d8a654775e0a38745de08.tar.gz | |
Merge pull request #18766 from ganesh-k13/enh_simd_signed_division
ENH, SIMD: Replace libdivide functions of signed integer division with universal intrinsics
Diffstat (limited to 'numpy')
| -rw-r--r-- | numpy/core/code_generators/generate_umath.py | 2 | ||||
| -rw-r--r-- | numpy/core/src/common/simd/intdiv.h | 24 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops.c.src | 86 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops.h.src | 6 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops_arithmetic.dispatch.c.src | 148 |
5 files changed, 161 insertions, 105 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index 6b6a0fe64..9e94f9ccc 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -328,7 +328,7 @@ defdict = { docstrings.get('numpy.core.umath.floor_divide'), 'PyUFunc_DivisionTypeResolver', TD(ints, cfunc_alias='divide', - dispatch=[('loops_arithmetic', 'BHILQ')]), + dispatch=[('loops_arithmetic', 'bBhHiIlLqQ')]), TD(flts + cmplx), [TypeDescription('m', FullTypeDescr, 'mq', 'm'), TypeDescription('m', FullTypeDescr, 'md', 'm'), diff --git a/numpy/core/src/common/simd/intdiv.h b/numpy/core/src/common/simd/intdiv.h index 1ce3b4df8..f6ea9abf2 100644 --- a/numpy/core/src/common/simd/intdiv.h +++ b/numpy/core/src/common/simd/intdiv.h @@ -368,18 +368,18 @@ NPY_FINLINE npyv_s32x3 npyv_divisor_s32(npy_int32 d) { npy_int32 d1 = abs(d); npy_int32 sh, m; - if (d1 > 1) { + // Handel abs overflow + if ((npy_uint32)d == 0x80000000U) { + m = 0x80000001; + sh = 30; + } + else if (d1 > 1) { sh = npyv__bitscan_revnz_u32(d1 - 1); // ceil(log2(abs(d))) - 1 m = (1ULL << (32 + sh)) / d1 + 1; // multiplier } else if (d1 == 1) { sh = 0; m = 1; } - // fix abs overflow - else if (d == (1 << 31)) { - m = d + 1; - sh = 30; - } else { // raise arithmetic exception for d == 0 sh = m = 1 / ((npy_int32 volatile *)&d)[0]; // LCOV_EXCL_LINE @@ -445,18 +445,18 @@ NPY_FINLINE npyv_s64x3 npyv_divisor_s64(npy_int64 d) #else npy_int64 d1 = llabs(d); npy_int64 sh, m; - if (d1 > 1) { + // Handel abs overflow + if ((npy_uint64)d == 0x8000000000000000ULL) { + m = 0x8000000000000001LL; + sh = 62; + } + else if (d1 > 1) { sh = npyv__bitscan_revnz_u64(d1 - 1); // ceil(log2(abs(d))) - 1 m = npyv__divh128_u64(1ULL << sh, d1) + 1; // multiplier } else if (d1 == 1) { sh = 0; m = 1; } - // fix abs overflow - else if (d == (1LL << 63)) { - m = d + 1; - sh = 62; - } else { // raise arithmetic exception for d == 0 sh = m = 1 / ((npy_int64 volatile *)&d)[0]; // LCOV_EXCL_LINE diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 04665dc52..683bd0178 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -843,92 +843,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0)); } -/* Libdivide only supports 32 and 64 bit types - * We try to pick the best possible one */ -#if NPY_BITSOF_@TYPE@ <= 32 -#define libdivide_@type@_t libdivide_s32_t -#define libdivide_@type@_gen libdivide_s32_gen -#define libdivide_@type@_do libdivide_s32_do -#else -#define libdivide_@type@_t libdivide_s64_t -#define libdivide_@type@_gen libdivide_s64_gen -#define libdivide_@type@_do libdivide_s64_do -#endif - -NPY_NO_EXPORT void -@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - BINARY_DEFS - - /* When the divisor is a constant, use libdivide for faster division */ - if (steps[1] == 0) { - /* In case of empty array, just return */ - if (n == 0) { - return; - } - - const @type@ in2 = *(@type@ *)ip2; - - /* If divisor is 0, we need not compute anything */ - if (in2 == 0) { - npy_set_floatstatus_divbyzero(); - BINARY_LOOP_SLIDING { - *((@type@ *)op1) = 0; - } - } - else { - struct libdivide_@type@_t fast_d = libdivide_@type@_gen(in2); - BINARY_LOOP_SLIDING { - const @type@ in1 = *(@type@ *)ip1; - /* - * FIXME: On x86 at least, dividing the smallest representable integer - * by -1 causes a SIFGPE (division overflow). We treat this case here - * (to avoid a SIGFPE crash at python level), but a good solution would - * be to treat integer division problems separately from FPU exceptions - * (i.e. a different approach than npy_set_floatstatus_divbyzero()). - */ - if (in1 == NPY_MIN_@TYPE@ && in2 == -1) { - npy_set_floatstatus_divbyzero(); - *((@type@ *)op1) = 0; - } - else { - *((@type@ *)op1) = libdivide_@type@_do(in1, &fast_d); - - /* Negative quotients needs to be rounded down */ - if (((in1 > 0) != (in2 > 0)) && (*((@type@ *)op1) * in2 != in1)) { - *((@type@ *)op1) = *((@type@ *)op1) - 1; - } - } - } - } - } - else { - BINARY_LOOP_SLIDING { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - /* - * FIXME: On x86 at least, dividing the smallest representable integer - * by -1 causes a SIFGPE (division overflow). We treat this case here - * (to avoid a SIGFPE crash at python level), but a good solution would - * be to treat integer division problems separately from FPU exceptions - * (i.e. a different approach than npy_set_floatstatus_divbyzero()). - */ - if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) { - npy_set_floatstatus_divbyzero(); - *((@type@ *)op1) = 0; - } - else { - *((@type@ *)op1) = in1/in2; - - /* Negative quotients needs to be rounded down */ - if (((in1 > 0) != (in2 > 0)) && (*((@type@ *)op1) * in2 != in1)) { - *((@type@ *)op1) = *((@type@ *)op1) - 1; - } - } - } - } -} - NPY_NO_EXPORT void @TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 0301aa5ed..bb07e047c 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -58,7 +58,8 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void #endif /**begin repeat - * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG# + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, + BYTE, SHORT, INT, LONG, LONGLONG# */ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide, (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) @@ -152,9 +153,6 @@ NPY_NO_EXPORT void @S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void -@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void @S@@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src index 7e9f46463..19e05f2b5 100644 --- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src +++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src @@ -20,10 +20,91 @@ //############################################################################### /******************************************************************************** ** Defining the SIMD kernels + * + * Floor division of signed is based on T. Granlund and P. L. Montgomery + * “Division by invariant integers using multiplication(see [Figure 6.1] + * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556)" + * For details on TRUNC division see simd/intdiv.h for more clarification + *********************************************************************************** + ** Figure 6.1: Signed division by run–time invariant divisor, rounded towards -INF + *********************************************************************************** + * For q = FLOOR(a/d), all sword: + * sword −dsign = SRL(d, N − 1); + * uword −nsign = (n < −dsign); + * uword −qsign = EOR(−nsign, −dsign); + * q = TRUNC((n − (−dsign ) + (−nsign))/d) − (−qsign); ********************************************************************************/ + #if NPY_SIMD /**begin repeat - * #sfx = u8, u16, u32, u64# + * Signed types + * #sfx = s8, s16, s32, s64# + * #len = 8, 16, 32, 64# + */ +static NPY_INLINE void +simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) +{ + npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0]; + npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1]; + npyv_lanetype_@sfx@ *dst = (npyv_lanetype_@sfx@ *) args[2]; + const int vstep = npyv_nlanes_@sfx@; + const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar); + + if (scalar == -1) { + npyv_b@len@ noverflow = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1)); + npyv_@sfx@ vzero = npyv_zero_@sfx@(); + for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { + npyv_@sfx@ a = npyv_load_@sfx@(src); + npyv_b@len@ gt_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@)); + noverflow = npyv_and_b@len@(noverflow, gt_min); + npyv_@sfx@ neg = npyv_ifsub_@sfx@(gt_min, vzero, a, vzero); + npyv_store_@sfx@(dst, neg); + } + + int raise_err = npyv_tobits_b@len@(npyv_not_b@len@(noverflow)) != 0; + for (; len > 0; --len, ++src, ++dst) { + npyv_lanetype_@sfx@ a = *src; + if (a == NPY_MIN_INT@len@) { + raise_err = 1; + *dst = 0; + } else { + *dst = -a; + } + } + if (raise_err) { + npy_set_floatstatus_divbyzero(); + } + } else { + for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { + npyv_@sfx@ nsign_d = npyv_setall_@sfx@(scalar < 0); + npyv_@sfx@ a = npyv_load_@sfx@(src); + npyv_@sfx@ nsign_a = npyv_cvt_@sfx@_b@len@(npyv_cmplt_@sfx@(a, nsign_d)); + nsign_a = npyv_and_@sfx@(nsign_a, npyv_setall_@sfx@(1)); + npyv_@sfx@ diff_sign = npyv_sub_@sfx@(nsign_a, nsign_d); + npyv_@sfx@ to_ninf = npyv_xor_@sfx@(nsign_a, nsign_d); + npyv_@sfx@ trunc = npyv_divc_@sfx@(npyv_add_@sfx@(a, diff_sign), divisor); + npyv_@sfx@ floor = npyv_sub_@sfx@(trunc, to_ninf); + npyv_store_@sfx@(dst, floor); + } + + for (; len > 0; --len, ++src, ++dst) { + const npyv_lanetype_@sfx@ a = *src; + npyv_lanetype_@sfx@ r = a / scalar; + // Negative quotients needs to be rounded down + if (((a > 0) != (scalar > 0)) && ((r * scalar) != a)) { + r--; + } + *dst = r; + } + } + npyv_cleanup(); +} +/**end repeat**/ + +/**begin repeat + * Unsigned types + * #sfx = u8, u16, u32, u64# + * #len = 8, 16, 32, 64# */ static NPY_INLINE void simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) @@ -44,7 +125,6 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) const npyv_lanetype_@sfx@ a = *src; *dst = a / scalar; } - npyv_cleanup(); } /**end repeat**/ @@ -55,6 +135,70 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) ********************************************************************************/ /**begin repeat + * Signed types + * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong# + * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# + */ +#undef TO_SIMD_SFX +#if 0 +/**begin repeat1 + * #len = 8, 16, 32, 64# + */ +#elif NPY_BITSOF_@TYPE@ == @len@ + #define TO_SIMD_SFX(X) X##_s@len@ +/**end repeat1**/ +#endif + +#if NPY_BITSOF_@TYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON)) + #undef TO_SIMD_SFX +#endif + +NPY_FINLINE @type@ floor_div_@TYPE@(const @type@ n, const @type@ d) +{ + /* + * FIXME: On x86 at least, dividing the smallest representable integer + * by -1 causes a SIFGPE (division overflow). We treat this case here + * (to avoid a SIGFPE crash at python level), but a good solution would + * be to treat integer division problems separately from FPU exceptions + * (i.e. a different approach than npy_set_floatstatus_divbyzero()). + */ + if (NPY_UNLIKELY(d == 0 || (n == NPY_MIN_@TYPE@ && d == -1))) { + npy_set_floatstatus_divbyzero(); + return 0; + } + @type@ r = n / d; + // Negative quotients needs to be rounded down + if (((n > 0) != (d > 0)) && ((r * d) != n)) { + r--; + } + return r; +} + +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + if (IS_BINARY_REDUCE) { + BINARY_REDUCE_LOOP(@type@) { + io1 = floor_div_@TYPE@(io1, *(@type@*)ip2); + } + *((@type@ *)iop1) = io1; + } +#if NPY_SIMD && defined(TO_SIMD_SFX) + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) && + (*(@type@ *)args[1]) != 0) { + TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]); + } +#endif + else { + BINARY_LOOP { + *((@type@ *)op1) = floor_div_@TYPE@(*(@type@*)ip1, *(@type@*)ip2); + } + } +} +/**end repeat**/ + +/**begin repeat * Unsigned types * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong# * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG# |
