diff options
Diffstat (limited to 'numpy')
| -rw-r--r-- | numpy/core/src/umath/loops_arithmetic.dispatch.c.src | 124 |
1 files changed, 61 insertions, 63 deletions
diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src index a52bb36b7..19e05f2b5 100644 --- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src +++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src @@ -34,6 +34,7 @@ * uword −qsign = EOR(−nsign, −dsign); * q = TRUNC((n − (−dsign ) + (−nsign))/d) − (−qsign); ********************************************************************************/ + #if NPY_SIMD /**begin repeat * Signed types @@ -49,34 +50,28 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) const int vstep = npyv_nlanes_@sfx@; const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar); - if (scalar == (npyv_lanetype_@sfx@)-1) { + if (scalar == -1) { npyv_b@len@ noverflow = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1)); npyv_@sfx@ vzero = npyv_zero_@sfx@(); for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { - npyv_@sfx@ a = npyv_load_@sfx@(src); - npyv_b@len@ greater_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@)); - noverflow = npyv_and_b@len@(noverflow, greater_min); - npyv_@sfx@ neg = npyv_ifsub_@sfx@(greater_min, vzero, a, vzero); - + npyv_@sfx@ a = npyv_load_@sfx@(src); + npyv_b@len@ gt_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@)); + noverflow = npyv_and_b@len@(noverflow, gt_min); + npyv_@sfx@ neg = npyv_ifsub_@sfx@(gt_min, vzero, a, vzero); npyv_store_@sfx@(dst, neg); } - npy_uint64 tobits = npyv_tobits_b@len@(noverflow); - #if npyv_nlanes_@sfx@ == 64 - int raise = (~tobits) != 0; - #else - int raise = tobits != (1ULL << vstep)-1; - #endif + int raise_err = npyv_tobits_b@len@(npyv_not_b@len@(noverflow)) != 0; for (; len > 0; --len, ++src, ++dst) { npyv_lanetype_@sfx@ a = *src; if (a == NPY_MIN_INT@len@) { - raise = NPY_TRUE; + raise_err = 1; *dst = 0; } else { *dst = -a; } } - if (raise) { + if (raise_err) { npy_set_floatstatus_divbyzero(); } } else { @@ -89,20 +84,19 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) npyv_@sfx@ to_ninf = npyv_xor_@sfx@(nsign_a, nsign_d); npyv_@sfx@ trunc = npyv_divc_@sfx@(npyv_add_@sfx@(a, diff_sign), divisor); npyv_@sfx@ floor = npyv_sub_@sfx@(trunc, to_ninf); - npyv_store_@sfx@(dst, floor); } for (; len > 0; --len, ++src, ++dst) { const npyv_lanetype_@sfx@ a = *src; - *dst = a / scalar; - /* Negative quotients needs to be rounded down */ - if (((a > 0) != (scalar > 0)) && (*dst * scalar != a)) { - *dst = *dst - 1; + npyv_lanetype_@sfx@ r = a / scalar; + // Negative quotients needs to be rounded down + if (((a > 0) != (scalar > 0)) && ((r * scalar) != a)) { + r--; } + *dst = r; } } - npyv_cleanup(); } /**end repeat**/ @@ -131,7 +125,6 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) const npyv_lanetype_@sfx@ a = *src; *dst = a / scalar; } - npyv_cleanup(); } /**end repeat**/ @@ -143,8 +136,8 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) /**begin repeat * Signed types - * #type = byte, short, int, long, longlong# - * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# + * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong# + * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# */ #undef TO_SIMD_SFX #if 0 @@ -159,42 +152,47 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) #if NPY_BITSOF_@TYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON)) #undef TO_SIMD_SFX #endif + +NPY_FINLINE @type@ floor_div_@TYPE@(const @type@ n, const @type@ d) +{ + /* + * FIXME: On x86 at least, dividing the smallest representable integer + * by -1 causes a SIFGPE (division overflow). We treat this case here + * (to avoid a SIGFPE crash at python level), but a good solution would + * be to treat integer division problems separately from FPU exceptions + * (i.e. a different approach than npy_set_floatstatus_divbyzero()). + */ + if (NPY_UNLIKELY(d == 0 || (n == NPY_MIN_@TYPE@ && d == -1))) { + npy_set_floatstatus_divbyzero(); + return 0; + } + @type@ r = n / d; + // Negative quotients needs to be rounded down + if (((n > 0) != (d > 0)) && ((r * d) != n)) { + r--; + } + return r; +} + NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { if (IS_BINARY_REDUCE) { - BINARY_REDUCE_LOOP(npy_@type@) { - const npy_@type@ d = *(npy_@type@ *)ip2; - if (NPY_UNLIKELY(d == 0 || (io1 == (npy_@type@)NPY_MIN_@TYPE@ && d == (npy_@type@)-1))) { - npy_set_floatstatus_divbyzero(); - io1 = 0; - } else { - io1 /= d; - } + BINARY_REDUCE_LOOP(@type@) { + io1 = floor_div_@TYPE@(io1, *(@type@*)ip2); } - *((npy_@type@ *)iop1) = io1; + *((@type@ *)iop1) = io1; } #if NPY_SIMD && defined(TO_SIMD_SFX) // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_@type@), NPY_SIMD_WIDTH) && - (*(npy_@type@ *)args[1]) != 0) { + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) && + (*(@type@ *)args[1]) != 0) { TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]); } #endif else { BINARY_LOOP { - const npy_@type@ in1 = *(npy_@type@ *)ip1; - const npy_@type@ in2 = *(npy_@type@ *)ip2; - if (NPY_UNLIKELY(in2 == 0 || (in1 == (npy_@type@)NPY_MIN_@TYPE@ && in2 == (npy_@type@)-1))) { - npy_set_floatstatus_divbyzero(); - *((npy_@type@ *)op1) = 0; - } else{ - *((npy_@type@ *)op1) = in1 / in2; - /* Negative quotients needs to be rounded down */ - if (((in1 > 0) != (in2 > 0)) && (*((npy_@type@ *)op1) * in2 != in1)) { - *((npy_@type@ *)op1) = *((npy_@type@ *)op1) - 1; - } - } + *((@type@ *)op1) = floor_div_@TYPE@(*(@type@*)ip1, *(@type@*)ip2); } } } @@ -202,16 +200,16 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) /**begin repeat * Unsigned types - * #type = byte, short, int, long, longlong# - * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# + * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong# + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG# + * #STYPE = BYTE, SHORT, INT, LONG, LONGLONG# */ - #undef TO_SIMD_SFX #if 0 /**begin repeat1 * #len = 8, 16, 32, 64# */ -#elif NPY_BITSOF_@TYPE@ == @len@ +#elif NPY_BITSOF_@STYPE@ == @len@ #define TO_SIMD_SFX(X) X##_u@len@ /**end repeat1**/ #endif @@ -223,40 +221,40 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) * Power10(VSX4) is an exception here since it has native support for integer vector division, * note neither infrastructure nor NPYV has supported VSX4 yet. */ -#if NPY_BITSOF_@TYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON)) +#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON)) #undef TO_SIMD_SFX #endif -NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(U@TYPE@_divide) +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { if (IS_BINARY_REDUCE) { - BINARY_REDUCE_LOOP(npy_u@type@) { - const npy_u@type@ d = *(npy_u@type@ *)ip2; - if (NPY_UNLIKELY(d == 0 || (io1 == (npy_u@type@)NPY_MIN_@TYPE@ && d == (npy_u@type@)-1))) { + BINARY_REDUCE_LOOP(@type@) { + const @type@ d = *(@type@ *)ip2; + if (NPY_UNLIKELY(d == 0)) { npy_set_floatstatus_divbyzero(); io1 = 0; } else { io1 /= d; } } - *((npy_u@type@ *)iop1) = io1; + *((@type@ *)iop1) = io1; } #if NPY_SIMD && defined(TO_SIMD_SFX) // for contiguous block of memory, divisor is a scalar and not 0 - else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_u@type@), NPY_SIMD_WIDTH) && - (*(npy_u@type@ *)args[1]) != 0) { + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) && + (*(@type@ *)args[1]) != 0) { TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]); } #endif else { BINARY_LOOP { - const npy_u@type@ in1 = *(npy_u@type@ *)ip1; - const npy_u@type@ in2 = *(npy_u@type@ *)ip2; - if (NPY_UNLIKELY(in2 == 0 || (in1 == (npy_u@type@)NPY_MIN_@TYPE@ && in2 == (npy_u@type@)-1))) { + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { npy_set_floatstatus_divbyzero(); - *((npy_u@type@ *)op1) = 0; + *((@type@ *)op1) = 0; } else{ - *((npy_u@type@ *)op1) = in1 / in2; + *((@type@ *)op1) = in1 / in2; } } } |
