summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
authorGanesh Kathiresan <ganesh3597@gmail.com>2021-04-13 21:08:48 +0530
committerSayed Adel <seiko@imavr.com>2021-05-20 23:19:50 +0200
commit0b8838ef0c2e4c5d9e66163d260dc30902cc6170 (patch)
tree14d43fce4ad05b2bd6bc8ab89c33bfc569ebaf35 /numpy
parent7f9d342324a730185cdf215c66d73530033436ab (diff)
downloadnumpy-0b8838ef0c2e4c5d9e66163d260dc30902cc6170.tar.gz
SIMD: Added floor divide logic for signed
Diffstat (limited to 'numpy')
-rw-r--r--numpy/core/src/umath/loops_arithmetic.dispatch.c.src107
1 files changed, 81 insertions, 26 deletions
diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
index c071edb3b..55066589f 100644
--- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -23,9 +23,53 @@
********************************************************************************/
#if NPY_SIMD
/**begin repeat
- * #sfx = u8, u16, u32, u64, s8, s16, s32, s64#
+ * #sfx = u8, u16, u32, u64, s8, s16, s32, s64#
+ * #len = 8, 16, 32, 64, 8, 16, 32, 64#
* #signed = 0*4, 1*4#
*/
+#if @signed@
+static NPY_INLINE npyv_@sfx@
+simd_floor_divide_@sfx@(npyv_lanetype_@sfx@ *src, const npyv_@sfx@x3 divisor, npyv_lanetype_@sfx@ scalar)
+{
+ npyv_@sfx@ a, nsign_d, nsign_a, diff_sign, to_ninf, trunc, floor;
+ npyv_b@len@ greater_min, noverflow;
+
+ nsign_d = npyv_setall_@sfx@(scalar < 0);
+ a = npyv_load_@sfx@(src);
+ nsign_a = npyv_cvt_@sfx@_b@len@(npyv_cmplt_@sfx@(a, nsign_d));
+ nsign_a = npyv_and_@sfx@((npyv_@sfx@)nsign_a, npyv_setall_@sfx@(1));
+ diff_sign = npyv_sub_@sfx@((npyv_@sfx@)nsign_a, nsign_d);
+ to_ninf = npyv_xor_@sfx@((npyv_@sfx@)nsign_a, nsign_d);
+ trunc = npyv_divc_@sfx@(npyv_add_@sfx@(a, diff_sign), divisor);
+
+ if (NPY_UNLIKELY(-1 == scalar)) {
+ greater_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@));
+ noverflow = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1));
+ noverflow = npyv_and_b@len@(noverflow, greater_min);
+ if (npyv_tobits_b@len@(noverflow) != (((npy_uint64)1) << npyv_nlanes_@sfx@)-1) {
+ npy_set_floatstatus_divbyzero();
+ }
+ floor = npyv_ifsub_@sfx@(greater_min, trunc, to_ninf, npyv_zero_@sfx@());
+ }
+ else {
+ floor = npyv_sub_@sfx@(trunc, to_ninf);
+ }
+
+ return floor;
+}
+#else
+static NPY_INLINE npyv_@sfx@
+simd_floor_divide_@sfx@(npyv_lanetype_@sfx@ *src, const npyv_@sfx@x3 divisor, npyv_lanetype_@sfx@ scalar)
+{
+ npyv_@sfx@ a, c;
+
+ a = npyv_load_@sfx@(src);
+ c = npyv_divc_@sfx@(a, divisor);
+
+ return c;
+}
+#endif
+
static NPY_INLINE void
simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
{
@@ -36,20 +80,24 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar);
for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
- npyv_@sfx@ a = npyv_load_@sfx@(src);
- npyv_@sfx@ c = npyv_divc_@sfx@(a, divisor);
+ npyv_@sfx@ c = simd_floor_divide_@sfx@(src, divisor, scalar);
npyv_store_@sfx@(dst, c);
}
for (; len > 0; --len, ++src, ++dst) {
const npyv_lanetype_@sfx@ a = *src;
- *dst = a / scalar;
+ if (scalar == 0 || (a == (npyv_lanetype_@sfx@)NPY_MIN_INT@len@ && scalar == (npyv_lanetype_@sfx@)-1)) {
+ npy_set_floatstatus_divbyzero();
+ *dst = 0;
+ } else {
+ *dst = a / scalar;
#if @signed@
- /* Negative quotients needs to be rounded down */
- if (((a > 0) != (scalar > 0)) && (*dst * scalar != a)) {
- *dst = *dst - 1;
- }
+ /* Negative quotients needs to be rounded down */
+ if (((a > 0) != (scalar > 0)) && (*dst * scalar != a)) {
+ *dst = *dst - 1;
+ }
#endif
+ }
}
npyv_cleanup();
@@ -68,19 +116,24 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
*/
/**begin repeat1
- * #s = , u#
- * #S = , U#
- * #slen = s, u#
* #signed = 1, 0#
*/
#undef TO_SIMD_SFX
+#undef SIMD_TYPE
+#undef SIMD_DIVIDE
#if 0
/**begin repeat2
* #len = 8, 16, 32, 64#
*/
+#elif NPY_BITSOF_@TYPE@ == @len@ && @signed@
+ #define TO_SIMD_SFX(X) X##_s@len@
+ #define SIMD_TYPE npy_@type@
+ #define SIMD_DIVIDE @TYPE@_divide
#elif NPY_BITSOF_@TYPE@ == @len@
- #define TO_SIMD_SFX(X) X##_@slen@@len@
+ #define TO_SIMD_SFX(X) X##_u@len@
+ #define SIMD_TYPE npy_u@type@
+ #define SIMD_DIVIDE U@TYPE@_divide
/**end repeat2**/
#endif
/*
@@ -93,42 +146,44 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
*/
#if NPY_BITSOF_@TYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
#undef TO_SIMD_SFX
+ #undef SIMD_TYPE
+ #undef SIMD_DIVIDE
#endif
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@S@@TYPE@_divide)
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SIMD_DIVIDE)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
if (IS_BINARY_REDUCE) {
- BINARY_REDUCE_LOOP(npy_@s@@type@) {
- const npy_@s@@type@ d = *(npy_@s@@type@ *)ip2;
- if (NPY_UNLIKELY(d == 0)) {
+ BINARY_REDUCE_LOOP(SIMD_TYPE) {
+ const SIMD_TYPE d = *(SIMD_TYPE *)ip2;
+ if (NPY_UNLIKELY(d == 0 || (io1 == (SIMD_TYPE)NPY_MIN_@TYPE@ && d == (SIMD_TYPE)-1))) {
npy_set_floatstatus_divbyzero();
io1 = 0;
} else {
io1 /= d;
}
}
- *((npy_@s@@type@ *)iop1) = io1;
+ *((SIMD_TYPE *)iop1) = io1;
}
#if NPY_SIMD && defined(TO_SIMD_SFX)
// for contiguous block of memory, divisor is a scalar and not 0
- else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_@s@@type@), NPY_SIMD_WIDTH) &&
- (*(npy_@s@@type@ *)args[1]) != 0) {
+ else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(SIMD_TYPE), NPY_SIMD_WIDTH) &&
+ (*(SIMD_TYPE *)args[1]) != 0) {
TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
}
#endif
else {
BINARY_LOOP {
- const npy_@s@@type@ in1 = *(npy_@s@@type@ *)ip1;
- const npy_@s@@type@ in2 = *(npy_@s@@type@ *)ip2;
- if (NPY_UNLIKELY(in2 == 0)) {
+ const SIMD_TYPE in1 = *(SIMD_TYPE *)ip1;
+ const SIMD_TYPE in2 = *(SIMD_TYPE *)ip2;
+ if (NPY_UNLIKELY(in2 == 0 || (in1 == (SIMD_TYPE)NPY_MIN_@TYPE@ && in2 == (SIMD_TYPE)-1))) {
npy_set_floatstatus_divbyzero();
- *((npy_@s@@type@ *)op1) = 0;
+ *((SIMD_TYPE *)op1) = 0;
} else{
- *((npy_@s@@type@ *)op1) = in1 / in2;
+ *((SIMD_TYPE *)op1) = in1 / in2;
#if @signed@
/* Negative quotients needs to be rounded down */
- if (((in1 > 0) != (in2 > 0)) && (*((npy_@type@ *)op1) * in2 != in1)) {
- *((npy_@type@ *)op1) = *((npy_@type@ *)op1) - 1;
+ if (((in1 > 0) != (in2 > 0)) && (*((SIMD_TYPE *)op1) * in2 != in1)) {
+ *((SIMD_TYPE *)op1) = *((SIMD_TYPE *)op1) - 1;
}
#endif
}