5 files changed, 161 insertions, 105 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 6b6a0fe64..9e94f9ccc 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -328,7 +328,7 @@ defdict = {
           docstrings.get('numpy.core.umath.floor_divide'),
           'PyUFunc_DivisionTypeResolver',
           TD(ints, cfunc_alias='divide',
-              dispatch=[('loops_arithmetic', 'BHILQ')]),
+              dispatch=[('loops_arithmetic', 'bBhHiIlLqQ')]),
           TD(flts + cmplx),
           [TypeDescription('m', FullTypeDescr, 'mq', 'm'),
            TypeDescription('m', FullTypeDescr, 'md', 'm'),
diff --git a/numpy/core/src/common/simd/intdiv.h b/numpy/core/src/common/simd/intdiv.h
index 1ce3b4df8..f6ea9abf2 100644
--- a/numpy/core/src/common/simd/intdiv.h
+++ b/numpy/core/src/common/simd/intdiv.h
@@ -368,18 +368,18 @@ NPY_FINLINE npyv_s32x3 npyv_divisor_s32(npy_int32 d)
 {
     npy_int32 d1 = abs(d);
     npy_int32 sh, m;
-    if (d1 > 1) {
+    // Handel abs overflow
+    if ((npy_uint32)d == 0x80000000U) {
+        m = 0x80000001;
+        sh = 30;
+    }
+    else if (d1 > 1) {
         sh = npyv__bitscan_revnz_u32(d1 - 1); // ceil(log2(abs(d))) - 1
         m =  (1ULL << (32 + sh)) / d1 + 1;    // multiplier
     }
     else if (d1 == 1) {
         sh = 0; m = 1;
     }
-    // fix abs overflow
-    else if (d == (1 << 31)) {
-        m = d + 1;
-        sh = 30;
-    }
     else {
         // raise arithmetic exception for d == 0
         sh = m = 1 / ((npy_int32 volatile *)&d)[0]; // LCOV_EXCL_LINE
@@ -445,18 +445,18 @@ NPY_FINLINE npyv_s64x3 npyv_divisor_s64(npy_int64 d)
 #else
     npy_int64 d1 = llabs(d);
     npy_int64 sh, m;
-    if (d1 > 1) {
+    // Handel abs overflow
+    if ((npy_uint64)d == 0x8000000000000000ULL) {
+        m = 0x8000000000000001LL;
+        sh = 62;
+    }
+    else if (d1 > 1) {
         sh = npyv__bitscan_revnz_u64(d1 - 1);       // ceil(log2(abs(d))) - 1
         m  = npyv__divh128_u64(1ULL << sh, d1) + 1; // multiplier
     }
     else if (d1 == 1) {
         sh = 0; m = 1;
     }
-    // fix abs overflow
-    else if (d == (1LL << 63)) {
-        m = d + 1;
-        sh = 62;
-    }
     else {
         // raise arithmetic exception for d == 0
         sh = m = 1 / ((npy_int64 volatile *)&d)[0]; // LCOV_EXCL_LINE
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 04665dc52..683bd0178 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -843,92 +843,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
     UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
 }
 
-/* Libdivide only supports 32 and 64 bit types
- * We try to pick the best possible one */
-#if NPY_BITSOF_@TYPE@ <= 32
-#define libdivide_@type@_t libdivide_s32_t
-#define libdivide_@type@_gen libdivide_s32_gen
-#define libdivide_@type@_do libdivide_s32_do
-#else
-#define libdivide_@type@_t libdivide_s64_t
-#define libdivide_@type@_gen libdivide_s64_gen
-#define libdivide_@type@_do libdivide_s64_do
-#endif
-
-NPY_NO_EXPORT void
-@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    BINARY_DEFS
-
-    /* When the divisor is a constant, use libdivide for faster division */
-    if (steps[1] == 0) {
-        /* In case of empty array, just return */
-        if (n == 0) {
-            return;
-        }
-
-        const @type@ in2 = *(@type@ *)ip2;
-
-        /* If divisor is 0, we need not compute anything */
-        if (in2 == 0) {
-            npy_set_floatstatus_divbyzero();
-            BINARY_LOOP_SLIDING {
-                *((@type@ *)op1) = 0;
-            }
-        }
-        else {
-            struct libdivide_@type@_t fast_d = libdivide_@type@_gen(in2);
-            BINARY_LOOP_SLIDING {
-                const @type@ in1 = *(@type@ *)ip1;
-                /*
-                 * FIXME: On x86 at least, dividing the smallest representable integer
-                 * by -1 causes a SIFGPE (division overflow). We treat this case here
-                 * (to avoid a SIGFPE crash at python level), but a good solution would
-                 * be to treat integer division problems separately from FPU exceptions
-                 * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
-                 */
-                if (in1 == NPY_MIN_@TYPE@ && in2 == -1) {
-                    npy_set_floatstatus_divbyzero();
-                    *((@type@ *)op1) = 0;
-                }
-                else {
-                    *((@type@ *)op1) = libdivide_@type@_do(in1, &fast_d);
-
-                    /* Negative quotients needs to be rounded down */
-                    if (((in1 > 0) != (in2 > 0)) && (*((@type@ *)op1) * in2 != in1)) {
-                        *((@type@ *)op1) = *((@type@ *)op1) - 1;
-                    }
-                }
-            }
-        }
-    }
-    else {
-        BINARY_LOOP_SLIDING {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            /*
-             * FIXME: On x86 at least, dividing the smallest representable integer
-             * by -1 causes a SIFGPE (division overflow). We treat this case here
-             * (to avoid a SIGFPE crash at python level), but a good solution would
-             * be to treat integer division problems separately from FPU exceptions
-             * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
-             */
-            if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) {
-                npy_set_floatstatus_divbyzero();
-                *((@type@ *)op1) = 0;
-            }
-            else {
-                *((@type@ *)op1) = in1/in2;
-
-                /* Negative quotients needs to be rounded down */
-                if (((in1 > 0) != (in2 > 0)) && (*((@type@ *)op1) * in2 != in1)) {
-                    *((@type@ *)op1) = *((@type@ *)op1) - 1;
-                }
-            }
-        }
-    }
-}
-
 NPY_NO_EXPORT void
 @TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 0301aa5ed..bb07e047c 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -58,7 +58,8 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
 #endif
 
 /**begin repeat
- * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+           BYTE,  SHORT,  INT,  LONG,  LONGLONG#
  */
  NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide,
      (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
@@ -152,9 +153,6 @@ NPY_NO_EXPORT void
 @S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
-@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
 @S@@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
index 7e9f46463..19e05f2b5 100644
--- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -20,10 +20,91 @@
 //###############################################################################
 /********************************************************************************
  ** Defining the SIMD kernels
+ *
+ * Floor division of signed is based on T. Granlund and P. L. Montgomery
+ * “Division by invariant integers using multiplication(see [Figure 6.1]
+ * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556)"
+ * For details on TRUNC division see simd/intdiv.h for more clarification
+ ***********************************************************************************
+ ** Figure 6.1: Signed division by run–time invariant divisor, rounded towards -INF
+ ***********************************************************************************
+ * For q = FLOOR(a/d), all sword:
+ *     sword −dsign = SRL(d, N − 1);
+ *     uword −nsign = (n < −dsign);
+ *     uword −qsign = EOR(−nsign, −dsign);
+ *     q = TRUNC((n − (−dsign ) + (−nsign))/d) − (−qsign);
  ********************************************************************************/
+
 #if NPY_SIMD
 /**begin repeat
- *  #sfx = u8, u16, u32, u64#
+ * Signed types
+ * #sfx    = s8, s16, s32, s64#
+ * #len    = 8,  16,  32,  64#
+ */
+static NPY_INLINE void
+simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
+{
+    npyv_lanetype_@sfx@ *src   = (npyv_lanetype_@sfx@ *) args[0];
+    npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1];
+    npyv_lanetype_@sfx@ *dst   = (npyv_lanetype_@sfx@ *) args[2];
+    const int vstep            = npyv_nlanes_@sfx@;
+    const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar);
+
+    if (scalar == -1) {
+        npyv_b@len@ noverflow = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1));
+        npyv_@sfx@ vzero      = npyv_zero_@sfx@();
+        for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+            npyv_@sfx@ a       = npyv_load_@sfx@(src);
+            npyv_b@len@ gt_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@));
+            noverflow          = npyv_and_b@len@(noverflow, gt_min);
+            npyv_@sfx@ neg     = npyv_ifsub_@sfx@(gt_min, vzero, a, vzero);
+            npyv_store_@sfx@(dst, neg);
+        }
+
+        int raise_err = npyv_tobits_b@len@(npyv_not_b@len@(noverflow)) != 0;
+        for (; len > 0; --len, ++src, ++dst) {
+            npyv_lanetype_@sfx@ a = *src;
+            if (a == NPY_MIN_INT@len@) {
+                raise_err = 1;
+                *dst  = 0;
+            } else {
+                *dst = -a;
+            }
+        }
+        if (raise_err) {
+            npy_set_floatstatus_divbyzero();
+        }
+    } else {
+        for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+            npyv_@sfx@  nsign_d   = npyv_setall_@sfx@(scalar < 0);
+            npyv_@sfx@  a         = npyv_load_@sfx@(src);
+            npyv_@sfx@  nsign_a   = npyv_cvt_@sfx@_b@len@(npyv_cmplt_@sfx@(a, nsign_d));
+            nsign_a               = npyv_and_@sfx@(nsign_a, npyv_setall_@sfx@(1));
+            npyv_@sfx@  diff_sign = npyv_sub_@sfx@(nsign_a, nsign_d);
+            npyv_@sfx@  to_ninf   = npyv_xor_@sfx@(nsign_a, nsign_d);
+            npyv_@sfx@  trunc     = npyv_divc_@sfx@(npyv_add_@sfx@(a, diff_sign), divisor);
+            npyv_@sfx@  floor     = npyv_sub_@sfx@(trunc, to_ninf);
+            npyv_store_@sfx@(dst, floor);
+        }
+
+        for (; len > 0; --len, ++src, ++dst) {
+            const npyv_lanetype_@sfx@ a = *src;
+            npyv_lanetype_@sfx@ r = a / scalar;
+            // Negative quotients needs to be rounded down
+            if (((a > 0) != (scalar > 0)) && ((r * scalar) != a)) {
+                r--;
+            }
+            *dst = r;
+        }
+    }
+    npyv_cleanup();
+}
+/**end repeat**/
+
+/**begin repeat
+ * Unsigned types
+ * #sfx    = u8, u16, u32, u64#
+ * #len    = 8,  16,  32,  64#
  */
 static NPY_INLINE void
 simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
@@ -44,7 +125,6 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
         const npyv_lanetype_@sfx@ a = *src;
         *dst = a / scalar;
     }
-
     npyv_cleanup();
 }
 /**end repeat**/
@@ -55,6 +135,70 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
  ********************************************************************************/
 
 /**begin repeat
+ * Signed types
+ *  #type  = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
+ *  #TYPE  = BYTE,     SHORT,     INT,     LONG,     LONGLONG#
+ */
+#undef TO_SIMD_SFX
+#if 0
+/**begin repeat1
+ * #len = 8, 16, 32, 64#
+ */
+#elif NPY_BITSOF_@TYPE@ == @len@
+    #define TO_SIMD_SFX(X) X##_s@len@
+/**end repeat1**/
+#endif
+
+#if NPY_BITSOF_@TYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
+    #undef TO_SIMD_SFX
+#endif
+
+NPY_FINLINE @type@ floor_div_@TYPE@(const @type@ n, const @type@ d)
+{
+    /*
+     * FIXME: On x86 at least, dividing the smallest representable integer
+     * by -1 causes a SIFGPE (division overflow). We treat this case here
+     * (to avoid a SIGFPE crash at python level), but a good solution would
+     * be to treat integer division problems separately from FPU exceptions
+     * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
+     */
+    if (NPY_UNLIKELY(d == 0 || (n == NPY_MIN_@TYPE@ && d == -1))) {
+        npy_set_floatstatus_divbyzero();
+        return 0;
+    }
+    @type@ r = n / d;
+    // Negative quotients needs to be rounded down
+    if (((n > 0) != (d > 0)) && ((r * d) != n)) {
+        r--;
+    }
+    return r;
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP(@type@) {
+            io1 = floor_div_@TYPE@(io1, *(@type@*)ip2);
+        }
+        *((@type@ *)iop1) = io1;
+    }
+#if NPY_SIMD && defined(TO_SIMD_SFX)
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) &&
+             (*(@type@ *)args[1]) != 0) {
+        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
+    }
+#endif
+    else {
+        BINARY_LOOP {
+            *((@type@ *)op1) = floor_div_@TYPE@(*(@type@*)ip1, *(@type@*)ip2);
+        }
+    }
+}
+/**end repeat**/
+
+/**begin repeat
  * Unsigned types
  *  #type  = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
  *  #TYPE  = UBYTE,     USHORT,     UINT,     ULONG,     ULONGLONG#