3 files changed, 68 insertions, 8 deletions
diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
index 02920014b..4a75e3293 100644
--- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
@@ -793,7 +793,7 @@ NPY_NO_EXPORT PyArray_StridedUnaryOp *
 
 #endif
 
-static void
+static NPY_GCC_OPT_3 void
 @prefix@_cast_@name1@_to_@name2@(
                         char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 3f5048592..ee7e7652d 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -52,6 +52,19 @@
         && (steps[0] == steps[2])\
         && (steps[0] == 0))
 
+/* binary loop input and output continous */
+#define IS_BINARY_CONT(tin, tout) (steps[0] == sizeof(tin) && \
+                                   steps[1] == sizeof(tin) && \
+                                   steps[2] == sizeof(tout))
+/* binary loop input and output continous with first scalar */
+#define IS_BINARY_CONT_S1(tin, tout) (steps[0] == 0 && \
+                                   steps[1] == sizeof(tin) && \
+                                   steps[2] == sizeof(tout))
+/* binary loop input and output continous with second scalar */
+#define IS_BINARY_CONT_S2(tin, tout) (steps[0] == sizeof(tin) && \
+                                   steps[1] == 0 && \
+                                   steps[2] == sizeof(tout))
+
 #define OUTPUT_LOOP\
     char *op1 = args[1];\
     npy_intp os1 = steps[1];\
@@ -803,13 +816,45 @@ NPY_NO_EXPORT void
  * #OP =  ==, !=, >, >=, <, <=, &&, ||#
  */
 
-NPY_NO_EXPORT void
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
 @TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
 {
-    BINARY_LOOP {
-        const @type@ in1 = *(@type@ *)ip1;
-        const @type@ in2 = *(@type@ *)ip2;
-        *((npy_bool *)op1) = in1 @OP@ in2;
+    /*
+     * gcc vectorization of this is not good (PR60575) but manual integer
+     * vectorization is too tedious to be worthwhile
+     */
+    if (IS_BINARY_CONT(@type@, npy_bool)) {
+        npy_intp i, n = dimensions[0];
+        @type@ * a = (@type@ *)args[0], * b = (@type@ *)args[1];
+        npy_bool * o = (npy_bool *)args[2];
+        for (i = 0; i < n; i++) {
+            o[i] = a[i] @OP@ b[i];
+        }
+    }
+    else if (IS_BINARY_CONT_S1(@type@, npy_bool)) {
+        npy_intp i, n = dimensions[0];
+        @type@ a = *(@type@ *)args[0];
+        @type@ * b = (@type@ *)args[1];
+        npy_bool * o = (npy_bool *)args[2];
+        for (i = 0; i < n; i++) {
+            o[i] = a @OP@ b[i];
+        }
+    }
+    else if (IS_BINARY_CONT_S2(@type@, npy_bool)) {
+        npy_intp i, n = dimensions[0];
+        @type@ * a = (@type@ *)args[0];
+        @type@ b = *(@type@*)args[1];
+        npy_bool * o = (npy_bool *)args[2];
+        for (i = 0; i < n; i++) {
+            o[i] = a[i] @OP@ b;
+        }
+    }
+    else {
+        BINARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            const @type@ in2 = *(@type@ *)ip2;
+            *((npy_bool *)op1) = in1 @OP@ in2;
+        }
     }
 }
 
diff --git a/numpy/lib/twodim_base.py b/numpy/lib/twodim_base.py
index 5a0c0e7ee..20b5cdd67 100644
--- a/numpy/lib/twodim_base.py
+++ b/numpy/lib/twodim_base.py
@@ -11,10 +11,24 @@ __all__ = ['diag', 'diagflat', 'eye', 'fliplr', 'flipud', 'rot90', 'tri',
 
 from numpy.core.numeric import (
     asanyarray, subtract, arange, zeros, greater_equal, multiply, ones,
-    asarray, where, dtype as np_dtype, less
+    asarray, where, dtype as np_dtype, less, int8, int16, int32, int64
     )
+from numpy.core import iinfo
 
 
+i1 = iinfo(int8)
+i2 = iinfo(int16)
+i4 = iinfo(int32)
+def _min_int(low, high):
+    """ get small int that fits the range """
+    if high <= i1.max and low >= i1.min:
+        return int8
+    if high <= i2.max and low >= i2.min:
+        return int16
+    if high <= i4.max and low >= i4.min:
+        return int32
+    return int64
+
 
 def fliplr(m):
     """
@@ -396,7 +410,8 @@ def tri(N, M=None, k=0, dtype=float):
     if M is None:
         M = N
 
-    m = greater_equal.outer(arange(N), arange(-k, M-k))
+    m = greater_equal.outer(arange(N, dtype=_min_int(0, N)),
+                            arange(-k, M-k, dtype=_min_int(-k, M - k)))
 
     # Avoid making a copy if the requested type is already bool
     if np_dtype(dtype) != np_dtype(bool):