7 files changed, 929 insertions, 163 deletions
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index 7164a2c28..764377bc9 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -1408,9 +1408,9 @@ def resize(a, new_shape):
 
     See Also
     --------
-    np.reshape : Reshape an array without changing the total size.
-    np.pad : Enlarge and pad an array.
-    np.repeat : Repeat elements of an array.
+    numpy.reshape : Reshape an array without changing the total size.
+    numpy.pad : Enlarge and pad an array.
+    numpy.repeat : Repeat elements of an array.
     ndarray.resize : resize an array in-place.
 
     Notes
diff --git a/numpy/core/src/npymath/npy_math_internal.h.src b/numpy/core/src/npymath/npy_math_internal.h.src
index ff4663dc3..1e46a2303 100644
--- a/numpy/core/src/npymath/npy_math_internal.h.src
+++ b/numpy/core/src/npymath/npy_math_internal.h.src
@@ -398,8 +398,8 @@ NPY_INPLACE @type@ npy_@kind@@c@(@type@ x)
 /**end repeat1**/
 
 /**begin repeat1
- * #kind = atan2,hypot,pow,copysign#
- * #KIND = ATAN2,HYPOT,POW,COPYSIGN#
+ * #kind = atan2,hypot,pow,fmod,copysign#
+ * #KIND = ATAN2,HYPOT,POW,FMOD,COPYSIGN#
  */
 #ifdef @kind@@c@
 #undef @kind@@c@
@@ -412,32 +412,6 @@ NPY_INPLACE @type@ npy_@kind@@c@(@type@ x, @type@ y)
 #endif
 /**end repeat1**/
 
-/**begin repeat1
- * #kind = fmod#
- * #KIND = FMOD#
- */
-#ifdef @kind@@c@
-#undef @kind@@c@
-#endif
-#ifndef HAVE_MODF@C@
-NPY_INPLACE @type@
-npy_@kind@@c@(@type@ x, @type@ y)
-{
-    int are_inputs_inf = (npy_isinf(x) && npy_isinf(y));
-    /* force set invalid flag, doesnt raise by default on gcc < 8 */
-    if (npy_isnan(x) || npy_isnan(y)) {
-        npy_set_floatstatus_invalid();
-    }
-    if (are_inputs_inf || !y) {
-        if (!npy_isnan(x)) {
-            npy_set_floatstatus_invalid();
-        }
-    }
-    return (@type@) npy_@kind@((double)x, (double) y);
-}
-#endif
-/**end repeat1**/
-
 #ifdef modf@c@
 #undef modf@c@
 #endif
@@ -499,8 +473,8 @@ NPY_INPLACE @type@ npy_@kind@@c@(@type@ x)
 /**end repeat1**/
 
 /**begin repeat1
- * #kind = atan2,hypot,pow,copysign#
- * #KIND = ATAN2,HYPOT,POW,COPYSIGN#
+ * #kind = atan2,hypot,pow,fmod,copysign#
+ * #KIND = ATAN2,HYPOT,POW,FMOD,COPYSIGN#
  */
 #ifdef HAVE_@KIND@@C@
 NPY_INPLACE @type@ npy_@kind@@c@(@type@ x, @type@ y)
@@ -510,29 +484,6 @@ NPY_INPLACE @type@ npy_@kind@@c@(@type@ x, @type@ y)
 #endif
 /**end repeat1**/
 
-/**begin repeat1
- * #kind = fmod#
- * #KIND = FMOD#
- */
-#ifdef HAVE_FMOD@C@
-NPY_INPLACE @type@
-npy_@kind@@c@(@type@ x, @type@ y)
-{
-    int are_inputs_inf = (npy_isinf(x) && npy_isinf(y));
-    /* force set invalid flag, doesnt raise by default on gcc < 8 */
-    if (npy_isnan(x) || npy_isnan(y)) {
-        npy_set_floatstatus_invalid();
-    }
-    if (are_inputs_inf || !y) {
-        if (!npy_isnan(x)) {
-            npy_set_floatstatus_invalid();
-        }
-    }
-    return @kind@@c@(x, y);
-}
-#endif
-/**end repeat1**/
-
 #ifdef HAVE_MODF@C@
 NPY_INPLACE @type@ npy_modf@c@(@type@ x, @type@ *iptr)
 {
@@ -682,8 +633,14 @@ npy_remainder@c@(@type@ a, @type@ b)
 {
     @type@ mod;
     if (NPY_UNLIKELY(!b)) {
+        /*
+         * in2 == 0 (and not NaN): normal fmod will give the correct
+         * result (always NaN). `divmod` may set additional FPE for the
+         * division by zero creating an inf.
+         */
         mod = npy_fmod@c@(a, b);
-    } else {
+    }
+    else {
         npy_divmod@c@(a, b, &mod);
     }
     return mod;
@@ -693,13 +650,14 @@ NPY_INPLACE @type@
 npy_floor_divide@c@(@type@ a, @type@ b) {
     @type@ div, mod;
     if (NPY_UNLIKELY(!b)) {
+        /*
+         * in2 == 0 (and not NaN): normal division will give the correct
+         * result (Inf or NaN). `divmod` may set additional FPE for the modulo
+         * evaluating to NaN.
+         */
         div = a / b;
-        if (!a || npy_isnan(a)) {
-            npy_set_floatstatus_invalid();
-        } else {
-            npy_set_floatstatus_divbyzero();
-        }
-    } else {
+    }
+    else {
         div = npy_divmod@c@(a, b, &mod);
     }
     return div;
@@ -715,19 +673,11 @@ npy_divmod@c@(@type@ a, @type@ b, @type@ *modulus)
 {
     @type@ div, mod, floordiv;
 
-    /* force set invalid flag, doesnt raise by default on gcc < 8 */
-    if (npy_isnan(a) || npy_isnan(b)) {
-        npy_set_floatstatus_invalid();
-    }
     mod = npy_fmod@c@(a, b);
     if (NPY_UNLIKELY(!b)) {
-        div = a / b;
-        if (a && !npy_isnan(a)) {
-            npy_set_floatstatus_divbyzero();
-        }
-        /* If b == 0, return result of fmod. For IEEE is nan */
+        /* b == 0 (not NaN): return result of fmod. For IEEE is nan */
         *modulus = mod;
-        return div;
+        return a / b;
     }
 
     /* a - mod should be very nearly an integer multiple of b */
@@ -735,7 +685,7 @@ npy_divmod@c@(@type@ a, @type@ b, @type@ *modulus)
 
     /* adjust fmod result to conform to Python convention of remainder */
     if (mod) {
-        if ((b < 0) != (mod < 0)) {
+        if (isless(b, 0) != isless(mod, 0)) {
             mod += b;
             div -= 1.0@c@;
         }
@@ -748,7 +698,7 @@ npy_divmod@c@(@type@ a, @type@ b, @type@ *modulus)
     /* snap quotient to nearest integral value */
     if (div) {
         floordiv = npy_floor@c@(div);
-        if (div - floordiv > 0.5@c@)
+        if (isgreater(div - floordiv, 0.5@c@))
             floordiv += 1.0@c@;
     }
     else {
diff --git a/numpy/core/src/umath/scalarmath.c.src b/numpy/core/src/umath/scalarmath.c.src
index 66f97a831..5836545f8 100644
--- a/numpy/core/src/umath/scalarmath.c.src
+++ b/numpy/core/src/umath/scalarmath.c.src
@@ -283,19 +283,13 @@ static void
 
 static void
 @name@_ctype_floor_divide(@type@ a, @type@ b, @type@ *out) {
-    @type@ mod;
-
-    if (!b) {
-        *out = a / b;
-    } else {
-        *out = npy_divmod@c@(a, b, &mod);
-    }
+    *out = npy_floor_divide@c@(a, b);
 }
 
 
 static void
 @name@_ctype_remainder(@type@ a, @type@ b, @type@ *out) {
-    npy_divmod@c@(a, b, out);
+    *out = npy_remainder@c@(a, b);
 }
 
 
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 6fd4b4659..ebd61d199 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -458,10 +458,15 @@ class TestDivision:
         # divide by zero error check
         with np.errstate(divide='raise', invalid='ignore'):
             assert_raises(FloatingPointError, np.floor_divide, fone, fzer)
-        with np.errstate(invalid='raise'):
-            assert_raises(FloatingPointError, np.floor_divide, fnan, fone)
-            assert_raises(FloatingPointError, np.floor_divide, fone, fnan)
-            assert_raises(FloatingPointError, np.floor_divide, fnan, fzer)
+        with np.errstate(divide='ignore', invalid='raise'):
+            np.floor_divide(fone, fzer)
+
+        # The following already contain a NaN and should not warn
+        with np.errstate(all='raise'):
+            np.floor_divide(fnan, fone)
+            np.floor_divide(fone, fnan)
+            np.floor_divide(fnan, fzer)
+            np.floor_divide(fzer, fnan)
 
     @pytest.mark.parametrize('dtype', np.typecodes['Float'])
     def test_floor_division_corner_cases(self, dtype):
@@ -558,6 +563,9 @@ class TestRemainder:
                     else:
                         assert_(b > rem >= 0, msg)
 
+    @pytest.mark.xfail(sys.platform.startswith("darwin"),
+            reason="MacOS seems to not give the correct 'invalid' warning for "
+                   "`fmod`.  Hopefully, others always do.")
     @pytest.mark.parametrize('dtype', np.typecodes['Float'])
     def test_float_divmod_errors(self, dtype):
         # Check valid errors raised for divmod and remainder
@@ -578,8 +586,12 @@ class TestRemainder:
         with np.errstate(divide='ignore', invalid='raise'):
             assert_raises(FloatingPointError, np.divmod, finf, fzero)
         with np.errstate(divide='raise', invalid='ignore'):
-            assert_raises(FloatingPointError, np.divmod, finf, fzero)
+            # inf / 0 does not set any flags, only the modulo creates a NaN
+            np.divmod(finf, fzero)
 
+    @pytest.mark.xfail(sys.platform.startswith("darwin"),
+           reason="MacOS seems to not give the correct 'invalid' warning for "
+                  "`fmod`.  Hopefully, others always do.")
     @pytest.mark.parametrize('dtype', np.typecodes['Float'])
     @pytest.mark.parametrize('fn', [np.fmod, np.remainder])
     def test_float_remainder_errors(self, dtype, fn):
@@ -587,11 +599,16 @@ class TestRemainder:
         fone = np.array(1.0, dtype=dtype)
         finf = np.array(np.inf, dtype=dtype)
         fnan = np.array(np.nan, dtype=dtype)
-        with np.errstate(invalid='raise'):
-            assert_raises(FloatingPointError, fn, fone, fzero)
-            assert_raises(FloatingPointError, fn, fnan, fzero)
-            assert_raises(FloatingPointError, fn, fone, fnan)
-            assert_raises(FloatingPointError, fn, fnan, fone)
+
+        # The following already contain a NaN and should not warn.
+        with np.errstate(all='raise'):
+            with pytest.raises(FloatingPointError,
+                    match="invalid value"):
+                fn(fone, fzero)
+            fn(fnan, fzero)
+            fn(fzero, fnan)
+            fn(fone, fnan)
+            fn(fnan, fone)
 
     def test_float_remainder_overflow(self):
         a = np.finfo(np.float64).tiny
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 10f7f9ae0..2b686839a 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -99,6 +99,10 @@ def _raise_linalgerror_svd_nonconvergence(err, flag):
 def _raise_linalgerror_lstsq(err, flag):
     raise LinAlgError("SVD did not converge in Linear Least Squares")
 
+def _raise_linalgerror_qr(err, flag):
+    raise LinAlgError("Incorrect argument found while performing "
+                      "QR factorization")
+
 def get_linalg_error_extobj(callback):
     extobj = list(_linalg_error_extobj)  # make a copy
     extobj[2] = callback
@@ -776,15 +780,16 @@ def qr(a, mode='reduced'):
 
     Parameters
     ----------
-    a : array_like, shape (M, N)
-        Matrix to be factored.
+    a : array_like, shape (..., M, N)
+        An array-like object with the dimensionality of at least 2.
     mode : {'reduced', 'complete', 'r', 'raw'}, optional
         If K = min(M, N), then
 
-        * 'reduced'  : returns q, r with dimensions (M, K), (K, N) (default)
-        * 'complete' : returns q, r with dimensions (M, M), (M, N)
-        * 'r'        : returns r only with dimensions (K, N)
-        * 'raw'      : returns h, tau with dimensions (N, M), (K,)
+        * 'reduced'  : returns q, r with dimensions
+                       (..., M, K), (..., K, N) (default)
+        * 'complete' : returns q, r with dimensions (..., M, M), (..., M, N)
+        * 'r'        : returns r only with dimensions (..., K, N)
+        * 'raw'      : returns h, tau with dimensions (..., N, M), (..., K,)
 
         The options 'reduced', 'complete, and 'raw' are new in numpy 1.8,
         see the notes for more information. The default is 'reduced', and to
@@ -803,9 +808,13 @@ def qr(a, mode='reduced'):
         A matrix with orthonormal columns. When mode = 'complete' the
         result is an orthogonal/unitary matrix depending on whether or not
         a is real/complex. The determinant may be either +/- 1 in that
-        case.
+        case. In case the number of dimensions in the input array is
+        greater than 2 then a stack of the matrices with above properties
+        is returned.
     r : ndarray of float or complex, optional
-        The upper-triangular matrix.
+        The upper-triangular matrix or a stack of upper-triangular
+        matrices if the number of dimensions in the input array is greater
+        than 2.
     (h, tau) : ndarrays of np.double or np.cdouble, optional
         The array h contains the Householder reflectors that generate q
         along with r. The tau array contains scaling factors for the
@@ -853,6 +862,14 @@ def qr(a, mode='reduced'):
     >>> r2 = np.linalg.qr(a, mode='r')
     >>> np.allclose(r, r2)  # mode='r' returns the same r as mode='full'
     True
+    >>> a = np.random.normal(size=(3, 2, 2)) # Stack of 2 x 2 matrices as input
+    >>> q, r = np.linalg.qr(a)
+    >>> q.shape
+    (3, 2, 2)
+    >>> r.shape
+    (3, 2, 2)
+    >>> np.allclose(a, np.matmul(q, r))
+    True
 
     Example illustrating a common use of `qr`: solving of least squares
     problems
@@ -900,83 +917,58 @@ def qr(a, mode='reduced'):
             raise ValueError(f"Unrecognized mode '{mode}'")
 
     a, wrap = _makearray(a)
-    _assert_2d(a)
-    m, n = a.shape
+    _assert_stacked_2d(a)
+    m, n = a.shape[-2:]
     t, result_t = _commonType(a)
-    a = _fastCopyAndTranspose(t, a)
+    a = a.astype(t, copy=True)
     a = _to_native_byte_order(a)
     mn = min(m, n)
-    tau = zeros((mn,), t)
 
-    if isComplexType(t):
-        lapack_routine = lapack_lite.zgeqrf
-        routine_name = 'zgeqrf'
+    if m <= n:
+        gufunc = _umath_linalg.qr_r_raw_m
     else:
-        lapack_routine = lapack_lite.dgeqrf
-        routine_name = 'dgeqrf'
-
-    # calculate optimal size of work data 'work'
-    lwork = 1
-    work = zeros((lwork,), t)
-    results = lapack_routine(m, n, a, max(1, m), tau, work, -1, 0)
-    if results['info'] != 0:
-        raise LinAlgError('%s returns %d' % (routine_name, results['info']))
-
-    # do qr decomposition
-    lwork = max(1, n, int(abs(work[0])))
-    work = zeros((lwork,), t)
-    results = lapack_routine(m, n, a, max(1, m), tau, work, lwork, 0)
-    if results['info'] != 0:
-        raise LinAlgError('%s returns %d' % (routine_name, results['info']))
+        gufunc = _umath_linalg.qr_r_raw_n
+
+    signature = 'D->D' if isComplexType(t) else 'd->d'
+    extobj = get_linalg_error_extobj(_raise_linalgerror_qr)
+    tau = gufunc(a, signature=signature, extobj=extobj)
 
     # handle modes that don't return q
     if mode == 'r':
-        r = _fastCopyAndTranspose(result_t, a[:, :mn])
-        return wrap(triu(r))
+        r = triu(a[..., :mn, :])
+        r = r.astype(result_t, copy=False)
+        return wrap(r)
 
     if mode == 'raw':
-        return a, tau
+        q = transpose(a)
+        q = q.astype(result_t, copy=False)
+        tau = tau.astype(result_t, copy=False)
+        return wrap(q), tau
 
     if mode == 'economic':
-        if t != result_t :
-            a = a.astype(result_t, copy=False)
-        return wrap(a.T)
+        a = a.astype(result_t, copy=False)
+        return wrap(a)
 
-    #  generate q from a
+    # mc is the number of columns in the resulting q
+    # matrix. If the mode is complete then it is 
+    # same as number of rows, and if the mode is reduced,
+    # then it is the minimum of number of rows and columns.
     if mode == 'complete' and m > n:
         mc = m
-        q = empty((m, m), t)
+        gufunc = _umath_linalg.qr_complete
     else:
         mc = mn
-        q = empty((n, m), t)
-    q[:n] = a
-
-    if isComplexType(t):
-        lapack_routine = lapack_lite.zungqr
-        routine_name = 'zungqr'
-    else:
-        lapack_routine = lapack_lite.dorgqr
-        routine_name = 'dorgqr'
+        gufunc = _umath_linalg.qr_reduced
 
-    # determine optimal lwork
-    lwork = 1
-    work = zeros((lwork,), t)
-    results = lapack_routine(m, mc, mn, q, max(1, m), tau, work, -1, 0)
-    if results['info'] != 0:
-        raise LinAlgError('%s returns %d' % (routine_name, results['info']))
-
-    # compute q
-    lwork = max(1, n, int(abs(work[0])))
-    work = zeros((lwork,), t)
-    results = lapack_routine(m, mc, mn, q, max(1, m), tau, work, lwork, 0)
-    if results['info'] != 0:
-        raise LinAlgError('%s returns %d' % (routine_name, results['info']))
-
-    q = _fastCopyAndTranspose(result_t, q[:mc])
-    r = _fastCopyAndTranspose(result_t, a[:, :mc])
+    signature = 'DD->D' if isComplexType(t) else 'dd->d'
+    extobj = get_linalg_error_extobj(_raise_linalgerror_qr)
+    q = gufunc(a, tau, signature=signature, extobj=extobj)
+    r = triu(a[..., :mc, :])
 
-    return wrap(q), wrap(triu(r))
+    q = q.astype(result_t, copy=False)
+    r = r.astype(result_t, copy=False)
 
+    return wrap(q), wrap(r)
 
 # Eigenvalues
 
@@ -2173,7 +2165,7 @@ def lstsq(a, b, rcond="warn"):
     equal to, or greater than its number of linearly independent columns).
     If `a` is square and of full rank, then `x` (but for round-off error)
     is the "exact" solution of the equation. Else, `x` minimizes the
-    Euclidean 2-norm :math:`||b - ax||`. If there are multiple minimizing 
+    Euclidean 2-norm :math:`||b - ax||`. If there are multiple minimizing
     solutions, the one with the smallest 2-norm :math:`||x||` is returned.
 
     Parameters
diff --git a/numpy/linalg/tests/test_linalg.py b/numpy/linalg/tests/test_linalg.py
index c6e8cdd03..dd059fb63 100644
--- a/numpy/linalg/tests/test_linalg.py
+++ b/numpy/linalg/tests/test_linalg.py
@@ -1,6 +1,7 @@
 """ Test functions for linalg module
 
 """
+from numpy.core.fromnumeric import shape
 import os
 import sys
 import itertools
@@ -11,6 +12,7 @@ import pytest
 
 import numpy as np
 from numpy import array, single, double, csingle, cdouble, dot, identity, matmul
+from numpy.core import swapaxes
 from numpy import multiply, atleast_2d, inf, asarray
 from numpy import linalg
 from numpy.linalg import matrix_power, norm, matrix_rank, multi_dot, LinAlgError
@@ -1710,6 +1712,66 @@ class TestQR:
             self.check_qr(m2)
             self.check_qr(m2.T)
 
+    def check_qr_stacked(self, a):
+        # This test expects the argument `a` to be an ndarray or
+        # a subclass of an ndarray of inexact type.
+        a_type = type(a)
+        a_dtype = a.dtype
+        m, n = a.shape[-2:]
+        k = min(m, n)
+
+        # mode == 'complete'
+        q, r = linalg.qr(a, mode='complete')
+        assert_(q.dtype == a_dtype)
+        assert_(r.dtype == a_dtype)
+        assert_(isinstance(q, a_type))
+        assert_(isinstance(r, a_type))
+        assert_(q.shape[-2:] == (m, m))
+        assert_(r.shape[-2:] == (m, n))
+        assert_almost_equal(matmul(q, r), a)
+        I_mat = np.identity(q.shape[-1])
+        stack_I_mat = np.broadcast_to(I_mat, 
+                        q.shape[:-2] + (q.shape[-1],)*2)
+        assert_almost_equal(matmul(swapaxes(q, -1, -2).conj(), q), stack_I_mat)
+        assert_almost_equal(np.triu(r[..., :, :]), r)
+
+        # mode == 'reduced'
+        q1, r1 = linalg.qr(a, mode='reduced')
+        assert_(q1.dtype == a_dtype)
+        assert_(r1.dtype == a_dtype)
+        assert_(isinstance(q1, a_type))
+        assert_(isinstance(r1, a_type))
+        assert_(q1.shape[-2:] == (m, k))
+        assert_(r1.shape[-2:] == (k, n))
+        assert_almost_equal(matmul(q1, r1), a)
+        I_mat = np.identity(q1.shape[-1])
+        stack_I_mat = np.broadcast_to(I_mat, 
+                        q1.shape[:-2] + (q1.shape[-1],)*2)
+        assert_almost_equal(matmul(swapaxes(q1, -1, -2).conj(), q1), 
+                            stack_I_mat)
+        assert_almost_equal(np.triu(r1[..., :, :]), r1)
+
+        # mode == 'r'
+        r2 = linalg.qr(a, mode='r')
+        assert_(r2.dtype == a_dtype)
+        assert_(isinstance(r2, a_type))
+        assert_almost_equal(r2, r1)
+
+    @pytest.mark.parametrize("size", [
+        (3, 4), (4, 3), (4, 4), 
+        (3, 0), (0, 3)])
+    @pytest.mark.parametrize("outer_size", [
+        (2, 2), (2,), (2, 3, 4)])
+    @pytest.mark.parametrize("dt", [
+        np.single, np.double, 
+        np.csingle, np.cdouble])
+    def test_stacked_inputs(self, outer_size, size, dt):
+
+        A = np.random.normal(size=outer_size + size).astype(dt)
+        B = np.random.normal(size=outer_size + size).astype(dt)
+        self.check_qr_stacked(A)
+        self.check_qr_stacked(A + 1.j*B)
+
 
 class TestCholesky:
     # TODO: are there no other tests for cholesky?
@@ -2051,10 +2113,11 @@ def test_unsupported_commontype():
         linalg.cholesky(arr)
 
 
-@pytest.mark.slow
-@pytest.mark.xfail(not HAS_LAPACK64, run=False,
-                   reason="Numpy not compiled with 64-bit BLAS/LAPACK")
-@requires_memory(free_bytes=16e9)
+#@pytest.mark.slow
+#@pytest.mark.xfail(not HAS_LAPACK64, run=False,
+#                   reason="Numpy not compiled with 64-bit BLAS/LAPACK")
+#@requires_memory(free_bytes=16e9)
+@pytest.mark.skip(reason="Bad memory reports lead to OOM in ci testing")
 def test_blas64_dot():
     n = 2**32
     a = np.zeros([1, n], dtype=np.float32)
diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index 1807aadcf..a486e9e5b 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -163,6 +163,24 @@ FNAME(zgelsd)(fortran_int *m, fortran_int *n, fortran_int *nrhs,
               fortran_int *info);
 
 extern fortran_int
+FNAME(dgeqrf)(fortran_int *m, fortran_int *n, double a[], fortran_int *lda,
+              double tau[], double work[],
+              fortran_int *lwork, fortran_int *info);
+extern fortran_int
+FNAME(zgeqrf)(fortran_int *m, fortran_int *n, f2c_doublecomplex a[], fortran_int *lda,
+              f2c_doublecomplex tau[], f2c_doublecomplex work[],
+              fortran_int *lwork, fortran_int *info);
+
+extern fortran_int
+FNAME(dorgqr)(fortran_int *m, fortran_int *n, fortran_int *k, double a[], fortran_int *lda,
+              double tau[], double work[],
+              fortran_int *lwork, fortran_int *info);
+extern fortran_int
+FNAME(zungqr)(fortran_int *m, fortran_int *n, fortran_int *k, f2c_doublecomplex a[],
+              fortran_int *lda, f2c_doublecomplex tau[],
+              f2c_doublecomplex work[], fortran_int *lwork, fortran_int *info);
+
+extern fortran_int
 FNAME(sgesv)(fortran_int *n, fortran_int *nrhs,
              float a[], fortran_int *lda,
              fortran_int ipiv[],
@@ -2834,6 +2852,670 @@ static void
 
 /**end repeat**/
 
+/* -------------------------------------------------------------------------- */
+                 /* qr (modes - r, raw) */
+
+typedef struct geqfr_params_struct
+{
+    fortran_int M;
+    fortran_int N;
+    void *A;
+    fortran_int LDA;
+    void* TAU;
+    void *WORK;
+    fortran_int LWORK;
+} GEQRF_PARAMS_t;
+
+
+static inline void
+dump_geqrf_params(const char *name,
+                  GEQRF_PARAMS_t *params)
+{
+    TRACE_TXT("\n%s:\n"\
+
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n",
+
+              name,
+
+              "A", params->A,
+              "TAU", params->TAU,
+              "WORK", params->WORK,
+
+              "M", (int)params->M,
+              "N", (int)params->N,
+              "LDA", (int)params->LDA,
+              "LWORK", (int)params->LWORK);
+}
+
+/**begin repeat
+   #lapack_func=dgeqrf,zgeqrf#
+ */
+
+static inline fortran_int
+call_@lapack_func@(GEQRF_PARAMS_t *params)
+{
+    fortran_int rv;
+    LAPACK(@lapack_func@)(&params->M, &params->N,
+                          params->A, &params->LDA,
+                          params->TAU,
+                          params->WORK, &params->LWORK,
+                          &rv);
+    return rv;
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #TYPE=DOUBLE#
+   #lapack_func=dgeqrf#
+   #ftyp=fortran_doublereal#
+ */
+static inline int
+init_@lapack_func@(GEQRF_PARAMS_t *params,
+                   fortran_int m,
+                   fortran_int n)
+{
+    npy_uint8 *mem_buff = NULL;
+    npy_uint8 *mem_buff2 = NULL;
+    npy_uint8 *a, *tau, *work;
+    fortran_int min_m_n = fortran_int_min(m, n);
+    size_t safe_min_m_n = min_m_n;
+    size_t safe_m = m;
+    size_t safe_n = n;
+
+    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
+    size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
+
+    fortran_int work_count;
+    size_t work_size;
+    fortran_int lda = fortran_int_max(1, m);
+
+    mem_buff = malloc(a_size + tau_size);
+
+    if (!mem_buff)
+        goto error;
+
+    a = mem_buff;
+    tau = a + a_size;
+    memset(tau, 0, tau_size);
+
+
+    params->M = m;
+    params->N = n;
+    params->A = a;
+    params->TAU = tau;
+    params->LDA = lda;
+
+    {
+        /* compute optimal work size */
+
+        @ftyp@ work_size_query;
+
+        params->WORK = &work_size_query;
+        params->LWORK = -1;
+
+        if (call_@lapack_func@(params) != 0)
+            goto error;
+
+        work_count = (fortran_int) *(@ftyp@*) params->WORK;
+
+    }
+
+    params->LWORK = fortran_int_max(fortran_int_max(1, n), work_count);
+
+    work_size = (size_t) params->LWORK * sizeof(@ftyp@);
+    mem_buff2 = malloc(work_size);
+    if (!mem_buff2)
+        goto error;
+
+    work = mem_buff2;
+
+    params->WORK = work;
+
+    return 1;
+ error:
+    TRACE_TXT("%s failed init\n", __FUNCTION__);
+    free(mem_buff);
+    free(mem_buff2);
+    memset(params, 0, sizeof(*params));
+
+    return 0;
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #TYPE=CDOUBLE#
+   #lapack_func=zgeqrf#
+   #ftyp=fortran_doublecomplex#
+ */
+static inline int
+init_@lapack_func@(GEQRF_PARAMS_t *params,
+                   fortran_int m,
+                   fortran_int n)
+{
+    npy_uint8 *mem_buff = NULL;
+    npy_uint8 *mem_buff2 = NULL;
+    npy_uint8 *a, *tau, *work;
+    fortran_int min_m_n = fortran_int_min(m, n);
+    size_t safe_min_m_n = min_m_n;
+    size_t safe_m = m;
+    size_t safe_n = n;
+
+    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
+    size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
+
+    fortran_int work_count;
+    size_t work_size;
+    fortran_int lda = fortran_int_max(1, m);
+
+    mem_buff = malloc(a_size + tau_size);
+
+    if (!mem_buff)
+        goto error;
+
+    a = mem_buff;
+    tau = a + a_size;
+    memset(tau, 0, tau_size);
+
+
+    params->M = m;
+    params->N = n;
+    params->A = a;
+    params->TAU = tau;
+    params->LDA = lda;
+
+    {
+        /* compute optimal work size */
+
+        @ftyp@ work_size_query;
+
+        params->WORK = &work_size_query;
+        params->LWORK = -1;
+
+        if (call_@lapack_func@(params) != 0)
+            goto error;
+
+        work_count = (fortran_int) ((@ftyp@*)params->WORK)->r;
+
+    }
+
+    params->LWORK = fortran_int_max(fortran_int_max(1, n),
+                                    work_count);
+
+    work_size = (size_t) params->LWORK * sizeof(@ftyp@);
+
+    mem_buff2 = malloc(work_size);
+    if (!mem_buff2)
+        goto error;
+
+    work = mem_buff2;
+
+    params->WORK = work;
+
+    return 1;
+ error:
+    TRACE_TXT("%s failed init\n", __FUNCTION__);
+    free(mem_buff);
+    free(mem_buff2);
+    memset(params, 0, sizeof(*params));
+
+    return 0;
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #lapack_func=dgeqrf,zgeqrf#
+ */
+static inline void
+release_@lapack_func@(GEQRF_PARAMS_t* params)
+{
+    /* A and WORK contain allocated blocks */
+    free(params->A);
+    free(params->WORK);
+    memset(params, 0, sizeof(*params));
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #TYPE=DOUBLE,CDOUBLE#
+   #REALTYPE=DOUBLE,DOUBLE#
+   #lapack_func=dgeqrf,zgeqrf#
+   #typ     = npy_double,npy_cdouble#
+   #basetyp = npy_double,npy_double#
+   #ftyp = fortran_doublereal,fortran_doublecomplex#
+   #cmplx = 0, 1#
+ */
+static void
+@TYPE@_qr_r_raw(char **args, npy_intp const *dimensions, npy_intp const *steps,
+          void *NPY_UNUSED(func))
+{
+    GEQRF_PARAMS_t params;
+    int error_occurred = get_fp_invalid_and_clear();
+    fortran_int n, m;
+
+    INIT_OUTER_LOOP_2
+
+    m = (fortran_int)dimensions[0];
+    n = (fortran_int)dimensions[1];
+
+    if (init_@lapack_func@(&params, m, n)) {
+        LINEARIZE_DATA_t a_in, tau_out;
+
+        init_linearize_data(&a_in, n, m, steps[1], steps[0]);
+        init_linearize_data(&tau_out, 1, fortran_int_min(m, n), 1, steps[2]);
+
+        BEGIN_OUTER_LOOP_2
+            int not_ok;
+            linearize_@TYPE@_matrix(params.A, args[0], &a_in);
+            not_ok = call_@lapack_func@(&params);
+            if (!not_ok) {
+                delinearize_@TYPE@_matrix(args[0], params.A, &a_in);
+                delinearize_@TYPE@_matrix(args[1], params.TAU, &tau_out);
+            } else {
+                error_occurred = 1;
+                nan_@TYPE@_matrix(args[1], &tau_out);
+            }
+        END_OUTER_LOOP
+
+        release_@lapack_func@(&params);
+    }
+
+    set_fp_invalid_or_clear(error_occurred);
+}
+
+/**end repeat**/
+
+/* -------------------------------------------------------------------------- */
+                 /* qr common code (modes - reduced and complete) */ 
+
+typedef struct gqr_params_struct
+{
+    fortran_int M;
+    fortran_int MC;
+    fortran_int MN;
+    void* A;
+    void *Q;
+    fortran_int LDA;
+    void* TAU;
+    void *WORK;
+    fortran_int LWORK;
+} GQR_PARAMS_t;
+
+/**begin repeat
+   #lapack_func=dorgqr,zungqr#
+ */
+
+static inline fortran_int
+call_@lapack_func@(GQR_PARAMS_t *params)
+{
+    fortran_int rv;
+    LAPACK(@lapack_func@)(&params->M, &params->MC, &params->MN,
+                          params->Q, &params->LDA,
+                          params->TAU,
+                          params->WORK, &params->LWORK,
+                          &rv);
+    return rv;
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #lapack_func=dorgqr#
+   #ftyp=fortran_doublereal#
+ */
+static inline int
+init_@lapack_func@_common(GQR_PARAMS_t *params,
+                          fortran_int m,
+                          fortran_int n,
+                          fortran_int mc)
+{
+    npy_uint8 *mem_buff = NULL;
+    npy_uint8 *mem_buff2 = NULL;
+    npy_uint8 *a, *q, *tau, *work;
+    fortran_int min_m_n = fortran_int_min(m, n);
+    size_t safe_mc = mc;
+    size_t safe_min_m_n = min_m_n;
+    size_t safe_m = m;
+    size_t safe_n = n;
+    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
+    size_t q_size = safe_m * safe_mc * sizeof(@ftyp@);
+    size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
+
+    fortran_int work_count;
+    size_t work_size;
+    fortran_int lda = fortran_int_max(1, m);
+
+    mem_buff = malloc(q_size + tau_size + a_size);
+
+    if (!mem_buff)
+        goto error;
+
+    q = mem_buff;
+    tau = q + q_size;
+    a = tau + tau_size;
+
+
+    params->M = m;
+    params->MC = mc;
+    params->MN = min_m_n;
+    params->A = a;
+    params->Q = q;
+    params->TAU = tau;
+    params->LDA = lda;
+
+    {
+        /* compute optimal work size */
+        @ftyp@ work_size_query;
+
+        params->WORK = &work_size_query;
+        params->LWORK = -1;
+
+        if (call_@lapack_func@(params) != 0)
+            goto error;
+
+        work_count = (fortran_int) *(@ftyp@*) params->WORK;
+
+    }
+
+    params->LWORK = fortran_int_max(fortran_int_max(1, n), work_count);
+
+    work_size = (size_t) params->LWORK * sizeof(@ftyp@);
+
+    mem_buff2 = malloc(work_size);
+    if (!mem_buff2)
+        goto error;
+
+    work = mem_buff2;
+
+    params->WORK = work;
+
+    return 1;
+ error:
+    TRACE_TXT("%s failed init\n", __FUNCTION__);
+    free(mem_buff);
+    free(mem_buff2);
+    memset(params, 0, sizeof(*params));
+
+    return 0;
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #lapack_func=zungqr#
+   #ftyp=fortran_doublecomplex#
+ */
+static inline int
+init_@lapack_func@_common(GQR_PARAMS_t *params,
+                          fortran_int m,
+                          fortran_int n,
+                          fortran_int mc)
+{
+    npy_uint8 *mem_buff = NULL;
+    npy_uint8 *mem_buff2 = NULL;
+    npy_uint8 *a, *q, *tau, *work;
+    fortran_int min_m_n = fortran_int_min(m, n);
+    size_t safe_mc = mc;
+    size_t safe_min_m_n = min_m_n;
+    size_t safe_m = m;
+    size_t safe_n = n;
+
+    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
+    size_t q_size = safe_m * safe_mc * sizeof(@ftyp@);
+    size_t tau_size = safe_min_m_n * sizeof(@ftyp@);
+
+    fortran_int work_count;
+    size_t work_size;
+    fortran_int lda = fortran_int_max(1, m);
+
+    mem_buff = malloc(q_size + tau_size + a_size);
+
+    if (!mem_buff)
+        goto error;
+
+    q = mem_buff;
+    tau = q + q_size;
+    a = tau + tau_size;
+
+
+    params->M = m;
+    params->MC = mc;
+    params->MN = min_m_n;
+    params->A = a;
+    params->Q = q;
+    params->TAU = tau;
+    params->LDA = lda;
+
+    {
+        /* compute optimal work size */
+        @ftyp@ work_size_query;
+
+        params->WORK = &work_size_query;
+        params->LWORK = -1;
+
+        if (call_@lapack_func@(params) != 0)
+            goto error;
+
+        work_count = (fortran_int) ((@ftyp@*)params->WORK)->r;
+
+    }
+
+    params->LWORK = fortran_int_max(fortran_int_max(1, n),
+                                    work_count);
+
+    work_size = (size_t) params->LWORK * sizeof(@ftyp@);
+
+    mem_buff2 = malloc(work_size);
+    if (!mem_buff2)
+        goto error;
+
+    work = mem_buff2;
+
+    params->WORK = work;
+    params->LWORK = work_count;
+
+    return 1;
+ error:
+    TRACE_TXT("%s failed init\n", __FUNCTION__);
+    free(mem_buff);
+    free(mem_buff2);
+    memset(params, 0, sizeof(*params));
+
+    return 0;
+}
+
+/**end repeat**/
+
+/* -------------------------------------------------------------------------- */
+                 /* qr (modes - reduced) */
+
+
+static inline void
+dump_gqr_params(const char *name,
+                GQR_PARAMS_t *params)
+{
+    TRACE_TXT("\n%s:\n"\
+
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n",
+
+              name,
+
+              "Q", params->Q,
+              "TAU", params->TAU,
+              "WORK", params->WORK,
+
+              "M", (int)params->M,
+              "MC", (int)params->MC,
+              "MN", (int)params->MN,
+              "LDA", (int)params->LDA,
+              "LWORK", (int)params->LWORK);
+}
+
+/**begin repeat
+   #lapack_func=dorgqr,zungqr#
+   #ftyp=fortran_doublereal,fortran_doublecomplex#
+ */
+static inline int
+init_@lapack_func@(GQR_PARAMS_t *params,
+                   fortran_int m,
+                   fortran_int n)
+{
+    return init_@lapack_func@_common(
+        params, m, n, 
+        fortran_int_min(m, n));
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #lapack_func=dorgqr,zungqr#
+ */
+static inline void
+release_@lapack_func@(GQR_PARAMS_t* params)
+{
+    /* A and WORK contain allocated blocks */
+    free(params->Q);
+    free(params->WORK);
+    memset(params, 0, sizeof(*params));
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #TYPE=DOUBLE,CDOUBLE#
+   #REALTYPE=DOUBLE,DOUBLE#
+   #lapack_func=dorgqr,zungqr#
+   #typ     = npy_double, npy_cdouble#
+   #basetyp = npy_double, npy_double#
+   #ftyp = fortran_doublereal,fortran_doublecomplex#
+   #cmplx = 0, 1#
+ */
+static void
+@TYPE@_qr_reduced(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                  void *NPY_UNUSED(func))
+{
+    GQR_PARAMS_t params;
+    int error_occurred = get_fp_invalid_and_clear();
+    fortran_int n, m;
+
+    INIT_OUTER_LOOP_3
+
+    m = (fortran_int)dimensions[0];
+    n = (fortran_int)dimensions[1];
+
+    if (init_@lapack_func@(&params, m, n)) {
+        LINEARIZE_DATA_t a_in, tau_in, q_out;
+
+        init_linearize_data(&a_in, n, m, steps[1], steps[0]);
+        init_linearize_data(&tau_in, 1, fortran_int_min(m, n), 1, steps[2]);
+        init_linearize_data(&q_out, fortran_int_min(m, n), m, steps[4], steps[3]);
+
+        BEGIN_OUTER_LOOP_3
+            int not_ok;
+            linearize_@TYPE@_matrix(params.A, args[0], &a_in);
+            linearize_@TYPE@_matrix(params.Q, args[0], &a_in);
+            linearize_@TYPE@_matrix(params.TAU, args[1], &tau_in);
+            not_ok = call_@lapack_func@(&params);
+            if (!not_ok) {
+                delinearize_@TYPE@_matrix(args[2], params.Q, &q_out);
+            } else {
+                error_occurred = 1;
+                nan_@TYPE@_matrix(args[2], &q_out);
+            }
+        END_OUTER_LOOP
+
+        release_@lapack_func@(&params);
+    }
+
+    set_fp_invalid_or_clear(error_occurred);
+}
+
+/**end repeat**/
+
+/* -------------------------------------------------------------------------- */
+                 /* qr (modes - complete) */
+
+/**begin repeat
+   #lapack_func=dorgqr,zungqr#
+   #ftyp=fortran_doublereal,fortran_doublecomplex#
+ */
+static inline int
+init_@lapack_func@_complete(GQR_PARAMS_t *params,
+                            fortran_int m,
+                            fortran_int n)
+{
+    return init_@lapack_func@_common(params, m, n, m);
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #TYPE=DOUBLE,CDOUBLE#
+   #REALTYPE=DOUBLE,DOUBLE#
+   #lapack_func=dorgqr,zungqr#
+   #typ     = npy_double,npy_cdouble#
+   #basetyp = npy_double,npy_double#
+   #ftyp = fortran_doublereal,fortran_doublecomplex#
+   #cmplx = 0, 1#
+ */
+static void
+@TYPE@_qr_complete(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                  void *NPY_UNUSED(func))
+{
+    GQR_PARAMS_t params;
+    int error_occurred = get_fp_invalid_and_clear();
+    fortran_int n, m;
+
+    INIT_OUTER_LOOP_3
+
+    m = (fortran_int)dimensions[0];
+    n = (fortran_int)dimensions[1];
+
+
+    if (init_@lapack_func@_complete(&params, m, n)) {
+        LINEARIZE_DATA_t a_in, tau_in, q_out;
+
+        init_linearize_data(&a_in, n, m, steps[1], steps[0]);
+        init_linearize_data(&tau_in, 1, fortran_int_min(m, n), 1, steps[2]);
+        init_linearize_data(&q_out, m, m, steps[4], steps[3]);
+
+        BEGIN_OUTER_LOOP_3
+            int not_ok;
+            linearize_@TYPE@_matrix(params.A, args[0], &a_in);
+            linearize_@TYPE@_matrix(params.Q, args[0], &a_in);
+            linearize_@TYPE@_matrix(params.TAU, args[1], &tau_in);
+            not_ok = call_@lapack_func@(&params);
+            if (!not_ok) {
+                delinearize_@TYPE@_matrix(args[2], params.Q, &q_out);
+            } else {
+                error_occurred = 1;
+                nan_@TYPE@_matrix(args[2], &q_out);
+            }
+        END_OUTER_LOOP
+
+        release_@lapack_func@(&params);
+    }
+
+    set_fp_invalid_or_clear(error_occurred);
+}
+
+/**end repeat**/
 
 /* -------------------------------------------------------------------------- */
                  /* least squares */
@@ -3291,6 +3973,17 @@ static void *array_of_nulls[] = {
         CDOUBLE_ ## NAME                                \
     }
 
+/* The single precision functions are not used at all,
+ * due to input data being promoted to double precision
+ * in Python, so they are not implemented here.
+ */
+#define GUFUNC_FUNC_ARRAY_QR(NAME)                      \
+    static PyUFuncGenericFunction                       \
+    FUNC_ARRAY_NAME(NAME)[] = {                         \
+        DOUBLE_ ## NAME,                                \
+        CDOUBLE_ ## NAME                                \
+    }
+
 
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(slogdet);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(det);
@@ -3305,6 +3998,9 @@ GUFUNC_FUNC_ARRAY_REAL_COMPLEX(cholesky_lo);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_N);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_S);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_A);
+GUFUNC_FUNC_ARRAY_QR(qr_r_raw);
+GUFUNC_FUNC_ARRAY_QR(qr_reduced);
+GUFUNC_FUNC_ARRAY_QR(qr_complete);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(lstsq);
 GUFUNC_FUNC_ARRAY_EIG(eig);
 GUFUNC_FUNC_ARRAY_EIG(eigvals);
@@ -3371,6 +4067,24 @@ static char svd_1_3_types[] = {
     NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE
 };
 
+/* A, tau */
+static char qr_r_raw_types[] = {
+    NPY_DOUBLE,  NPY_DOUBLE,
+    NPY_CDOUBLE, NPY_CDOUBLE,
+};
+
+/* A, tau, q */
+static char qr_reduced_types[] = {
+    NPY_DOUBLE,  NPY_DOUBLE,  NPY_DOUBLE,
+    NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE,
+};
+
+/* A, tau, q */
+static char qr_complete_types[] = {
+    NPY_DOUBLE,  NPY_DOUBLE,  NPY_DOUBLE,
+    NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE,
+};
+
 /*  A,           b,           rcond,      x,           resid,      rank,    s,        */
 static char lstsq_types[] = {
     NPY_FLOAT,   NPY_FLOAT,   NPY_FLOAT,  NPY_FLOAT,   NPY_FLOAT,  NPY_INT, NPY_FLOAT,
@@ -3571,6 +4285,42 @@ GUFUNC_DESCRIPTOR_t gufunc_descriptors [] = {
         eigvals_types
     },
     {
+        "qr_r_raw_m",
+        "(m,n)->(m)",
+        "Compute TAU vector for the last two dimensions \n"\
+        "and broadcast to the rest. For m <= n. \n",
+        2, 1, 1,
+        FUNC_ARRAY_NAME(qr_r_raw),
+        qr_r_raw_types
+    },
+    {
+        "qr_r_raw_n",
+        "(m,n)->(n)",
+        "Compute TAU vector for the last two dimensions \n"\
+        "and broadcast to the rest. For m > n. \n",
+        2, 1, 1,
+        FUNC_ARRAY_NAME(qr_r_raw),
+        qr_r_raw_types
+    },
+    {
+        "qr_reduced",
+        "(m,n),(k)->(m,k)",
+        "Compute Q matrix for the last two dimensions \n"\
+        "and broadcast to the rest. \n",
+        2, 2, 1,
+        FUNC_ARRAY_NAME(qr_reduced),
+        qr_reduced_types
+    },
+    {
+        "qr_complete",
+        "(m,n),(n)->(m,m)",
+        "Compute Q matrix for the last two dimensions \n"\
+        "and broadcast to the rest. For m > n. \n",
+        2, 2, 1,
+        FUNC_ARRAY_NAME(qr_complete),
+        qr_complete_types
+    },
+    {
         "lstsq_m",
         "(m,n),(m,nrhs),()->(n,nrhs),(nrhs),(),(m)",
         "least squares on the last two dimensions and broadcast to the rest. \n"\