2 files changed, 102 insertions, 29 deletions
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 4905690ad..5ee230f92 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -2036,17 +2036,9 @@ def lstsq(a, b, rcond="warn"):
     else:
         gufunc = _umath_linalg.lstsq_n
 
-    signature = 'DDd->Did' if isComplexType(t) else 'ddd->did'
+    signature = 'DDd->Ddid' if isComplexType(t) else 'ddd->ddid'
     extobj = get_linalg_error_extobj(_raise_linalgerror_lstsq)
-    b_out, rank, s = gufunc(a, b, rcond, signature=signature, extobj=extobj)
-
-    # b_out contains both the solution and the components of the residuals
-    x = b_out[...,:n,:]
-    r_parts = b_out[...,n:,:]
-    if isComplexType(t):
-        resids = sum(abs(r_parts)**2, axis=-2)
-    else:
-        resids = sum(r_parts**2, axis=-2)
+    x, resids, rank, s = gufunc(a, b, rcond, signature=signature, extobj=extobj)
 
     # remove the axis we added
     if is_1d:
diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index c43a4a48e..03fdd387a 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -765,6 +765,10 @@ fortran_int_max(fortran_int x, fortran_int y) {
     INIT_OUTER_LOOP_5\
     npy_intp s5 = *steps++;
 
+#define INIT_OUTER_LOOP_7  \
+    INIT_OUTER_LOOP_6\
+    npy_intp s6 = *steps++;
+
 #define BEGIN_OUTER_LOOP_2 \
     for (N_ = 0;\
          N_ < dN;\
@@ -805,6 +809,17 @@ fortran_int_max(fortran_int x, fortran_int y) {
              args[4] += s4,\
              args[5] += s5) {
 
+#define BEGIN_OUTER_LOOP_7 \
+    for (N_ = 0;\
+         N_ < dN;\
+         N_++, args[0] += s0,\
+             args[1] += s1,\
+             args[2] += s2,\
+             args[3] += s3,\
+             args[4] += s4,\
+             args[5] += s5,\
+             args[6] += s6) {
+
 #define END_OUTER_LOOP  }
 
 static NPY_INLINE void
@@ -836,6 +851,7 @@ update_pointers(npy_uint8** bases, ptrdiff_t* offsets, size_t count)
     #typ = float, double, COMPLEX_t, DOUBLECOMPLEX_t#
     #copy = scopy, dcopy, ccopy, zcopy#
     #nan = s_nan, d_nan, c_nan, z_nan#
+    #zero = s_zero, d_zero, c_zero, z_zero#
  */
 static NPY_INLINE void *
 linearize_@TYPE@_matrix(void *dst_in,
@@ -949,6 +965,23 @@ nan_@TYPE@_matrix(void *dst_in, const LINEARIZE_DATA_t* data)
     }
 }
 
+static NPY_INLINE void
+zero_@TYPE@_matrix(void *dst_in, const LINEARIZE_DATA_t* data)
+{
+    @typ@ *dst = (@typ@ *) dst_in;
+
+    int i, j;
+    for (i = 0; i < data->rows; i++) {
+        @typ@ *cp = dst;
+        ptrdiff_t cs = data->column_strides/sizeof(@typ@);
+        for (j = 0; j < data->columns; ++j) {
+            *cp = @zero@;
+            cp += cs;
+        }
+        dst += data->row_strides/sizeof(@typ@);
+    }
+}
+
 /**end repeat**/
 
                /* identity square matrix generation */
@@ -3196,6 +3229,12 @@ init_@lapack_func@(GELSD_PARAMS_t *params,
    #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
    #REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE#
    #lapack_func=sgelsd,dgelsd,cgelsd,zgelsd#
+   #dot_func=sdot,ddot,cdotc,zdotc#
+   #typ     = npy_float, npy_double, npy_cfloat, npy_cdouble#
+   #basetyp = npy_float, npy_double, npy_float,  npy_double#
+   #ftyp = fortran_real, fortran_doublereal,
+           fortran_complex, fortran_doublecomplex#
+   #cmplx = 0, 0, 1, 1#
  */
 static inline void
 release_@lapack_func@(GELSD_PARAMS_t* params)
@@ -3206,6 +3245,22 @@ release_@lapack_func@(GELSD_PARAMS_t* params)
     memset(params, 0, sizeof(*params));
 }
 
+/** Compute the squared l2 norm of a contiguous vector */
+static @basetyp@
+@TYPE@_abs2(@typ@ *p, npy_intp n) {
+    npy_intp i;
+    @basetyp@ res = 0;
+    for (i = 0; i < n; i++) {
+        @typ@ el = p[i];
+#if @cmplx@
+        res += el.real*el.real + el.imag*el.imag;
+#else
+        res += el*el;
+#endif
+    }
+    return res;
+}
+
 static void
 @TYPE@_lstsq(char **args, npy_intp *dimensions, npy_intp *steps,
              void *NPY_UNUSED(func))
@@ -3213,21 +3268,25 @@ static void
     GELSD_PARAMS_t params;
     int error_occurred = get_fp_invalid_and_clear();
     fortran_int n, m, nrhs;
-    INIT_OUTER_LOOP_6
+    fortran_int excess;
+
+    INIT_OUTER_LOOP_7
 
     m = (fortran_int)dimensions[0];
     n = (fortran_int)dimensions[1];
     nrhs = (fortran_int)dimensions[2];
+    excess = m - n;
 
     if (init_@lapack_func@(&params, m, n, nrhs)) {
-        LINEARIZE_DATA_t a_in, b_in, x_out, s_out;
+        LINEARIZE_DATA_t a_in, b_in, x_out, s_out, r_out;
 
         init_linearize_data(&a_in, n, m, steps[1], steps[0]);
         init_linearize_data_ex(&b_in, nrhs, m, steps[3], steps[2], fortran_int_max(n, m));
-        init_linearize_data(&x_out, nrhs, fortran_int_max(n, m), steps[5], steps[4]);
-        init_linearize_data(&s_out, 1, fortran_int_min(n, m), 1, steps[6]);
+        init_linearize_data_ex(&x_out, nrhs, n, steps[5], steps[4], fortran_int_max(n, m));
+        init_linearize_data(&r_out, 1, nrhs, 1, steps[6]);
+        init_linearize_data(&s_out, 1, fortran_int_min(n, m), 1, steps[7]);
 
-        BEGIN_OUTER_LOOP_6
+        BEGIN_OUTER_LOOP_7
             int not_ok;
             linearize_@TYPE@_matrix(params.A, args[0], &a_in);
             linearize_@TYPE@_matrix(params.B, args[1], &b_in);
@@ -3235,13 +3294,35 @@ static void
             not_ok = call_@lapack_func@(&params);
             if (!not_ok) {
                 delinearize_@TYPE@_matrix(args[3], params.B, &x_out);
-                *(npy_int*) args[4] = params.RANK;
-                delinearize_@REALTYPE@_matrix(args[5], params.S, &s_out);
+                *(npy_int*) args[5] = params.RANK;
+                delinearize_@REALTYPE@_matrix(args[6], params.S, &s_out);
+
+                /* Note that linalg.lstsq discards this when excess == 0 */
+                if (excess >= 0 && params.RANK == n) {
+                    /* Compute the residuals as the square sum of each column */
+                    int i;
+                    char *resid = args[4];
+                    @ftyp@ *components = (@ftyp@ *)params.B + n;
+                    for (i = 0; i < nrhs; i++) {
+                        @ftyp@ *vector = components + i*m;
+                        /* Numpy and fortran floating types are the same size,
+                         * so this case is safe */
+                        @basetyp@ abs2 = @TYPE@_abs2((@typ@ *)vector, excess);
+                        memcpy(
+                            resid + i*r_out.column_strides,
+                            &abs2, sizeof(abs2));
+                    }
+                }
+                else {
+                    /* Note that this is always discarded by linalg.lstsq */
+                    nan_@REALTYPE@_matrix(args[4], &r_out);
+                }
             } else {
                 error_occurred = 1;
                 nan_@TYPE@_matrix(args[3], &x_out);
-                *(npy_int*) args[4] = -1;
-                nan_@REALTYPE@_matrix(args[5], &s_out);
+                nan_@REALTYPE@_matrix(args[4], &r_out);
+                *(npy_int*) args[5] = -1;
+                nan_@REALTYPE@_matrix(args[6], &s_out);
             }
         END_OUTER_LOOP
 
@@ -3389,12 +3470,12 @@ static char svd_1_3_types[] = {
     NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE
 };
 
-/*  A,           b,           rcond,      x,           rank,    s        */
+/*  A,           b,           rcond,      x,           resid,      rank,    s,        */
 static char lstsq_types[] = {
-    NPY_FLOAT,   NPY_FLOAT,   NPY_FLOAT,  NPY_FLOAT,   NPY_INT, NPY_FLOAT,
-    NPY_DOUBLE,  NPY_DOUBLE,  NPY_DOUBLE, NPY_DOUBLE,  NPY_INT, NPY_DOUBLE,
-    NPY_CFLOAT,  NPY_CFLOAT,  NPY_FLOAT,  NPY_CFLOAT,  NPY_INT, NPY_FLOAT,
-    NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE, NPY_INT, NPY_DOUBLE
+    NPY_FLOAT,   NPY_FLOAT,   NPY_FLOAT,  NPY_FLOAT,   NPY_FLOAT,  NPY_INT, NPY_FLOAT,
+    NPY_DOUBLE,  NPY_DOUBLE,  NPY_DOUBLE, NPY_DOUBLE,  NPY_DOUBLE, NPY_INT, NPY_DOUBLE,
+    NPY_CFLOAT,  NPY_CFLOAT,  NPY_FLOAT,  NPY_CFLOAT,  NPY_FLOAT,  NPY_INT, NPY_FLOAT,
+    NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_INT, NPY_DOUBLE,
 };
 
 typedef struct gufunc_descriptor_struct {
@@ -3590,19 +3671,19 @@ GUFUNC_DESCRIPTOR_t gufunc_descriptors [] = {
     },
     {
         "lstsq_m",
-        "(m,n),(m,nrhs),()->(n,nrhs),(),(m)",
+        "(m,n),(m,nrhs),()->(n,nrhs),(nrhs),(),(m)",
         "least squares on the last two dimensions and broadcast to the rest. \n"\
         "For m <= n. \n",
-        4, 3, 3,
+        4, 3, 4,
         FUNC_ARRAY_NAME(lstsq),
         lstsq_types
     },
     {
         "lstsq_n",
-        "(m,n),(m,nrhs),()->(m,nrhs),(),(n)",
+        "(m,n),(m,nrhs),()->(n,nrhs),(nrhs),(),(n)",
         "least squares on the last two dimensions and broadcast to the rest. \n"\
-        "For m >= n. \n",
-        4, 3, 3,
+        "For m >= n, meaning that residuals are produced. \n",
+        4, 3, 4,
         FUNC_ARRAY_NAME(lstsq),
         lstsq_types
     }