summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--numpy/linalg/linalg.py63
-rw-r--r--numpy/linalg/tests/test_linalg.py17
2 files changed, 60 insertions, 20 deletions
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index f25452064..a7f1c074d 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -1420,8 +1420,8 @@ def matrix_rank(M, tol=None):
"""
Return matrix rank of array using SVD method
- Rank of the array is the number of SVD singular values of the
- array that are greater than `tol`.
+ Rank of the array is the number of SVD singular values of the array that are
+ greater than `tol`.
Parameters
----------
@@ -1431,27 +1431,53 @@ def matrix_rank(M, tol=None):
threshold below which SVD values are considered zero. If `tol` is
None, and ``S`` is an array with singular values for `M`, and
``eps`` is the epsilon value for datatype of ``S``, then `tol` is
- set to ``S.max() * eps``.
+ set to ``S.max() * max(M.shape) * eps``.
Notes
-----
- Golub and van Loan [1]_ define "numerical rank deficiency" as using
- tol=eps*S[0] (where S[0] is the maximum singular value and thus the
- 2-norm of the matrix). This is one definition of rank deficiency,
- and the one we use here. When floating point roundoff is the main
- concern, then "numerical rank deficiency" is a reasonable choice. In
- some cases you may prefer other definitions. The most useful measure
- of the tolerance depends on the operations you intend to use on your
- matrix. For example, if your data come from uncertain measurements
- with uncertainties greater than floating point epsilon, choosing a
- tolerance near that uncertainty may be preferable. The tolerance
- may be absolute if the uncertainties are absolute rather than
- relative.
+ The default threshold to detect rank deficiency is a test on the magnitude
+ of the singular values of `M`. By default, we identify singular values less
+ than ``S.max() * max(M.shape) * eps`` as indicating rank deficiency (with
+ the symbols defined above). This is the algorithm MATLAB uses [1]. It also
+ appears in *Numerical recipes* in the discussion of SVD solutions for linear
+ least squares [2].
+
+ This default threshold is designed to detect rank deficiency accounting for
+ the numerical errors of the SVD computation. Imagine that there is a column
+ in `M` that is an exact (in floating point) linear combination of other
+ columns in `M`. Computing the SVD on `M` will not produce a singular value
+ exactly equal to 0 in general: any difference of the smallest SVD value from
+ 0 will be caused by numerical imprecision in the calculation of the SVD.
+ Our threshold for small SVD values takes this numerical imprecision into
+ account, and the default threshold will detect such numerical rank
+ deficiency. The threshold may declare a matrix `M` rank deficient even if
+ the linear combination of some columns of `M` is not exactly equal to
+ another column of `M` but only numerically very close to another column of
+ `M`.
+
+ We chose our default threshold because it is in wide use. Other thresholds
+ are possible. For example, elsewhere in the 2007 edition of *Numerical
+ recipes* there is an alternative threshold of ``S.max() *
+ np.finfo(M.dtype).eps / 2. * np.sqrt(m + n + 1.)``. The authors describe
+ this threshold as being based on "expected roundoff error" (p 71).
+
+ The thresholds above deal with floating point roundoff error in the
+ calculation of the SVD. However, you may have more information about the
+ sources of error in `M` that would make you consider other tolerance values
+ to detect *effective* rank deficiency. The most useful measure of the
+ tolerance depends on the operations you intend to use on your matrix. For
+ example, if your data come from uncertain measurements with uncertainties
+ greater than floating point epsilon, choosing a tolerance near that
+ uncertainty may be preferable. The tolerance may be absolute if the
+ uncertainties are absolute rather than relative.
References
----------
- .. [1] G. H. Golub and C. F. Van Loan, *Matrix Computations*.
- Baltimore: Johns Hopkins University Press, 1996.
+ .. [1] MATLAB reference documention, "Rank"
+ http://www.mathworks.com/help/techdoc/ref/rank.html
+ .. [2] W. H. Press, S. A. Teukolsky, W. T. Vetterling and B. P. Flannery,
+ "Numerical Recipes (3rd edition)", Cambridge University Press, 2007,
+ page 795.
Examples
--------
@@ -1465,7 +1491,6 @@ def matrix_rank(M, tol=None):
1
>>> matrix_rank(np.zeros((4,)))
0
-
"""
M = asarray(M)
if M.ndim > 2:
@@ -1474,7 +1499,7 @@ def matrix_rank(M, tol=None):
return int(not all(M==0))
S = svd(M, compute_uv=False)
if tol is None:
- tol = S.max() * finfo(S.dtype).eps
+ tol = S.max() * max(M.shape) * finfo(S.dtype).eps
return sum(S > tol)
diff --git a/numpy/linalg/tests/test_linalg.py b/numpy/linalg/tests/test_linalg.py
index 2a2ec3da5..623939863 100644
--- a/numpy/linalg/tests/test_linalg.py
+++ b/numpy/linalg/tests/test_linalg.py
@@ -3,7 +3,9 @@
import sys
import numpy as np
-from numpy.testing import *
+from numpy.testing import (TestCase, assert_, assert_equal, assert_raises,
+ assert_array_equal, assert_almost_equal,
+ run_module_suite)
from numpy import array, single, double, csingle, cdouble, dot, identity
from numpy import multiply, atleast_2d, inf, asarray, matrix
from numpy import linalg
@@ -451,6 +453,19 @@ class TestMatrixRank(object):
yield assert_equal, matrix_rank(1), 1
+def test_reduced_rank():
+ # Test matrices with reduced rank
+ rng = np.random.RandomState(20120714)
+ for i in range(100):
+ # Make a rank deficient matrix
+ X = rng.normal(size=(40, 10))
+ X[:, 0] = X[:, 1] + X[:, 2]
+ # Assert that matrix_rank detected deficiency
+ assert_equal(matrix_rank(X), 9)
+ X[:, 3] = X[:, 4] + X[:, 5]
+ assert_equal(matrix_rank(X), 8)
+
+
class TestQR(TestCase):
def test_qr_empty(self):
a = np.zeros((0,2))