diff options
author | tpoole <t.b.poole@gmail.com> | 2014-08-13 13:22:37 +0100 |
---|---|---|
committer | tpoole <t.b.poole@gmail.com> | 2015-05-13 10:00:05 +0100 |
commit | d87d2ca584b888bcc48fd2fd25c07eb0c08c0939 (patch) | |
tree | ef3f02fe2166745004a7b78676c4bdf4d05b99ce /numpy/lib | |
parent | 30e3d41b6d11c18e17eb283a61bbe9bbf4bb4d8f (diff) | |
download | numpy-d87d2ca584b888bcc48fd2fd25c07eb0c08c0939.tar.gz |
ENH: add 'fweights' and 'aweights' arguments to covariance calculations.
'fweights' allows integer frequencies to be specified for observation vectors,
and 'aweights' provides a more general importance or probabalistic weighting.
Diffstat (limited to 'numpy/lib')
-rw-r--r-- | numpy/lib/function_base.py | 135 | ||||
-rw-r--r-- | numpy/lib/tests/test_function_base.py | 77 |
2 files changed, 176 insertions, 36 deletions
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index d22e8c047..f74a9019f 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -18,7 +18,7 @@ from numpy.core.umath import ( mod, exp, log10 ) from numpy.core.fromnumeric import ( - ravel, nonzero, sort, partition, mean + ravel, nonzero, sort, partition, mean, any, sum ) from numpy.core.numerictypes import typecodes, number from numpy.lib.twodim_base import diag @@ -1829,9 +1829,9 @@ class vectorize(object): return _res -def cov(m, y=None, rowvar=1, bias=0, ddof=None): +def cov(m, y=None, rowvar=1, bias=0, ddof=None, fweights=None, aweights=None): """ - Estimate a covariance matrix, given data. + Estimate a covariance matrix, given data and weights. Covariance indicates the level to which two variables vary together. If we examine N-dimensional samples, :math:`X = [x_1, x_2, ... x_N]^T`, @@ -1839,6 +1839,8 @@ def cov(m, y=None, rowvar=1, bias=0, ddof=None): :math:`x_i` and :math:`x_j`. The element :math:`C_{ii}` is the variance of :math:`x_i`. + See the notes for an outline of the algorithm. + Parameters ---------- m : array_like @@ -1846,23 +1848,35 @@ def cov(m, y=None, rowvar=1, bias=0, ddof=None): Each row of `m` represents a variable, and each column a single observation of all those variables. Also see `rowvar` below. y : array_like, optional - An additional set of variables and observations. `y` has the same - form as that of `m`. + An additional set of variables and observations. `y` has the same form + as that of `m`. rowvar : int, optional If `rowvar` is non-zero (default), then each row represents a variable, with observations in the columns. Otherwise, the relationship is transposed: each column represents a variable, while the rows contain observations. bias : int, optional - Default normalization is by ``(N - 1)``, where ``N`` is the number of - observations given (unbiased estimate). If `bias` is 1, then - normalization is by ``N``. These values can be overridden by using - the keyword ``ddof`` in numpy versions >= 1.5. + Default normalization is by ``(N - 1)``, where ``N`` corresponds to the + number of observations given (unbiased estimate). If `bias` is 1, then + normalization is by ``N``. These values can be overridden by using the + keyword ``ddof`` in numpy versions >= 1.5. ddof : int, optional .. versionadded:: 1.5 - If not ``None`` normalization is by ``(N - ddof)``, where ``N`` is - the number of observations; this overrides the value implied by - ``bias``. The default value is ``None``. + If not ``None`` the default value implied by `bias` is overridden. + Note that ``ddof=1`` will return the unbiased estimate, even if both + `fweights` and `aweights` are specified, and ``ddof=0`` will return + the simple average. See the notes for the details. The default value + is ``None``. + fweights : array_like, int, optional + .. versionadded:: 1.10 + 1-D array of integer freguency weights; the number of times each + observation vector should be repeated. + aweights : array_like, optional + .. versionadded:: 1.10 + 1-D array of observation vector weights. These relative weights are + typically large for observations considered "important" and smaller for + observations considered less "important". If ``ddof=0`` the array of + weights can be used to assign probabilities to observation vectors. Returns ------- @@ -1873,6 +1887,22 @@ def cov(m, y=None, rowvar=1, bias=0, ddof=None): -------- corrcoef : Normalized covariance matrix + Notes + ----- + Assume that the observations are in the columns of the observation + array `m` and let ``f = fweights`` and ``a = aweights`` for brevity. The + steps to compute the weighted covariance are as follows:: + + >>> w = f * a + >>> v1 = np.sum(w) + >>> v2 = np.sum(w * a) + >>> m -= np.sum(m * w, axis=1, keepdims=True) / v1 + >>> cov = np.dot(m * w, m.T) * v1 / (v1**2 - ddof * v2) + + Note that when ``a == 1``, the normalization factor + ``v1 / (v1**2 - ddof * v2)`` goes over to ``1 / (np.sum(f) - ddof)`` + as it should. + Examples -------- Consider two variables, :math:`x_0` and :math:`x_1`, which @@ -1921,36 +1951,79 @@ def cov(m, y=None, rowvar=1, bias=0, ddof=None): y = np.asarray(y) dtype = np.result_type(m, y, np.float64) X = array(m, ndmin=2, dtype=dtype) + if rowvar == 0 and X.shape[0] != 1: + X = X.T + if X.shape[0] == 0: + return np.array([]).reshape(0, 0) + if y is not None: + y = array(y, copy=False, ndmin=2, dtype=dtype) + if rowvar == 0 and y.shape[0] != 1: + y = y.T + X = np.vstack((X, y)) - if X.shape[0] == 1: - rowvar = 1 - if rowvar: - N = X.shape[1] - axis = 0 - else: - N = X.shape[0] - axis = 1 - - # check ddof if ddof is None: if bias == 0: ddof = 1 else: ddof = 0 - fact = float(N - ddof) + + # Get the product of frequencies and weights + w = None + if fweights is not None: + fweights = np.asarray(fweights, dtype=np.float) + if not np.all(fweights == np.around(fweights)): + raise TypeError( + "fweights must be integer") + if fweights.ndim > 1: + raise RuntimeError( + "cannot handle multidimensional fweights") + if fweights.shape[0] != X.shape[1]: + raise RuntimeError( + "incompatible numbers of samples and fweights") + if any(fweights < 0): + raise ValueError( + "fweights cannot be negative") + w = fweights + if aweights is not None: + aweights = np.asarray(aweights, dtype=np.float) + if aweights.ndim > 1: + raise RuntimeError( + "cannot handle multidimensional aweights") + if aweights.shape[0] != X.shape[1]: + raise RuntimeError( + "incompatible numbers of samples and aweights") + if any(aweights < 0): + raise ValueError( + "aweights cannot be negative") + if w is None: + w = aweights + else: + w *= aweights + + avg, w_sum = average(X, axis=1, weights=w, returned=True) + w_sum = w_sum[0] + + # Determine the normalization + if w is None: + fact = float(X.shape[1] - ddof) + else: + if ddof == 0: + fact = w_sum + elif aweights is None: + fact = w_sum - ddof + else: + fact = w_sum - ddof*sum(w*aweights)/w_sum + if fact <= 0: warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning) fact = 0.0 - if y is not None: - y = array(y, copy=False, ndmin=2, dtype=dtype) - X = concatenate((X, y), axis) - - X -= X.mean(axis=1-axis, keepdims=True) - if not rowvar: - return (dot(X.T, X.conj()) / fact).squeeze() + X -= avg[:, None] + if w is None: + X_T = X.T else: - return (dot(X, X.T.conj()) / fact).squeeze() + X_T = (X*w).T + return (dot(X, X_T.conj())/fact).squeeze() def corrcoef(x, y=None, rowvar=1, bias=np._NoValue, ddof=np._NoValue): diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index 12f9d414b..ad71fd3fa 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -1391,9 +1391,20 @@ class TestCorrCoef(TestCase): class TestCov(TestCase): + x1 = np.array([[0, 2], [1, 1], [2, 0]]).T + res1 = np.array([[1., -1.], [-1., 1.]]) + x2 = np.array([0.0, 1.0, 2.0], ndmin=2) + frequencies = np.array([1, 4, 1]) + x2_repeats = np.array([[0.0], [1.0], [1.0], [1.0], [1.0], [2.0]]).T + res2 = np.array([[0.4, -0.4], [-0.4, 0.4]]) + unit_frequencies = np.ones(3, dtype=np.integer) + weights = np.array([1.0, 4.0, 1.0]) + res3 = np.array([[2./3., -2./3.], [-2./3., 2./3.]]) + unit_weights = np.ones(3) + x3 = np.array([0.3942, 0.5969, 0.7730, 0.9918, 0.7964]) + def test_basic(self): - x = np.array([[0, 2], [1, 1], [2, 0]]).T - assert_allclose(cov(x), np.array([[1., -1.], [-1., 1.]])) + assert_allclose(cov(self.x1), self.res1) def test_complex(self): x = np.array([[1, 2, 3], [1j, 2j, 3j]]) @@ -1414,11 +1425,67 @@ class TestCov(TestCase): np.array([[np.nan, np.nan], [np.nan, np.nan]])) def test_wrong_ddof(self): - x = np.array([[0, 2], [1, 1], [2, 0]]).T with warnings.catch_warnings(record=True): warnings.simplefilter('always', RuntimeWarning) - assert_array_equal(cov(x, ddof=5), - np.array([[np.inf, -np.inf], [-np.inf, np.inf]])) + assert_array_equal(cov(self.x1, ddof=5), + np.array([[np.inf, -np.inf], + [-np.inf, np.inf]])) + + def test_1D_rowvar(self): + assert_allclose(cov(self.x3), cov(self.x3, rowvar=0)) + y = np.array([0.0780, 0.3107, 0.2111, 0.0334, 0.8501]) + assert_allclose(cov(self.x3, y), cov(self.x3, y, rowvar=0)) + + def test_1D_variance(self): + assert_allclose(cov(self.x3, ddof=1), np.var(self.x3, ddof=1)) + + def test_fweights(self): + assert_allclose(cov(self.x2, fweights=self.frequencies), + cov(self.x2_repeats)) + assert_allclose(cov(self.x1, fweights=self.frequencies), + self.res2) + assert_allclose(cov(self.x1, fweights=self.unit_frequencies), + self.res1) + nonint = self.frequencies + 0.5 + assert_raises(TypeError, cov, self.x1, fweights=nonint) + f = np.ones((2, 3), dtype=np.integer) + assert_raises(RuntimeError, cov, self.x1, fweights=f) + f = np.ones(2, dtype=np.integer) + assert_raises(RuntimeError, cov, self.x1, fweights=f) + f = -1*np.ones(3, dtype=np.integer) + assert_raises(ValueError, cov, self.x1, fweights=f) + + def test_aweights(self): + assert_allclose(cov(self.x1, aweights=self.weights), self.res3) + assert_allclose(cov(self.x1, aweights=3.0*self.weights), + cov(self.x1, aweights=self.weights)) + assert_allclose(cov(self.x1, aweights=self.unit_weights), self.res1) + w = np.ones((2, 3)) + assert_raises(RuntimeError, cov, self.x1, aweights=w) + w = np.ones(2) + assert_raises(RuntimeError, cov, self.x1, aweights=w) + w = -1.0*np.ones(3) + assert_raises(ValueError, cov, self.x1, aweights=w) + + def test_unit_fweights_and_aweights(self): + assert_allclose(cov(self.x2, fweights=self.frequencies, + aweights=self.unit_weights), + cov(self.x2_repeats)) + assert_allclose(cov(self.x1, fweights=self.frequencies, + aweights=self.unit_weights), + self.res2) + assert_allclose(cov(self.x1, fweights=self.unit_frequencies, + aweights=self.unit_weights), + self.res1) + assert_allclose(cov(self.x1, fweights=self.unit_frequencies, + aweights=self.weights), + self.res3) + assert_allclose(cov(self.x1, fweights=self.unit_frequencies, + aweights=3.0*self.weights), + cov(self.x1, aweights=self.weights)) + assert_allclose(cov(self.x1, fweights=self.unit_frequencies, + aweights=self.unit_weights), + self.res1) class Test_I0(TestCase): |