diff options
author | tpoole <t.b.poole@gmail.com> | 2014-08-13 13:22:37 +0100 |
---|---|---|
committer | tpoole <t.b.poole@gmail.com> | 2015-05-13 10:00:05 +0100 |
commit | d87d2ca584b888bcc48fd2fd25c07eb0c08c0939 (patch) | |
tree | ef3f02fe2166745004a7b78676c4bdf4d05b99ce /numpy/lib/function_base.py | |
parent | 30e3d41b6d11c18e17eb283a61bbe9bbf4bb4d8f (diff) | |
download | numpy-d87d2ca584b888bcc48fd2fd25c07eb0c08c0939.tar.gz |
ENH: add 'fweights' and 'aweights' arguments to covariance calculations.
'fweights' allows integer frequencies to be specified for observation vectors,
and 'aweights' provides a more general importance or probabalistic weighting.
Diffstat (limited to 'numpy/lib/function_base.py')
-rw-r--r-- | numpy/lib/function_base.py | 135 |
1 files changed, 104 insertions, 31 deletions
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index d22e8c047..f74a9019f 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -18,7 +18,7 @@ from numpy.core.umath import ( mod, exp, log10 ) from numpy.core.fromnumeric import ( - ravel, nonzero, sort, partition, mean + ravel, nonzero, sort, partition, mean, any, sum ) from numpy.core.numerictypes import typecodes, number from numpy.lib.twodim_base import diag @@ -1829,9 +1829,9 @@ class vectorize(object): return _res -def cov(m, y=None, rowvar=1, bias=0, ddof=None): +def cov(m, y=None, rowvar=1, bias=0, ddof=None, fweights=None, aweights=None): """ - Estimate a covariance matrix, given data. + Estimate a covariance matrix, given data and weights. Covariance indicates the level to which two variables vary together. If we examine N-dimensional samples, :math:`X = [x_1, x_2, ... x_N]^T`, @@ -1839,6 +1839,8 @@ def cov(m, y=None, rowvar=1, bias=0, ddof=None): :math:`x_i` and :math:`x_j`. The element :math:`C_{ii}` is the variance of :math:`x_i`. + See the notes for an outline of the algorithm. + Parameters ---------- m : array_like @@ -1846,23 +1848,35 @@ def cov(m, y=None, rowvar=1, bias=0, ddof=None): Each row of `m` represents a variable, and each column a single observation of all those variables. Also see `rowvar` below. y : array_like, optional - An additional set of variables and observations. `y` has the same - form as that of `m`. + An additional set of variables and observations. `y` has the same form + as that of `m`. rowvar : int, optional If `rowvar` is non-zero (default), then each row represents a variable, with observations in the columns. Otherwise, the relationship is transposed: each column represents a variable, while the rows contain observations. bias : int, optional - Default normalization is by ``(N - 1)``, where ``N`` is the number of - observations given (unbiased estimate). If `bias` is 1, then - normalization is by ``N``. These values can be overridden by using - the keyword ``ddof`` in numpy versions >= 1.5. + Default normalization is by ``(N - 1)``, where ``N`` corresponds to the + number of observations given (unbiased estimate). If `bias` is 1, then + normalization is by ``N``. These values can be overridden by using the + keyword ``ddof`` in numpy versions >= 1.5. ddof : int, optional .. versionadded:: 1.5 - If not ``None`` normalization is by ``(N - ddof)``, where ``N`` is - the number of observations; this overrides the value implied by - ``bias``. The default value is ``None``. + If not ``None`` the default value implied by `bias` is overridden. + Note that ``ddof=1`` will return the unbiased estimate, even if both + `fweights` and `aweights` are specified, and ``ddof=0`` will return + the simple average. See the notes for the details. The default value + is ``None``. + fweights : array_like, int, optional + .. versionadded:: 1.10 + 1-D array of integer freguency weights; the number of times each + observation vector should be repeated. + aweights : array_like, optional + .. versionadded:: 1.10 + 1-D array of observation vector weights. These relative weights are + typically large for observations considered "important" and smaller for + observations considered less "important". If ``ddof=0`` the array of + weights can be used to assign probabilities to observation vectors. Returns ------- @@ -1873,6 +1887,22 @@ def cov(m, y=None, rowvar=1, bias=0, ddof=None): -------- corrcoef : Normalized covariance matrix + Notes + ----- + Assume that the observations are in the columns of the observation + array `m` and let ``f = fweights`` and ``a = aweights`` for brevity. The + steps to compute the weighted covariance are as follows:: + + >>> w = f * a + >>> v1 = np.sum(w) + >>> v2 = np.sum(w * a) + >>> m -= np.sum(m * w, axis=1, keepdims=True) / v1 + >>> cov = np.dot(m * w, m.T) * v1 / (v1**2 - ddof * v2) + + Note that when ``a == 1``, the normalization factor + ``v1 / (v1**2 - ddof * v2)`` goes over to ``1 / (np.sum(f) - ddof)`` + as it should. + Examples -------- Consider two variables, :math:`x_0` and :math:`x_1`, which @@ -1921,36 +1951,79 @@ def cov(m, y=None, rowvar=1, bias=0, ddof=None): y = np.asarray(y) dtype = np.result_type(m, y, np.float64) X = array(m, ndmin=2, dtype=dtype) + if rowvar == 0 and X.shape[0] != 1: + X = X.T + if X.shape[0] == 0: + return np.array([]).reshape(0, 0) + if y is not None: + y = array(y, copy=False, ndmin=2, dtype=dtype) + if rowvar == 0 and y.shape[0] != 1: + y = y.T + X = np.vstack((X, y)) - if X.shape[0] == 1: - rowvar = 1 - if rowvar: - N = X.shape[1] - axis = 0 - else: - N = X.shape[0] - axis = 1 - - # check ddof if ddof is None: if bias == 0: ddof = 1 else: ddof = 0 - fact = float(N - ddof) + + # Get the product of frequencies and weights + w = None + if fweights is not None: + fweights = np.asarray(fweights, dtype=np.float) + if not np.all(fweights == np.around(fweights)): + raise TypeError( + "fweights must be integer") + if fweights.ndim > 1: + raise RuntimeError( + "cannot handle multidimensional fweights") + if fweights.shape[0] != X.shape[1]: + raise RuntimeError( + "incompatible numbers of samples and fweights") + if any(fweights < 0): + raise ValueError( + "fweights cannot be negative") + w = fweights + if aweights is not None: + aweights = np.asarray(aweights, dtype=np.float) + if aweights.ndim > 1: + raise RuntimeError( + "cannot handle multidimensional aweights") + if aweights.shape[0] != X.shape[1]: + raise RuntimeError( + "incompatible numbers of samples and aweights") + if any(aweights < 0): + raise ValueError( + "aweights cannot be negative") + if w is None: + w = aweights + else: + w *= aweights + + avg, w_sum = average(X, axis=1, weights=w, returned=True) + w_sum = w_sum[0] + + # Determine the normalization + if w is None: + fact = float(X.shape[1] - ddof) + else: + if ddof == 0: + fact = w_sum + elif aweights is None: + fact = w_sum - ddof + else: + fact = w_sum - ddof*sum(w*aweights)/w_sum + if fact <= 0: warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning) fact = 0.0 - if y is not None: - y = array(y, copy=False, ndmin=2, dtype=dtype) - X = concatenate((X, y), axis) - - X -= X.mean(axis=1-axis, keepdims=True) - if not rowvar: - return (dot(X.T, X.conj()) / fact).squeeze() + X -= avg[:, None] + if w is None: + X_T = X.T else: - return (dot(X, X.T.conj()) / fact).squeeze() + X_T = (X*w).T + return (dot(X, X_T.conj())/fact).squeeze() def corrcoef(x, y=None, rowvar=1, bias=np._NoValue, ddof=np._NoValue): |