ENH: add 'fweights' and 'aweights' arguments to covariance calculations.

'fweights' allows integer frequencies to be specified for observation vectors, and 'aweights' provides a more general importance or probabalistic weighting.
author: tpoole <t.b.poole@gmail.com> 2014-08-13 13:22:37 +0100
committer: tpoole <t.b.poole@gmail.com> 2015-05-13 10:00:05 +0100
commit: d87d2ca584b888bcc48fd2fd25c07eb0c08c0939 (patch)
tree: ef3f02fe2166745004a7b78676c4bdf4d05b99ce /numpy/lib
parent: 30e3d41b6d11c18e17eb283a61bbe9bbf4bb4d8f (diff)
download: numpy-d87d2ca584b888bcc48fd2fd25c07eb0c08c0939.tar.gz
2 files changed, 176 insertions, 36 deletions
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index d22e8c047..f74a9019f 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -18,7 +18,7 @@ from numpy.core.umath import (
     mod, exp, log10
     )
 from numpy.core.fromnumeric import (
-    ravel, nonzero, sort, partition, mean
+    ravel, nonzero, sort, partition, mean, any, sum
     )
 from numpy.core.numerictypes import typecodes, number
 from numpy.lib.twodim_base import diag
@@ -1829,9 +1829,9 @@ class vectorize(object):
         return _res
 
 
-def cov(m, y=None, rowvar=1, bias=0, ddof=None):
+def cov(m, y=None, rowvar=1, bias=0, ddof=None, fweights=None, aweights=None):
     """
-    Estimate a covariance matrix, given data.
+    Estimate a covariance matrix, given data and weights.
 
     Covariance indicates the level to which two variables vary together.
     If we examine N-dimensional samples, :math:`X = [x_1, x_2, ... x_N]^T`,
@@ -1839,6 +1839,8 @@ def cov(m, y=None, rowvar=1, bias=0, ddof=None):
     :math:`x_i` and :math:`x_j`. The element :math:`C_{ii}` is the variance
     of :math:`x_i`.
 
+    See the notes for an outline of the algorithm.
+
     Parameters
     ----------
     m : array_like
@@ -1846,23 +1848,35 @@ def cov(m, y=None, rowvar=1, bias=0, ddof=None):
         Each row of `m` represents a variable, and each column a single
         observation of all those variables. Also see `rowvar` below.
     y : array_like, optional
-        An additional set of variables and observations. `y` has the same
-        form as that of `m`.
+        An additional set of variables and observations. `y` has the same form
+        as that of `m`.
     rowvar : int, optional
         If `rowvar` is non-zero (default), then each row represents a
         variable, with observations in the columns. Otherwise, the relationship
         is transposed: each column represents a variable, while the rows
         contain observations.
     bias : int, optional
-        Default normalization is by ``(N - 1)``, where ``N`` is the number of
-        observations given (unbiased estimate). If `bias` is 1, then
-        normalization is by ``N``. These values can be overridden by using
-        the keyword ``ddof`` in numpy versions >= 1.5.
+        Default normalization is by ``(N - 1)``, where ``N`` corresponds to the
+        number of observations given (unbiased estimate). If `bias` is 1, then
+        normalization is by ``N``. These values can be overridden by using the
+        keyword ``ddof`` in numpy versions >= 1.5.
     ddof : int, optional
         .. versionadded:: 1.5
-        If not ``None`` normalization is by ``(N - ddof)``, where ``N`` is
-        the number of observations; this overrides the value implied by
-        ``bias``. The default value is ``None``.
+        If not ``None`` the default value implied by `bias` is overridden.
+        Note that ``ddof=1`` will return the unbiased estimate, even if both
+        `fweights` and `aweights` are specified, and ``ddof=0`` will return
+        the simple average. See the notes for the details. The default value
+        is ``None``.
+    fweights : array_like, int, optional
+        .. versionadded:: 1.10
+        1-D array of integer freguency weights; the number of times each
+        observation vector should be repeated.
+    aweights : array_like, optional
+        .. versionadded:: 1.10
+        1-D array of observation vector weights. These relative weights are
+        typically large for observations considered "important" and smaller for
+        observations considered less "important". If ``ddof=0`` the array of
+        weights can be used to assign probabilities to observation vectors.
 
     Returns
     -------
@@ -1873,6 +1887,22 @@ def cov(m, y=None, rowvar=1, bias=0, ddof=None):
     --------
     corrcoef : Normalized covariance matrix
 
+    Notes
+    -----
+    Assume that the observations are in the columns of the observation
+    array `m` and let ``f = fweights`` and ``a = aweights`` for brevity. The
+    steps to compute the weighted covariance are as follows::
+
+        >>> w = f * a
+        >>> v1 = np.sum(w)
+        >>> v2 = np.sum(w * a)
+        >>> m -= np.sum(m * w, axis=1, keepdims=True) / v1
+        >>> cov = np.dot(m * w, m.T) * v1 / (v1**2 - ddof * v2)
+
+    Note that when ``a == 1``, the normalization factor
+    ``v1 / (v1**2 - ddof * v2)`` goes over to ``1 / (np.sum(f) - ddof)``
+    as it should.
+
     Examples
     --------
     Consider two variables, :math:`x_0` and :math:`x_1`, which
@@ -1921,36 +1951,79 @@ def cov(m, y=None, rowvar=1, bias=0, ddof=None):
         y = np.asarray(y)
         dtype = np.result_type(m, y, np.float64)
     X = array(m, ndmin=2, dtype=dtype)
+    if rowvar == 0 and X.shape[0] != 1:
+        X = X.T
+    if X.shape[0] == 0:
+        return np.array([]).reshape(0, 0)
+    if y is not None:
+        y = array(y, copy=False, ndmin=2, dtype=dtype)
+        if rowvar == 0 and y.shape[0] != 1:
+            y = y.T
+        X = np.vstack((X, y))
 
-    if X.shape[0] == 1:
-        rowvar = 1
-    if rowvar:
-        N = X.shape[1]
-        axis = 0
-    else:
-        N = X.shape[0]
-        axis = 1
-
-    # check ddof
     if ddof is None:
         if bias == 0:
             ddof = 1
         else:
             ddof = 0
-    fact = float(N - ddof)
+
+    # Get the product of frequencies and weights
+    w = None
+    if fweights is not None:
+        fweights = np.asarray(fweights, dtype=np.float)
+        if not np.all(fweights == np.around(fweights)):
+            raise TypeError(
+                "fweights must be integer")
+        if fweights.ndim > 1:
+            raise RuntimeError(
+                "cannot handle multidimensional fweights")
+        if fweights.shape[0] != X.shape[1]:
+            raise RuntimeError(
+                "incompatible numbers of samples and fweights")
+        if any(fweights < 0):
+            raise ValueError(
+                "fweights cannot be negative")
+        w = fweights
+    if aweights is not None:
+        aweights = np.asarray(aweights, dtype=np.float)
+        if aweights.ndim > 1:
+            raise RuntimeError(
+                "cannot handle multidimensional aweights")
+        if aweights.shape[0] != X.shape[1]:
+            raise RuntimeError(
+                "incompatible numbers of samples and aweights")
+        if any(aweights < 0):
+            raise ValueError(
+                "aweights cannot be negative")
+        if w is None:
+            w = aweights
+        else:
+            w *= aweights
+
+    avg, w_sum = average(X, axis=1, weights=w, returned=True)
+    w_sum = w_sum[0]
+
+    # Determine the normalization
+    if w is None:
+        fact = float(X.shape[1] - ddof)
+    else:
+        if ddof == 0:
+            fact = w_sum
+        elif aweights is None:
+            fact = w_sum - ddof
+        else:
+            fact = w_sum - ddof*sum(w*aweights)/w_sum
+
     if fact <= 0:
         warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
         fact = 0.0
 
-    if y is not None:
-        y = array(y, copy=False, ndmin=2, dtype=dtype)
-        X = concatenate((X, y), axis)
-
-    X -= X.mean(axis=1-axis, keepdims=True)
-    if not rowvar:
-        return (dot(X.T, X.conj()) / fact).squeeze()
+    X -= avg[:, None]
+    if w is None:
+        X_T = X.T
     else:
-        return (dot(X, X.T.conj()) / fact).squeeze()
+        X_T = (X*w).T
+    return (dot(X, X_T.conj())/fact).squeeze()
 
 
 def corrcoef(x, y=None, rowvar=1, bias=np._NoValue, ddof=np._NoValue):
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py
index 12f9d414b..ad71fd3fa 100644
--- a/numpy/lib/tests/test_function_base.py
+++ b/numpy/lib/tests/test_function_base.py
@@ -1391,9 +1391,20 @@ class TestCorrCoef(TestCase):
 
 
 class TestCov(TestCase):
+    x1 = np.array([[0, 2], [1, 1], [2, 0]]).T
+    res1 = np.array([[1., -1.], [-1., 1.]])
+    x2 = np.array([0.0, 1.0, 2.0], ndmin=2)
+    frequencies = np.array([1, 4, 1])
+    x2_repeats = np.array([[0.0], [1.0], [1.0], [1.0], [1.0], [2.0]]).T
+    res2 = np.array([[0.4, -0.4], [-0.4, 0.4]])
+    unit_frequencies = np.ones(3, dtype=np.integer)
+    weights = np.array([1.0, 4.0, 1.0])
+    res3 = np.array([[2./3., -2./3.], [-2./3., 2./3.]])
+    unit_weights = np.ones(3)
+    x3 = np.array([0.3942, 0.5969, 0.7730, 0.9918, 0.7964])
+
     def test_basic(self):
-        x = np.array([[0, 2], [1, 1], [2, 0]]).T
-        assert_allclose(cov(x), np.array([[1., -1.], [-1., 1.]]))
+        assert_allclose(cov(self.x1), self.res1)
 
     def test_complex(self):
         x = np.array([[1, 2, 3], [1j, 2j, 3j]])
@@ -1414,11 +1425,67 @@ class TestCov(TestCase):
                                np.array([[np.nan, np.nan], [np.nan, np.nan]]))
 
     def test_wrong_ddof(self):
-        x = np.array([[0, 2], [1, 1], [2, 0]]).T
         with warnings.catch_warnings(record=True):
             warnings.simplefilter('always', RuntimeWarning)
-            assert_array_equal(cov(x, ddof=5),
-                               np.array([[np.inf, -np.inf], [-np.inf, np.inf]]))
+            assert_array_equal(cov(self.x1, ddof=5),
+                               np.array([[np.inf, -np.inf],
+                                         [-np.inf, np.inf]]))
+
+    def test_1D_rowvar(self):
+        assert_allclose(cov(self.x3), cov(self.x3, rowvar=0))
+        y = np.array([0.0780, 0.3107, 0.2111, 0.0334, 0.8501])
+        assert_allclose(cov(self.x3, y), cov(self.x3, y, rowvar=0))
+
+    def test_1D_variance(self):
+        assert_allclose(cov(self.x3, ddof=1), np.var(self.x3, ddof=1))
+
+    def test_fweights(self):
+        assert_allclose(cov(self.x2, fweights=self.frequencies),
+                        cov(self.x2_repeats))
+        assert_allclose(cov(self.x1, fweights=self.frequencies),
+                        self.res2)
+        assert_allclose(cov(self.x1, fweights=self.unit_frequencies),
+                        self.res1)
+        nonint = self.frequencies + 0.5
+        assert_raises(TypeError, cov, self.x1, fweights=nonint)
+        f = np.ones((2, 3), dtype=np.integer)
+        assert_raises(RuntimeError, cov, self.x1, fweights=f)
+        f = np.ones(2, dtype=np.integer)
+        assert_raises(RuntimeError, cov, self.x1, fweights=f)
+        f = -1*np.ones(3, dtype=np.integer)
+        assert_raises(ValueError, cov, self.x1, fweights=f)
+
+    def test_aweights(self):
+        assert_allclose(cov(self.x1, aweights=self.weights), self.res3)
+        assert_allclose(cov(self.x1, aweights=3.0*self.weights),
+                        cov(self.x1, aweights=self.weights))
+        assert_allclose(cov(self.x1, aweights=self.unit_weights), self.res1)
+        w = np.ones((2, 3))
+        assert_raises(RuntimeError, cov, self.x1, aweights=w)
+        w = np.ones(2)
+        assert_raises(RuntimeError, cov, self.x1, aweights=w)
+        w = -1.0*np.ones(3)
+        assert_raises(ValueError, cov, self.x1, aweights=w)
+
+    def test_unit_fweights_and_aweights(self):
+        assert_allclose(cov(self.x2, fweights=self.frequencies,
+                            aweights=self.unit_weights),
+                        cov(self.x2_repeats))
+        assert_allclose(cov(self.x1, fweights=self.frequencies,
+                            aweights=self.unit_weights),
+                        self.res2)
+        assert_allclose(cov(self.x1, fweights=self.unit_frequencies,
+                            aweights=self.unit_weights),
+                        self.res1)
+        assert_allclose(cov(self.x1, fweights=self.unit_frequencies,
+                            aweights=self.weights),
+                        self.res3)
+        assert_allclose(cov(self.x1, fweights=self.unit_frequencies,
+                            aweights=3.0*self.weights),
+                        cov(self.x1, aweights=self.weights))
+        assert_allclose(cov(self.x1, fweights=self.unit_frequencies,
+                            aweights=self.unit_weights),
+                        self.res1)
 
 
 class Test_I0(TestCase):
author	tpoole <t.b.poole@gmail.com>	2014-08-13 13:22:37 +0100
committer	tpoole <t.b.poole@gmail.com>	2015-05-13 10:00:05 +0100
commit	d87d2ca584b888bcc48fd2fd25c07eb0c08c0939 (patch)
tree	ef3f02fe2166745004a7b78676c4bdf4d05b99ce /numpy/lib
parent	30e3d41b6d11c18e17eb283a61bbe9bbf4bb4d8f (diff)
download	numpy-d87d2ca584b888bcc48fd2fd25c07eb0c08c0939.tar.gz