summaryrefslogtreecommitdiff
path: root/numpy/ma/mstats.py
diff options
context:
space:
mode:
Diffstat (limited to 'numpy/ma/mstats.py')
-rw-r--r--numpy/ma/mstats.py433
1 files changed, 433 insertions, 0 deletions
diff --git a/numpy/ma/mstats.py b/numpy/ma/mstats.py
new file mode 100644
index 000000000..8daa49c4b
--- /dev/null
+++ b/numpy/ma/mstats.py
@@ -0,0 +1,433 @@
+"""
+Generic statistics functions, with support to MA.
+
+:author: Pierre GF Gerard-Marchant
+:contact: pierregm_at_uga_edu
+:date: $Date: 2007-10-29 17:18:13 +0200 (Mon, 29 Oct 2007) $
+:version: $Id: mstats.py 3473 2007-10-29 15:18:13Z jarrod.millman $
+"""
+__author__ = "Pierre GF Gerard-Marchant ($Author: jarrod.millman $)"
+__version__ = '1.0'
+__revision__ = "$Revision: 3473 $"
+__date__ = '$Date: 2007-10-29 17:18:13 +0200 (Mon, 29 Oct 2007) $'
+
+
+import numpy
+from numpy import bool_, float_, int_, \
+ sqrt
+from numpy import array as narray
+import numpy.core.numeric as numeric
+from numpy.core.numeric import concatenate
+
+import numpy.ma
+from numpy.ma.core import masked, nomask, MaskedArray, masked_array
+from numpy.ma.extras import apply_along_axis, dot
+
+__all__ = ['cov','meppf','plotting_positions','meppf','mmedian','mquantiles',
+ 'stde_median','trim_tail','trim_both','trimmed_mean','trimmed_stde',
+ 'winsorize']
+
+#####--------------------------------------------------------------------------
+#---- -- Trimming ---
+#####--------------------------------------------------------------------------
+
+def winsorize(data, alpha=0.2):
+ """Returns a Winsorized version of the input array.
+
+The (alpha/2.) lowest values are set to the (alpha/2.)th percentile, and
+the (alpha/2.) highest values are set to the (1-alpha/2.)th percentile
+Masked values are skipped.
+
+*Parameters*:
+ data : {ndarray}
+ Input data to Winsorize. The data is first flattened.
+ alpha : {float}, optional
+ Percentage of total Winsorization : alpha/2. on the left, alpha/2. on the right
+ """
+ data = masked_array(data, copy=False).ravel()
+ idxsort = data.argsort()
+ (nsize, ncounts) = (data.size, data.count())
+ ntrim = int(alpha * ncounts)
+ (xmin,xmax) = data[idxsort[[ntrim, ncounts-nsize-ntrim-1]]]
+ return masked_array(numpy.clip(data, xmin, xmax), mask=data._mask)
+
+#..............................................................................
+def trim_both(data, proportiontocut=0.2, axis=None):
+ """Trims the data by masking the int(trim*n) smallest and int(trim*n) largest
+values of data along the given axis, where n is the number of unmasked values.
+
+*Parameters*:
+ data : {ndarray}
+ Data to trim.
+ proportiontocut : {float}
+ Percentage of trimming. If n is the number of unmasked values before trimming,
+ the number of values after trimming is (1-2*trim)*n.
+ axis : {integer}
+ Axis along which to perform the trimming. If None, the input array is first
+ flattened.
+ """
+ #...................
+ def _trim_1D(data, trim):
+ "Private function: return a trimmed 1D array."
+ nsize = data.size
+ ncounts = data.count()
+ ntrim = int(trim * ncounts)
+ idxsort = data.argsort()
+ data[idxsort[:ntrim]] = masked
+ data[idxsort[ncounts-nsize-ntrim:]] = masked
+ return data
+ #...................
+ data = masked_array(data, copy=False, subok=True)
+ data.unshare_mask()
+ if (axis is None):
+ return _trim_1D(data.ravel(), proportiontocut)
+ else:
+ assert data.ndim <= 2, "Array should be 2D at most !"
+ return apply_along_axis(_trim_1D, axis, data, proportiontocut)
+
+#..............................................................................
+def trim_tail(data, proportiontocut=0.2, tail='left', axis=None):
+ """Trims the data by masking int(trim*n) values from ONE tail of the data
+along the given axis, where n is the number of unmasked values.
+
+*Parameters*:
+ data : {ndarray}
+ Data to trim.
+ proportiontocut : {float}
+ Percentage of trimming. If n is the number of unmasked values before trimming,
+ the number of values after trimming is (1-trim)*n.
+ tail : {string}
+ Trimming direction, in ('left', 'right'). If left, the proportiontocut
+ lowest values are set to the corresponding percentile. If right, the
+ proportiontocut highest values are used instead.
+ axis : {integer}
+ Axis along which to perform the trimming. If None, the input array is first
+ flattened.
+ """
+ #...................
+ def _trim_1D(data, trim, left):
+ "Private function: return a trimmed 1D array."
+ nsize = data.size
+ ncounts = data.count()
+ ntrim = int(trim * ncounts)
+ idxsort = data.argsort()
+ if left:
+ data[idxsort[:ntrim]] = masked
+ else:
+ data[idxsort[ncounts-nsize-ntrim:]] = masked
+ return data
+ #...................
+ data = masked_array(data, copy=False, subok=True)
+ data.unshare_mask()
+ #
+ if not isinstance(tail, str):
+ raise TypeError("The tail argument should be in ('left','right')")
+ tail = tail.lower()[0]
+ if tail == 'l':
+ left = True
+ elif tail == 'r':
+ left=False
+ else:
+ raise ValueError("The tail argument should be in ('left','right')")
+ #
+ if (axis is None):
+ return _trim_1D(data.ravel(), proportiontocut, left)
+ else:
+ assert data.ndim <= 2, "Array should be 2D at most !"
+ return apply_along_axis(_trim_1D, axis, data, proportiontocut, left)
+
+#..............................................................................
+def trimmed_mean(data, proportiontocut=0.2, axis=None):
+ """Returns the trimmed mean of the data along the given axis. Trimming is
+performed on both ends of the distribution.
+
+*Parameters*:
+ data : {ndarray}
+ Data to trim.
+ proportiontocut : {float}
+ Proportion of the data to cut from each side of the data .
+ As a result, (2*proportiontocut*n) values are actually trimmed.
+ axis : {integer}
+ Axis along which to perform the trimming. If None, the input array is first
+ flattened.
+ """
+ return trim_both(data, proportiontocut=proportiontocut, axis=axis).mean(axis=axis)
+
+#..............................................................................
+def trimmed_stde(data, proportiontocut=0.2, axis=None):
+ """Returns the standard error of the trimmed mean for the input data,
+along the given axis. Trimming is performed on both ends of the distribution.
+
+*Parameters*:
+ data : {ndarray}
+ Data to trim.
+ proportiontocut : {float}
+ Proportion of the data to cut from each side of the data .
+ As a result, (2*proportiontocut*n) values are actually trimmed.
+ axis : {integer}
+ Axis along which to perform the trimming. If None, the input array is first
+ flattened.
+ """
+ #........................
+ def _trimmed_stde_1D(data, trim=0.2):
+ "Returns the standard error of the trimmed mean for a 1D input data."
+ winsorized = winsorize(data)
+ nsize = winsorized.count()
+ winstd = winsorized.stdu()
+ return winstd / ((1-2*trim) * numpy.sqrt(nsize))
+ #........................
+ data = masked_array(data, copy=False, subok=True)
+ data.unshare_mask()
+ if (axis is None):
+ return _trimmed_stde_1D(data.ravel(), proportiontocut)
+ else:
+ assert data.ndim <= 2, "Array should be 2D at most !"
+ return apply_along_axis(_trimmed_stde_1D, axis, data, proportiontocut)
+
+#.............................................................................
+def stde_median(data, axis=None):
+ """Returns the McKean-Schrader estimate of the standard error of the sample
+median along the given axis.
+
+
+*Parameters*:
+ data : {ndarray}
+ Data to trim.
+ axis : {integer}
+ Axis along which to perform the trimming. If None, the input array is first
+ flattened.
+ """
+ def _stdemed_1D(data):
+ sorted = numpy.sort(data.compressed())
+ n = len(sorted)
+ z = 2.5758293035489004
+ k = int(round((n+1)/2. - z * sqrt(n/4.),0))
+ return ((sorted[n-k] - sorted[k-1])/(2.*z))
+ #
+ data = masked_array(data, copy=False, subok=True)
+ if (axis is None):
+ return _stdemed_1D(data)
+ else:
+ assert data.ndim <= 2, "Array should be 2D at most !"
+ return apply_along_axis(_stdemed_1D, axis, data)
+
+
+#####--------------------------------------------------------------------------
+#---- --- Quantiles ---
+#####--------------------------------------------------------------------------
+
+
+def mquantiles(data, prob=list([.25,.5,.75]), alphap=.4, betap=.4, axis=None):
+ """Computes empirical quantiles for a *1xN* data array.
+Samples quantile are defined by:
+*Q(p) = (1-g).x[i] +g.x[i+1]*
+where *x[j]* is the jth order statistic,
+with *i = (floor(n*p+m))*, *m=alpha+p*(1-alpha-beta)* and *g = n*p + m - i)*.
+
+Typical values of (alpha,beta) are:
+
+ - (0,1) : *p(k) = k/n* : linear interpolation of cdf (R, type 4)
+ - (.5,.5) : *p(k) = (k+1/2.)/n* : piecewise linear function (R, type 5)
+ - (0,0) : *p(k) = k/(n+1)* : (R type 6)
+ - (1,1) : *p(k) = (k-1)/(n-1)*. In this case, p(k) = mode[F(x[k])].
+ That's R default (R type 7)
+ - (1/3,1/3): *p(k) = (k-1/3)/(n+1/3)*. Then p(k) ~ median[F(x[k])].
+ The resulting quantile estimates are approximately median-unbiased
+ regardless of the distribution of x. (R type 8)
+ - (3/8,3/8): *p(k) = (k-3/8)/(n+1/4)*. Blom.
+ The resulting quantile estimates are approximately unbiased
+ if x is normally distributed (R type 9)
+ - (.4,.4) : approximately quantile unbiased (Cunnane)
+ - (.35,.35): APL, used with PWM
+
+*Parameters*:
+ x : {sequence}
+ Input data, as a sequence or array of dimension at most 2.
+ prob : {sequence}
+ List of quantiles to compute.
+ alpha : {float}
+ Plotting positions parameter.
+ beta : {float}
+ Plotting positions parameter.
+ axis : {integer}
+ Axis along which to perform the trimming. If None, the input array is first
+ flattened.
+ """
+ def _quantiles1D(data,m,p):
+ x = numpy.sort(data.compressed())
+ n = len(x)
+ if n == 0:
+ return masked_array(numpy.empty(len(p), dtype=float_), mask=True)
+ elif n == 1:
+ return masked_array(numpy.resize(x, p.shape), mask=nomask)
+ aleph = (n*p + m)
+ k = numpy.floor(aleph.clip(1, n-1)).astype(int_)
+ gamma = (aleph-k).clip(0,1)
+ return (1.-gamma)*x[(k-1).tolist()] + gamma*x[k.tolist()]
+
+ # Initialization & checks ---------
+ data = masked_array(data, copy=False)
+ p = narray(prob, copy=False, ndmin=1)
+ m = alphap + p*(1.-alphap-betap)
+ # Computes quantiles along axis (or globally)
+ if (axis is None):
+ return _quantiles1D(data, m, p)
+ else:
+ assert data.ndim <= 2, "Array should be 2D at most !"
+ return apply_along_axis(_quantiles1D, axis, data, m, p)
+
+
+def plotting_positions(data, alpha=0.4, beta=0.4):
+ """Returns the plotting positions (or empirical percentile points) for the
+ data.
+ Plotting positions are defined as (i-alpha)/(n-alpha-beta), where:
+ - i is the rank order statistics
+ - n is the number of unmasked values along the given axis
+ - alpha and beta are two parameters.
+
+ Typical values for alpha and beta are:
+ - (0,1) : *p(k) = k/n* : linear interpolation of cdf (R, type 4)
+ - (.5,.5) : *p(k) = (k-1/2.)/n* : piecewise linear function (R, type 5)
+ - (0,0) : *p(k) = k/(n+1)* : Weibull (R type 6)
+ - (1,1) : *p(k) = (k-1)/(n-1)*. In this case, p(k) = mode[F(x[k])].
+ That's R default (R type 7)
+ - (1/3,1/3): *p(k) = (k-1/3)/(n+1/3)*. Then p(k) ~ median[F(x[k])].
+ The resulting quantile estimates are approximately median-unbiased
+ regardless of the distribution of x. (R type 8)
+ - (3/8,3/8): *p(k) = (k-3/8)/(n+1/4)*. Blom.
+ The resulting quantile estimates are approximately unbiased
+ if x is normally distributed (R type 9)
+ - (.4,.4) : approximately quantile unbiased (Cunnane)
+ - (.35,.35): APL, used with PWM
+ """
+ data = masked_array(data, copy=False).reshape(1,-1)
+ n = data.count()
+ plpos = numpy.empty(data.size, dtype=float_)
+ plpos[n:] = 0
+ plpos[data.argsort()[:n]] = (numpy.arange(1,n+1) - alpha)/(n+1-alpha-beta)
+ return masked_array(plpos, mask=data._mask)
+
+meppf = plotting_positions
+
+
+def mmedian(data, axis=None):
+ """Returns the median of data along the given axis. Missing data are discarded."""
+ def _median1D(data):
+ x = numpy.sort(data.compressed())
+ if x.size == 0:
+ return masked
+ return numpy.median(x)
+ data = masked_array(data, subok=True, copy=True)
+ if axis is None:
+ return _median1D(data)
+ else:
+ return apply_along_axis(_median1D, axis, data)
+
+
+def cov(x, y=None, rowvar=True, bias=False, strict=False):
+ """Estimates the covariance matrix.
+
+
+Normalization is by (N-1) where N is the number of observations (unbiased
+estimate). If bias is True then normalization is by N.
+
+*Parameters*:
+ x : {ndarray}
+ Input data. If x is a 1D array, returns the variance. If x is a 2D array,
+ returns the covariance matrix.
+ y : {ndarray}, optional
+ Optional set of variables.
+ rowvar : {boolean}
+ If rowvar is true, then each row is a variable with obersvations in columns.
+ If rowvar is False, each column is a variable and the observations are in
+ the rows.
+ bias : {boolean}
+ Whether to use a biased or unbiased estimate of the covariance.
+ If bias is True, then the normalization is by N, the number of observations.
+ Otherwise, the normalization is by (N-1)
+ strict : {boolean}
+ If strict is True, masked values are propagated: if a masked value appears in
+ a row or column, the whole row or column is considered masked.
+ """
+ X = narray(x, ndmin=2, subok=True, dtype=float)
+ if X.shape[0] == 1:
+ rowvar = True
+ if rowvar:
+ axis = 0
+ tup = (slice(None),None)
+ else:
+ axis = 1
+ tup = (None, slice(None))
+ #
+ if y is not None:
+ y = narray(y, copy=False, ndmin=2, subok=True, dtype=float)
+ X = concatenate((X,y),axis)
+ #
+ X -= X.mean(axis=1-axis)[tup]
+ n = X.count(1-axis)
+ #
+ if bias:
+ fact = n*1.0
+ else:
+ fact = n-1.0
+ #
+ if not rowvar:
+ return (dot(X.T, X.conj(), strict=False) / fact).squeeze()
+ else:
+ return (dot(X, X.T.conj(), strict=False) / fact).squeeze()
+
+
+def idealfourths(data, axis=None):
+ """Returns an estimate of the interquartile range of the data along the given
+axis, as computed with the ideal fourths.
+ """
+ def _idf(data):
+ x = numpy.sort(data.compressed())
+ n = len(x)
+ (j,h) = divmod(n/4. + 5/12.,1)
+ qlo = (1-h)*x[j] + h*x[j+1]
+ k = n - j
+ qup = (1-h)*x[k] + h*x[k-1]
+ return qup - qlo
+ data = masked_array(data, copy=False)
+ if (axis is None):
+ return _idf(data)
+ else:
+ return apply_along_axis(_idf, axis, data)
+
+
+def rsh(data, points=None):
+ """Evalutates Rosenblatt's shifted histogram estimators for each point
+on the dataset 'data'.
+
+*Parameters* :
+ data : {sequence}
+ Input data. Masked values are ignored.
+ points : {sequence}
+ Sequence of points where to evaluate Rosenblatt shifted histogram.
+ If None, use the data.
+ """
+ data = masked_array(data, copy=False)
+ if points is None:
+ points = data
+ else:
+ points = numpy.array(points, copy=False, ndmin=1)
+ if data.ndim != 1:
+ raise AttributeError("The input array should be 1D only !")
+ n = data.count()
+ h = 1.2 * idealfourths(data) / n**(1./5)
+ nhi = (data[:,None] <= points[None,:] + h).sum(0)
+ nlo = (data[:,None] < points[None,:] - h).sum(0)
+ return (nhi-nlo) / (2.*n*h)
+
+################################################################################
+if __name__ == '__main__':
+ from numpy.ma.testutils import assert_almost_equal
+ if 1:
+ a = numpy.ma.arange(1,101)
+ a[1::2] = masked
+ b = numpy.ma.resize(a, (100,100))
+ assert_almost_equal(mquantiles(b), [25., 50., 75.])
+ assert_almost_equal(mquantiles(b, axis=0), numpy.ma.resize(a,(3,100)))
+ assert_almost_equal(mquantiles(b, axis=1),
+ numpy.ma.resize([24.9, 50., 75.1], (100,3)))