diff options
Diffstat (limited to 'numpy/ma/mstats.py')
-rw-r--r-- | numpy/ma/mstats.py | 433 |
1 files changed, 433 insertions, 0 deletions
diff --git a/numpy/ma/mstats.py b/numpy/ma/mstats.py new file mode 100644 index 000000000..8daa49c4b --- /dev/null +++ b/numpy/ma/mstats.py @@ -0,0 +1,433 @@ +""" +Generic statistics functions, with support to MA. + +:author: Pierre GF Gerard-Marchant +:contact: pierregm_at_uga_edu +:date: $Date: 2007-10-29 17:18:13 +0200 (Mon, 29 Oct 2007) $ +:version: $Id: mstats.py 3473 2007-10-29 15:18:13Z jarrod.millman $ +""" +__author__ = "Pierre GF Gerard-Marchant ($Author: jarrod.millman $)" +__version__ = '1.0' +__revision__ = "$Revision: 3473 $" +__date__ = '$Date: 2007-10-29 17:18:13 +0200 (Mon, 29 Oct 2007) $' + + +import numpy +from numpy import bool_, float_, int_, \ + sqrt +from numpy import array as narray +import numpy.core.numeric as numeric +from numpy.core.numeric import concatenate + +import numpy.ma +from numpy.ma.core import masked, nomask, MaskedArray, masked_array +from numpy.ma.extras import apply_along_axis, dot + +__all__ = ['cov','meppf','plotting_positions','meppf','mmedian','mquantiles', + 'stde_median','trim_tail','trim_both','trimmed_mean','trimmed_stde', + 'winsorize'] + +#####-------------------------------------------------------------------------- +#---- -- Trimming --- +#####-------------------------------------------------------------------------- + +def winsorize(data, alpha=0.2): + """Returns a Winsorized version of the input array. + +The (alpha/2.) lowest values are set to the (alpha/2.)th percentile, and +the (alpha/2.) highest values are set to the (1-alpha/2.)th percentile +Masked values are skipped. + +*Parameters*: + data : {ndarray} + Input data to Winsorize. The data is first flattened. + alpha : {float}, optional + Percentage of total Winsorization : alpha/2. on the left, alpha/2. on the right + """ + data = masked_array(data, copy=False).ravel() + idxsort = data.argsort() + (nsize, ncounts) = (data.size, data.count()) + ntrim = int(alpha * ncounts) + (xmin,xmax) = data[idxsort[[ntrim, ncounts-nsize-ntrim-1]]] + return masked_array(numpy.clip(data, xmin, xmax), mask=data._mask) + +#.............................................................................. +def trim_both(data, proportiontocut=0.2, axis=None): + """Trims the data by masking the int(trim*n) smallest and int(trim*n) largest +values of data along the given axis, where n is the number of unmasked values. + +*Parameters*: + data : {ndarray} + Data to trim. + proportiontocut : {float} + Percentage of trimming. If n is the number of unmasked values before trimming, + the number of values after trimming is (1-2*trim)*n. + axis : {integer} + Axis along which to perform the trimming. If None, the input array is first + flattened. + """ + #................... + def _trim_1D(data, trim): + "Private function: return a trimmed 1D array." + nsize = data.size + ncounts = data.count() + ntrim = int(trim * ncounts) + idxsort = data.argsort() + data[idxsort[:ntrim]] = masked + data[idxsort[ncounts-nsize-ntrim:]] = masked + return data + #................... + data = masked_array(data, copy=False, subok=True) + data.unshare_mask() + if (axis is None): + return _trim_1D(data.ravel(), proportiontocut) + else: + assert data.ndim <= 2, "Array should be 2D at most !" + return apply_along_axis(_trim_1D, axis, data, proportiontocut) + +#.............................................................................. +def trim_tail(data, proportiontocut=0.2, tail='left', axis=None): + """Trims the data by masking int(trim*n) values from ONE tail of the data +along the given axis, where n is the number of unmasked values. + +*Parameters*: + data : {ndarray} + Data to trim. + proportiontocut : {float} + Percentage of trimming. If n is the number of unmasked values before trimming, + the number of values after trimming is (1-trim)*n. + tail : {string} + Trimming direction, in ('left', 'right'). If left, the proportiontocut + lowest values are set to the corresponding percentile. If right, the + proportiontocut highest values are used instead. + axis : {integer} + Axis along which to perform the trimming. If None, the input array is first + flattened. + """ + #................... + def _trim_1D(data, trim, left): + "Private function: return a trimmed 1D array." + nsize = data.size + ncounts = data.count() + ntrim = int(trim * ncounts) + idxsort = data.argsort() + if left: + data[idxsort[:ntrim]] = masked + else: + data[idxsort[ncounts-nsize-ntrim:]] = masked + return data + #................... + data = masked_array(data, copy=False, subok=True) + data.unshare_mask() + # + if not isinstance(tail, str): + raise TypeError("The tail argument should be in ('left','right')") + tail = tail.lower()[0] + if tail == 'l': + left = True + elif tail == 'r': + left=False + else: + raise ValueError("The tail argument should be in ('left','right')") + # + if (axis is None): + return _trim_1D(data.ravel(), proportiontocut, left) + else: + assert data.ndim <= 2, "Array should be 2D at most !" + return apply_along_axis(_trim_1D, axis, data, proportiontocut, left) + +#.............................................................................. +def trimmed_mean(data, proportiontocut=0.2, axis=None): + """Returns the trimmed mean of the data along the given axis. Trimming is +performed on both ends of the distribution. + +*Parameters*: + data : {ndarray} + Data to trim. + proportiontocut : {float} + Proportion of the data to cut from each side of the data . + As a result, (2*proportiontocut*n) values are actually trimmed. + axis : {integer} + Axis along which to perform the trimming. If None, the input array is first + flattened. + """ + return trim_both(data, proportiontocut=proportiontocut, axis=axis).mean(axis=axis) + +#.............................................................................. +def trimmed_stde(data, proportiontocut=0.2, axis=None): + """Returns the standard error of the trimmed mean for the input data, +along the given axis. Trimming is performed on both ends of the distribution. + +*Parameters*: + data : {ndarray} + Data to trim. + proportiontocut : {float} + Proportion of the data to cut from each side of the data . + As a result, (2*proportiontocut*n) values are actually trimmed. + axis : {integer} + Axis along which to perform the trimming. If None, the input array is first + flattened. + """ + #........................ + def _trimmed_stde_1D(data, trim=0.2): + "Returns the standard error of the trimmed mean for a 1D input data." + winsorized = winsorize(data) + nsize = winsorized.count() + winstd = winsorized.stdu() + return winstd / ((1-2*trim) * numpy.sqrt(nsize)) + #........................ + data = masked_array(data, copy=False, subok=True) + data.unshare_mask() + if (axis is None): + return _trimmed_stde_1D(data.ravel(), proportiontocut) + else: + assert data.ndim <= 2, "Array should be 2D at most !" + return apply_along_axis(_trimmed_stde_1D, axis, data, proportiontocut) + +#............................................................................. +def stde_median(data, axis=None): + """Returns the McKean-Schrader estimate of the standard error of the sample +median along the given axis. + + +*Parameters*: + data : {ndarray} + Data to trim. + axis : {integer} + Axis along which to perform the trimming. If None, the input array is first + flattened. + """ + def _stdemed_1D(data): + sorted = numpy.sort(data.compressed()) + n = len(sorted) + z = 2.5758293035489004 + k = int(round((n+1)/2. - z * sqrt(n/4.),0)) + return ((sorted[n-k] - sorted[k-1])/(2.*z)) + # + data = masked_array(data, copy=False, subok=True) + if (axis is None): + return _stdemed_1D(data) + else: + assert data.ndim <= 2, "Array should be 2D at most !" + return apply_along_axis(_stdemed_1D, axis, data) + + +#####-------------------------------------------------------------------------- +#---- --- Quantiles --- +#####-------------------------------------------------------------------------- + + +def mquantiles(data, prob=list([.25,.5,.75]), alphap=.4, betap=.4, axis=None): + """Computes empirical quantiles for a *1xN* data array. +Samples quantile are defined by: +*Q(p) = (1-g).x[i] +g.x[i+1]* +where *x[j]* is the jth order statistic, +with *i = (floor(n*p+m))*, *m=alpha+p*(1-alpha-beta)* and *g = n*p + m - i)*. + +Typical values of (alpha,beta) are: + + - (0,1) : *p(k) = k/n* : linear interpolation of cdf (R, type 4) + - (.5,.5) : *p(k) = (k+1/2.)/n* : piecewise linear function (R, type 5) + - (0,0) : *p(k) = k/(n+1)* : (R type 6) + - (1,1) : *p(k) = (k-1)/(n-1)*. In this case, p(k) = mode[F(x[k])]. + That's R default (R type 7) + - (1/3,1/3): *p(k) = (k-1/3)/(n+1/3)*. Then p(k) ~ median[F(x[k])]. + The resulting quantile estimates are approximately median-unbiased + regardless of the distribution of x. (R type 8) + - (3/8,3/8): *p(k) = (k-3/8)/(n+1/4)*. Blom. + The resulting quantile estimates are approximately unbiased + if x is normally distributed (R type 9) + - (.4,.4) : approximately quantile unbiased (Cunnane) + - (.35,.35): APL, used with PWM + +*Parameters*: + x : {sequence} + Input data, as a sequence or array of dimension at most 2. + prob : {sequence} + List of quantiles to compute. + alpha : {float} + Plotting positions parameter. + beta : {float} + Plotting positions parameter. + axis : {integer} + Axis along which to perform the trimming. If None, the input array is first + flattened. + """ + def _quantiles1D(data,m,p): + x = numpy.sort(data.compressed()) + n = len(x) + if n == 0: + return masked_array(numpy.empty(len(p), dtype=float_), mask=True) + elif n == 1: + return masked_array(numpy.resize(x, p.shape), mask=nomask) + aleph = (n*p + m) + k = numpy.floor(aleph.clip(1, n-1)).astype(int_) + gamma = (aleph-k).clip(0,1) + return (1.-gamma)*x[(k-1).tolist()] + gamma*x[k.tolist()] + + # Initialization & checks --------- + data = masked_array(data, copy=False) + p = narray(prob, copy=False, ndmin=1) + m = alphap + p*(1.-alphap-betap) + # Computes quantiles along axis (or globally) + if (axis is None): + return _quantiles1D(data, m, p) + else: + assert data.ndim <= 2, "Array should be 2D at most !" + return apply_along_axis(_quantiles1D, axis, data, m, p) + + +def plotting_positions(data, alpha=0.4, beta=0.4): + """Returns the plotting positions (or empirical percentile points) for the + data. + Plotting positions are defined as (i-alpha)/(n-alpha-beta), where: + - i is the rank order statistics + - n is the number of unmasked values along the given axis + - alpha and beta are two parameters. + + Typical values for alpha and beta are: + - (0,1) : *p(k) = k/n* : linear interpolation of cdf (R, type 4) + - (.5,.5) : *p(k) = (k-1/2.)/n* : piecewise linear function (R, type 5) + - (0,0) : *p(k) = k/(n+1)* : Weibull (R type 6) + - (1,1) : *p(k) = (k-1)/(n-1)*. In this case, p(k) = mode[F(x[k])]. + That's R default (R type 7) + - (1/3,1/3): *p(k) = (k-1/3)/(n+1/3)*. Then p(k) ~ median[F(x[k])]. + The resulting quantile estimates are approximately median-unbiased + regardless of the distribution of x. (R type 8) + - (3/8,3/8): *p(k) = (k-3/8)/(n+1/4)*. Blom. + The resulting quantile estimates are approximately unbiased + if x is normally distributed (R type 9) + - (.4,.4) : approximately quantile unbiased (Cunnane) + - (.35,.35): APL, used with PWM + """ + data = masked_array(data, copy=False).reshape(1,-1) + n = data.count() + plpos = numpy.empty(data.size, dtype=float_) + plpos[n:] = 0 + plpos[data.argsort()[:n]] = (numpy.arange(1,n+1) - alpha)/(n+1-alpha-beta) + return masked_array(plpos, mask=data._mask) + +meppf = plotting_positions + + +def mmedian(data, axis=None): + """Returns the median of data along the given axis. Missing data are discarded.""" + def _median1D(data): + x = numpy.sort(data.compressed()) + if x.size == 0: + return masked + return numpy.median(x) + data = masked_array(data, subok=True, copy=True) + if axis is None: + return _median1D(data) + else: + return apply_along_axis(_median1D, axis, data) + + +def cov(x, y=None, rowvar=True, bias=False, strict=False): + """Estimates the covariance matrix. + + +Normalization is by (N-1) where N is the number of observations (unbiased +estimate). If bias is True then normalization is by N. + +*Parameters*: + x : {ndarray} + Input data. If x is a 1D array, returns the variance. If x is a 2D array, + returns the covariance matrix. + y : {ndarray}, optional + Optional set of variables. + rowvar : {boolean} + If rowvar is true, then each row is a variable with obersvations in columns. + If rowvar is False, each column is a variable and the observations are in + the rows. + bias : {boolean} + Whether to use a biased or unbiased estimate of the covariance. + If bias is True, then the normalization is by N, the number of observations. + Otherwise, the normalization is by (N-1) + strict : {boolean} + If strict is True, masked values are propagated: if a masked value appears in + a row or column, the whole row or column is considered masked. + """ + X = narray(x, ndmin=2, subok=True, dtype=float) + if X.shape[0] == 1: + rowvar = True + if rowvar: + axis = 0 + tup = (slice(None),None) + else: + axis = 1 + tup = (None, slice(None)) + # + if y is not None: + y = narray(y, copy=False, ndmin=2, subok=True, dtype=float) + X = concatenate((X,y),axis) + # + X -= X.mean(axis=1-axis)[tup] + n = X.count(1-axis) + # + if bias: + fact = n*1.0 + else: + fact = n-1.0 + # + if not rowvar: + return (dot(X.T, X.conj(), strict=False) / fact).squeeze() + else: + return (dot(X, X.T.conj(), strict=False) / fact).squeeze() + + +def idealfourths(data, axis=None): + """Returns an estimate of the interquartile range of the data along the given +axis, as computed with the ideal fourths. + """ + def _idf(data): + x = numpy.sort(data.compressed()) + n = len(x) + (j,h) = divmod(n/4. + 5/12.,1) + qlo = (1-h)*x[j] + h*x[j+1] + k = n - j + qup = (1-h)*x[k] + h*x[k-1] + return qup - qlo + data = masked_array(data, copy=False) + if (axis is None): + return _idf(data) + else: + return apply_along_axis(_idf, axis, data) + + +def rsh(data, points=None): + """Evalutates Rosenblatt's shifted histogram estimators for each point +on the dataset 'data'. + +*Parameters* : + data : {sequence} + Input data. Masked values are ignored. + points : {sequence} + Sequence of points where to evaluate Rosenblatt shifted histogram. + If None, use the data. + """ + data = masked_array(data, copy=False) + if points is None: + points = data + else: + points = numpy.array(points, copy=False, ndmin=1) + if data.ndim != 1: + raise AttributeError("The input array should be 1D only !") + n = data.count() + h = 1.2 * idealfourths(data) / n**(1./5) + nhi = (data[:,None] <= points[None,:] + h).sum(0) + nlo = (data[:,None] < points[None,:] - h).sum(0) + return (nhi-nlo) / (2.*n*h) + +################################################################################ +if __name__ == '__main__': + from numpy.ma.testutils import assert_almost_equal + if 1: + a = numpy.ma.arange(1,101) + a[1::2] = masked + b = numpy.ma.resize(a, (100,100)) + assert_almost_equal(mquantiles(b), [25., 50., 75.]) + assert_almost_equal(mquantiles(b, axis=0), numpy.ma.resize(a,(3,100))) + assert_almost_equal(mquantiles(b, axis=1), + numpy.ma.resize([24.9, 50., 75.1], (100,3))) |