diff options
Diffstat (limited to 'numpy/ma/morestats.py')
-rw-r--r-- | numpy/ma/morestats.py | 406 |
1 files changed, 406 insertions, 0 deletions
diff --git a/numpy/ma/morestats.py b/numpy/ma/morestats.py new file mode 100644 index 000000000..e7085c240 --- /dev/null +++ b/numpy/ma/morestats.py @@ -0,0 +1,406 @@ +""" +Generic statistics functions, with support to MA. + +:author: Pierre GF Gerard-Marchant +:contact: pierregm_at_uga_edu +:date: $Date: 2007-10-29 17:18:13 +0200 (Mon, 29 Oct 2007) $ +:version: $Id: morestats.py 3473 2007-10-29 15:18:13Z jarrod.millman $ +""" +__author__ = "Pierre GF Gerard-Marchant ($Author: jarrod.millman $)" +__version__ = '1.0' +__revision__ = "$Revision: 3473 $" +__date__ = '$Date: 2007-10-29 17:18:13 +0200 (Mon, 29 Oct 2007) $' + + +import numpy +from numpy import bool_, float_, int_, ndarray, \ + sqrt,\ + arange, empty,\ + r_ +from numpy import array as narray +import numpy.core.numeric as numeric +from numpy.core.numeric import concatenate + +import numpy.ma as MA +from numpy.ma.core import masked, nomask, MaskedArray, masked_array +from numpy.ma.extras import apply_along_axis, dot +from numpy.ma.mstats import trim_both, trimmed_stde, mquantiles, mmedian, stde_median + +from scipy.stats.distributions import norm, beta, t, binom +from scipy.stats.morestats import find_repeats + +__all__ = ['hdquantiles', 'hdmedian', 'hdquantiles_sd', + 'trimmed_mean_ci', 'mjci', 'rank_data'] + + +#####-------------------------------------------------------------------------- +#---- --- Quantiles --- +#####-------------------------------------------------------------------------- +def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,): + """Computes quantile estimates with the Harrell-Davis method, where the estimates +are calculated as a weighted linear combination of order statistics. + +*Parameters* : + data: {ndarray} + Data array. + prob: {sequence} + Sequence of quantiles to compute. + axis : {integer} + Axis along which to compute the quantiles. If None, use a flattened array. + var : {boolean} + Whether to return the variance of the estimate. + +*Returns* + A (p,) array of quantiles (if ``var`` is False), or a (2,p) array of quantiles + and variances (if ``var`` is True), where ``p`` is the number of quantiles. + +:Note: + The function is restricted to 2D arrays. + """ + def _hd_1D(data,prob,var): + "Computes the HD quantiles for a 1D array. Returns nan for invalid data." + xsorted = numpy.squeeze(numpy.sort(data.compressed().view(ndarray))) + # Don't use length here, in case we have a numpy scalar + n = xsorted.size + #......... + hd = empty((2,len(prob)), float_) + if n < 2: + hd.flat = numpy.nan + if var: + return hd + return hd[0] + #......... + v = arange(n+1) / float(n) + betacdf = beta.cdf + for (i,p) in enumerate(prob): + _w = betacdf(v, (n+1)*p, (n+1)*(1-p)) + w = _w[1:] - _w[:-1] + hd_mean = dot(w, xsorted) + hd[0,i] = hd_mean + # + hd[1,i] = dot(w, (xsorted-hd_mean)**2) + # + hd[0, prob == 0] = xsorted[0] + hd[0, prob == 1] = xsorted[-1] + if var: + hd[1, prob == 0] = hd[1, prob == 1] = numpy.nan + return hd + return hd[0] + # Initialization & checks --------- + data = masked_array(data, copy=False, dtype=float_) + p = numpy.array(prob, copy=False, ndmin=1) + # Computes quantiles along axis (or globally) + if (axis is None) or (data.ndim == 1): + result = _hd_1D(data, p, var) + else: + assert data.ndim <= 2, "Array should be 2D at most !" + result = apply_along_axis(_hd_1D, axis, data, p, var) + # + return masked_array(result, mask=numpy.isnan(result)) + +#.............................................................................. +def hdmedian(data, axis=-1, var=False): + """Returns the Harrell-Davis estimate of the median along the given axis. + +*Parameters* : + data: {ndarray} + Data array. + axis : {integer} + Axis along which to compute the quantiles. If None, use a flattened array. + var : {boolean} + Whether to return the variance of the estimate. + """ + result = hdquantiles(data,[0.5], axis=axis, var=var) + return result.squeeze() + + +#.............................................................................. +def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None): + """Computes the standard error of the Harrell-Davis quantile estimates by jackknife. + + +*Parameters* : + data: {ndarray} + Data array. + prob: {sequence} + Sequence of quantiles to compute. + axis : {integer} + Axis along which to compute the quantiles. If None, use a flattened array. + +*Note*: + The function is restricted to 2D arrays. + """ + def _hdsd_1D(data,prob): + "Computes the std error for 1D arrays." + xsorted = numpy.sort(data.compressed()) + n = len(xsorted) + #......... + hdsd = empty(len(prob), float_) + if n < 2: + hdsd.flat = numpy.nan + #......... + vv = arange(n) / float(n-1) + betacdf = beta.cdf + # + for (i,p) in enumerate(prob): + _w = betacdf(vv, (n+1)*p, (n+1)*(1-p)) + w = _w[1:] - _w[:-1] + mx_ = numpy.fromiter([dot(w,xsorted[r_[range(0,k), + range(k+1,n)].astype(int_)]) + for k in range(n)], dtype=float_) + mx_var = numpy.array(mx_.var(), copy=False, ndmin=1) * n / float(n-1) + hdsd[i] = float(n-1) * sqrt(numpy.diag(mx_var).diagonal() / float(n)) + return hdsd + # Initialization & checks --------- + data = masked_array(data, copy=False, dtype=float_) + p = numpy.array(prob, copy=False, ndmin=1) + # Computes quantiles along axis (or globally) + if (axis is None): + result = _hdsd_1D(data.compressed(), p) + else: + assert data.ndim <= 2, "Array should be 2D at most !" + result = apply_along_axis(_hdsd_1D, axis, data, p) + # + return masked_array(result, mask=numpy.isnan(result)).ravel() + + +#####-------------------------------------------------------------------------- +#---- --- Confidence intervals --- +#####-------------------------------------------------------------------------- + +def trimmed_mean_ci(data, proportiontocut=0.2, alpha=0.05, axis=None): + """Returns the selected confidence interval of the trimmed mean along the +given axis. + +*Parameters* : + data : {sequence} + Input data. The data is transformed to a masked array + proportiontocut : {float} + Proportion of the data to cut from each side of the data . + As a result, (2*proportiontocut*n) values are actually trimmed. + alpha : {float} + Confidence level of the intervals. + axis : {integer} + Axis along which to cut. If None, uses a flattened version of the input. + """ + data = masked_array(data, copy=False) + trimmed = trim_both(data, proportiontocut=proportiontocut, axis=axis) + tmean = trimmed.mean(axis) + tstde = trimmed_stde(data, proportiontocut=proportiontocut, axis=axis) + df = trimmed.count(axis) - 1 + tppf = t.ppf(1-alpha/2.,df) + return numpy.array((tmean - tppf*tstde, tmean+tppf*tstde)) + +#.............................................................................. +def mjci(data, prob=[0.25,0.5,0.75], axis=None): + """Returns the Maritz-Jarrett estimators of the standard error of selected +experimental quantiles of the data. + +*Parameters* : + data: {ndarray} + Data array. + prob: {sequence} + Sequence of quantiles to compute. + axis : {integer} + Axis along which to compute the quantiles. If None, use a flattened array. + """ + def _mjci_1D(data, p): + data = data.compressed() + sorted = numpy.sort(data) + n = data.size + prob = (numpy.array(p) * n + 0.5).astype(int_) + betacdf = beta.cdf + # + mj = empty(len(prob), float_) + x = arange(1,n+1, dtype=float_) / n + y = x - 1./n + for (i,m) in enumerate(prob): + (m1,m2) = (m-1, n-m) + W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m) + C1 = numpy.dot(W,sorted) + C2 = numpy.dot(W,sorted**2) + mj[i] = sqrt(C2 - C1**2) + return mj + # + data = masked_array(data, copy=False) + assert data.ndim <= 2, "Array should be 2D at most !" + p = numpy.array(prob, copy=False, ndmin=1) + # Computes quantiles along axis (or globally) + if (axis is None): + return _mjci_1D(data, p) + else: + return apply_along_axis(_mjci_1D, axis, data, p) + +#.............................................................................. +def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None): + """Computes the alpha confidence interval for the selected quantiles of the +data, with Maritz-Jarrett estimators. + +*Parameters* : + data: {ndarray} + Data array. + prob: {sequence} + Sequence of quantiles to compute. + alpha : {float} + Confidence level of the intervals. + axis : {integer} + Axis along which to compute the quantiles. If None, use a flattened array. + """ + alpha = min(alpha, 1-alpha) + z = norm.ppf(1-alpha/2.) + xq = mquantiles(data, prob, alphap=0, betap=0, axis=axis) + smj = mjci(data, prob, axis=axis) + return (xq - z * smj, xq + z * smj) + + +#............................................................................. +def median_cihs(data, alpha=0.05, axis=None): + """Computes the alpha-level confidence interval for the median of the data, +following the Hettmasperger-Sheather method. + +*Parameters* : + data : {sequence} + Input data. Masked values are discarded. The input should be 1D only, or + axis should be set to None. + alpha : {float} + Confidence level of the intervals. + axis : {integer} + Axis along which to compute the quantiles. If None, use a flattened array. + """ + def _cihs_1D(data, alpha): + data = numpy.sort(data.compressed()) + n = len(data) + alpha = min(alpha, 1-alpha) + k = int(binom._ppf(alpha/2., n, 0.5)) + gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5) + if gk < 1-alpha: + k -= 1 + gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5) + gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5) + I = (gk - 1 + alpha)/(gk - gkk) + lambd = (n-k) * I / float(k + (n-2*k)*I) + lims = (lambd*data[k] + (1-lambd)*data[k-1], + lambd*data[n-k-1] + (1-lambd)*data[n-k]) + return lims + data = masked_array(data, copy=False) + # Computes quantiles along axis (or globally) + if (axis is None): + result = _cihs_1D(data.compressed(), p, var) + else: + assert data.ndim <= 2, "Array should be 2D at most !" + result = apply_along_axis(_cihs_1D, axis, data, alpha) + # + return result + +#.............................................................................. +def compare_medians_ms(group_1, group_2, axis=None): + """Compares the medians from two independent groups along the given axis. + +The comparison is performed using the McKean-Schrader estimate of the standard +error of the medians. + +*Parameters* : + group_1 : {sequence} + First dataset. + group_2 : {sequence} + Second dataset. + axis : {integer} + Axis along which the medians are estimated. If None, the arrays are flattened. + +*Returns* : + A (p,) array of comparison values. + + """ + (med_1, med_2) = (mmedian(group_1, axis=axis), mmedian(group_2, axis=axis)) + (std_1, std_2) = (stde_median(group_1, axis=axis), + stde_median(group_2, axis=axis)) + W = abs(med_1 - med_2) / sqrt(std_1**2 + std_2**2) + return 1 - norm.cdf(W) + + +#####-------------------------------------------------------------------------- +#---- --- Ranking --- +#####-------------------------------------------------------------------------- + +#.............................................................................. +def rank_data(data, axis=None, use_missing=False): + """Returns the rank (also known as order statistics) of each data point +along the given axis. + +If some values are tied, their rank is averaged. +If some values are masked, their rank is set to 0 if use_missing is False, or +set to the average rank of the unmasked values if use_missing is True. + +*Parameters* : + data : {sequence} + Input data. The data is transformed to a masked array + axis : {integer} + Axis along which to perform the ranking. If None, the array is first + flattened. An exception is raised if the axis is specified for arrays + with a dimension larger than 2 + use_missing : {boolean} + Whether the masked values have a rank of 0 (False) or equal to the + average rank of the unmasked values (True). + """ + # + def _rank1d(data, use_missing=False): + n = data.count() + rk = numpy.empty(data.size, dtype=float_) + idx = data.argsort() + rk[idx[:n]] = numpy.arange(1,n+1) + # + if use_missing: + rk[idx[n:]] = (n+1)/2. + else: + rk[idx[n:]] = 0 + # + repeats = find_repeats(data) + for r in repeats[0]: + condition = (data==r).filled(False) + rk[condition] = rk[condition].mean() + return rk + # + data = masked_array(data, copy=False) + if axis is None: + if data.ndim > 1: + return _rank1d(data.ravel(), use_missing).reshape(data.shape) + else: + return _rank1d(data, use_missing) + else: + return apply_along_axis(_rank1d, axis, data, use_missing) + +############################################################################### +if __name__ == '__main__': + + if 0: + from numpy.ma.testutils import assert_almost_equal + data = [0.706560797,0.727229578,0.990399276,0.927065621,0.158953014, + 0.887764025,0.239407086,0.349638551,0.972791145,0.149789972, + 0.936947700,0.132359948,0.046041972,0.641675031,0.945530547, + 0.224218684,0.771450991,0.820257774,0.336458052,0.589113496, + 0.509736129,0.696838829,0.491323573,0.622767425,0.775189248, + 0.641461450,0.118455200,0.773029450,0.319280007,0.752229111, + 0.047841438,0.466295911,0.583850781,0.840581845,0.550086491, + 0.466470062,0.504765074,0.226855960,0.362641207,0.891620942, + 0.127898691,0.490094097,0.044882048,0.041441695,0.317976349, + 0.504135618,0.567353033,0.434617473,0.636243375,0.231803616, + 0.230154113,0.160011327,0.819464108,0.854706985,0.438809221, + 0.487427267,0.786907310,0.408367937,0.405534192,0.250444460, + 0.995309248,0.144389588,0.739947527,0.953543606,0.680051621, + 0.388382017,0.863530727,0.006514031,0.118007779,0.924024803, + 0.384236354,0.893687694,0.626534881,0.473051932,0.750134705, + 0.241843555,0.432947602,0.689538104,0.136934797,0.150206859, + 0.474335206,0.907775349,0.525869295,0.189184225,0.854284286, + 0.831089744,0.251637345,0.587038213,0.254475554,0.237781276, + 0.827928620,0.480283781,0.594514455,0.213641488,0.024194386, + 0.536668589,0.699497811,0.892804071,0.093835427,0.731107772] + # + assert_almost_equal(hdquantiles(data,[0., 1.]), + [0.006514031, 0.995309248]) + hdq = hdquantiles(data,[0.25, 0.5, 0.75]) + assert_almost_equal(hdq, [0.253210762, 0.512847491, 0.762232442,]) + hdq = hdquantiles_sd(data,[0.25, 0.5, 0.75]) + assert_almost_equal(hdq, [0.03786954, 0.03805389, 0.03800152,], 4) + # + data = numpy.array(data).reshape(10,10) + hdq = hdquantiles(data,[0.25,0.5,0.75],axis=0) |