diff options
author | Varun Nayyar <nayyarv@gmail.com> | 2015-08-15 03:14:24 +1000 |
---|---|---|
committer | Varun Nayyar <nayyarv@gmail.com> | 2015-08-15 03:34:09 +1000 |
commit | 388ee595330e2375a1e4c8187c17de7ea9fb2f6f (patch) | |
tree | c6146ac3bb049793429014942173ae09694fcfdb /numpy/lib/function_base.py | |
parent | ed9a0f347a4cb273edf0b8b5dcc98de57b8158cf (diff) | |
download | numpy-388ee595330e2375a1e4c8187c17de7ea9fb2f6f.tar.gz |
ENH: Adding in automatic number of bins estimation for np.histogram. Users can now pass in bins='auto' (or 'scott','fd','rice','sturges') and get the corresponding rule of thumb estimator provide a decent estimate of the optimal number of bins for the given the data.
Diffstat (limited to 'numpy/lib/function_base.py')
-rw-r--r-- | numpy/lib/function_base.py | 171 |
1 files changed, 170 insertions, 1 deletions
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index 19040807b..820168663 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -27,6 +27,7 @@ from numpy.core.multiarray import _insert, add_docstring from numpy.core.multiarray import digitize, bincount, interp as compiled_interp from numpy.core.umath import _add_newdoc_ufunc as add_newdoc_ufunc from numpy.compat import long +from numpy.compat.py3k import basestring # Force range to be a generator, for np.delete's usage. if sys.version_info[0] < 3: @@ -75,6 +76,91 @@ def iterable(y): return 1 +def _hist_optim_numbins_estimator(a, estimator): + """ + A helper function to be called from histogram to deal with estimating optimal number of bins + + estimator: str + If estimator is one of ['auto', 'fd', 'scott', 'rice', 'sturges'] this function + will choose the appropriate estimator and return it's estimate for the optimal + number of bins. + """ + assert isinstance(estimator, basestring) + # private function should not be called otherwise + + if a.size == 0: + return 1 + + def sturges(x): + """ + Sturges Estimator + A very simplistic estimator based on the assumption of normality of the data + Poor performance for non-normal data, especially obvious for large X. + Depends only on size of the data. + """ + return np.ceil(np.log2(x.size)) + 1 + + def rice(x): + """ + Rice Estimator + Another simple estimator, with no normality assumption. + It has better performance for large data, but tends to overestimate number of bins. + The number of bins is proportional to the cube root of data size (asymptotically optimal) + Depends only on size of the data + """ + return np.ceil(2 * x.size ** (1.0 / 3)) + + def scott(x): + """ + Scott Estimator + The binwidth is proportional to the standard deviation of the data and + inversely proportional to the cube root of data size (asymptotically optimal) + + """ + h = 3.5 * x.std() * x.size ** (-1.0 / 3) + if h > 0: + return np.ceil(x.ptp() / h) + return 1 + + def fd(x): + """ + Freedman Diaconis rule using Inter Quartile Range (IQR) for binwidth + Considered a variation of the Scott rule with more robustness as the IQR + is less affected by outliers than the standard deviation. However the IQR depends on + fewer points than the sd so it is less accurate, especially for long tailed distributions. + + If the IQR is 0, we return 1 for the number of bins. + Binwidth is inversely proportional to the cube root of data size (asymptotically optimal) + """ + iqr = np.subtract(*np.percentile(x, [75, 25])) + + if iqr > 0: + h = (2 * iqr * x.size ** (-1.0 / 3)) + return np.ceil(x.ptp() / h) + + # If iqr is 0, default number of bins is 1 + return 1 + + def auto(x): + """ + The FD estimator is usually the most robust method, but it tends to be too small + for small X. The Sturges estimator is quite good for small (<1000) datasets and is + the default in R. + This method gives good off the shelf behaviour. + """ + return max(fd(x), sturges(x)) + + optimal_numbins_methods = {'sturges': sturges, 'rice': rice, 'scott': scott, + 'fd': fd, 'auto': auto} + try: + estimator_func = optimal_numbins_methods[estimator.lower()] + except KeyError: + raise ValueError("{0} not a valid method for `bins`".format(estimator)) + else: + # these methods return floats, np.histogram requires an int + return int(estimator_func(a)) + + def histogram(a, bins=10, range=None, normed=False, weights=None, density=None): """ @@ -84,11 +170,37 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, ---------- a : array_like Input data. The histogram is computed over the flattened array. - bins : int or sequence of scalars, optional + bins : int or sequence of scalars or str, optional If `bins` is an int, it defines the number of equal-width bins in the given range (10, by default). If `bins` is a sequence, it defines the bin edges, including the rightmost edge, allowing for non-uniform bin widths. + + .. versionadded:: 1.11.0 + + If `bins` is a string from the list below, `histogram` will use the method + chosen to calculate the optimal number of bins (see Notes for more detail + on the estimators). For visualisation, we suggest using the 'auto' option. + + 'auto' + Maximum of the 'sturges' and 'fd' estimators. Provides good all round performance + + 'fd' (Freedman Diaconis Estimator) + Robust (resilient to outliers) estimator that takes into account data + variability and data size . + + 'scott' + Less robust estimator that that takes into account data + variability and data size. + + 'rice' + Estimator does not take variability into account, only data size. + Commonly overestimates number of bins required. + + 'sturges' + R's default method, only accounts for data size. Only optimal for + gaussian data and underestimates number of bins for large non-gaussian datasets. + range : (float, float), optional The lower and upper range of the bins. If not provided, range is simply ``(a.min(), a.max())``. Values outside the range are @@ -140,6 +252,48 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which *includes* 4. + .. versionadded:: 1.11.0 + + The methods to estimate the optimal number of bins are well found in literature, + and are inspired by the choices R provides for histogram visualisation. + Note that having the number of bins proportional to :math:`n^{1/3}` is asymptotically optimal, + which is why it appears in most estimators. + These are simply plug-in methods that give good starting points for number of bins. + In the equations below, :math:`h` is the binwidth and :math:`n_h` is the number of bins + + 'Auto' (maximum of the 'Sturges' and 'FD' estimators) + A compromise to get a good value. For small datasets the sturges + value will usually be chosen, while larger datasets will usually default to FD. + Avoids the overly conservative behaviour of FD and Sturges for small and + large datasets respectively. Switchover point is usually x.size~1000. + + 'FD' (Freedman Diaconis Estimator) + .. math:: h = 2 \\frac{IQR}{n^{-1/3}} + The binwidth is proportional to the interquartile range (IQR) + and inversely proportional to cube root of a.size. Can be too + conservative for small datasets, but is quite good + for large datasets. The IQR is very robust to outliers. + + 'Scott' + .. math:: h = \\frac{3.5\\sigma}{n^{-1/3}} + The binwidth is proportional to the standard deviation (sd) of the data + and inversely proportional to cube root of a.size. Can be too + conservative for small datasets, but is quite good + for large datasets. The sd is not very robust to outliers. Values + are very similar to the Freedman Diaconis Estimator in the absence of outliers. + + 'Rice' + .. math:: n_h = \\left\\lceil 2n^{1/3} \\right\\rceil + The number of bins is only proportional to cube root of a.size. + It tends to overestimate the number of bins + and it does not take into account data variability. + + 'Sturges' + .. math:: n_h = \\left\\lceil \\log _{2}n+1 \\right\\rceil + The number of bins is the base2 log of a.size. + This estimator assumes normality of data and is too conservative for larger, + non-normal datasets. This is the default method in R's `hist` method. + Examples -------- >>> np.histogram([1, 2, 1], bins=[0, 1, 2, 3]) @@ -158,6 +312,16 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, >>> np.sum(hist*np.diff(bin_edges)) 1.0 + .. versionadded:: 1.11.0 + + Automated Bin Selection Methods example, using 2 peak random data with 2000 points + + >>> import matplotlib.pyplot as plt + >>> rng = np.random.RandomState(10) # deterministic random data + >>> a = np.hstack((rng.normal(size = 1000), rng.normal(loc = 5, scale = 2, size = 1000))) + >>> plt.hist(a, bins = 'auto') # plt.hist passes it's arguments to np.histogram + >>> plt.title("Histogram with 'auto' bins") + >>> plt.show() """ a = asarray(a) @@ -175,6 +339,11 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, raise AttributeError( 'max must be larger than min in range parameter.') + if isinstance(bins, basestring): + bins = _hist_optim_numbins_estimator(a, bins) + # if `bins` is a string for an automatic method, + # this will replace it with the number of bins calculated + # Histogram is an integer or a float array depending on the weights. if weights is None: ntype = np.dtype(np.intp) |