summaryrefslogtreecommitdiff
path: root/numpy/lib
diff options
context:
space:
mode:
authorseberg <sebastian@sipsolutions.net>2015-08-15 15:53:06 +0200
committerseberg <sebastian@sipsolutions.net>2015-08-15 15:53:06 +0200
commit6e8b869d52ec5a1242df69bcd9323a4b0947933b (patch)
treea4296d6f98c220f38cf02b2ad376cf9ab270ce4d /numpy/lib
parentc573b7170f4467d75e194ccca8a032a32fa1b5d0 (diff)
parent388ee595330e2375a1e4c8187c17de7ea9fb2f6f (diff)
downloadnumpy-6e8b869d52ec5a1242df69bcd9323a4b0947933b.tar.gz
Merge pull request #6029 from nayyarv/master
ENH: Automatic number of bins for np.histogram
Diffstat (limited to 'numpy/lib')
-rw-r--r--numpy/lib/function_base.py171
-rw-r--r--numpy/lib/tests/test_function_base.py90
2 files changed, 260 insertions, 1 deletions
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index 19040807b..820168663 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -27,6 +27,7 @@ from numpy.core.multiarray import _insert, add_docstring
from numpy.core.multiarray import digitize, bincount, interp as compiled_interp
from numpy.core.umath import _add_newdoc_ufunc as add_newdoc_ufunc
from numpy.compat import long
+from numpy.compat.py3k import basestring
# Force range to be a generator, for np.delete's usage.
if sys.version_info[0] < 3:
@@ -75,6 +76,91 @@ def iterable(y):
return 1
+def _hist_optim_numbins_estimator(a, estimator):
+ """
+ A helper function to be called from histogram to deal with estimating optimal number of bins
+
+ estimator: str
+ If estimator is one of ['auto', 'fd', 'scott', 'rice', 'sturges'] this function
+ will choose the appropriate estimator and return it's estimate for the optimal
+ number of bins.
+ """
+ assert isinstance(estimator, basestring)
+ # private function should not be called otherwise
+
+ if a.size == 0:
+ return 1
+
+ def sturges(x):
+ """
+ Sturges Estimator
+ A very simplistic estimator based on the assumption of normality of the data
+ Poor performance for non-normal data, especially obvious for large X.
+ Depends only on size of the data.
+ """
+ return np.ceil(np.log2(x.size)) + 1
+
+ def rice(x):
+ """
+ Rice Estimator
+ Another simple estimator, with no normality assumption.
+ It has better performance for large data, but tends to overestimate number of bins.
+ The number of bins is proportional to the cube root of data size (asymptotically optimal)
+ Depends only on size of the data
+ """
+ return np.ceil(2 * x.size ** (1.0 / 3))
+
+ def scott(x):
+ """
+ Scott Estimator
+ The binwidth is proportional to the standard deviation of the data and
+ inversely proportional to the cube root of data size (asymptotically optimal)
+
+ """
+ h = 3.5 * x.std() * x.size ** (-1.0 / 3)
+ if h > 0:
+ return np.ceil(x.ptp() / h)
+ return 1
+
+ def fd(x):
+ """
+ Freedman Diaconis rule using Inter Quartile Range (IQR) for binwidth
+ Considered a variation of the Scott rule with more robustness as the IQR
+ is less affected by outliers than the standard deviation. However the IQR depends on
+ fewer points than the sd so it is less accurate, especially for long tailed distributions.
+
+ If the IQR is 0, we return 1 for the number of bins.
+ Binwidth is inversely proportional to the cube root of data size (asymptotically optimal)
+ """
+ iqr = np.subtract(*np.percentile(x, [75, 25]))
+
+ if iqr > 0:
+ h = (2 * iqr * x.size ** (-1.0 / 3))
+ return np.ceil(x.ptp() / h)
+
+ # If iqr is 0, default number of bins is 1
+ return 1
+
+ def auto(x):
+ """
+ The FD estimator is usually the most robust method, but it tends to be too small
+ for small X. The Sturges estimator is quite good for small (<1000) datasets and is
+ the default in R.
+ This method gives good off the shelf behaviour.
+ """
+ return max(fd(x), sturges(x))
+
+ optimal_numbins_methods = {'sturges': sturges, 'rice': rice, 'scott': scott,
+ 'fd': fd, 'auto': auto}
+ try:
+ estimator_func = optimal_numbins_methods[estimator.lower()]
+ except KeyError:
+ raise ValueError("{0} not a valid method for `bins`".format(estimator))
+ else:
+ # these methods return floats, np.histogram requires an int
+ return int(estimator_func(a))
+
+
def histogram(a, bins=10, range=None, normed=False, weights=None,
density=None):
"""
@@ -84,11 +170,37 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
----------
a : array_like
Input data. The histogram is computed over the flattened array.
- bins : int or sequence of scalars, optional
+ bins : int or sequence of scalars or str, optional
If `bins` is an int, it defines the number of equal-width
bins in the given range (10, by default). If `bins` is a sequence,
it defines the bin edges, including the rightmost edge, allowing
for non-uniform bin widths.
+
+ .. versionadded:: 1.11.0
+
+ If `bins` is a string from the list below, `histogram` will use the method
+ chosen to calculate the optimal number of bins (see Notes for more detail
+ on the estimators). For visualisation, we suggest using the 'auto' option.
+
+ 'auto'
+ Maximum of the 'sturges' and 'fd' estimators. Provides good all round performance
+
+ 'fd' (Freedman Diaconis Estimator)
+ Robust (resilient to outliers) estimator that takes into account data
+ variability and data size .
+
+ 'scott'
+ Less robust estimator that that takes into account data
+ variability and data size.
+
+ 'rice'
+ Estimator does not take variability into account, only data size.
+ Commonly overestimates number of bins required.
+
+ 'sturges'
+ R's default method, only accounts for data size. Only optimal for
+ gaussian data and underestimates number of bins for large non-gaussian datasets.
+
range : (float, float), optional
The lower and upper range of the bins. If not provided, range
is simply ``(a.min(), a.max())``. Values outside the range are
@@ -140,6 +252,48 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which *includes*
4.
+ .. versionadded:: 1.11.0
+
+ The methods to estimate the optimal number of bins are well found in literature,
+ and are inspired by the choices R provides for histogram visualisation.
+ Note that having the number of bins proportional to :math:`n^{1/3}` is asymptotically optimal,
+ which is why it appears in most estimators.
+ These are simply plug-in methods that give good starting points for number of bins.
+ In the equations below, :math:`h` is the binwidth and :math:`n_h` is the number of bins
+
+ 'Auto' (maximum of the 'Sturges' and 'FD' estimators)
+ A compromise to get a good value. For small datasets the sturges
+ value will usually be chosen, while larger datasets will usually default to FD.
+ Avoids the overly conservative behaviour of FD and Sturges for small and
+ large datasets respectively. Switchover point is usually x.size~1000.
+
+ 'FD' (Freedman Diaconis Estimator)
+ .. math:: h = 2 \\frac{IQR}{n^{-1/3}}
+ The binwidth is proportional to the interquartile range (IQR)
+ and inversely proportional to cube root of a.size. Can be too
+ conservative for small datasets, but is quite good
+ for large datasets. The IQR is very robust to outliers.
+
+ 'Scott'
+ .. math:: h = \\frac{3.5\\sigma}{n^{-1/3}}
+ The binwidth is proportional to the standard deviation (sd) of the data
+ and inversely proportional to cube root of a.size. Can be too
+ conservative for small datasets, but is quite good
+ for large datasets. The sd is not very robust to outliers. Values
+ are very similar to the Freedman Diaconis Estimator in the absence of outliers.
+
+ 'Rice'
+ .. math:: n_h = \\left\\lceil 2n^{1/3} \\right\\rceil
+ The number of bins is only proportional to cube root of a.size.
+ It tends to overestimate the number of bins
+ and it does not take into account data variability.
+
+ 'Sturges'
+ .. math:: n_h = \\left\\lceil \\log _{2}n+1 \\right\\rceil
+ The number of bins is the base2 log of a.size.
+ This estimator assumes normality of data and is too conservative for larger,
+ non-normal datasets. This is the default method in R's `hist` method.
+
Examples
--------
>>> np.histogram([1, 2, 1], bins=[0, 1, 2, 3])
@@ -158,6 +312,16 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
>>> np.sum(hist*np.diff(bin_edges))
1.0
+ .. versionadded:: 1.11.0
+
+ Automated Bin Selection Methods example, using 2 peak random data with 2000 points
+
+ >>> import matplotlib.pyplot as plt
+ >>> rng = np.random.RandomState(10) # deterministic random data
+ >>> a = np.hstack((rng.normal(size = 1000), rng.normal(loc = 5, scale = 2, size = 1000)))
+ >>> plt.hist(a, bins = 'auto') # plt.hist passes it's arguments to np.histogram
+ >>> plt.title("Histogram with 'auto' bins")
+ >>> plt.show()
"""
a = asarray(a)
@@ -175,6 +339,11 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
raise AttributeError(
'max must be larger than min in range parameter.')
+ if isinstance(bins, basestring):
+ bins = _hist_optim_numbins_estimator(a, bins)
+ # if `bins` is a string for an automatic method,
+ # this will replace it with the number of bins calculated
+
# Histogram is an integer or a float array depending on the weights.
if weights is None:
ntype = np.dtype(np.intp)
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py
index b29012bcb..b127f65f6 100644
--- a/numpy/lib/tests/test_function_base.py
+++ b/numpy/lib/tests/test_function_base.py
@@ -1234,6 +1234,96 @@ class TestHistogram(TestCase):
assert_array_equal(b, np.array([0, 1]))
+class TestHistogramOptimBinNums(TestCase):
+ """
+ Provide test coverage when using provided estimators for optimal number of bins
+ """
+
+ def test_empty(self):
+ estimator_list = ['fd', 'scott', 'rice', 'sturges', 'auto']
+ # check it can deal with empty data
+ for estimator in estimator_list:
+ a, b = histogram([], bins=estimator)
+ assert_array_equal(a, np.array([0]))
+ assert_array_equal(b, np.array([0, 1]))
+
+ def test_simple(self):
+ """
+ Straightforward testing with a mixture of linspace data (for consistency).
+ All test values have been precomputed and the values shouldn't change
+ """
+ # some basic sanity checking, with some fixed data. Checking for the correct number of bins
+ basic_test = {50: {'fd': 4, 'scott': 4, 'rice': 8, 'sturges': 7, 'auto': 7},
+ 500: {'fd': 8, 'scott': 8, 'rice': 16, 'sturges': 10, 'auto': 10},
+ 5000: {'fd': 17, 'scott': 17, 'rice': 35, 'sturges': 14, 'auto': 17}}
+
+ for testlen, expectedResults in basic_test.items():
+ # create some sort of non uniform data to test with (2 peak uniform mixture)
+ x1 = np.linspace(-10, -1, testlen/5 * 2)
+ x2 = np.linspace(1,10, testlen/5 * 3)
+ x = np.hstack((x1, x2))
+ for estimator, numbins in expectedResults.items():
+ a, b = np.histogram(x, estimator)
+ assert_equal(len(a), numbins,
+ err_msg="For the {0} estimator with datasize of {1} ".format(estimator, testlen))
+
+ def test_small(self):
+ """
+ Smaller datasets have the potential to cause issues with the data adaptive methods
+ Especially the FD methods
+ All bin numbers have been precalculated
+ """
+ small_dat = {1: {'fd': 1, 'scott': 1, 'rice': 2, 'sturges': 1},
+ 2: {'fd': 2, 'scott': 1, 'rice': 3, 'sturges': 2},
+ 3: {'fd': 2, 'scott': 2, 'rice': 3, 'sturges': 3}}
+
+ for testlen, expectedResults in small_dat.items():
+ testdat = np.arange(testlen)
+ for estimator, expbins in expectedResults.items():
+ a, b = np.histogram(testdat, estimator)
+ assert_equal(len(a), expbins,
+ err_msg="For the {0} estimator with datasize of {1} ".format(estimator, testlen))
+
+ def test_incorrect_methods(self):
+ """
+ Check a Value Error is thrown when an unknown string is passed in
+ """
+ check_list = ['mad', 'freeman', 'histograms', 'IQR']
+ for estimator in check_list:
+ assert_raises(ValueError, histogram, [1, 2, 3], estimator)
+
+ def test_novariance(self):
+ """
+ Check that methods handle no variance in data
+ Primarily for Scott and FD as the SD and IQR are both 0 in this case
+ """
+ novar_dataset = np.ones(100)
+ novar_resultdict = {'fd': 1, 'scott': 1, 'rice': 10, 'sturges': 8, 'auto': 8}
+
+ for estimator, numbins in novar_resultdict.items():
+ a, b = np.histogram(novar_dataset, estimator)
+ assert_equal(len(a), numbins,
+ err_msg="{0} estimator, No Variance test".format(estimator))
+
+ def test_outlier(self):
+ """
+ Check the fd and scott with outliers
+ The fd determines a smaller binwidth since it's less affected by outliers
+ since the range is so (artificially) large this means more bins
+ most of which will be empty, but the data of interest usually is unaffected.
+ The Scott estimator is more affected and returns fewer bins, despite most of
+ the variance being in one area of the data
+ """
+ xcenter = np.linspace(-10, 10, 50)
+ outlier_dataset = np.hstack((np.linspace(-110, -100, 5), xcenter))
+
+ outlier_resultdict = {'fd': 21, 'scott': 5}
+
+ for estimator, numbins in outlier_resultdict.items():
+ a, b = np.histogram(outlier_dataset, estimator)
+ assert_equal(len(a), numbins)
+
+
class TestHistogramdd(TestCase):
def test_simple(self):