diff options
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/lib/histograms.py | 76 | ||||
-rw-r--r-- | numpy/lib/tests/test_histograms.py | 44 |
2 files changed, 97 insertions, 23 deletions
diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py index 06b30f978..482eabe14 100644 --- a/numpy/lib/histograms.py +++ b/numpy/lib/histograms.py @@ -21,7 +21,7 @@ array_function_dispatch = functools.partial( _range = range -def _hist_bin_sqrt(x): +def _hist_bin_sqrt(x, range): """ Square root histogram bin estimator. @@ -38,10 +38,11 @@ def _hist_bin_sqrt(x): ------- h : An estimate of the optimal bin width for the given data. """ + del range # unused return x.ptp() / np.sqrt(x.size) -def _hist_bin_sturges(x): +def _hist_bin_sturges(x, range): """ Sturges histogram bin estimator. @@ -60,10 +61,11 @@ def _hist_bin_sturges(x): ------- h : An estimate of the optimal bin width for the given data. """ + del range # unused return x.ptp() / (np.log2(x.size) + 1.0) -def _hist_bin_rice(x): +def _hist_bin_rice(x, range): """ Rice histogram bin estimator. @@ -83,10 +85,11 @@ def _hist_bin_rice(x): ------- h : An estimate of the optimal bin width for the given data. """ + del range # unused return x.ptp() / (2.0 * x.size ** (1.0 / 3)) -def _hist_bin_scott(x): +def _hist_bin_scott(x, range): """ Scott histogram bin estimator. @@ -104,10 +107,52 @@ def _hist_bin_scott(x): ------- h : An estimate of the optimal bin width for the given data. """ + del range # unused return (24.0 * np.pi**0.5 / x.size)**(1.0 / 3.0) * np.std(x) -def _hist_bin_doane(x): +def _hist_bin_stone(x, range): + """ + Histogram bin estimator based on minimizing the estimated integrated squared error (ISE). + + The number of bins is chosen by minimizing the estimated ISE against the unknown true distribution. + The ISE is estimated using cross-validation and can be regarded as a generalization of Scott's rule. + https://en.wikipedia.org/wiki/Histogram#Scott.27s_normal_reference_rule + + This paper by Stone appears to be the origination of this rule. + http://digitalassets.lib.berkeley.edu/sdtr/ucb/text/34.pdf + + Parameters + ---------- + x : array_like + Input data that is to be histogrammed, trimmed to range. May not + be empty. + range : (float, float) + The lower and upper range of the bins. + + Returns + ------- + h : An estimate of the optimal bin width for the given data. + """ + + n = x.size + ptp_x = np.ptp(x) + if n <= 1 or ptp_x == 0: + return 0 + + def jhat(nbins): + hh = ptp_x / nbins + p_k = np.histogram(x, bins=nbins, range=range)[0] / n + return (2 - (n + 1) * p_k.dot(p_k)) / hh + + nbins_upper_bound = max(100, int(np.sqrt(n))) + nbins = min(_range(1, nbins_upper_bound + 1), key=jhat) + if nbins == nbins_upper_bound: + warnings.warn("The number of bins estimated may be suboptimal.", RuntimeWarning, stacklevel=2) + return ptp_x / nbins + + +def _hist_bin_doane(x, range): """ Doane's histogram bin estimator. @@ -125,6 +170,7 @@ def _hist_bin_doane(x): ------- h : An estimate of the optimal bin width for the given data. """ + del range # unused if x.size > 2: sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3))) sigma = np.std(x) @@ -141,7 +187,7 @@ def _hist_bin_doane(x): return 0.0 -def _hist_bin_fd(x): +def _hist_bin_fd(x, range): """ The Freedman-Diaconis histogram bin estimator. @@ -166,11 +212,12 @@ def _hist_bin_fd(x): ------- h : An estimate of the optimal bin width for the given data. """ + del range # unused iqr = np.subtract(*np.percentile(x, [75, 25])) return 2.0 * iqr * x.size ** (-1.0 / 3.0) -def _hist_bin_auto(x): +def _hist_bin_auto(x, range): """ Histogram bin estimator that uses the minimum width of the Freedman-Diaconis and Sturges estimators if the FD bandwidth is non zero @@ -204,8 +251,9 @@ def _hist_bin_auto(x): -------- _hist_bin_fd, _hist_bin_sturges """ - fd_bw = _hist_bin_fd(x) - sturges_bw = _hist_bin_sturges(x) + fd_bw = _hist_bin_fd(x, range) + sturges_bw = _hist_bin_sturges(x, range) + del range # unused if fd_bw: return min(fd_bw, sturges_bw) else: @@ -213,7 +261,8 @@ def _hist_bin_auto(x): return sturges_bw # Private dict initialized at module load time -_hist_bin_selectors = {'auto': _hist_bin_auto, +_hist_bin_selectors = {'stone': _hist_bin_stone, + 'auto': _hist_bin_auto, 'doane': _hist_bin_doane, 'fd': _hist_bin_fd, 'rice': _hist_bin_rice, @@ -348,7 +397,7 @@ def _get_bin_edges(a, bins, range, weights): n_equal_bins = 1 else: # Do not call selectors on empty arrays - width = _hist_bin_selectors[bin_name](a) + width = _hist_bin_selectors[bin_name](a, (first_edge, last_edge)) if width: n_equal_bins = int(np.ceil(_unsigned_subtract(last_edge, first_edge) / width)) else: @@ -450,6 +499,11 @@ def histogram_bin_edges(a, bins=10, range=None, weights=None): Less robust estimator that that takes into account data variability and data size. + 'stone' + Estimator based on leave-one-out cross-validation estimate of + the integrated squared error. Can be regarded as a generalization + of Scott's rule. + 'rice' Estimator does not take variability into account, only data size. Commonly overestimates number of bins required. diff --git a/numpy/lib/tests/test_histograms.py b/numpy/lib/tests/test_histograms.py index 5b51763b2..49c0d9720 100644 --- a/numpy/lib/tests/test_histograms.py +++ b/numpy/lib/tests/test_histograms.py @@ -431,7 +431,7 @@ class TestHistogramOptimBinNums(object): def test_empty(self): estimator_list = ['fd', 'scott', 'rice', 'sturges', - 'doane', 'sqrt', 'auto'] + 'doane', 'sqrt', 'auto', 'stone'] # check it can deal with empty data for estimator in estimator_list: a, b = histogram([], bins=estimator) @@ -447,11 +447,11 @@ class TestHistogramOptimBinNums(object): # Some basic sanity checking, with some fixed data. # Checking for the correct number of bins basic_test = {50: {'fd': 4, 'scott': 4, 'rice': 8, 'sturges': 7, - 'doane': 8, 'sqrt': 8, 'auto': 7}, + 'doane': 8, 'sqrt': 8, 'auto': 7, 'stone': 2}, 500: {'fd': 8, 'scott': 8, 'rice': 16, 'sturges': 10, - 'doane': 12, 'sqrt': 23, 'auto': 10}, + 'doane': 12, 'sqrt': 23, 'auto': 10, 'stone': 9}, 5000: {'fd': 17, 'scott': 17, 'rice': 35, 'sturges': 14, - 'doane': 17, 'sqrt': 71, 'auto': 17}} + 'doane': 17, 'sqrt': 71, 'auto': 17, 'stone': 20}} for testlen, expectedResults in basic_test.items(): # Create some sort of non uniform data to test with @@ -471,11 +471,11 @@ class TestHistogramOptimBinNums(object): precalculated. """ small_dat = {1: {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1, - 'doane': 1, 'sqrt': 1}, + 'doane': 1, 'sqrt': 1, 'stone': 1}, 2: {'fd': 2, 'scott': 1, 'rice': 3, 'sturges': 2, - 'doane': 1, 'sqrt': 2}, + 'doane': 1, 'sqrt': 2, 'stone': 1}, 3: {'fd': 2, 'scott': 2, 'rice': 3, 'sturges': 3, - 'doane': 3, 'sqrt': 2}} + 'doane': 3, 'sqrt': 2, 'stone': 1}} for testlen, expectedResults in small_dat.items(): testdat = np.arange(testlen) @@ -499,7 +499,7 @@ class TestHistogramOptimBinNums(object): """ novar_dataset = np.ones(100) novar_resultdict = {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1, - 'doane': 1, 'sqrt': 1, 'auto': 1} + 'doane': 1, 'sqrt': 1, 'auto': 1, 'stone': 1} for estimator, numbins in novar_resultdict.items(): a, b = np.histogram(novar_dataset, estimator) @@ -538,12 +538,32 @@ class TestHistogramOptimBinNums(object): xcenter = np.linspace(-10, 10, 50) outlier_dataset = np.hstack((np.linspace(-110, -100, 5), xcenter)) - outlier_resultdict = {'fd': 21, 'scott': 5, 'doane': 11} + outlier_resultdict = {'fd': 21, 'scott': 5, 'doane': 11, 'stone': 6} for estimator, numbins in outlier_resultdict.items(): a, b = np.histogram(outlier_dataset, estimator) assert_equal(len(a), numbins) + def test_scott_vs_stone(self): + """Verify that Scott's rule and Stone's rule converges for normally distributed data""" + + def nbins_ratio(seed, size): + rng = np.random.RandomState(seed) + x = rng.normal(loc=0, scale=2, size=size) + a, b = len(np.histogram(x, 'stone')[0]), len(np.histogram(x, 'scott')[0]) + return a / (a + b) + + ll = [[nbins_ratio(seed, size) for size in np.geomspace(start=10, stop=100, num=4).round().astype(int)] + for seed in range(256)] + + # the average difference between the two methods decreases as the dataset size increases. + assert_almost_equal(abs(np.mean(ll, axis=0) - 0.5), + [0.1065248, + 0.0968844, + 0.0331818, + 0.0178057], + decimal=3) + def test_simple_range(self): """ Straightforward testing with a mixture of linspace data (for @@ -555,11 +575,11 @@ class TestHistogramOptimBinNums(object): # Checking for the correct number of bins basic_test = { 50: {'fd': 8, 'scott': 8, 'rice': 15, - 'sturges': 14, 'auto': 14}, + 'sturges': 14, 'auto': 14, 'stone': 8}, 500: {'fd': 15, 'scott': 16, 'rice': 32, - 'sturges': 20, 'auto': 20}, + 'sturges': 20, 'auto': 20, 'stone': 80}, 5000: {'fd': 33, 'scott': 33, 'rice': 69, - 'sturges': 27, 'auto': 33} + 'sturges': 27, 'auto': 33, 'stone': 80} } for testlen, expectedResults in basic_test.items(): |