1 files changed, 75 insertions, 16 deletions
diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py
index 1ff25b81f..7b229cc89 100644
--- a/numpy/lib/histograms.py
+++ b/numpy/lib/histograms.py
@@ -3,21 +3,25 @@ Histogram-related functions
 """
 from __future__ import division, absolute_import, print_function
 
+import functools
 import operator
 import warnings
 
 import numpy as np
 from numpy.compat.py3k import basestring
-from numpy.core.overrides import array_function_dispatch
+from numpy.core import overrides
 
 __all__ = ['histogram', 'histogramdd', 'histogram_bin_edges']
 
+array_function_dispatch = functools.partial(
+    overrides.array_function_dispatch, module='numpy')
+
 # range is a keyword argument to many functions, so save the builtin so they can
 # use it.
 _range = range
 
 
-def _hist_bin_sqrt(x):
+def _hist_bin_sqrt(x, range):
     """
     Square root histogram bin estimator.
 
@@ -34,10 +38,11 @@ def _hist_bin_sqrt(x):
     -------
     h : An estimate of the optimal bin width for the given data.
     """
+    del range  # unused
     return x.ptp() / np.sqrt(x.size)
 
 
-def _hist_bin_sturges(x):
+def _hist_bin_sturges(x, range):
     """
     Sturges histogram bin estimator.
 
@@ -56,10 +61,11 @@ def _hist_bin_sturges(x):
     -------
     h : An estimate of the optimal bin width for the given data.
     """
+    del range  # unused
     return x.ptp() / (np.log2(x.size) + 1.0)
 
 
-def _hist_bin_rice(x):
+def _hist_bin_rice(x, range):
     """
     Rice histogram bin estimator.
 
@@ -79,10 +85,11 @@ def _hist_bin_rice(x):
     -------
     h : An estimate of the optimal bin width for the given data.
     """
+    del range  # unused
     return x.ptp() / (2.0 * x.size ** (1.0 / 3))
 
 
-def _hist_bin_scott(x):
+def _hist_bin_scott(x, range):
     """
     Scott histogram bin estimator.
 
@@ -100,10 +107,52 @@ def _hist_bin_scott(x):
     -------
     h : An estimate of the optimal bin width for the given data.
     """
+    del range  # unused
     return (24.0 * np.pi**0.5 / x.size)**(1.0 / 3.0) * np.std(x)
 
 
-def _hist_bin_doane(x):
+def _hist_bin_stone(x, range):
+    """
+    Histogram bin estimator based on minimizing the estimated integrated squared error (ISE).
+
+    The number of bins is chosen by minimizing the estimated ISE against the unknown true distribution.
+    The ISE is estimated using cross-validation and can be regarded as a generalization of Scott's rule.
+    https://en.wikipedia.org/wiki/Histogram#Scott.27s_normal_reference_rule
+
+    This paper by Stone appears to be the origination of this rule.
+    http://digitalassets.lib.berkeley.edu/sdtr/ucb/text/34.pdf
+
+    Parameters
+    ----------
+    x : array_like
+        Input data that is to be histogrammed, trimmed to range. May not
+        be empty.
+    range : (float, float)
+        The lower and upper range of the bins.
+
+    Returns
+    -------
+    h : An estimate of the optimal bin width for the given data.
+    """
+
+    n = x.size
+    ptp_x = np.ptp(x)
+    if n <= 1 or ptp_x == 0:
+        return 0
+
+    def jhat(nbins):
+        hh = ptp_x / nbins
+        p_k = np.histogram(x, bins=nbins, range=range)[0] / n
+        return (2 - (n + 1) * p_k.dot(p_k)) / hh
+
+    nbins_upper_bound = max(100, int(np.sqrt(n)))
+    nbins = min(_range(1, nbins_upper_bound + 1), key=jhat)
+    if nbins == nbins_upper_bound:
+        warnings.warn("The number of bins estimated may be suboptimal.", RuntimeWarning, stacklevel=2)
+    return ptp_x / nbins
+
+
+def _hist_bin_doane(x, range):
     """
     Doane's histogram bin estimator.
 
@@ -121,6 +170,7 @@ def _hist_bin_doane(x):
     -------
     h : An estimate of the optimal bin width for the given data.
     """
+    del range  # unused
     if x.size > 2:
         sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3)))
         sigma = np.std(x)
@@ -137,7 +187,7 @@ def _hist_bin_doane(x):
     return 0.0
 
 
-def _hist_bin_fd(x):
+def _hist_bin_fd(x, range):
     """
     The Freedman-Diaconis histogram bin estimator.
 
@@ -162,11 +212,12 @@ def _hist_bin_fd(x):
     -------
     h : An estimate of the optimal bin width for the given data.
     """
+    del range  # unused
     iqr = np.subtract(*np.percentile(x, [75, 25]))
     return 2.0 * iqr * x.size ** (-1.0 / 3.0)
 
 
-def _hist_bin_auto(x):
+def _hist_bin_auto(x, range):
     """
     Histogram bin estimator that uses the minimum width of the
     Freedman-Diaconis and Sturges estimators if the FD bandwidth is non zero
@@ -200,8 +251,9 @@ def _hist_bin_auto(x):
     --------
     _hist_bin_fd, _hist_bin_sturges
     """
-    fd_bw = _hist_bin_fd(x)
-    sturges_bw = _hist_bin_sturges(x)
+    fd_bw = _hist_bin_fd(x, range)
+    sturges_bw = _hist_bin_sturges(x, range)
+    del range  # unused
     if fd_bw:
         return min(fd_bw, sturges_bw)
     else:
@@ -209,7 +261,8 @@ def _hist_bin_auto(x):
         return sturges_bw
 
 # Private dict initialized at module load time
-_hist_bin_selectors = {'auto': _hist_bin_auto,
+_hist_bin_selectors = {'stone': _hist_bin_stone,
+                       'auto': _hist_bin_auto,
                        'doane': _hist_bin_doane,
                        'fd': _hist_bin_fd,
                        'rice': _hist_bin_rice,
@@ -344,7 +397,7 @@ def _get_bin_edges(a, bins, range, weights):
             n_equal_bins = 1
         else:
             # Do not call selectors on empty arrays
-            width = _hist_bin_selectors[bin_name](a)
+            width = _hist_bin_selectors[bin_name](a, (first_edge, last_edge))
             if width:
                 n_equal_bins = int(np.ceil(_unsigned_subtract(last_edge, first_edge) / width))
             else:
@@ -446,6 +499,11 @@ def histogram_bin_edges(a, bins=10, range=None, weights=None):
             Less robust estimator that that takes into account data
             variability and data size.
 
+        'stone'
+            Estimator based on leave-one-out cross-validation estimate of
+            the integrated squared error. Can be regarded as a generalization
+            of Scott's rule.
+
         'rice'
             Estimator does not take variability into account, only data
             size. Commonly overestimates number of bins required.
@@ -587,7 +645,7 @@ def histogram_bin_edges(a, bins=10, range=None, weights=None):
 
     >>> hist_0, bins_0 = np.histogram(arr[group_id == 0], bins='auto')
     >>> hist_1, bins_1 = np.histogram(arr[group_id == 1], bins='auto')
-    >>> hist_0; hist1
+    >>> hist_0; hist_1
     array([1, 1, 1])
     array([2, 1, 1, 2])
     >>> bins_0; bins_1
@@ -690,14 +748,14 @@ def histogram(a, bins=10, range=None, normed=None, weights=None,
     >>> np.histogram([1, 2, 1], bins=[0, 1, 2, 3])
     (array([0, 2, 1]), array([0, 1, 2, 3]))
     >>> np.histogram(np.arange(4), bins=np.arange(5), density=True)
-    (array([ 0.25,  0.25,  0.25,  0.25]), array([0, 1, 2, 3, 4]))
+    (array([0.25, 0.25, 0.25, 0.25]), array([0, 1, 2, 3, 4]))
     >>> np.histogram([[1, 2, 1], [1, 0, 1]], bins=[0,1,2,3])
     (array([1, 4, 1]), array([0, 1, 2, 3]))
 
     >>> a = np.arange(5)
     >>> hist, bin_edges = np.histogram(a, density=True)
     >>> hist
-    array([ 0.5,  0. ,  0.5,  0. ,  0. ,  0.5,  0. ,  0.5,  0. ,  0.5])
+    array([0.5, 0. , 0.5, 0. , 0. , 0.5, 0. , 0.5, 0. , 0.5])
     >>> hist.sum()
     2.4999999999999996
     >>> np.sum(hist * np.diff(bin_edges))
@@ -712,8 +770,9 @@ def histogram(a, bins=10, range=None, normed=None, weights=None,
     >>> rng = np.random.RandomState(10)  # deterministic random data
     >>> a = np.hstack((rng.normal(size=1000),
     ...                rng.normal(loc=5, scale=2, size=1000)))
-    >>> plt.hist(a, bins='auto')  # arguments are passed to np.histogram
+    >>> _ = plt.hist(a, bins='auto')  # arguments are passed to np.histogram
     >>> plt.title("Histogram with 'auto' bins")
+    Text(0.5, 1.0, "Histogram with 'auto' bins")
     >>> plt.show()
 
     """