ENH: Improve histogram bins="auto" for data with little variance (#10739)

Now falls back on sturges estimator when the IQR is zero
author: Varun Nayyar <nayyarv@users.noreply.github.com> 2018-04-10 12:55:59 +0800
committer: Eric Wieser <wieser.eric@gmail.com> 2018-04-09 21:55:59 -0700
commit: 918a167c30a7e19f0c0061f5e09e7637b1e04591 (patch)
tree: 0b1c76b31578cb10b9777e750cdbb0767823544f
parent: ab4e4c93beb671b634bac34d21f30452ef194bf6 (diff)
download: numpy-918a167c30a7e19f0c0061f5e09e7637b1e04591.tar.gz
3 files changed, 45 insertions, 9 deletions
diff --git a/doc/release/1.15.0-notes.rst b/doc/release/1.15.0-notes.rst
index fc2af11a2..34c2a6a61 100644
--- a/doc/release/1.15.0-notes.rst
+++ b/doc/release/1.15.0-notes.rst
@@ -146,6 +146,11 @@ as usual with `errstate`.
 Dates, times, and timedeltas can now be histogrammed. The bin edges must be
 passed explicitly, and are not yet computed automatically.
 
+``histogram`` "auto" estimator handles limited variance better
+------------------------------------------------------------------------
+No longer does an IQR of 0 result in `n_bins=1`, rather the number of bins
+chosen is related to the data size in this situation
+
 ``histogramdd`` allows explicit ranges to be given in a subset of axes
 ----------------------------------------------------------------------
 The ``range`` argument of `histogramdd` can now contain ``None`` values to
diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py
index a0346f6c5..d2a398a0a 100644
--- a/numpy/lib/histograms.py
+++ b/numpy/lib/histograms.py
@@ -167,12 +167,22 @@ def _hist_bin_fd(x):
 def _hist_bin_auto(x):
     """
     Histogram bin estimator that uses the minimum width of the
-    Freedman-Diaconis and Sturges estimators.
+    Freedman-Diaconis and Sturges estimators if the FD bandwidth is non zero
+    and the Sturges estimator if the FD bandwidth is 0.
 
     The FD estimator is usually the most robust method, but its width
-    estimate tends to be too large for small `x`. The Sturges estimator
-    is quite good for small (<1000) datasets and is the default in the R
-    language. This method gives good off the shelf behaviour.
+    estimate tends to be too large for small `x` and bad for data with limited
+    variance. The Sturges estimator is quite good for small (<1000) datasets
+    and is the default in the R language. This method gives good off the shelf
+    behaviour.
+
+    .. versionchanged:: 1.15.0
+    If there is limited variance the IQR can be 0, which results in the
+    FD bin width being 0 too. This is not a valid bin width, so
+    ``np.histogram_bin_edges`` chooses 1 bin instead, which may not be optimal.
+    If the IQR is 0, it's unlikely any variance based estimators will be of
+    use, so we revert to the sturges estimator, which only uses the size of the
+    dataset in its calculation.
 
     Parameters
     ----------
@@ -188,10 +198,13 @@ def _hist_bin_auto(x):
     --------
     _hist_bin_fd, _hist_bin_sturges
     """
-    # There is no need to check for zero here. If ptp is, so is IQR and
-    # vice versa. Either both are zero or neither one is.
-    return min(_hist_bin_fd(x), _hist_bin_sturges(x))
-
+    fd_bw = _hist_bin_fd(x)
+    sturges_bw = _hist_bin_sturges(x)
+    if fd_bw:
+        return min(fd_bw, sturges_bw)
+    else:
+        # limited variance, so we return a len dependent bw estimator
+        return sturges_bw
 
 # Private dict initialized at module load time
 _hist_bin_selectors = {'auto': _hist_bin_auto,
@@ -440,7 +453,7 @@ def histogram_bin_edges(a, bins=10, range=None, weights=None):
     below, :math:`h` is the binwidth and :math:`n_h` is the number of
     bins. All estimators that compute bin counts are recast to bin width
     using the `ptp` of the data. The final bin count is obtained from
-    ``np.round(np.ceil(range / h))`.
+    ``np.round(np.ceil(range / h))``.
 
     'Auto' (maximum of the 'Sturges' and 'FD' estimators)
         A compromise to get a good value. For small datasets the Sturges
diff --git a/numpy/lib/tests/test_histograms.py b/numpy/lib/tests/test_histograms.py
index 6777089ab..06daacbdc 100644
--- a/numpy/lib/tests/test_histograms.py
+++ b/numpy/lib/tests/test_histograms.py
@@ -443,6 +443,24 @@ class TestHistogramOptimBinNums(object):
             assert_equal(len(a), numbins, err_msg="{0} estimator, "
                          "No Variance test".format(estimator))
 
+    def test_limited_variance(self):
+        """
+        Check when IQR is 0, but variance exists, we return the sturges value
+        and not the fd value.
+        """
+        lim_var_data = np.ones(1000)
+        lim_var_data[:3] = 0
+        lim_var_data[-4:] = 100
+
+        edges_auto = histogram_bin_edges(lim_var_data, 'auto')
+        assert_equal(edges_auto, np.linspace(0, 100, 12))
+
+        edges_fd = histogram_bin_edges(lim_var_data, 'fd')
+        assert_equal(edges_fd, np.array([0, 100]))
+
+        edges_sturges = histogram_bin_edges(lim_var_data, 'sturges')
+        assert_equal(edges_sturges, np.linspace(0, 100, 12))
+
     def test_outlier(self):
         """
         Check the FD, Scott and Doane with outliers.
author	Varun Nayyar <nayyarv@users.noreply.github.com>	2018-04-10 12:55:59 +0800
committer	Eric Wieser <wieser.eric@gmail.com>	2018-04-09 21:55:59 -0700
commit	918a167c30a7e19f0c0061f5e09e7637b1e04591 (patch)
tree	0b1c76b31578cb10b9777e750cdbb0767823544f
parent	ab4e4c93beb671b634bac34d21f30452ef194bf6 (diff)
download	numpy-918a167c30a7e19f0c0061f5e09e7637b1e04591.tar.gz