summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVarun Nayyar <nayyarv@users.noreply.github.com>2018-04-10 12:55:59 +0800
committerEric Wieser <wieser.eric@gmail.com>2018-04-09 21:55:59 -0700
commit918a167c30a7e19f0c0061f5e09e7637b1e04591 (patch)
tree0b1c76b31578cb10b9777e750cdbb0767823544f
parentab4e4c93beb671b634bac34d21f30452ef194bf6 (diff)
downloadnumpy-918a167c30a7e19f0c0061f5e09e7637b1e04591.tar.gz
ENH: Improve histogram bins="auto" for data with little variance (#10739)
Now falls back on sturges estimator when the IQR is zero
-rw-r--r--doc/release/1.15.0-notes.rst5
-rw-r--r--numpy/lib/histograms.py31
-rw-r--r--numpy/lib/tests/test_histograms.py18
3 files changed, 45 insertions, 9 deletions
diff --git a/doc/release/1.15.0-notes.rst b/doc/release/1.15.0-notes.rst
index fc2af11a2..34c2a6a61 100644
--- a/doc/release/1.15.0-notes.rst
+++ b/doc/release/1.15.0-notes.rst
@@ -146,6 +146,11 @@ as usual with `errstate`.
Dates, times, and timedeltas can now be histogrammed. The bin edges must be
passed explicitly, and are not yet computed automatically.
+``histogram`` "auto" estimator handles limited variance better
+------------------------------------------------------------------------
+No longer does an IQR of 0 result in `n_bins=1`, rather the number of bins
+chosen is related to the data size in this situation
+
``histogramdd`` allows explicit ranges to be given in a subset of axes
----------------------------------------------------------------------
The ``range`` argument of `histogramdd` can now contain ``None`` values to
diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py
index a0346f6c5..d2a398a0a 100644
--- a/numpy/lib/histograms.py
+++ b/numpy/lib/histograms.py
@@ -167,12 +167,22 @@ def _hist_bin_fd(x):
def _hist_bin_auto(x):
"""
Histogram bin estimator that uses the minimum width of the
- Freedman-Diaconis and Sturges estimators.
+ Freedman-Diaconis and Sturges estimators if the FD bandwidth is non zero
+ and the Sturges estimator if the FD bandwidth is 0.
The FD estimator is usually the most robust method, but its width
- estimate tends to be too large for small `x`. The Sturges estimator
- is quite good for small (<1000) datasets and is the default in the R
- language. This method gives good off the shelf behaviour.
+ estimate tends to be too large for small `x` and bad for data with limited
+ variance. The Sturges estimator is quite good for small (<1000) datasets
+ and is the default in the R language. This method gives good off the shelf
+ behaviour.
+
+ .. versionchanged:: 1.15.0
+ If there is limited variance the IQR can be 0, which results in the
+ FD bin width being 0 too. This is not a valid bin width, so
+ ``np.histogram_bin_edges`` chooses 1 bin instead, which may not be optimal.
+ If the IQR is 0, it's unlikely any variance based estimators will be of
+ use, so we revert to the sturges estimator, which only uses the size of the
+ dataset in its calculation.
Parameters
----------
@@ -188,10 +198,13 @@ def _hist_bin_auto(x):
--------
_hist_bin_fd, _hist_bin_sturges
"""
- # There is no need to check for zero here. If ptp is, so is IQR and
- # vice versa. Either both are zero or neither one is.
- return min(_hist_bin_fd(x), _hist_bin_sturges(x))
-
+ fd_bw = _hist_bin_fd(x)
+ sturges_bw = _hist_bin_sturges(x)
+ if fd_bw:
+ return min(fd_bw, sturges_bw)
+ else:
+ # limited variance, so we return a len dependent bw estimator
+ return sturges_bw
# Private dict initialized at module load time
_hist_bin_selectors = {'auto': _hist_bin_auto,
@@ -440,7 +453,7 @@ def histogram_bin_edges(a, bins=10, range=None, weights=None):
below, :math:`h` is the binwidth and :math:`n_h` is the number of
bins. All estimators that compute bin counts are recast to bin width
using the `ptp` of the data. The final bin count is obtained from
- ``np.round(np.ceil(range / h))`.
+ ``np.round(np.ceil(range / h))``.
'Auto' (maximum of the 'Sturges' and 'FD' estimators)
A compromise to get a good value. For small datasets the Sturges
diff --git a/numpy/lib/tests/test_histograms.py b/numpy/lib/tests/test_histograms.py
index 6777089ab..06daacbdc 100644
--- a/numpy/lib/tests/test_histograms.py
+++ b/numpy/lib/tests/test_histograms.py
@@ -443,6 +443,24 @@ class TestHistogramOptimBinNums(object):
assert_equal(len(a), numbins, err_msg="{0} estimator, "
"No Variance test".format(estimator))
+ def test_limited_variance(self):
+ """
+ Check when IQR is 0, but variance exists, we return the sturges value
+ and not the fd value.
+ """
+ lim_var_data = np.ones(1000)
+ lim_var_data[:3] = 0
+ lim_var_data[-4:] = 100
+
+ edges_auto = histogram_bin_edges(lim_var_data, 'auto')
+ assert_equal(edges_auto, np.linspace(0, 100, 12))
+
+ edges_fd = histogram_bin_edges(lim_var_data, 'fd')
+ assert_equal(edges_fd, np.array([0, 100]))
+
+ edges_sturges = histogram_bin_edges(lim_var_data, 'sturges')
+ assert_equal(edges_sturges, np.linspace(0, 100, 12))
+
def test_outlier(self):
"""
Check the FD, Scott and Doane with outliers.