diff options
author | Varun Nayyar <nayyarv@users.noreply.github.com> | 2018-04-10 12:55:59 +0800 |
---|---|---|
committer | Eric Wieser <wieser.eric@gmail.com> | 2018-04-09 21:55:59 -0700 |
commit | 918a167c30a7e19f0c0061f5e09e7637b1e04591 (patch) | |
tree | 0b1c76b31578cb10b9777e750cdbb0767823544f | |
parent | ab4e4c93beb671b634bac34d21f30452ef194bf6 (diff) | |
download | numpy-918a167c30a7e19f0c0061f5e09e7637b1e04591.tar.gz |
ENH: Improve histogram bins="auto" for data with little variance (#10739)
Now falls back on sturges estimator when the IQR is zero
-rw-r--r-- | doc/release/1.15.0-notes.rst | 5 | ||||
-rw-r--r-- | numpy/lib/histograms.py | 31 | ||||
-rw-r--r-- | numpy/lib/tests/test_histograms.py | 18 |
3 files changed, 45 insertions, 9 deletions
diff --git a/doc/release/1.15.0-notes.rst b/doc/release/1.15.0-notes.rst index fc2af11a2..34c2a6a61 100644 --- a/doc/release/1.15.0-notes.rst +++ b/doc/release/1.15.0-notes.rst @@ -146,6 +146,11 @@ as usual with `errstate`. Dates, times, and timedeltas can now be histogrammed. The bin edges must be passed explicitly, and are not yet computed automatically. +``histogram`` "auto" estimator handles limited variance better +------------------------------------------------------------------------ +No longer does an IQR of 0 result in `n_bins=1`, rather the number of bins +chosen is related to the data size in this situation + ``histogramdd`` allows explicit ranges to be given in a subset of axes ---------------------------------------------------------------------- The ``range`` argument of `histogramdd` can now contain ``None`` values to diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py index a0346f6c5..d2a398a0a 100644 --- a/numpy/lib/histograms.py +++ b/numpy/lib/histograms.py @@ -167,12 +167,22 @@ def _hist_bin_fd(x): def _hist_bin_auto(x): """ Histogram bin estimator that uses the minimum width of the - Freedman-Diaconis and Sturges estimators. + Freedman-Diaconis and Sturges estimators if the FD bandwidth is non zero + and the Sturges estimator if the FD bandwidth is 0. The FD estimator is usually the most robust method, but its width - estimate tends to be too large for small `x`. The Sturges estimator - is quite good for small (<1000) datasets and is the default in the R - language. This method gives good off the shelf behaviour. + estimate tends to be too large for small `x` and bad for data with limited + variance. The Sturges estimator is quite good for small (<1000) datasets + and is the default in the R language. This method gives good off the shelf + behaviour. + + .. versionchanged:: 1.15.0 + If there is limited variance the IQR can be 0, which results in the + FD bin width being 0 too. This is not a valid bin width, so + ``np.histogram_bin_edges`` chooses 1 bin instead, which may not be optimal. + If the IQR is 0, it's unlikely any variance based estimators will be of + use, so we revert to the sturges estimator, which only uses the size of the + dataset in its calculation. Parameters ---------- @@ -188,10 +198,13 @@ def _hist_bin_auto(x): -------- _hist_bin_fd, _hist_bin_sturges """ - # There is no need to check for zero here. If ptp is, so is IQR and - # vice versa. Either both are zero or neither one is. - return min(_hist_bin_fd(x), _hist_bin_sturges(x)) - + fd_bw = _hist_bin_fd(x) + sturges_bw = _hist_bin_sturges(x) + if fd_bw: + return min(fd_bw, sturges_bw) + else: + # limited variance, so we return a len dependent bw estimator + return sturges_bw # Private dict initialized at module load time _hist_bin_selectors = {'auto': _hist_bin_auto, @@ -440,7 +453,7 @@ def histogram_bin_edges(a, bins=10, range=None, weights=None): below, :math:`h` is the binwidth and :math:`n_h` is the number of bins. All estimators that compute bin counts are recast to bin width using the `ptp` of the data. The final bin count is obtained from - ``np.round(np.ceil(range / h))`. + ``np.round(np.ceil(range / h))``. 'Auto' (maximum of the 'Sturges' and 'FD' estimators) A compromise to get a good value. For small datasets the Sturges diff --git a/numpy/lib/tests/test_histograms.py b/numpy/lib/tests/test_histograms.py index 6777089ab..06daacbdc 100644 --- a/numpy/lib/tests/test_histograms.py +++ b/numpy/lib/tests/test_histograms.py @@ -443,6 +443,24 @@ class TestHistogramOptimBinNums(object): assert_equal(len(a), numbins, err_msg="{0} estimator, " "No Variance test".format(estimator)) + def test_limited_variance(self): + """ + Check when IQR is 0, but variance exists, we return the sturges value + and not the fd value. + """ + lim_var_data = np.ones(1000) + lim_var_data[:3] = 0 + lim_var_data[-4:] = 100 + + edges_auto = histogram_bin_edges(lim_var_data, 'auto') + assert_equal(edges_auto, np.linspace(0, 100, 12)) + + edges_fd = histogram_bin_edges(lim_var_data, 'fd') + assert_equal(edges_fd, np.array([0, 100])) + + edges_sturges = histogram_bin_edges(lim_var_data, 'sturges') + assert_equal(edges_sturges, np.linspace(0, 100, 12)) + def test_outlier(self): """ Check the FD, Scott and Doane with outliers. |