summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
Diffstat (limited to 'numpy')
-rw-r--r--numpy/lib/histograms.py31
-rw-r--r--numpy/lib/tests/test_histograms.py18
2 files changed, 40 insertions, 9 deletions
diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py
index a0346f6c5..d2a398a0a 100644
--- a/numpy/lib/histograms.py
+++ b/numpy/lib/histograms.py
@@ -167,12 +167,22 @@ def _hist_bin_fd(x):
def _hist_bin_auto(x):
"""
Histogram bin estimator that uses the minimum width of the
- Freedman-Diaconis and Sturges estimators.
+ Freedman-Diaconis and Sturges estimators if the FD bandwidth is non zero
+ and the Sturges estimator if the FD bandwidth is 0.
The FD estimator is usually the most robust method, but its width
- estimate tends to be too large for small `x`. The Sturges estimator
- is quite good for small (<1000) datasets and is the default in the R
- language. This method gives good off the shelf behaviour.
+ estimate tends to be too large for small `x` and bad for data with limited
+ variance. The Sturges estimator is quite good for small (<1000) datasets
+ and is the default in the R language. This method gives good off the shelf
+ behaviour.
+
+ .. versionchanged:: 1.15.0
+ If there is limited variance the IQR can be 0, which results in the
+ FD bin width being 0 too. This is not a valid bin width, so
+ ``np.histogram_bin_edges`` chooses 1 bin instead, which may not be optimal.
+ If the IQR is 0, it's unlikely any variance based estimators will be of
+ use, so we revert to the sturges estimator, which only uses the size of the
+ dataset in its calculation.
Parameters
----------
@@ -188,10 +198,13 @@ def _hist_bin_auto(x):
--------
_hist_bin_fd, _hist_bin_sturges
"""
- # There is no need to check for zero here. If ptp is, so is IQR and
- # vice versa. Either both are zero or neither one is.
- return min(_hist_bin_fd(x), _hist_bin_sturges(x))
-
+ fd_bw = _hist_bin_fd(x)
+ sturges_bw = _hist_bin_sturges(x)
+ if fd_bw:
+ return min(fd_bw, sturges_bw)
+ else:
+ # limited variance, so we return a len dependent bw estimator
+ return sturges_bw
# Private dict initialized at module load time
_hist_bin_selectors = {'auto': _hist_bin_auto,
@@ -440,7 +453,7 @@ def histogram_bin_edges(a, bins=10, range=None, weights=None):
below, :math:`h` is the binwidth and :math:`n_h` is the number of
bins. All estimators that compute bin counts are recast to bin width
using the `ptp` of the data. The final bin count is obtained from
- ``np.round(np.ceil(range / h))`.
+ ``np.round(np.ceil(range / h))``.
'Auto' (maximum of the 'Sturges' and 'FD' estimators)
A compromise to get a good value. For small datasets the Sturges
diff --git a/numpy/lib/tests/test_histograms.py b/numpy/lib/tests/test_histograms.py
index 6777089ab..06daacbdc 100644
--- a/numpy/lib/tests/test_histograms.py
+++ b/numpy/lib/tests/test_histograms.py
@@ -443,6 +443,24 @@ class TestHistogramOptimBinNums(object):
assert_equal(len(a), numbins, err_msg="{0} estimator, "
"No Variance test".format(estimator))
+ def test_limited_variance(self):
+ """
+ Check when IQR is 0, but variance exists, we return the sturges value
+ and not the fd value.
+ """
+ lim_var_data = np.ones(1000)
+ lim_var_data[:3] = 0
+ lim_var_data[-4:] = 100
+
+ edges_auto = histogram_bin_edges(lim_var_data, 'auto')
+ assert_equal(edges_auto, np.linspace(0, 100, 12))
+
+ edges_fd = histogram_bin_edges(lim_var_data, 'fd')
+ assert_equal(edges_fd, np.array([0, 100]))
+
+ edges_sturges = histogram_bin_edges(lim_var_data, 'sturges')
+ assert_equal(edges_sturges, np.linspace(0, 100, 12))
+
def test_outlier(self):
"""
Check the FD, Scott and Doane with outliers.