diff options
Diffstat (limited to 'numpy/lib/histograms.py')
-rw-r--r-- | numpy/lib/histograms.py | 41 |
1 files changed, 24 insertions, 17 deletions
diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py index 8474bd5d3..ede8a26e4 100644 --- a/numpy/lib/histograms.py +++ b/numpy/lib/histograms.py @@ -1,15 +1,12 @@ """ Histogram-related functions """ -from __future__ import division, absolute_import, print_function - import contextlib import functools import operator import warnings import numpy as np -from numpy.compat.py3k import basestring from numpy.core import overrides __all__ = ['histogram', 'histogramdd', 'histogram_bin_edges'] @@ -22,6 +19,16 @@ array_function_dispatch = functools.partial( _range = range +def _ptp(x): + """Peak-to-peak value of x. + + This implementation avoids the problem of signed integer arrays having a + peak-to-peak value that cannot be represented with the array's data type. + This function returns an unsigned value for signed integer arrays. + """ + return _unsigned_subtract(x.max(), x.min()) + + def _hist_bin_sqrt(x, range): """ Square root histogram bin estimator. @@ -40,7 +47,7 @@ def _hist_bin_sqrt(x, range): h : An estimate of the optimal bin width for the given data. """ del range # unused - return x.ptp() / np.sqrt(x.size) + return _ptp(x) / np.sqrt(x.size) def _hist_bin_sturges(x, range): @@ -63,7 +70,7 @@ def _hist_bin_sturges(x, range): h : An estimate of the optimal bin width for the given data. """ del range # unused - return x.ptp() / (np.log2(x.size) + 1.0) + return _ptp(x) / (np.log2(x.size) + 1.0) def _hist_bin_rice(x, range): @@ -87,7 +94,7 @@ def _hist_bin_rice(x, range): h : An estimate of the optimal bin width for the given data. """ del range # unused - return x.ptp() / (2.0 * x.size ** (1.0 / 3)) + return _ptp(x) / (2.0 * x.size ** (1.0 / 3)) def _hist_bin_scott(x, range): @@ -137,7 +144,7 @@ def _hist_bin_stone(x, range): """ n = x.size - ptp_x = np.ptp(x) + ptp_x = _ptp(x) if n <= 1 or ptp_x == 0: return 0 @@ -184,7 +191,7 @@ def _hist_bin_doane(x, range): np.true_divide(temp, sigma, temp) np.power(temp, 3, temp) g1 = np.mean(temp) - return x.ptp() / (1.0 + np.log2(x.size) + + return _ptp(x) / (1.0 + np.log2(x.size) + np.log2(1.0 + np.absolute(g1) / sg1)) return 0.0 @@ -200,7 +207,7 @@ def _hist_bin_fd(x, range): than the standard deviation, so it is less accurate, especially for long tailed distributions. - If the IQR is 0, this function returns 1 for the number of bins. + If the IQR is 0, this function returns 0 for the bin width. Binwidth is inversely proportional to the cube root of data size (asymptotically optimal). @@ -222,21 +229,21 @@ def _hist_bin_fd(x, range): def _hist_bin_auto(x, range): """ Histogram bin estimator that uses the minimum width of the - Freedman-Diaconis and Sturges estimators if the FD bandwidth is non zero - and the Sturges estimator if the FD bandwidth is 0. + Freedman-Diaconis and Sturges estimators if the FD bin width is non-zero. + If the bin width from the FD estimator is 0, the Sturges estimator is used. The FD estimator is usually the most robust method, but its width estimate tends to be too large for small `x` and bad for data with limited variance. The Sturges estimator is quite good for small (<1000) datasets - and is the default in the R language. This method gives good off the shelf + and is the default in the R language. This method gives good off-the-shelf behaviour. .. versionchanged:: 1.15.0 If there is limited variance the IQR can be 0, which results in the FD bin width being 0 too. This is not a valid bin width, so ``np.histogram_bin_edges`` chooses 1 bin instead, which may not be optimal. - If the IQR is 0, it's unlikely any variance based estimators will be of - use, so we revert to the sturges estimator, which only uses the size of the + If the IQR is 0, it's unlikely any variance-based estimators will be of + use, so we revert to the Sturges estimator, which only uses the size of the dataset in its calculation. Parameters @@ -375,7 +382,7 @@ def _get_bin_edges(a, bins, range, weights): n_equal_bins = None bin_edges = None - if isinstance(bins, basestring): + if isinstance(bins, str): bin_name = bins # if `bins` is a string for an automatic method, # this will replace it with the number of bins calculated @@ -946,9 +953,9 @@ def histogramdd(sample, bins=10, range=None, normed=None, weights=None, Note the unusual interpretation of sample when an array_like: * When an array, each row is a coordinate in a D-dimensional space - - such as ``histogramgramdd(np.array([p1, p2, p3]))``. + such as ``histogramdd(np.array([p1, p2, p3]))``. * When an array_like, each element is the list of values for single - coordinate - such as ``histogramgramdd((X, Y, Z))``. + coordinate - such as ``histogramdd((X, Y, Z))``. The first form should be preferred. |