diff options
author | Eric Wieser <wieser.eric@gmail.com> | 2017-10-18 23:46:39 -0700 |
---|---|---|
committer | Eric Wieser <wieser.eric@gmail.com> | 2017-10-19 23:04:32 -0700 |
commit | 57d225c11bb253981639d52442384eb3e43bb5f6 (patch) | |
tree | 494cdc7eccca1c9476ff5024468838c3354eef93 /numpy/lib/function_base.py | |
parent | e657629bbc2bfb880a1b2fa24a39c5921c1f965e (diff) | |
download | numpy-57d225c11bb253981639d52442384eb3e43bb5f6.tar.gz |
MAINT: Tidy np.histogram, and improve error messages
Split up the overloaded `bins` variable into separate names depending on its meaning
Helpful errors are now emitted for:
* non-integer bin counts (fixes gh-8072)
* non-1d bin edges
Removes another use of `np.isscalar`...
Diffstat (limited to 'numpy/lib/function_base.py')
-rw-r--r-- | numpy/lib/function_base.py | 136 |
1 files changed, 77 insertions, 59 deletions
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index 2745b49d1..1de4e906c 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -4,6 +4,7 @@ import collections import re import sys import warnings +import operator import numpy as np import numpy.core.numeric as _nx @@ -646,7 +647,7 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, a = asarray(a) if weights is not None: weights = asarray(weights) - if np.any(weights.shape != a.shape): + if weights.shape != a.shape: raise ValueError( 'weights should have the same shape as a.') weights = weights.ravel() @@ -671,11 +672,21 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, mn -= 0.5 mx += 0.5 + # density overrides the normed keyword + if density is not None: + normed = False + + # parse the overloaded bins argument + n_equal_bins = None + bin_edges = None + if isinstance(bins, basestring): + bin_name = bins # if `bins` is a string for an automatic method, # this will replace it with the number of bins calculated - if bins not in _hist_bin_selectors: - raise ValueError("{0} not a valid estimator for bins".format(bins)) + if bin_name not in _hist_bin_selectors: + raise ValueError( + "{!r} is not a valid estimator for `bins`".format(bin_name)) if weights is not None: raise TypeError("Automated estimation of the number of " "bins is not supported for weighted data") @@ -689,16 +700,40 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, b = a[keep] if b.size == 0: - bins = 1 + n_equal_bins = 1 else: # Do not call selectors on empty arrays - width = _hist_bin_selectors[bins](b) + width = _hist_bin_selectors[bin_name](b) if width: - bins = int(np.ceil((mx - mn) / width)) + n_equal_bins = int(np.ceil((mx - mn) / width)) else: # Width can be zero for some estimators, e.g. FD when # the IQR of the data is zero. - bins = 1 + n_equal_bins = 1 + + elif np.ndim(bins) == 0: + try: + n_equal_bins = operator.index(bins) + except TypeError: + raise TypeError( + '`bins` must be an integer, a string, or an array') + if n_equal_bins < 1: + raise ValueError('`bins` must be positive, when an integer') + + elif np.ndim(bins) == 1: + bin_edges = np.asarray(bins) + if np.any(bin_edges[:-1] > bin_edges[1:]): + raise ValueError( + '`bins` must increase monotonically, when an array') + + else: + raise ValueError('`bins` must be 1d, when an array') + + del bins + + # compute the bins if only the count was specified + if n_equal_bins is not None: + bin_edges = linspace(mn, mx, n_equal_bins + 1, endpoint=True) # Histogram is an integer or a float array depending on the weights. if weights is None: @@ -710,27 +745,24 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, # computing histograms, to minimize memory usage. BLOCK = 65536 - if not iterable(bins): - if np.isscalar(bins) and bins < 1: - raise ValueError( - '`bins` should be a positive integer.') - # At this point, if the weights are not integer, floating point, or - # complex, we have to use the slow algorithm. - if weights is not None and not (np.can_cast(weights.dtype, np.double) or - np.can_cast(weights.dtype, complex)): - bins = linspace(mn, mx, bins + 1, endpoint=True) - - if not iterable(bins): + # The fast path uses bincount, but that only works for certain types + # of weight + simple_weights = ( + weights is None or + np.can_cast(weights.dtype, np.double) or + np.can_cast(weights.dtype, complex) + ) + + if n_equal_bins is not None and simple_weights: + # Fast algorithm for equal bins # We now convert values of a to bin indices, under the assumption of # equal bin widths (which is valid here). # Initialize empty histogram - n = np.zeros(bins, ntype) - # Pre-compute histogram scaling factor - norm = bins / (mx - mn) + n = np.zeros(n_equal_bins, ntype) - # Compute the bin edges for potential correction. - bin_edges = linspace(mn, mx, bins + 1, endpoint=True) + # Pre-compute histogram scaling factor + norm = n_equal_bins / (mx - mn) # We iterate over blocks here for two reasons: the first is that for # large arrays, it is actually faster (for example for a 10^8 array it @@ -757,7 +789,7 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, # Compute the bin indices, and for values that lie exactly on mx we # need to subtract one indices = tmp_a.astype(np.intp) - indices[indices == bins] -= 1 + indices[indices == n_equal_bins] -= 1 # The index computation is not guaranteed to give exactly # consistent results within ~1 ULP of the bin edges. @@ -765,35 +797,26 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, indices[decrement] -= 1 # The last bin includes the right edge. The other bins do not. increment = ((tmp_a_data >= bin_edges[indices + 1]) - & (indices != bins - 1)) + & (indices != n_equal_bins - 1)) indices[increment] += 1 # We now compute the histogram using bincount if ntype.kind == 'c': n.real += np.bincount(indices, weights=tmp_w.real, - minlength=bins) + minlength=n_equal_bins) n.imag += np.bincount(indices, weights=tmp_w.imag, - minlength=bins) + minlength=n_equal_bins) else: n += np.bincount(indices, weights=tmp_w, - minlength=bins).astype(ntype) - - # Rename the bin edges for return. - bins = bin_edges + minlength=n_equal_bins).astype(ntype) else: - bins = asarray(bins) - if np.any(bins[:-1] > bins[1:]): - raise ValueError( - 'bins must increase monotonically.') - - # Initialize empty histogram - n = np.zeros(bins.shape, ntype) - + # Compute via cumulative histogram + cum_n = np.zeros(bin_edges.shape, ntype) if weights is None: for i in arange(0, len(a), BLOCK): sa = sort(a[i:i+BLOCK]) - n += np.r_[sa.searchsorted(bins[:-1], 'left'), - sa.searchsorted(bins[-1], 'right')] + cum_n += np.r_[sa.searchsorted(bin_edges[:-1], 'left'), + sa.searchsorted(bin_edges[-1], 'right')] else: zero = array(0, dtype=ntype) for i in arange(0, len(a), BLOCK): @@ -802,27 +825,22 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, sorting_index = np.argsort(tmp_a) sa = tmp_a[sorting_index] sw = tmp_w[sorting_index] - cw = np.concatenate(([zero, ], sw.cumsum())) - bin_index = np.r_[sa.searchsorted(bins[:-1], 'left'), - sa.searchsorted(bins[-1], 'right')] - n += cw[bin_index] - + cw = np.concatenate(([zero], sw.cumsum())) + bin_index = np.r_[sa.searchsorted(bin_edges[:-1], 'left'), + sa.searchsorted(bin_edges[-1], 'right')] + cum_n += cw[bin_index] - n = np.diff(n) + n = np.diff(cum_n) - if density is not None: - if density: - db = array(np.diff(bins), float) - return n/db/n.sum(), bins - else: - return n, bins - else: + if density: + db = array(np.diff(bin_edges), float) + return n/db/n.sum(), bin_edges + elif normed: # deprecated, buggy behavior. Remove for NumPy 2.0.0 - if normed: - db = array(np.diff(bins), float) - return n/(n*db).sum(), bins - else: - return n, bins + db = array(np.diff(bin_edges), float) + return n/(n*db).sum(), bin_edges + else: + return n, bin_edges def histogramdd(sample, bins=10, range=None, normed=False, weights=None): |