summaryrefslogtreecommitdiff
path: root/numpy/lib/function_base.py
diff options
context:
space:
mode:
authorCharles Harris <charlesr.harris@gmail.com>2017-10-20 12:17:20 -0600
committerGitHub <noreply@github.com>2017-10-20 12:17:20 -0600
commit07e0cfee16dbdbe14cf5a9284e0c919b16d66aba (patch)
tree72b80fd4cc94e7ef831bb2668066b1ddce32c818 /numpy/lib/function_base.py
parent928671055fde9469e68a5ef35686a244391b404d (diff)
parent5b7b87e4ebd91b1f0e17db05113a94e6776a5701 (diff)
downloadnumpy-07e0cfee16dbdbe14cf5a9284e0c919b16d66aba.tar.gz
Merge pull request #9889 from eric-wieser/tidy-histogram
MAINT: Tidy np.histogram, and improve error messages
Diffstat (limited to 'numpy/lib/function_base.py')
-rw-r--r--numpy/lib/function_base.py167
1 files changed, 93 insertions, 74 deletions
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index 2745b49d1..3a73409fc 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -4,6 +4,7 @@ import collections
import re
import sys
import warnings
+import operator
import numpy as np
import numpy.core.numeric as _nx
@@ -646,7 +647,7 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
a = asarray(a)
if weights is not None:
weights = asarray(weights)
- if np.any(weights.shape != a.shape):
+ if weights.shape != a.shape:
raise ValueError(
'weights should have the same shape as a.')
weights = weights.ravel()
@@ -656,26 +657,36 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
if range is None:
if a.size == 0:
# handle empty arrays. Can't determine range, so use 0-1.
- mn, mx = 0.0, 1.0
+ first_edge, last_edge = 0.0, 1.0
else:
- mn, mx = a.min() + 0.0, a.max() + 0.0
+ first_edge, last_edge = a.min() + 0.0, a.max() + 0.0
else:
- mn, mx = [mi + 0.0 for mi in range]
- if mn > mx:
+ first_edge, last_edge = [mi + 0.0 for mi in range]
+ if first_edge > last_edge:
raise ValueError(
'max must be larger than min in range parameter.')
- if not np.all(np.isfinite([mn, mx])):
+ if not np.all(np.isfinite([first_edge, last_edge])):
raise ValueError(
'range parameter must be finite.')
- if mn == mx:
- mn -= 0.5
- mx += 0.5
+ if first_edge == last_edge:
+ first_edge -= 0.5
+ last_edge += 0.5
+
+ # density overrides the normed keyword
+ if density is not None:
+ normed = False
+
+ # parse the overloaded bins argument
+ n_equal_bins = None
+ bin_edges = None
if isinstance(bins, basestring):
+ bin_name = bins
# if `bins` is a string for an automatic method,
# this will replace it with the number of bins calculated
- if bins not in _hist_bin_selectors:
- raise ValueError("{0} not a valid estimator for bins".format(bins))
+ if bin_name not in _hist_bin_selectors:
+ raise ValueError(
+ "{!r} is not a valid estimator for `bins`".format(bin_name))
if weights is not None:
raise TypeError("Automated estimation of the number of "
"bins is not supported for weighted data")
@@ -683,22 +694,47 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
b = a
# Update the reference if the range needs truncation
if range is not None:
- keep = (a >= mn)
- keep &= (a <= mx)
+ keep = (a >= first_edge)
+ keep &= (a <= last_edge)
if not np.logical_and.reduce(keep):
b = a[keep]
if b.size == 0:
- bins = 1
+ n_equal_bins = 1
else:
# Do not call selectors on empty arrays
- width = _hist_bin_selectors[bins](b)
+ width = _hist_bin_selectors[bin_name](b)
if width:
- bins = int(np.ceil((mx - mn) / width))
+ n_equal_bins = int(np.ceil((last_edge - first_edge) / width))
else:
# Width can be zero for some estimators, e.g. FD when
# the IQR of the data is zero.
- bins = 1
+ n_equal_bins = 1
+
+ elif np.ndim(bins) == 0:
+ try:
+ n_equal_bins = operator.index(bins)
+ except TypeError:
+ raise TypeError(
+ '`bins` must be an integer, a string, or an array')
+ if n_equal_bins < 1:
+ raise ValueError('`bins` must be positive, when an integer')
+
+ elif np.ndim(bins) == 1:
+ bin_edges = np.asarray(bins)
+ if np.any(bin_edges[:-1] > bin_edges[1:]):
+ raise ValueError(
+ '`bins` must increase monotonically, when an array')
+
+ else:
+ raise ValueError('`bins` must be 1d, when an array')
+
+ del bins
+
+ # compute the bins if only the count was specified
+ if n_equal_bins is not None:
+ bin_edges = linspace(
+ first_edge, last_edge, n_equal_bins + 1, endpoint=True)
# Histogram is an integer or a float array depending on the weights.
if weights is None:
@@ -710,27 +746,24 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
# computing histograms, to minimize memory usage.
BLOCK = 65536
- if not iterable(bins):
- if np.isscalar(bins) and bins < 1:
- raise ValueError(
- '`bins` should be a positive integer.')
- # At this point, if the weights are not integer, floating point, or
- # complex, we have to use the slow algorithm.
- if weights is not None and not (np.can_cast(weights.dtype, np.double) or
- np.can_cast(weights.dtype, complex)):
- bins = linspace(mn, mx, bins + 1, endpoint=True)
-
- if not iterable(bins):
+ # The fast path uses bincount, but that only works for certain types
+ # of weight
+ simple_weights = (
+ weights is None or
+ np.can_cast(weights.dtype, np.double) or
+ np.can_cast(weights.dtype, complex)
+ )
+
+ if n_equal_bins is not None and simple_weights:
+ # Fast algorithm for equal bins
# We now convert values of a to bin indices, under the assumption of
# equal bin widths (which is valid here).
# Initialize empty histogram
- n = np.zeros(bins, ntype)
- # Pre-compute histogram scaling factor
- norm = bins / (mx - mn)
+ n = np.zeros(n_equal_bins, ntype)
- # Compute the bin edges for potential correction.
- bin_edges = linspace(mn, mx, bins + 1, endpoint=True)
+ # Pre-compute histogram scaling factor
+ norm = n_equal_bins / (last_edge - first_edge)
# We iterate over blocks here for two reasons: the first is that for
# large arrays, it is actually faster (for example for a 10^8 array it
@@ -744,20 +777,20 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
tmp_w = weights[i:i + BLOCK]
# Only include values in the right range
- keep = (tmp_a >= mn)
- keep &= (tmp_a <= mx)
+ keep = (tmp_a >= first_edge)
+ keep &= (tmp_a <= last_edge)
if not np.logical_and.reduce(keep):
tmp_a = tmp_a[keep]
if tmp_w is not None:
tmp_w = tmp_w[keep]
tmp_a_data = tmp_a.astype(float)
- tmp_a = tmp_a_data - mn
+ tmp_a = tmp_a_data - first_edge
tmp_a *= norm
- # Compute the bin indices, and for values that lie exactly on mx we
- # need to subtract one
+ # Compute the bin indices, and for values that lie exactly on
+ # last_edge we need to subtract one
indices = tmp_a.astype(np.intp)
- indices[indices == bins] -= 1
+ indices[indices == n_equal_bins] -= 1
# The index computation is not guaranteed to give exactly
# consistent results within ~1 ULP of the bin edges.
@@ -765,35 +798,26 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
indices[decrement] -= 1
# The last bin includes the right edge. The other bins do not.
increment = ((tmp_a_data >= bin_edges[indices + 1])
- & (indices != bins - 1))
+ & (indices != n_equal_bins - 1))
indices[increment] += 1
# We now compute the histogram using bincount
if ntype.kind == 'c':
n.real += np.bincount(indices, weights=tmp_w.real,
- minlength=bins)
+ minlength=n_equal_bins)
n.imag += np.bincount(indices, weights=tmp_w.imag,
- minlength=bins)
+ minlength=n_equal_bins)
else:
n += np.bincount(indices, weights=tmp_w,
- minlength=bins).astype(ntype)
-
- # Rename the bin edges for return.
- bins = bin_edges
+ minlength=n_equal_bins).astype(ntype)
else:
- bins = asarray(bins)
- if np.any(bins[:-1] > bins[1:]):
- raise ValueError(
- 'bins must increase monotonically.')
-
- # Initialize empty histogram
- n = np.zeros(bins.shape, ntype)
-
+ # Compute via cumulative histogram
+ cum_n = np.zeros(bin_edges.shape, ntype)
if weights is None:
for i in arange(0, len(a), BLOCK):
sa = sort(a[i:i+BLOCK])
- n += np.r_[sa.searchsorted(bins[:-1], 'left'),
- sa.searchsorted(bins[-1], 'right')]
+ cum_n += np.r_[sa.searchsorted(bin_edges[:-1], 'left'),
+ sa.searchsorted(bin_edges[-1], 'right')]
else:
zero = array(0, dtype=ntype)
for i in arange(0, len(a), BLOCK):
@@ -802,27 +826,22 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
sorting_index = np.argsort(tmp_a)
sa = tmp_a[sorting_index]
sw = tmp_w[sorting_index]
- cw = np.concatenate(([zero, ], sw.cumsum()))
- bin_index = np.r_[sa.searchsorted(bins[:-1], 'left'),
- sa.searchsorted(bins[-1], 'right')]
- n += cw[bin_index]
-
+ cw = np.concatenate(([zero], sw.cumsum()))
+ bin_index = np.r_[sa.searchsorted(bin_edges[:-1], 'left'),
+ sa.searchsorted(bin_edges[-1], 'right')]
+ cum_n += cw[bin_index]
- n = np.diff(n)
+ n = np.diff(cum_n)
- if density is not None:
- if density:
- db = array(np.diff(bins), float)
- return n/db/n.sum(), bins
- else:
- return n, bins
- else:
+ if density:
+ db = array(np.diff(bin_edges), float)
+ return n/db/n.sum(), bin_edges
+ elif normed:
# deprecated, buggy behavior. Remove for NumPy 2.0.0
- if normed:
- db = array(np.diff(bins), float)
- return n/(n*db).sum(), bins
- else:
- return n, bins
+ db = array(np.diff(bin_edges), float)
+ return n/(n*db).sum(), bin_edges
+ else:
+ return n, bin_edges
def histogramdd(sample, bins=10, range=None, normed=False, weights=None):