Merge pull request #9889 from eric-wieser/tidy-histogram

MAINT: Tidy np.histogram, and improve error messages
author: Charles Harris <charlesr.harris@gmail.com> 2017-10-20 12:17:20 -0600
committer: GitHub <noreply@github.com> 2017-10-20 12:17:20 -0600
commit: 07e0cfee16dbdbe14cf5a9284e0c919b16d66aba (patch)
tree: 72b80fd4cc94e7ef831bb2668066b1ddce32c818 /numpy/lib/function_base.py
parent: 928671055fde9469e68a5ef35686a244391b404d (diff)
parent: 5b7b87e4ebd91b1f0e17db05113a94e6776a5701 (diff)
download: numpy-07e0cfee16dbdbe14cf5a9284e0c919b16d66aba.tar.gz
1 files changed, 93 insertions, 74 deletions
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index 2745b49d1..3a73409fc 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -4,6 +4,7 @@ import collections
 import re
 import sys
 import warnings
+import operator
 
 import numpy as np
 import numpy.core.numeric as _nx
@@ -646,7 +647,7 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
     a = asarray(a)
     if weights is not None:
         weights = asarray(weights)
-        if np.any(weights.shape != a.shape):
+        if weights.shape != a.shape:
             raise ValueError(
                 'weights should have the same shape as a.')
         weights = weights.ravel()
@@ -656,26 +657,36 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
     if range is None:
         if a.size == 0:
             # handle empty arrays. Can't determine range, so use 0-1.
-            mn, mx = 0.0, 1.0
+            first_edge, last_edge = 0.0, 1.0
         else:
-            mn, mx = a.min() + 0.0, a.max() + 0.0
+            first_edge, last_edge = a.min() + 0.0, a.max() + 0.0
     else:
-        mn, mx = [mi + 0.0 for mi in range]
-    if mn > mx:
+        first_edge, last_edge = [mi + 0.0 for mi in range]
+    if first_edge > last_edge:
         raise ValueError(
             'max must be larger than min in range parameter.')
-    if not np.all(np.isfinite([mn, mx])):
+    if not np.all(np.isfinite([first_edge, last_edge])):
         raise ValueError(
             'range parameter must be finite.')
-    if mn == mx:
-        mn -= 0.5
-        mx += 0.5
+    if first_edge == last_edge:
+        first_edge -= 0.5
+        last_edge += 0.5
+
+    # density overrides the normed keyword
+    if density is not None:
+        normed = False
+
+    # parse the overloaded bins argument
+    n_equal_bins = None
+    bin_edges = None
 
     if isinstance(bins, basestring):
+        bin_name = bins
         # if `bins` is a string for an automatic method,
         # this will replace it with the number of bins calculated
-        if bins not in _hist_bin_selectors:
-            raise ValueError("{0} not a valid estimator for bins".format(bins))
+        if bin_name not in _hist_bin_selectors:
+            raise ValueError(
+                "{!r} is not a valid estimator for `bins`".format(bin_name))
         if weights is not None:
             raise TypeError("Automated estimation of the number of "
                             "bins is not supported for weighted data")
@@ -683,22 +694,47 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
         b = a
         # Update the reference if the range needs truncation
         if range is not None:
-            keep = (a >= mn)
-            keep &= (a <= mx)
+            keep = (a >= first_edge)
+            keep &= (a <= last_edge)
             if not np.logical_and.reduce(keep):
                 b = a[keep]
 
         if b.size == 0:
-            bins = 1
+            n_equal_bins = 1
         else:
             # Do not call selectors on empty arrays
-            width = _hist_bin_selectors[bins](b)
+            width = _hist_bin_selectors[bin_name](b)
             if width:
-                bins = int(np.ceil((mx - mn) / width))
+                n_equal_bins = int(np.ceil((last_edge - first_edge) / width))
             else:
                 # Width can be zero for some estimators, e.g. FD when
                 # the IQR of the data is zero.
-                bins = 1
+                n_equal_bins = 1
+
+    elif np.ndim(bins) == 0:
+        try:
+            n_equal_bins = operator.index(bins)
+        except TypeError:
+            raise TypeError(
+                '`bins` must be an integer, a string, or an array')
+        if n_equal_bins < 1:
+            raise ValueError('`bins` must be positive, when an integer')
+
+    elif np.ndim(bins) == 1:
+        bin_edges = np.asarray(bins)
+        if np.any(bin_edges[:-1] > bin_edges[1:]):
+            raise ValueError(
+                '`bins` must increase monotonically, when an array')
+
+    else:
+        raise ValueError('`bins` must be 1d, when an array')
+
+    del bins
+
+    # compute the bins if only the count was specified
+    if n_equal_bins is not None:
+        bin_edges = linspace(
+            first_edge, last_edge, n_equal_bins + 1, endpoint=True)
 
     # Histogram is an integer or a float array depending on the weights.
     if weights is None:
@@ -710,27 +746,24 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
     # computing histograms, to minimize memory usage.
     BLOCK = 65536
 
-    if not iterable(bins):
-        if np.isscalar(bins) and bins < 1:
-            raise ValueError(
-                '`bins` should be a positive integer.')
-        # At this point, if the weights are not integer, floating point, or
-        # complex, we have to use the slow algorithm.
-        if weights is not None and not (np.can_cast(weights.dtype, np.double) or
-                                        np.can_cast(weights.dtype, complex)):
-            bins = linspace(mn, mx, bins + 1, endpoint=True)
-
-    if not iterable(bins):
+    # The fast path uses bincount, but that only works for certain types
+    # of weight
+    simple_weights = (
+        weights is None or
+        np.can_cast(weights.dtype, np.double) or
+        np.can_cast(weights.dtype, complex)
+    )
+
+    if n_equal_bins is not None and simple_weights:
+        # Fast algorithm for equal bins
         # We now convert values of a to bin indices, under the assumption of
         # equal bin widths (which is valid here).
 
         # Initialize empty histogram
-        n = np.zeros(bins, ntype)
-        # Pre-compute histogram scaling factor
-        norm = bins / (mx - mn)
+        n = np.zeros(n_equal_bins, ntype)
 
-        # Compute the bin edges for potential correction.
-        bin_edges = linspace(mn, mx, bins + 1, endpoint=True)
+        # Pre-compute histogram scaling factor
+        norm = n_equal_bins / (last_edge - first_edge)
 
         # We iterate over blocks here for two reasons: the first is that for
         # large arrays, it is actually faster (for example for a 10^8 array it
@@ -744,20 +777,20 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
                 tmp_w = weights[i:i + BLOCK]
 
             # Only include values in the right range
-            keep = (tmp_a >= mn)
-            keep &= (tmp_a <= mx)
+            keep = (tmp_a >= first_edge)
+            keep &= (tmp_a <= last_edge)
             if not np.logical_and.reduce(keep):
                 tmp_a = tmp_a[keep]
                 if tmp_w is not None:
                     tmp_w = tmp_w[keep]
             tmp_a_data = tmp_a.astype(float)
-            tmp_a = tmp_a_data - mn
+            tmp_a = tmp_a_data - first_edge
             tmp_a *= norm
 
-            # Compute the bin indices, and for values that lie exactly on mx we
-            # need to subtract one
+            # Compute the bin indices, and for values that lie exactly on
+            # last_edge we need to subtract one
             indices = tmp_a.astype(np.intp)
-            indices[indices == bins] -= 1
+            indices[indices == n_equal_bins] -= 1
 
             # The index computation is not guaranteed to give exactly
             # consistent results within ~1 ULP of the bin edges.
@@ -765,35 +798,26 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
             indices[decrement] -= 1
             # The last bin includes the right edge. The other bins do not.
             increment = ((tmp_a_data >= bin_edges[indices + 1])
-                         & (indices != bins - 1))
+                         & (indices != n_equal_bins - 1))
             indices[increment] += 1
 
             # We now compute the histogram using bincount
             if ntype.kind == 'c':
                 n.real += np.bincount(indices, weights=tmp_w.real,
-                                      minlength=bins)
+                                      minlength=n_equal_bins)
                 n.imag += np.bincount(indices, weights=tmp_w.imag,
-                                      minlength=bins)
+                                      minlength=n_equal_bins)
             else:
                 n += np.bincount(indices, weights=tmp_w,
-                                 minlength=bins).astype(ntype)
-
-        # Rename the bin edges for return.
-        bins = bin_edges
+                                 minlength=n_equal_bins).astype(ntype)
     else:
-        bins = asarray(bins)
-        if np.any(bins[:-1] > bins[1:]):
-            raise ValueError(
-                'bins must increase monotonically.')
-
-        # Initialize empty histogram
-        n = np.zeros(bins.shape, ntype)
-
+        # Compute via cumulative histogram
+        cum_n = np.zeros(bin_edges.shape, ntype)
         if weights is None:
             for i in arange(0, len(a), BLOCK):
                 sa = sort(a[i:i+BLOCK])
-                n += np.r_[sa.searchsorted(bins[:-1], 'left'),
-                           sa.searchsorted(bins[-1], 'right')]
+                cum_n += np.r_[sa.searchsorted(bin_edges[:-1], 'left'),
+                               sa.searchsorted(bin_edges[-1], 'right')]
         else:
             zero = array(0, dtype=ntype)
             for i in arange(0, len(a), BLOCK):
@@ -802,27 +826,22 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
                 sorting_index = np.argsort(tmp_a)
                 sa = tmp_a[sorting_index]
                 sw = tmp_w[sorting_index]
-                cw = np.concatenate(([zero, ], sw.cumsum()))
-                bin_index = np.r_[sa.searchsorted(bins[:-1], 'left'),
-                                  sa.searchsorted(bins[-1], 'right')]
-                n += cw[bin_index]
-
+                cw = np.concatenate(([zero], sw.cumsum()))
+                bin_index = np.r_[sa.searchsorted(bin_edges[:-1], 'left'),
+                                  sa.searchsorted(bin_edges[-1], 'right')]
+                cum_n += cw[bin_index]
 
-        n = np.diff(n)
+        n = np.diff(cum_n)
 
-    if density is not None:
-        if density:
-            db = array(np.diff(bins), float)
-            return n/db/n.sum(), bins
-        else:
-            return n, bins
-    else:
+    if density:
+        db = array(np.diff(bin_edges), float)
+        return n/db/n.sum(), bin_edges
+    elif normed:
         # deprecated, buggy behavior. Remove for NumPy 2.0.0
-        if normed:
-            db = array(np.diff(bins), float)
-            return n/(n*db).sum(), bins
-        else:
-            return n, bins
+        db = array(np.diff(bin_edges), float)
+        return n/(n*db).sum(), bin_edges
+    else:
+        return n, bin_edges
 
 
 def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
author	Charles Harris <charlesr.harris@gmail.com>	2017-10-20 12:17:20 -0600
committer	GitHub <noreply@github.com>	2017-10-20 12:17:20 -0600
commit	07e0cfee16dbdbe14cf5a9284e0c919b16d66aba (patch)
tree	72b80fd4cc94e7ef831bb2668066b1ddce32c818 /numpy/lib/function_base.py
parent	928671055fde9469e68a5ef35686a244391b404d (diff)
parent	5b7b87e4ebd91b1f0e17db05113a94e6776a5701 (diff)
download	numpy-07e0cfee16dbdbe14cf5a9284e0c919b16d66aba.tar.gz