BUG: Avoid signed overflow in histogram

When dividing the range between first and last, the formula `(x - first) / (last - first)` is used, where `last >= first` This falls apart if the types of these variables are signed integers, in which case overflow can occur. The solution is to force the subtraction result to be unsigned. This is a regression in 1.15.0
author: Eric Wieser <wieser.eric@gmail.com> 2018-08-17 22:42:39 -0700
committer: Eric Wieser <wieser.eric@gmail.com> 2018-08-18 10:31:35 -0700
commit: b581aade6891662e37931b04bba29262cd6057de (patch)
tree: 488833f95378d30496b39d8b370099c226671096 /numpy
parent: 3d9a082d146195944071a81ed0eae9d16976961a (diff)
download: numpy-b581aade6891662e37931b04bba29262cd6057de.tar.gz
2 files changed, 43 insertions, 3 deletions
diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py
index 2922b3a86..1a47acf78 100644
--- a/numpy/lib/histograms.py
+++ b/numpy/lib/histograms.py
@@ -259,6 +259,32 @@ def _get_outer_edges(a, range):
     return first_edge, last_edge
 
 
+def _unsigned_subtract(a, b):
+    """
+    Subtract two values where a >= b, and produce an unsigned result
+
+    This is needed when finding the difference between the upper and lower
+    bound of an int16 histogram
+    """
+    # coerce to a single type
+    signed_to_unsigned = {
+        np.byte: np.ubyte,
+        np.short: np.ushort,
+        np.intc: np.uintc,
+        np.int_: np.uint,
+        np.longlong: np.ulonglong
+    }
+    dt = np.result_type(a, b)
+    try:
+        dt = signed_to_unsigned[dt.type]
+    except KeyError:
+        return np.subtract(a, b, dtype=dt)
+    else:
+        # we know the inputs are integers, and we are deliberately casting
+        # signed to unsigned
+        return np.subtract(a, b, casting='unsafe', dtype=dt)
+
+
 def _get_bin_edges(a, bins, range, weights):
     """
     Computes the bins used internally by `histogram`.
@@ -310,7 +336,7 @@ def _get_bin_edges(a, bins, range, weights):
             # Do not call selectors on empty arrays
             width = _hist_bin_selectors[bin_name](a)
             if width:
-                n_equal_bins = int(np.ceil((last_edge - first_edge) / width))
+                n_equal_bins = int(np.ceil(_unsigned_subtract(last_edge, first_edge) / width))
             else:
                 # Width can be zero for some estimators, e.g. FD when
                 # the IQR of the data is zero.
@@ -704,7 +730,7 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
         n = np.zeros(n_equal_bins, ntype)
 
         # Pre-compute histogram scaling factor
-        norm = n_equal_bins / (last_edge - first_edge)
+        norm = n_equal_bins / _unsigned_subtract(last_edge, first_edge)
 
         # We iterate over blocks here for two reasons: the first is that for
         # large arrays, it is actually faster (for example for a 10^8 array it
@@ -732,7 +758,7 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
 
             # Compute the bin indices, and for values that lie exactly on
             # last_edge we need to subtract one
-            f_indices = (tmp_a - first_edge) * norm
+            f_indices = _unsigned_subtract(tmp_a, first_edge) * norm
             indices = f_indices.astype(np.intp)
             indices[indices == n_equal_bins] -= 1
 
diff --git a/numpy/lib/tests/test_histograms.py b/numpy/lib/tests/test_histograms.py
index e16ae12c2..294dd539b 100644
--- a/numpy/lib/tests/test_histograms.py
+++ b/numpy/lib/tests/test_histograms.py
@@ -298,6 +298,20 @@ class TestHistogram(object):
         assert_equal(d_edge.dtype, dates.dtype)
         assert_equal(t_edge.dtype, td)
 
+    def do_signed_overflow_bounds(self, dtype):
+        exponent = 8 * np.dtype(dtype).itemsize - 1
+        arr = np.array([-2**exponent + 4, 2**exponent - 4], dtype=dtype)
+        hist, e = histogram(arr, bins=2)
+        assert_equal(e, [-2**exponent + 4, 0, 2**exponent - 4])
+        assert_equal(hist, [1, 1])
+
+    def test_signed_overflow_bounds(self):
+        self.do_signed_overflow_bounds(np.byte)
+        self.do_signed_overflow_bounds(np.short)
+        self.do_signed_overflow_bounds(np.intc)
+        self.do_signed_overflow_bounds(np.int_)
+        self.do_signed_overflow_bounds(np.longlong)
+
     def do_precision_lower_bound(self, float_small, float_large):
         eps = np.finfo(float_large).eps
author	Eric Wieser <wieser.eric@gmail.com>	2018-08-17 22:42:39 -0700
committer	Eric Wieser <wieser.eric@gmail.com>	2018-08-18 10:31:35 -0700
commit	b581aade6891662e37931b04bba29262cd6057de (patch)
tree	488833f95378d30496b39d8b370099c226671096 /numpy
parent	3d9a082d146195944071a81ed0eae9d16976961a (diff)
download	numpy-b581aade6891662e37931b04bba29262cd6057de.tar.gz