summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
authorEric Wieser <wieser.eric@gmail.com>2018-08-17 22:42:39 -0700
committerEric Wieser <wieser.eric@gmail.com>2018-08-18 10:31:35 -0700
commitb581aade6891662e37931b04bba29262cd6057de (patch)
tree488833f95378d30496b39d8b370099c226671096 /numpy
parent3d9a082d146195944071a81ed0eae9d16976961a (diff)
downloadnumpy-b581aade6891662e37931b04bba29262cd6057de.tar.gz
BUG: Avoid signed overflow in histogram
When dividing the range between first and last, the formula `(x - first) / (last - first)` is used, where `last >= first` This falls apart if the types of these variables are signed integers, in which case overflow can occur. The solution is to force the subtraction result to be unsigned. This is a regression in 1.15.0
Diffstat (limited to 'numpy')
-rw-r--r--numpy/lib/histograms.py32
-rw-r--r--numpy/lib/tests/test_histograms.py14
2 files changed, 43 insertions, 3 deletions
diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py
index 2922b3a86..1a47acf78 100644
--- a/numpy/lib/histograms.py
+++ b/numpy/lib/histograms.py
@@ -259,6 +259,32 @@ def _get_outer_edges(a, range):
return first_edge, last_edge
+def _unsigned_subtract(a, b):
+ """
+ Subtract two values where a >= b, and produce an unsigned result
+
+ This is needed when finding the difference between the upper and lower
+ bound of an int16 histogram
+ """
+ # coerce to a single type
+ signed_to_unsigned = {
+ np.byte: np.ubyte,
+ np.short: np.ushort,
+ np.intc: np.uintc,
+ np.int_: np.uint,
+ np.longlong: np.ulonglong
+ }
+ dt = np.result_type(a, b)
+ try:
+ dt = signed_to_unsigned[dt.type]
+ except KeyError:
+ return np.subtract(a, b, dtype=dt)
+ else:
+ # we know the inputs are integers, and we are deliberately casting
+ # signed to unsigned
+ return np.subtract(a, b, casting='unsafe', dtype=dt)
+
+
def _get_bin_edges(a, bins, range, weights):
"""
Computes the bins used internally by `histogram`.
@@ -310,7 +336,7 @@ def _get_bin_edges(a, bins, range, weights):
# Do not call selectors on empty arrays
width = _hist_bin_selectors[bin_name](a)
if width:
- n_equal_bins = int(np.ceil((last_edge - first_edge) / width))
+ n_equal_bins = int(np.ceil(_unsigned_subtract(last_edge, first_edge) / width))
else:
# Width can be zero for some estimators, e.g. FD when
# the IQR of the data is zero.
@@ -704,7 +730,7 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
n = np.zeros(n_equal_bins, ntype)
# Pre-compute histogram scaling factor
- norm = n_equal_bins / (last_edge - first_edge)
+ norm = n_equal_bins / _unsigned_subtract(last_edge, first_edge)
# We iterate over blocks here for two reasons: the first is that for
# large arrays, it is actually faster (for example for a 10^8 array it
@@ -732,7 +758,7 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
# Compute the bin indices, and for values that lie exactly on
# last_edge we need to subtract one
- f_indices = (tmp_a - first_edge) * norm
+ f_indices = _unsigned_subtract(tmp_a, first_edge) * norm
indices = f_indices.astype(np.intp)
indices[indices == n_equal_bins] -= 1
diff --git a/numpy/lib/tests/test_histograms.py b/numpy/lib/tests/test_histograms.py
index e16ae12c2..294dd539b 100644
--- a/numpy/lib/tests/test_histograms.py
+++ b/numpy/lib/tests/test_histograms.py
@@ -298,6 +298,20 @@ class TestHistogram(object):
assert_equal(d_edge.dtype, dates.dtype)
assert_equal(t_edge.dtype, td)
+ def do_signed_overflow_bounds(self, dtype):
+ exponent = 8 * np.dtype(dtype).itemsize - 1
+ arr = np.array([-2**exponent + 4, 2**exponent - 4], dtype=dtype)
+ hist, e = histogram(arr, bins=2)
+ assert_equal(e, [-2**exponent + 4, 0, 2**exponent - 4])
+ assert_equal(hist, [1, 1])
+
+ def test_signed_overflow_bounds(self):
+ self.do_signed_overflow_bounds(np.byte)
+ self.do_signed_overflow_bounds(np.short)
+ self.do_signed_overflow_bounds(np.intc)
+ self.do_signed_overflow_bounds(np.int_)
+ self.do_signed_overflow_bounds(np.longlong)
+
def do_precision_lower_bound(self, float_small, float_large):
eps = np.finfo(float_large).eps