From b581aade6891662e37931b04bba29262cd6057de Mon Sep 17 00:00:00 2001 From: Eric Wieser Date: Fri, 17 Aug 2018 22:42:39 -0700 Subject: BUG: Avoid signed overflow in histogram When dividing the range between first and last, the formula `(x - first) / (last - first)` is used, where `last >= first` This falls apart if the types of these variables are signed integers, in which case overflow can occur. The solution is to force the subtraction result to be unsigned. This is a regression in 1.15.0 --- numpy/lib/histograms.py | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) (limited to 'numpy/lib/histograms.py') diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py index 2922b3a86..1a47acf78 100644 --- a/numpy/lib/histograms.py +++ b/numpy/lib/histograms.py @@ -259,6 +259,32 @@ def _get_outer_edges(a, range): return first_edge, last_edge +def _unsigned_subtract(a, b): + """ + Subtract two values where a >= b, and produce an unsigned result + + This is needed when finding the difference between the upper and lower + bound of an int16 histogram + """ + # coerce to a single type + signed_to_unsigned = { + np.byte: np.ubyte, + np.short: np.ushort, + np.intc: np.uintc, + np.int_: np.uint, + np.longlong: np.ulonglong + } + dt = np.result_type(a, b) + try: + dt = signed_to_unsigned[dt.type] + except KeyError: + return np.subtract(a, b, dtype=dt) + else: + # we know the inputs are integers, and we are deliberately casting + # signed to unsigned + return np.subtract(a, b, casting='unsafe', dtype=dt) + + def _get_bin_edges(a, bins, range, weights): """ Computes the bins used internally by `histogram`. @@ -310,7 +336,7 @@ def _get_bin_edges(a, bins, range, weights): # Do not call selectors on empty arrays width = _hist_bin_selectors[bin_name](a) if width: - n_equal_bins = int(np.ceil((last_edge - first_edge) / width)) + n_equal_bins = int(np.ceil(_unsigned_subtract(last_edge, first_edge) / width)) else: # Width can be zero for some estimators, e.g. FD when # the IQR of the data is zero. @@ -704,7 +730,7 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, n = np.zeros(n_equal_bins, ntype) # Pre-compute histogram scaling factor - norm = n_equal_bins / (last_edge - first_edge) + norm = n_equal_bins / _unsigned_subtract(last_edge, first_edge) # We iterate over blocks here for two reasons: the first is that for # large arrays, it is actually faster (for example for a 10^8 array it @@ -732,7 +758,7 @@ def histogram(a, bins=10, range=None, normed=False, weights=None, # Compute the bin indices, and for values that lie exactly on # last_edge we need to subtract one - f_indices = (tmp_a - first_edge) * norm + f_indices = _unsigned_subtract(tmp_a, first_edge) * norm indices = f_indices.astype(np.intp) indices[indices == n_equal_bins] -= 1 -- cgit v1.2.1