summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWarren Weckesser <warren.weckesser@gmail.com>2019-08-27 15:15:26 -0400
committerWarren Weckesser <warren.weckesser@gmail.com>2019-10-15 17:42:58 -0400
commit3ff4924ead45ef6db81778daae08e3c939ea4629 (patch)
tree9e8c4c9726d1fbd8be32a75ce85a0314ac4c812d
parentdc20ec8c857bc1f1b717b56f3a5c64dbf31f16ac (diff)
downloadnumpy-3ff4924ead45ef6db81778daae08e3c939ea4629.tar.gz
BUG: lib: Fix histogram problem with signed integer arrays.
An input such as np.histogram(np.array([-2, 0, 127], dtype=np.int8), bins="auto") would raise the exception ValueError: Number of samples, -1, must be non-negative. The problem was that the peak-to-peak value for the input array was computed with the `ptp` method, which returned negative values for signed integer arrays when the actual value was more than the maximum signed value of the array's data type. The fix is to use a peak-to-peak function that returns an unsigned value for signed integer arrays. Closes gh-14379.
-rw-r--r--numpy/lib/histograms.py20
-rw-r--r--numpy/lib/tests/test_histograms.py11
2 files changed, 26 insertions, 5 deletions
diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py
index 8474bd5d3..03c365ab6 100644
--- a/numpy/lib/histograms.py
+++ b/numpy/lib/histograms.py
@@ -22,6 +22,16 @@ array_function_dispatch = functools.partial(
_range = range
+def _ptp(x):
+ """Peak-to-peak value of x.
+
+ This implementation avoids the problem of signed integer arrays having a
+ peak-to-peak value that cannot be represented with the array's data type.
+ This function returns an unsigned value for signed integer arrays.
+ """
+ return _unsigned_subtract(x.max(), x.min())
+
+
def _hist_bin_sqrt(x, range):
"""
Square root histogram bin estimator.
@@ -40,7 +50,7 @@ def _hist_bin_sqrt(x, range):
h : An estimate of the optimal bin width for the given data.
"""
del range # unused
- return x.ptp() / np.sqrt(x.size)
+ return _ptp(x) / np.sqrt(x.size)
def _hist_bin_sturges(x, range):
@@ -63,7 +73,7 @@ def _hist_bin_sturges(x, range):
h : An estimate of the optimal bin width for the given data.
"""
del range # unused
- return x.ptp() / (np.log2(x.size) + 1.0)
+ return _ptp(x) / (np.log2(x.size) + 1.0)
def _hist_bin_rice(x, range):
@@ -87,7 +97,7 @@ def _hist_bin_rice(x, range):
h : An estimate of the optimal bin width for the given data.
"""
del range # unused
- return x.ptp() / (2.0 * x.size ** (1.0 / 3))
+ return _ptp(x) / (2.0 * x.size ** (1.0 / 3))
def _hist_bin_scott(x, range):
@@ -137,7 +147,7 @@ def _hist_bin_stone(x, range):
"""
n = x.size
- ptp_x = np.ptp(x)
+ ptp_x = _ptp(x)
if n <= 1 or ptp_x == 0:
return 0
@@ -184,7 +194,7 @@ def _hist_bin_doane(x, range):
np.true_divide(temp, sigma, temp)
np.power(temp, 3, temp)
g1 = np.mean(temp)
- return x.ptp() / (1.0 + np.log2(x.size) +
+ return _ptp(x) / (1.0 + np.log2(x.size) +
np.log2(1.0 + np.absolute(g1) / sg1))
return 0.0
diff --git a/numpy/lib/tests/test_histograms.py b/numpy/lib/tests/test_histograms.py
index 4895a722c..dbf189f3e 100644
--- a/numpy/lib/tests/test_histograms.py
+++ b/numpy/lib/tests/test_histograms.py
@@ -8,6 +8,7 @@ from numpy.testing import (
assert_array_almost_equal, assert_raises, assert_allclose,
assert_array_max_ulp, assert_raises_regex, suppress_warnings,
)
+import pytest
class TestHistogram(object):
@@ -591,6 +592,16 @@ class TestHistogramOptimBinNums(object):
msg += " with datasize of {0}".format(testlen)
assert_equal(len(a), numbins, err_msg=msg)
+ @pytest.mark.parametrize("bins", ['auto', 'fd', 'doane', 'scott',
+ 'stone', 'rice', 'sturges'])
+ def test_signed_integer_data(self, bins):
+ # Regression test for gh-14379.
+ a = np.array([-2, 0, 127], dtype=np.int8)
+ hist, edges = np.histogram(a, bins=bins)
+ hist32, edges32 = np.histogram(a.astype(np.int32), bins=bins)
+ assert_array_equal(hist, hist32)
+ assert_array_equal(edges, edges32)
+
def test_simple_weighted(self):
"""
Check that weighted data raises a TypeError