diff options
author | Eric Wieser <wieser.eric@gmail.com> | 2017-12-09 13:58:42 -0800 |
---|---|---|
committer | Eric Wieser <wieser.eric@gmail.com> | 2017-12-10 16:58:42 -0800 |
commit | 34c9950ccdb8a2f64916903b09c2c39233be0adf (patch) | |
tree | 793c0714f9ace7b4722804263d456d02f39a0843 /numpy/lib/tests/test_histograms.py | |
parent | bbe92a4820eb3eba7235a9ffbd44605032981f42 (diff) | |
download | numpy-34c9950ccdb8a2f64916903b09c2c39233be0adf.tar.gz |
MAINT: Move histogram and histogramdd into their own module
800 self-contained lines are easily enough to go in their own file, as are the 500 lines of tests.
For compatibility, the names are still available through `np.lib.function_base.histogram` and `from np.lib.function_base import *`
For simplicity of imports, all of the unqualified `np.` names are now qualified
Diffstat (limited to 'numpy/lib/tests/test_histograms.py')
-rw-r--r-- | numpy/lib/tests/test_histograms.py | 527 |
1 files changed, 527 insertions, 0 deletions
diff --git a/numpy/lib/tests/test_histograms.py b/numpy/lib/tests/test_histograms.py new file mode 100644 index 000000000..59baf91fe --- /dev/null +++ b/numpy/lib/tests/test_histograms.py @@ -0,0 +1,527 @@ +from __future__ import division, absolute_import, print_function + +import numpy as np + +from numpy.lib.histograms import histogram, histogramdd +from numpy.testing import ( + run_module_suite, assert_, assert_equal, assert_array_equal, + assert_almost_equal, assert_array_almost_equal, assert_raises, + assert_allclose, assert_array_max_ulp, assert_warns, assert_raises_regex, + dec, suppress_warnings, HAS_REFCOUNT, +) + + +class TestHistogram(object): + + def setup(self): + pass + + def teardown(self): + pass + + def test_simple(self): + n = 100 + v = np.random.rand(n) + (a, b) = histogram(v) + # check if the sum of the bins equals the number of samples + assert_equal(np.sum(a, axis=0), n) + # check that the bin counts are evenly spaced when the data is from + # a linear function + (a, b) = histogram(np.linspace(0, 10, 100)) + assert_array_equal(a, 10) + + def test_one_bin(self): + # Ticket 632 + hist, edges = histogram([1, 2, 3, 4], [1, 2]) + assert_array_equal(hist, [2, ]) + assert_array_equal(edges, [1, 2]) + assert_raises(ValueError, histogram, [1, 2], bins=0) + h, e = histogram([1, 2], bins=1) + assert_equal(h, np.array([2])) + assert_allclose(e, np.array([1., 2.])) + + def test_normed(self): + # Check that the integral of the density equals 1. + n = 100 + v = np.random.rand(n) + a, b = histogram(v, normed=True) + area = np.sum(a * np.diff(b)) + assert_almost_equal(area, 1) + + # Check with non-constant bin widths (buggy but backwards + # compatible) + v = np.arange(10) + bins = [0, 1, 5, 9, 10] + a, b = histogram(v, bins, normed=True) + area = np.sum(a * np.diff(b)) + assert_almost_equal(area, 1) + + def test_density(self): + # Check that the integral of the density equals 1. + n = 100 + v = np.random.rand(n) + a, b = histogram(v, density=True) + area = np.sum(a * np.diff(b)) + assert_almost_equal(area, 1) + + # Check with non-constant bin widths + v = np.arange(10) + bins = [0, 1, 3, 6, 10] + a, b = histogram(v, bins, density=True) + assert_array_equal(a, .1) + assert_equal(np.sum(a * np.diff(b)), 1) + + # Variale bin widths are especially useful to deal with + # infinities. + v = np.arange(10) + bins = [0, 1, 3, 6, np.inf] + a, b = histogram(v, bins, density=True) + assert_array_equal(a, [.1, .1, .1, 0.]) + + # Taken from a bug report from N. Becker on the numpy-discussion + # mailing list Aug. 6, 2010. + counts, dmy = np.histogram( + [1, 2, 3, 4], [0.5, 1.5, np.inf], density=True) + assert_equal(counts, [.25, 0]) + + def test_outliers(self): + # Check that outliers are not tallied + a = np.arange(10) + .5 + + # Lower outliers + h, b = histogram(a, range=[0, 9]) + assert_equal(h.sum(), 9) + + # Upper outliers + h, b = histogram(a, range=[1, 10]) + assert_equal(h.sum(), 9) + + # Normalization + h, b = histogram(a, range=[1, 9], normed=True) + assert_almost_equal((h * np.diff(b)).sum(), 1, decimal=15) + + # Weights + w = np.arange(10) + .5 + h, b = histogram(a, range=[1, 9], weights=w, normed=True) + assert_equal((h * np.diff(b)).sum(), 1) + + h, b = histogram(a, bins=8, range=[1, 9], weights=w) + assert_equal(h, w[1:-1]) + + def test_type(self): + # Check the type of the returned histogram + a = np.arange(10) + .5 + h, b = histogram(a) + assert_(np.issubdtype(h.dtype, np.integer)) + + h, b = histogram(a, normed=True) + assert_(np.issubdtype(h.dtype, np.floating)) + + h, b = histogram(a, weights=np.ones(10, int)) + assert_(np.issubdtype(h.dtype, np.integer)) + + h, b = histogram(a, weights=np.ones(10, float)) + assert_(np.issubdtype(h.dtype, np.floating)) + + def test_f32_rounding(self): + # gh-4799, check that the rounding of the edges works with float32 + x = np.array([276.318359, -69.593948, 21.329449], dtype=np.float32) + y = np.array([5005.689453, 4481.327637, 6010.369629], dtype=np.float32) + counts_hist, xedges, yedges = np.histogram2d(x, y, bins=100) + assert_equal(counts_hist.sum(), 3.) + + def test_weights(self): + v = np.random.rand(100) + w = np.ones(100) * 5 + a, b = histogram(v) + na, nb = histogram(v, normed=True) + wa, wb = histogram(v, weights=w) + nwa, nwb = histogram(v, weights=w, normed=True) + assert_array_almost_equal(a * 5, wa) + assert_array_almost_equal(na, nwa) + + # Check weights are properly applied. + v = np.linspace(0, 10, 10) + w = np.concatenate((np.zeros(5), np.ones(5))) + wa, wb = histogram(v, bins=np.arange(11), weights=w) + assert_array_almost_equal(wa, w) + + # Check with integer weights + wa, wb = histogram([1, 2, 2, 4], bins=4, weights=[4, 3, 2, 1]) + assert_array_equal(wa, [4, 5, 0, 1]) + wa, wb = histogram( + [1, 2, 2, 4], bins=4, weights=[4, 3, 2, 1], normed=True) + assert_array_almost_equal(wa, np.array([4, 5, 0, 1]) / 10. / 3. * 4) + + # Check weights with non-uniform bin widths + a, b = histogram( + np.arange(9), [0, 1, 3, 6, 10], + weights=[2, 1, 1, 1, 1, 1, 1, 1, 1], density=True) + assert_almost_equal(a, [.2, .1, .1, .075]) + + def test_exotic_weights(self): + + # Test the use of weights that are not integer or floats, but e.g. + # complex numbers or object types. + + # Complex weights + values = np.array([1.3, 2.5, 2.3]) + weights = np.array([1, -1, 2]) + 1j * np.array([2, 1, 2]) + + # Check with custom bins + wa, wb = histogram(values, bins=[0, 2, 3], weights=weights) + assert_array_almost_equal(wa, np.array([1, 1]) + 1j * np.array([2, 3])) + + # Check with even bins + wa, wb = histogram(values, bins=2, range=[1, 3], weights=weights) + assert_array_almost_equal(wa, np.array([1, 1]) + 1j * np.array([2, 3])) + + # Decimal weights + from decimal import Decimal + values = np.array([1.3, 2.5, 2.3]) + weights = np.array([Decimal(1), Decimal(2), Decimal(3)]) + + # Check with custom bins + wa, wb = histogram(values, bins=[0, 2, 3], weights=weights) + assert_array_almost_equal(wa, [Decimal(1), Decimal(5)]) + + # Check with even bins + wa, wb = histogram(values, bins=2, range=[1, 3], weights=weights) + assert_array_almost_equal(wa, [Decimal(1), Decimal(5)]) + + def test_no_side_effects(self): + # This is a regression test that ensures that values passed to + # ``histogram`` are unchanged. + values = np.array([1.3, 2.5, 2.3]) + np.histogram(values, range=[-10, 10], bins=100) + assert_array_almost_equal(values, [1.3, 2.5, 2.3]) + + def test_empty(self): + a, b = histogram([], bins=([0, 1])) + assert_array_equal(a, np.array([0])) + assert_array_equal(b, np.array([0, 1])) + + def test_error_binnum_type (self): + # Tests if right Error is raised if bins argument is float + vals = np.linspace(0.0, 1.0, num=100) + histogram(vals, 5) + assert_raises(TypeError, histogram, vals, 2.4) + + def test_finite_range(self): + # Normal ranges should be fine + vals = np.linspace(0.0, 1.0, num=100) + histogram(vals, range=[0.25,0.75]) + assert_raises(ValueError, histogram, vals, range=[np.nan,0.75]) + assert_raises(ValueError, histogram, vals, range=[0.25,np.inf]) + + def test_bin_edge_cases(self): + # Ensure that floating-point computations correctly place edge cases. + arr = np.array([337, 404, 739, 806, 1007, 1811, 2012]) + hist, edges = np.histogram(arr, bins=8296, range=(2, 2280)) + mask = hist > 0 + left_edges = edges[:-1][mask] + right_edges = edges[1:][mask] + for x, left, right in zip(arr, left_edges, right_edges): + assert_(x >= left) + assert_(x < right) + + def test_last_bin_inclusive_range(self): + arr = np.array([0., 0., 0., 1., 2., 3., 3., 4., 5.]) + hist, edges = np.histogram(arr, bins=30, range=(-0.5, 5)) + assert_equal(hist[-1], 1) + + def test_unsigned_monotonicity_check(self): + # Ensures ValueError is raised if bins not increasing monotonically + # when bins contain unsigned values (see #9222) + arr = np.array([2]) + bins = np.array([1, 3, 1], dtype='uint64') + with assert_raises(ValueError): + hist, edges = np.histogram(arr, bins=bins) + + +class TestHistogramOptimBinNums(object): + """ + Provide test coverage when using provided estimators for optimal number of + bins + """ + + def test_empty(self): + estimator_list = ['fd', 'scott', 'rice', 'sturges', + 'doane', 'sqrt', 'auto'] + # check it can deal with empty data + for estimator in estimator_list: + a, b = histogram([], bins=estimator) + assert_array_equal(a, np.array([0])) + assert_array_equal(b, np.array([0, 1])) + + def test_simple(self): + """ + Straightforward testing with a mixture of linspace data (for + consistency). All test values have been precomputed and the values + shouldn't change + """ + # Some basic sanity checking, with some fixed data. + # Checking for the correct number of bins + basic_test = {50: {'fd': 4, 'scott': 4, 'rice': 8, 'sturges': 7, + 'doane': 8, 'sqrt': 8, 'auto': 7}, + 500: {'fd': 8, 'scott': 8, 'rice': 16, 'sturges': 10, + 'doane': 12, 'sqrt': 23, 'auto': 10}, + 5000: {'fd': 17, 'scott': 17, 'rice': 35, 'sturges': 14, + 'doane': 17, 'sqrt': 71, 'auto': 17}} + + for testlen, expectedResults in basic_test.items(): + # Create some sort of non uniform data to test with + # (2 peak uniform mixture) + x1 = np.linspace(-10, -1, testlen // 5 * 2) + x2 = np.linspace(1, 10, testlen // 5 * 3) + x = np.concatenate((x1, x2)) + for estimator, numbins in expectedResults.items(): + a, b = np.histogram(x, estimator) + assert_equal(len(a), numbins, err_msg="For the {0} estimator " + "with datasize of {1}".format(estimator, testlen)) + + def test_small(self): + """ + Smaller datasets have the potential to cause issues with the data + adaptive methods, especially the FD method. All bin numbers have been + precalculated. + """ + small_dat = {1: {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1, + 'doane': 1, 'sqrt': 1}, + 2: {'fd': 2, 'scott': 1, 'rice': 3, 'sturges': 2, + 'doane': 1, 'sqrt': 2}, + 3: {'fd': 2, 'scott': 2, 'rice': 3, 'sturges': 3, + 'doane': 3, 'sqrt': 2}} + + for testlen, expectedResults in small_dat.items(): + testdat = np.arange(testlen) + for estimator, expbins in expectedResults.items(): + a, b = np.histogram(testdat, estimator) + assert_equal(len(a), expbins, err_msg="For the {0} estimator " + "with datasize of {1}".format(estimator, testlen)) + + def test_incorrect_methods(self): + """ + Check a Value Error is thrown when an unknown string is passed in + """ + check_list = ['mad', 'freeman', 'histograms', 'IQR'] + for estimator in check_list: + assert_raises(ValueError, histogram, [1, 2, 3], estimator) + + def test_novariance(self): + """ + Check that methods handle no variance in data + Primarily for Scott and FD as the SD and IQR are both 0 in this case + """ + novar_dataset = np.ones(100) + novar_resultdict = {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1, + 'doane': 1, 'sqrt': 1, 'auto': 1} + + for estimator, numbins in novar_resultdict.items(): + a, b = np.histogram(novar_dataset, estimator) + assert_equal(len(a), numbins, err_msg="{0} estimator, " + "No Variance test".format(estimator)) + + def test_outlier(self): + """ + Check the FD, Scott and Doane with outliers. + + The FD estimates a smaller binwidth since it's less affected by + outliers. Since the range is so (artificially) large, this means more + bins, most of which will be empty, but the data of interest usually is + unaffected. The Scott estimator is more affected and returns fewer bins, + despite most of the variance being in one area of the data. The Doane + estimator lies somewhere between the other two. + """ + xcenter = np.linspace(-10, 10, 50) + outlier_dataset = np.hstack((np.linspace(-110, -100, 5), xcenter)) + + outlier_resultdict = {'fd': 21, 'scott': 5, 'doane': 11} + + for estimator, numbins in outlier_resultdict.items(): + a, b = np.histogram(outlier_dataset, estimator) + assert_equal(len(a), numbins) + + def test_simple_range(self): + """ + Straightforward testing with a mixture of linspace data (for + consistency). Adding in a 3rd mixture that will then be + completely ignored. All test values have been precomputed and + the shouldn't change. + """ + # some basic sanity checking, with some fixed data. + # Checking for the correct number of bins + basic_test = { + 50: {'fd': 8, 'scott': 8, 'rice': 15, + 'sturges': 14, 'auto': 14}, + 500: {'fd': 15, 'scott': 16, 'rice': 32, + 'sturges': 20, 'auto': 20}, + 5000: {'fd': 33, 'scott': 33, 'rice': 69, + 'sturges': 27, 'auto': 33} + } + + for testlen, expectedResults in basic_test.items(): + # create some sort of non uniform data to test with + # (3 peak uniform mixture) + x1 = np.linspace(-10, -1, testlen // 5 * 2) + x2 = np.linspace(1, 10, testlen // 5 * 3) + x3 = np.linspace(-100, -50, testlen) + x = np.hstack((x1, x2, x3)) + for estimator, numbins in expectedResults.items(): + a, b = np.histogram(x, estimator, range = (-20, 20)) + msg = "For the {0} estimator".format(estimator) + msg += " with datasize of {0}".format(testlen) + assert_equal(len(a), numbins, err_msg=msg) + + def test_simple_weighted(self): + """ + Check that weighted data raises a TypeError + """ + estimator_list = ['fd', 'scott', 'rice', 'sturges', 'auto'] + for estimator in estimator_list: + assert_raises(TypeError, histogram, [1, 2, 3], + estimator, weights=[1, 2, 3]) + + +class TestHistogramdd(object): + + def test_simple(self): + x = np.array([[-.5, .5, 1.5], [-.5, 1.5, 2.5], [-.5, 2.5, .5], + [.5, .5, 1.5], [.5, 1.5, 2.5], [.5, 2.5, 2.5]]) + H, edges = histogramdd(x, (2, 3, 3), + range=[[-1, 1], [0, 3], [0, 3]]) + answer = np.array([[[0, 1, 0], [0, 0, 1], [1, 0, 0]], + [[0, 1, 0], [0, 0, 1], [0, 0, 1]]]) + assert_array_equal(H, answer) + + # Check normalization + ed = [[-2, 0, 2], [0, 1, 2, 3], [0, 1, 2, 3]] + H, edges = histogramdd(x, bins=ed, normed=True) + assert_(np.all(H == answer / 12.)) + + # Check that H has the correct shape. + H, edges = histogramdd(x, (2, 3, 4), + range=[[-1, 1], [0, 3], [0, 4]], + normed=True) + answer = np.array([[[0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]], + [[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0]]]) + assert_array_almost_equal(H, answer / 6., 4) + # Check that a sequence of arrays is accepted and H has the correct + # shape. + z = [np.squeeze(y) for y in np.split(x, 3, axis=1)] + H, edges = histogramdd( + z, bins=(4, 3, 2), range=[[-2, 2], [0, 3], [0, 2]]) + answer = np.array([[[0, 0], [0, 0], [0, 0]], + [[0, 1], [0, 0], [1, 0]], + [[0, 1], [0, 0], [0, 0]], + [[0, 0], [0, 0], [0, 0]]]) + assert_array_equal(H, answer) + + Z = np.zeros((5, 5, 5)) + Z[list(range(5)), list(range(5)), list(range(5))] = 1. + H, edges = histogramdd([np.arange(5), np.arange(5), np.arange(5)], 5) + assert_array_equal(H, Z) + + def test_shape_3d(self): + # All possible permutations for bins of different lengths in 3D. + bins = ((5, 4, 6), (6, 4, 5), (5, 6, 4), (4, 6, 5), (6, 5, 4), + (4, 5, 6)) + r = np.random.rand(10, 3) + for b in bins: + H, edges = histogramdd(r, b) + assert_(H.shape == b) + + def test_shape_4d(self): + # All possible permutations for bins of different lengths in 4D. + bins = ((7, 4, 5, 6), (4, 5, 7, 6), (5, 6, 4, 7), (7, 6, 5, 4), + (5, 7, 6, 4), (4, 6, 7, 5), (6, 5, 7, 4), (7, 5, 4, 6), + (7, 4, 6, 5), (6, 4, 7, 5), (6, 7, 5, 4), (4, 6, 5, 7), + (4, 7, 5, 6), (5, 4, 6, 7), (5, 7, 4, 6), (6, 7, 4, 5), + (6, 5, 4, 7), (4, 7, 6, 5), (4, 5, 6, 7), (7, 6, 4, 5), + (5, 4, 7, 6), (5, 6, 7, 4), (6, 4, 5, 7), (7, 5, 6, 4)) + + r = np.random.rand(10, 4) + for b in bins: + H, edges = histogramdd(r, b) + assert_(H.shape == b) + + def test_weights(self): + v = np.random.rand(100, 2) + hist, edges = histogramdd(v) + n_hist, edges = histogramdd(v, normed=True) + w_hist, edges = histogramdd(v, weights=np.ones(100)) + assert_array_equal(w_hist, hist) + w_hist, edges = histogramdd(v, weights=np.ones(100) * 2, normed=True) + assert_array_equal(w_hist, n_hist) + w_hist, edges = histogramdd(v, weights=np.ones(100, int) * 2) + assert_array_equal(w_hist, 2 * hist) + + def test_identical_samples(self): + x = np.zeros((10, 2), int) + hist, edges = histogramdd(x, bins=2) + assert_array_equal(edges[0], np.array([-0.5, 0., 0.5])) + + def test_empty(self): + a, b = histogramdd([[], []], bins=([0, 1], [0, 1])) + assert_array_max_ulp(a, np.array([[0.]])) + a, b = np.histogramdd([[], [], []], bins=2) + assert_array_max_ulp(a, np.zeros((2, 2, 2))) + + def test_bins_errors(self): + # There are two ways to specify bins. Check for the right errors + # when mixing those. + x = np.arange(8).reshape(2, 4) + assert_raises(ValueError, np.histogramdd, x, bins=[-1, 2, 4, 5]) + assert_raises(ValueError, np.histogramdd, x, bins=[1, 0.99, 1, 1]) + assert_raises( + ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 2, 3]]) + assert_raises( + ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 3, -3]]) + assert_(np.histogramdd(x, bins=[1, 1, 1, [1, 2, 3, 4]])) + + def test_inf_edges(self): + # Test using +/-inf bin edges works. See #1788. + with np.errstate(invalid='ignore'): + x = np.arange(6).reshape(3, 2) + expected = np.array([[1, 0], [0, 1], [0, 1]]) + h, e = np.histogramdd(x, bins=[3, [-np.inf, 2, 10]]) + assert_allclose(h, expected) + h, e = np.histogramdd(x, bins=[3, np.array([-1, 2, np.inf])]) + assert_allclose(h, expected) + h, e = np.histogramdd(x, bins=[3, [-np.inf, 3, np.inf]]) + assert_allclose(h, expected) + + def test_rightmost_binedge(self): + # Test event very close to rightmost binedge. See Github issue #4266 + x = [0.9999999995] + bins = [[0., 0.5, 1.0]] + hist, _ = histogramdd(x, bins=bins) + assert_(hist[0] == 0.0) + assert_(hist[1] == 1.) + x = [1.0] + bins = [[0., 0.5, 1.0]] + hist, _ = histogramdd(x, bins=bins) + assert_(hist[0] == 0.0) + assert_(hist[1] == 1.) + x = [1.0000000001] + bins = [[0., 0.5, 1.0]] + hist, _ = histogramdd(x, bins=bins) + assert_(hist[0] == 0.0) + assert_(hist[1] == 1.) + x = [1.0001] + bins = [[0., 0.5, 1.0]] + hist, _ = histogramdd(x, bins=bins) + assert_(hist[0] == 0.0) + assert_(hist[1] == 0.0) + + def test_finite_range(self): + vals = np.random.random((100, 3)) + histogramdd(vals, range=[[0.0, 1.0], [0.25, 0.75], [0.25, 0.5]]) + assert_raises(ValueError, histogramdd, vals, + range=[[0.0, 1.0], [0.25, 0.75], [0.25, np.inf]]) + assert_raises(ValueError, histogramdd, vals, + range=[[0.0, 1.0], [np.nan, 0.75], [0.25, 0.5]]) + + +if __name__ == "__main__": + run_module_suite() |