diff options
author | seberg <sebastian@sipsolutions.net> | 2015-08-15 15:53:06 +0200 |
---|---|---|
committer | seberg <sebastian@sipsolutions.net> | 2015-08-15 15:53:06 +0200 |
commit | 6e8b869d52ec5a1242df69bcd9323a4b0947933b (patch) | |
tree | a4296d6f98c220f38cf02b2ad376cf9ab270ce4d /numpy/lib/tests/test_function_base.py | |
parent | c573b7170f4467d75e194ccca8a032a32fa1b5d0 (diff) | |
parent | 388ee595330e2375a1e4c8187c17de7ea9fb2f6f (diff) | |
download | numpy-6e8b869d52ec5a1242df69bcd9323a4b0947933b.tar.gz |
Merge pull request #6029 from nayyarv/master
ENH: Automatic number of bins for np.histogram
Diffstat (limited to 'numpy/lib/tests/test_function_base.py')
-rw-r--r-- | numpy/lib/tests/test_function_base.py | 90 |
1 files changed, 90 insertions, 0 deletions
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index b29012bcb..b127f65f6 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -1234,6 +1234,96 @@ class TestHistogram(TestCase): assert_array_equal(b, np.array([0, 1])) +class TestHistogramOptimBinNums(TestCase): + """ + Provide test coverage when using provided estimators for optimal number of bins + """ + + def test_empty(self): + estimator_list = ['fd', 'scott', 'rice', 'sturges', 'auto'] + # check it can deal with empty data + for estimator in estimator_list: + a, b = histogram([], bins=estimator) + assert_array_equal(a, np.array([0])) + assert_array_equal(b, np.array([0, 1])) + + def test_simple(self): + """ + Straightforward testing with a mixture of linspace data (for consistency). + All test values have been precomputed and the values shouldn't change + """ + # some basic sanity checking, with some fixed data. Checking for the correct number of bins + basic_test = {50: {'fd': 4, 'scott': 4, 'rice': 8, 'sturges': 7, 'auto': 7}, + 500: {'fd': 8, 'scott': 8, 'rice': 16, 'sturges': 10, 'auto': 10}, + 5000: {'fd': 17, 'scott': 17, 'rice': 35, 'sturges': 14, 'auto': 17}} + + for testlen, expectedResults in basic_test.items(): + # create some sort of non uniform data to test with (2 peak uniform mixture) + x1 = np.linspace(-10, -1, testlen/5 * 2) + x2 = np.linspace(1,10, testlen/5 * 3) + x = np.hstack((x1, x2)) + for estimator, numbins in expectedResults.items(): + a, b = np.histogram(x, estimator) + assert_equal(len(a), numbins, + err_msg="For the {0} estimator with datasize of {1} ".format(estimator, testlen)) + + def test_small(self): + """ + Smaller datasets have the potential to cause issues with the data adaptive methods + Especially the FD methods + All bin numbers have been precalculated + """ + small_dat = {1: {'fd': 1, 'scott': 1, 'rice': 2, 'sturges': 1}, + 2: {'fd': 2, 'scott': 1, 'rice': 3, 'sturges': 2}, + 3: {'fd': 2, 'scott': 2, 'rice': 3, 'sturges': 3}} + + for testlen, expectedResults in small_dat.items(): + testdat = np.arange(testlen) + for estimator, expbins in expectedResults.items(): + a, b = np.histogram(testdat, estimator) + assert_equal(len(a), expbins, + err_msg="For the {0} estimator with datasize of {1} ".format(estimator, testlen)) + + def test_incorrect_methods(self): + """ + Check a Value Error is thrown when an unknown string is passed in + """ + check_list = ['mad', 'freeman', 'histograms', 'IQR'] + for estimator in check_list: + assert_raises(ValueError, histogram, [1, 2, 3], estimator) + + def test_novariance(self): + """ + Check that methods handle no variance in data + Primarily for Scott and FD as the SD and IQR are both 0 in this case + """ + novar_dataset = np.ones(100) + novar_resultdict = {'fd': 1, 'scott': 1, 'rice': 10, 'sturges': 8, 'auto': 8} + + for estimator, numbins in novar_resultdict.items(): + a, b = np.histogram(novar_dataset, estimator) + assert_equal(len(a), numbins, + err_msg="{0} estimator, No Variance test".format(estimator)) + + def test_outlier(self): + """ + Check the fd and scott with outliers + The fd determines a smaller binwidth since it's less affected by outliers + since the range is so (artificially) large this means more bins + most of which will be empty, but the data of interest usually is unaffected. + The Scott estimator is more affected and returns fewer bins, despite most of + the variance being in one area of the data + """ + xcenter = np.linspace(-10, 10, 50) + outlier_dataset = np.hstack((np.linspace(-110, -100, 5), xcenter)) + + outlier_resultdict = {'fd': 21, 'scott': 5} + + for estimator, numbins in outlier_resultdict.items(): + a, b = np.histogram(outlier_dataset, estimator) + assert_equal(len(a), numbins) + + class TestHistogramdd(TestCase): def test_simple(self): |