diff options
author | Joseph Fox-Rabinovitz <joseph.r.fox-rabinovitz@nasa.gov> | 2016-02-01 16:29:48 -0500 |
---|---|---|
committer | Joseph Fox-Rabinovitz <jfoxrabinovitz@gmail.com> | 2016-02-11 20:06:30 -0500 |
commit | b8b55614a3d3c2e3e2c653064719de6906c1be39 (patch) | |
tree | d7e56a9618ead489145eaf54047349d59cfec9b9 /numpy/lib/tests/test_function_base.py | |
parent | 47b6c2b8bacb510cac62d490c159ec116080d1d0 (diff) | |
download | numpy-b8b55614a3d3c2e3e2c653064719de6906c1be39.tar.gz |
Added 'doane' and 'sqrt' estimators to np.histogram in numpy.function_base
Diffstat (limited to 'numpy/lib/tests/test_function_base.py')
-rw-r--r-- | numpy/lib/tests/test_function_base.py | 72 |
1 files changed, 43 insertions, 29 deletions
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index ba2448815..00d9f36c8 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -1280,11 +1280,13 @@ class TestHistogram(TestCase): class TestHistogramOptimBinNums(TestCase): """ - Provide test coverage when using provided estimators for optimal number of bins + Provide test coverage when using provided estimators for optimal number of + bins """ def test_empty(self): - estimator_list = ['fd', 'scott', 'rice', 'sturges', 'auto'] + estimator_list = ['fd', 'scott', 'rice', 'sturges', + 'doane', 'sqrt', 'auto'] # check it can deal with empty data for estimator in estimator_list: a, b = histogram([], bins=estimator) @@ -1293,40 +1295,49 @@ class TestHistogramOptimBinNums(TestCase): def test_simple(self): """ - Straightforward testing with a mixture of linspace data (for consistency). - All test values have been precomputed and the values shouldn't change + Straightforward testing with a mixture of linspace data (for + consistency). All test values have been precomputed and the values + shouldn't change """ - # some basic sanity checking, with some fixed data. Checking for the correct number of bins - basic_test = {50: {'fd': 4, 'scott': 4, 'rice': 8, 'sturges': 7, 'auto': 7}, - 500: {'fd': 8, 'scott': 8, 'rice': 16, 'sturges': 10, 'auto': 10}, - 5000: {'fd': 17, 'scott': 17, 'rice': 35, 'sturges': 14, 'auto': 17}} + # Some basic sanity checking, with some fixed data. + # Checking for the correct number of bins + basic_test = {50: {'fd': 4, 'scott': 4, 'rice': 8, 'sturges': 7, + 'doane': 8, 'sqrt': 8, 'auto': 7}, + 500: {'fd': 8, 'scott': 8, 'rice': 16, 'sturges': 10, + 'doane': 12, 'sqrt': 23, 'auto': 10}, + 5000: {'fd': 17, 'scott': 17, 'rice': 35, 'sturges': 14, + 'doane': 17, 'sqrt': 71, 'auto': 17}} for testlen, expectedResults in basic_test.items(): - # create some sort of non uniform data to test with (2 peak uniform mixture) + # Create some sort of non uniform data to test with + # (2 peak uniform mixture) x1 = np.linspace(-10, -1, testlen/5 * 2) x2 = np.linspace(1,10, testlen/5 * 3) x = np.hstack((x1, x2)) for estimator, numbins in expectedResults.items(): a, b = np.histogram(x, estimator) - assert_equal(len(a), numbins, - err_msg="For the {0} estimator with datasize of {1} ".format(estimator, testlen)) + assert_equal(len(a), numbins, err_msg="For the {0} estimator " + "with datasize of {1}".format(estimator, testlen)) def test_small(self): """ - Smaller datasets have the potential to cause issues with the data adaptive methods - Especially the FD methods - All bin numbers have been precalculated + Smaller datasets have the potential to cause issues with the data + adaptive methods, especially the FD method. All bin numbers have been + precalculated. """ - small_dat = {1: {'fd': 1, 'scott': 1, 'rice': 2, 'sturges': 1}, - 2: {'fd': 2, 'scott': 1, 'rice': 3, 'sturges': 2}, - 3: {'fd': 2, 'scott': 2, 'rice': 3, 'sturges': 3}} + small_dat = {1: {'fd': 1, 'scott': 1, 'rice': 2, 'sturges': 1, + 'doane': 1, 'sqrt': 1}, + 2: {'fd': 2, 'scott': 1, 'rice': 3, 'sturges': 2, + 'doane': 1, 'sqrt': 2}, + 3: {'fd': 2, 'scott': 2, 'rice': 3, 'sturges': 3, + 'doane': 3, 'sqrt': 2}} for testlen, expectedResults in small_dat.items(): testdat = np.arange(testlen) for estimator, expbins in expectedResults.items(): a, b = np.histogram(testdat, estimator) - assert_equal(len(a), expbins, - err_msg="For the {0} estimator with datasize of {1} ".format(estimator, testlen)) + assert_equal(len(a), expbins, err_msg="For the {0} estimator " + "with datasize of {1}".format(estimator, testlen)) def test_incorrect_methods(self): """ @@ -1342,26 +1353,29 @@ class TestHistogramOptimBinNums(TestCase): Primarily for Scott and FD as the SD and IQR are both 0 in this case """ novar_dataset = np.ones(100) - novar_resultdict = {'fd': 1, 'scott': 1, 'rice': 10, 'sturges': 8, 'auto': 8} + novar_resultdict = {'fd': 1, 'scott': 1, 'rice': 10, 'sturges': 8, + 'doane': 1, 'sqrt': 10, 'auto': 8} for estimator, numbins in novar_resultdict.items(): a, b = np.histogram(novar_dataset, estimator) - assert_equal(len(a), numbins, - err_msg="{0} estimator, No Variance test".format(estimator)) + assert_equal(len(a), numbins, err_msg="{0} estimator, " + "No Variance test".format(estimator)) def test_outlier(self): """ - Check the fd and scott with outliers - The fd determines a smaller binwidth since it's less affected by outliers - since the range is so (artificially) large this means more bins - most of which will be empty, but the data of interest usually is unaffected. - The Scott estimator is more affected and returns fewer bins, despite most of - the variance being in one area of the data + Check the FD, Scott and Doane with outliers. + + The FD estimates a smaller binwidth since it's less affected by + outliers. Since the range is so (artificially) large, this means more + bins, most of which will be empty, but the data of interest usually is + unaffected. The Scott estimator is more affected and returns fewer bins, + despite most of the variance being in one area of the data. The Doane + estimator lies somewhere between the other two. """ xcenter = np.linspace(-10, 10, 50) outlier_dataset = np.hstack((np.linspace(-110, -100, 5), xcenter)) - outlier_resultdict = {'fd': 21, 'scott': 5} + outlier_resultdict = {'fd': 21, 'scott': 5, 'doane': 11} for estimator, numbins in outlier_resultdict.items(): a, b = np.histogram(outlier_dataset, estimator) |