Merge pull request #6029 from nayyarv/master

ENH: Automatic number of bins for np.histogram
author: seberg <sebastian@sipsolutions.net> 2015-08-15 15:53:06 +0200
committer: seberg <sebastian@sipsolutions.net> 2015-08-15 15:53:06 +0200
commit: 6e8b869d52ec5a1242df69bcd9323a4b0947933b (patch)
tree: a4296d6f98c220f38cf02b2ad376cf9ab270ce4d /numpy/lib/tests/test_function_base.py
parent: c573b7170f4467d75e194ccca8a032a32fa1b5d0 (diff)
parent: 388ee595330e2375a1e4c8187c17de7ea9fb2f6f (diff)
download: numpy-6e8b869d52ec5a1242df69bcd9323a4b0947933b.tar.gz
1 files changed, 90 insertions, 0 deletions
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py
index b29012bcb..b127f65f6 100644
--- a/numpy/lib/tests/test_function_base.py
+++ b/numpy/lib/tests/test_function_base.py
@@ -1234,6 +1234,96 @@ class TestHistogram(TestCase):
         assert_array_equal(b, np.array([0, 1]))
 
 
+class TestHistogramOptimBinNums(TestCase):
+    """
+    Provide test coverage when using provided estimators for optimal number of bins
+    """
+
+    def test_empty(self):
+        estimator_list = ['fd', 'scott', 'rice', 'sturges', 'auto']
+        # check it can deal with empty data
+        for estimator in estimator_list:
+            a, b = histogram([], bins=estimator)
+            assert_array_equal(a, np.array([0]))
+            assert_array_equal(b, np.array([0, 1]))
+
+    def test_simple(self):
+        """
+        Straightforward testing with a mixture of linspace data (for consistency).
+        All test values have been precomputed and the values shouldn't change
+        """
+        # some basic sanity checking, with some fixed data. Checking for the correct number of bins
+        basic_test = {50:   {'fd': 4,  'scott': 4,  'rice': 8,  'sturges': 7,  'auto': 7},
+                      500:  {'fd': 8,  'scott': 8,  'rice': 16, 'sturges': 10, 'auto': 10},
+                      5000: {'fd': 17, 'scott': 17, 'rice': 35, 'sturges': 14, 'auto': 17}}
+
+        for testlen, expectedResults in basic_test.items():
+            # create some sort of non uniform data to test with (2 peak uniform mixture)
+            x1 = np.linspace(-10, -1, testlen/5 * 2)
+            x2 = np.linspace(1,10, testlen/5 * 3)
+            x = np.hstack((x1, x2))
+            for estimator, numbins in expectedResults.items():
+                a, b = np.histogram(x, estimator)
+                assert_equal(len(a), numbins,
+                             err_msg="For the {0} estimator with datasize of {1} ".format(estimator, testlen))
+
+    def test_small(self):
+        """
+        Smaller datasets have the potential to cause issues with the data adaptive methods
+        Especially the FD methods
+        All bin numbers have been precalculated
+        """
+        small_dat = {1: {'fd': 1, 'scott': 1, 'rice': 2, 'sturges': 1},
+                     2: {'fd': 2, 'scott': 1, 'rice': 3, 'sturges': 2},
+                     3: {'fd': 2, 'scott': 2, 'rice': 3, 'sturges': 3}}
+
+        for testlen, expectedResults in small_dat.items():
+            testdat = np.arange(testlen)
+            for estimator, expbins in expectedResults.items():
+                a, b = np.histogram(testdat, estimator)
+                assert_equal(len(a), expbins,
+                             err_msg="For the {0} estimator with datasize of {1} ".format(estimator, testlen))
+
+    def test_incorrect_methods(self):
+        """
+        Check a Value Error is thrown when an unknown string is passed in
+        """
+        check_list = ['mad', 'freeman', 'histograms', 'IQR']
+        for estimator in check_list:
+            assert_raises(ValueError, histogram, [1, 2, 3], estimator)
+
+    def test_novariance(self):
+        """
+        Check that methods handle no variance in data
+        Primarily for Scott and FD as the SD and IQR are both 0 in this case
+        """
+        novar_dataset = np.ones(100)
+        novar_resultdict = {'fd': 1, 'scott': 1, 'rice': 10, 'sturges': 8, 'auto': 8}
+
+        for estimator, numbins in novar_resultdict.items():
+            a, b = np.histogram(novar_dataset, estimator)
+            assert_equal(len(a), numbins,
+                         err_msg="{0} estimator, No Variance test".format(estimator))
+
+    def test_outlier(self):
+        """
+        Check the fd and scott with outliers
+        The fd determines a smaller binwidth since it's less affected by outliers
+        since the range is so (artificially) large this means more bins
+        most of which will be empty, but the data of interest usually is unaffected.
+        The Scott estimator is more affected and returns fewer bins, despite most of
+        the variance being in one area of the data
+        """
+        xcenter = np.linspace(-10, 10, 50)
+        outlier_dataset = np.hstack((np.linspace(-110, -100, 5), xcenter))
+
+        outlier_resultdict = {'fd': 21, 'scott': 5}
+
+        for estimator, numbins in outlier_resultdict.items():
+            a, b = np.histogram(outlier_dataset, estimator)
+            assert_equal(len(a), numbins)
+
+
 class TestHistogramdd(TestCase):
 
     def test_simple(self):
author	seberg <sebastian@sipsolutions.net>	2015-08-15 15:53:06 +0200
committer	seberg <sebastian@sipsolutions.net>	2015-08-15 15:53:06 +0200
commit	6e8b869d52ec5a1242df69bcd9323a4b0947933b (patch)
tree	a4296d6f98c220f38cf02b2ad376cf9ab270ce4d /numpy/lib/tests/test_function_base.py
parent	c573b7170f4467d75e194ccca8a032a32fa1b5d0 (diff)
parent	388ee595330e2375a1e4c8187c17de7ea9fb2f6f (diff)
download	numpy-6e8b869d52ec5a1242df69bcd9323a4b0947933b.tar.gz