summaryrefslogtreecommitdiff
path: root/numpy/lib
diff options
context:
space:
mode:
Diffstat (limited to 'numpy/lib')
-rw-r--r--numpy/lib/function_base.py222
-rw-r--r--numpy/lib/tests/test_function_base.py95
2 files changed, 252 insertions, 65 deletions
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index 4c02d7c7a..c813b49c1 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -11,6 +11,7 @@ __all__ = ['logspace', 'linspace',
'add_docstring', 'meshgrid', 'delete', 'insert', 'append',
'interp'
]
+import warnings
import types
import numpy.core.numeric as _nx
@@ -97,78 +98,173 @@ def iterable(y):
except: return 0
return 1
-def histogram(a, bins=10, range=None, normed=False):
+def histogram(a, bins=10, range=None, normed=False, weights=None, new=False):
"""Compute the histogram from a set of data.
- Parameters:
-
- a : array
- The data to histogram. n-D arrays will be flattened.
-
- bins : int or sequence of floats
- If an int, then the number of equal-width bins in the given range.
- Otherwise, a sequence of the lower bound of each bin.
-
- range : (float, float)
- The lower and upper range of the bins. If not provided, then
- (a.min(), a.max()) is used. Values outside of this range are
- allocated to the closest bin.
-
- normed : bool
- If False, the result array will contain the number of samples in
- each bin. If True, the result array is the value of the
- probability *density* function at the bin normalized such that the
- *integral* over the range is 1. Note that the sum of all of the
- histogram values will not usually be 1; it is not a probability
- *mass* function.
-
- Returns:
-
- hist : array
- The values of the histogram. See `normed` for a description of the
- possible semantics.
+ Parameters
+ ----------
- lower_edges : float array
- The lower edges of each bin.
+ a : array
+ The data to histogram.
+
+ bins : int or sequence
+ If an int, then the number of equal-width bins in the given range.
+ If new=True, bins can also be the bin edges, allowing for non-constant
+ bin widths.
+
+ range : (float, float)
+ The lower and upper range of the bins. If not provided, then
+ range is simply (a.min(), a.max()). Using new=False, lower than range
+ are ignored, and values higher than range are tallied in the rightmost
+ bin. Using new=True, both lower and upper outliers are ignored.
+
+ normed : bool
+ If False, the result array will contain the number of samples in
+ each bin. If True, the result array is the value of the
+ probability *density* function at the bin normalized such that the
+ *integral* over the range is 1. Note that the sum of all of the
+ histogram values will not usually be 1; it is not a probability
+ *mass* function.
+
+ weights : array
+ An array of weights, the same shape as a. If normed is False, the
+ histogram is computed by summing the weights of the values falling into
+ each bin. If normed is True, the weights are normalized, so that the
+ integral of the density over the range is 1. This option is only
+ available with new=True.
+
+ new : bool
+ Compatibility argument to transition from the old version (v1.1) to
+ the new version (v1.2).
+
+
+ Return
+ ------
+ hist : array
+ The values of the histogram. See `normed` and `weights` for a
+ description of the possible semantics.
+
+ bin_edges : float array
+ With new=False, return the left bin edges (length(hist)).
+ With new=True, return the bin edges (length(hist)+1).
SeeAlso:
histogramdd
"""
- a = asarray(a).ravel()
-
- if (range is not None):
- mn, mx = range
- if (mn > mx):
- raise AttributeError, 'max must be larger than min in range parameter.'
-
- if not iterable(bins):
- if range is None:
- range = (a.min(), a.max())
- mn, mx = [mi+0.0 for mi in range]
- if mn == mx:
- mn -= 0.5
- mx += 0.5
- bins = linspace(mn, mx, bins, endpoint=False)
- else:
- bins = asarray(bins)
- if (bins[1:]-bins[:-1] < 0).any():
- raise AttributeError, 'bins must increase monotonically.'
-
- # best block size probably depends on processor cache size
- block = 65536
- n = sort(a[:block]).searchsorted(bins)
- for i in xrange(block, a.size, block):
- n += sort(a[i:i+block]).searchsorted(bins)
- n = concatenate([n, [len(a)]])
- n = n[1:]-n[:-1]
-
- if normed:
- db = bins[1] - bins[0]
- return 1.0/(a.size*db) * n, bins
- else:
- return n, bins
+ # Old behavior
+ if new is False:
+ warnings.warn("""
+ The semantics of histogram will be modified in
+ release 1.2 to improve outlier handling. The new behavior can be
+ obtained using new=True. Note that the new version accepts/returns
+ the bin edges instead of the left bin edges.
+ Please read the docstring for more information.""", FutureWarning)
+ a = asarray(a).ravel()
+
+ if (range is not None):
+ mn, mx = range
+ if (mn > mx):
+ raise AttributeError, \
+ 'max must be larger than min in range parameter.'
+
+ if not iterable(bins):
+ if range is None:
+ range = (a.min(), a.max())
+ else:
+ warnings.warn("""Outliers handling will change in version 1.2.
+ Please read the docstring for details.""", FutureWarning)
+ mn, mx = [mi+0.0 for mi in range]
+ if mn == mx:
+ mn -= 0.5
+ mx += 0.5
+ bins = linspace(mn, mx, bins, endpoint=False)
+ else:
+ raise ValueError, 'Use new=True to pass bin edges explicitly.'
+
+ if weights is not None:
+ raise ValueError, 'weights are only available with new=True.'
+
+ # best block size probably depends on processor cache size
+ block = 65536
+ n = sort(a[:block]).searchsorted(bins)
+ for i in xrange(block, a.size, block):
+ n += sort(a[i:i+block]).searchsorted(bins)
+ n = concatenate([n, [len(a)]])
+ n = n[1:]-n[:-1]
+
+ if normed:
+ db = bins[1] - bins[0]
+ return 1.0/(a.size*db) * n, bins
+ else:
+ return n, bins
+
+
+
+ # New behavior
+ elif new is True:
+ a = asarray(a)
+ if weights is not None:
+ weights = asarray(weights)
+ if np.any(weights.shape != a.shape):
+ raise ValueError, 'weights should have the same shape as a.'
+ weights = weights.ravel()
+ a = a.ravel()
+
+ if (range is not None):
+ mn, mx = range
+ if (mn > mx):
+ raise AttributeError, \
+ 'max must be larger than min in range parameter.'
+
+ if not iterable(bins):
+ if range is None:
+ range = (a.min(), a.max())
+ mn, mx = [mi+0.0 for mi in range]
+ if mn == mx:
+ mn -= 0.5
+ mx += 0.5
+ bins = linspace(mn, mx, bins+1, endpoint=True)
+ else:
+ bins = asarray(bins)
+ if (np.diff(bins) < 0).any():
+ raise AttributeError, 'bins must increase monotonically.'
+
+ # Histogram is an integer or a float array depending on the weights.
+ if weights is None:
+ ntype = int
+ else:
+ ntype = weights.dtype
+ n = np.zeros(bins.shape, ntype)
+
+ block = 65536
+ if weights is None:
+ for i in xrange(0, a.size, block):
+ sa = sort(a[:block])
+ n += np.r_[sa.searchsorted(bins[:-1], 'left'), \
+ sa.searchsorted(bins[-1], 'right')]
+ else:
+ zero = array(0, dtype=ntype)
+ for i in xrange(0, a.size, block):
+ tmp_a = a[:block]
+ tmp_w = weights[:block]
+ sorting_index = np.argsort(tmp_a)
+ sa = tmp_a[sorting_index]
+ sw = tmp_w[sorting_index]
+ cw = np.concatenate(([zero,], sw.cumsum()))
+ bin_index = np.r_[sa.searchsorted(bins[:-1], 'left'), \
+ sa.searchsorted(bins[-1], 'right')]
+ n += cw[bin_index]
+
+ n = np.diff(n)
+
+ if normed is False:
+ return n, bins
+ elif normed is True:
+ db = array(np.diff(bins), float)
+ return n/(n*db).sum(), bins
+
def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
"""histogramdd(sample, bins=10, range=None, normed=False, weights=None)
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py
index 52c8cb4d0..ba0c5c860 100644
--- a/numpy/lib/tests/test_function_base.py
+++ b/numpy/lib/tests/test_function_base.py
@@ -5,6 +5,7 @@ set_package_path()
import numpy.lib;reload(numpy.lib)
from numpy.lib import *
from numpy.core import *
+
del sys.path[0]
class TestAny(NumpyTestCase):
@@ -432,12 +433,102 @@ class TestHistogram(NumpyTestCase):
v=rand(n)
(a,b)=histogram(v)
#check if the sum of the bins equals the number of samples
- assert(sum(a,axis=0)==n)
+ assert_equal(sum(a,axis=0), n)
#check that the bin counts are evenly spaced when the data is from a
# linear function
(a,b)=histogram(linspace(0,10,100))
- assert(all(a==10))
+ assert_array_equal(a, 10)
+ def check_simple_new(self):
+ n=100
+ v=rand(n)
+ (a,b)=histogram(v, new=True)
+ #check if the sum of the bins equals the number of samples
+ assert_equal(sum(a,axis=0), n)
+ #check that the bin counts are evenly spaced when the data is from a
+ # linear function
+ (a,b)=histogram(linspace(0,10,100), new=True)
+ assert_array_equal(a, 10)
+
+ def check_normed_new(self):
+ # Check that the integral of the density equals 1.
+ n = 100
+ v = rand(n)
+ a,b = histogram(v, normed=True, new=True)
+ area = sum(a*diff(b))
+ assert_almost_equal(area, 1)
+
+ # Check with non constant bin width
+ v = rand(n)*10
+ bins = [0,1,5, 9, 10]
+ a,b = histogram(v, bins, normed=True, new=True)
+ area = sum(a*diff(b))
+ assert_almost_equal(area, 1)
+
+
+ def check_outliers_new(self):
+ # Check that outliers are not tallied
+ a = arange(10)+.5
+
+ # Lower outliers
+ h,b = histogram(a, range=[0,9], new=True)
+ assert_equal(h.sum(),9)
+
+ # Upper outliers
+ h,b = histogram(a, range=[1,10], new=True)
+ assert_equal(h.sum(),9)
+
+ # Normalization
+ h,b = histogram(a, range=[1,9], normed=True, new=True)
+ assert_equal((h*diff(b)).sum(),1)
+
+ # Weights
+ w = arange(10)+.5
+ h,b = histogram(a, range=[1,9], weights=w, normed=True, new=True)
+ assert_equal((h*diff(b)).sum(),1)
+
+ h,b = histogram(a, bins=8, range=[1,9], weights=w, new=True)
+ assert_equal(h, w[1:-1])
+
+
+ def check_type_new(self):
+ # Check the type of the returned histogram
+ a = arange(10)+.5
+ h,b = histogram(a, new=True)
+ assert(issubdtype(h.dtype, int))
+
+ h,b = histogram(a, normed=True, new=True)
+ assert(issubdtype(h.dtype, float))
+
+ h,b = histogram(a, weights=ones(10, int), new=True)
+ assert(issubdtype(h.dtype, int))
+
+ h,b = histogram(a, weights=ones(10, float), new=True)
+ assert(issubdtype(h.dtype, float))
+
+
+ def check_weights_new(self):
+ v = rand(100)
+ w = ones(100)*5
+ a,b = histogram(v,new=True)
+ na,nb = histogram(v, normed=True, new=True)
+ wa,wb = histogram(v, weights=w, new=True)
+ nwa,nwb = histogram(v, weights=w, normed=True, new=True)
+ assert_array_almost_equal(a*5, wa)
+ assert_array_almost_equal(na, nwa)
+
+ # Check weights are properly applied.
+ v = linspace(0,10,10)
+ w = concatenate((zeros(5), ones(5)))
+ wa,wb = histogram(v, bins=arange(11),weights=w, new=True)
+ assert_array_almost_equal(wa, w)
+
+ # Check with integer weights
+ wa, wb = histogram([1,2,2,4], bins=4, weights=[4,3,2,1], new=True)
+ assert_array_equal(wa, [4,5,0,1])
+ wa, wb = histogram([1,2,2,4], bins=4, weights=[4,3,2,1], normed=True, new=True)
+ assert_array_equal(wa, array([4,5,0,1])/10./3.*4)
+
class TestHistogramdd(NumpyTestCase):
def check_simple(self):
x = array([[-.5, .5, 1.5], [-.5, 1.5, 2.5], [-.5, 2.5, .5], \