From 34c9950ccdb8a2f64916903b09c2c39233be0adf Mon Sep 17 00:00:00 2001 From: Eric Wieser Date: Sat, 9 Dec 2017 13:58:42 -0800 Subject: MAINT: Move histogram and histogramdd into their own module 800 self-contained lines are easily enough to go in their own file, as are the 500 lines of tests. For compatibility, the names are still available through `np.lib.function_base.histogram` and `from np.lib.function_base import *` For simplicity of imports, all of the unqualified `np.` names are now qualified --- numpy/lib/function_base.py | 804 +-------------------------------------------- 1 file changed, 3 insertions(+), 801 deletions(-) (limited to 'numpy/lib/function_base.py') diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index c9a23350d..d8e10007c 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -39,12 +39,14 @@ if sys.version_info[0] < 3: else: import builtins +# needed in this module for compatibility +from numpy.lib.histograms import histogram, histogramdd __all__ = [ 'select', 'piecewise', 'trim_zeros', 'copy', 'iterable', 'percentile', 'diff', 'gradient', 'angle', 'unwrap', 'sort_complex', 'disp', 'flip', 'rot90', 'extract', 'place', 'vectorize', 'asarray_chkfinite', 'average', - 'histogram', 'histogramdd', 'bincount', 'digitize', 'cov', 'corrcoef', + 'bincount', 'digitize', 'cov', 'corrcoef', 'msort', 'median', 'sinc', 'hamming', 'hanning', 'bartlett', 'blackman', 'kaiser', 'trapz', 'i0', 'add_newdoc', 'add_docstring', 'meshgrid', 'delete', 'insert', 'append', 'interp', 'add_newdoc_ufunc' @@ -241,806 +243,6 @@ def iterable(y): return True -def _hist_bin_sqrt(x): - """ - Square root histogram bin estimator. - - Bin width is inversely proportional to the data size. Used by many - programs for its simplicity. - - Parameters - ---------- - x : array_like - Input data that is to be histogrammed, trimmed to range. May not - be empty. - - Returns - ------- - h : An estimate of the optimal bin width for the given data. - """ - return x.ptp() / np.sqrt(x.size) - - -def _hist_bin_sturges(x): - """ - Sturges histogram bin estimator. - - A very simplistic estimator based on the assumption of normality of - the data. This estimator has poor performance for non-normal data, - which becomes especially obvious for large data sets. The estimate - depends only on size of the data. - - Parameters - ---------- - x : array_like - Input data that is to be histogrammed, trimmed to range. May not - be empty. - - Returns - ------- - h : An estimate of the optimal bin width for the given data. - """ - return x.ptp() / (np.log2(x.size) + 1.0) - - -def _hist_bin_rice(x): - """ - Rice histogram bin estimator. - - Another simple estimator with no normality assumption. It has better - performance for large data than Sturges, but tends to overestimate - the number of bins. The number of bins is proportional to the cube - root of data size (asymptotically optimal). The estimate depends - only on size of the data. - - Parameters - ---------- - x : array_like - Input data that is to be histogrammed, trimmed to range. May not - be empty. - - Returns - ------- - h : An estimate of the optimal bin width for the given data. - """ - return x.ptp() / (2.0 * x.size ** (1.0 / 3)) - - -def _hist_bin_scott(x): - """ - Scott histogram bin estimator. - - The binwidth is proportional to the standard deviation of the data - and inversely proportional to the cube root of data size - (asymptotically optimal). - - Parameters - ---------- - x : array_like - Input data that is to be histogrammed, trimmed to range. May not - be empty. - - Returns - ------- - h : An estimate of the optimal bin width for the given data. - """ - return (24.0 * np.pi**0.5 / x.size)**(1.0 / 3.0) * np.std(x) - - -def _hist_bin_doane(x): - """ - Doane's histogram bin estimator. - - Improved version of Sturges' formula which works better for - non-normal data. See - stats.stackexchange.com/questions/55134/doanes-formula-for-histogram-binning - - Parameters - ---------- - x : array_like - Input data that is to be histogrammed, trimmed to range. May not - be empty. - - Returns - ------- - h : An estimate of the optimal bin width for the given data. - """ - if x.size > 2: - sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3))) - sigma = np.std(x) - if sigma > 0.0: - # These three operations add up to - # g1 = np.mean(((x - np.mean(x)) / sigma)**3) - # but use only one temp array instead of three - temp = x - np.mean(x) - np.true_divide(temp, sigma, temp) - np.power(temp, 3, temp) - g1 = np.mean(temp) - return x.ptp() / (1.0 + np.log2(x.size) + - np.log2(1.0 + np.absolute(g1) / sg1)) - return 0.0 - - -def _hist_bin_fd(x): - """ - The Freedman-Diaconis histogram bin estimator. - - The Freedman-Diaconis rule uses interquartile range (IQR) to - estimate binwidth. It is considered a variation of the Scott rule - with more robustness as the IQR is less affected by outliers than - the standard deviation. However, the IQR depends on fewer points - than the standard deviation, so it is less accurate, especially for - long tailed distributions. - - If the IQR is 0, this function returns 1 for the number of bins. - Binwidth is inversely proportional to the cube root of data size - (asymptotically optimal). - - Parameters - ---------- - x : array_like - Input data that is to be histogrammed, trimmed to range. May not - be empty. - - Returns - ------- - h : An estimate of the optimal bin width for the given data. - """ - iqr = np.subtract(*np.percentile(x, [75, 25])) - return 2.0 * iqr * x.size ** (-1.0 / 3.0) - - -def _hist_bin_auto(x): - """ - Histogram bin estimator that uses the minimum width of the - Freedman-Diaconis and Sturges estimators. - - The FD estimator is usually the most robust method, but its width - estimate tends to be too large for small `x`. The Sturges estimator - is quite good for small (<1000) datasets and is the default in the R - language. This method gives good off the shelf behaviour. - - Parameters - ---------- - x : array_like - Input data that is to be histogrammed, trimmed to range. May not - be empty. - - Returns - ------- - h : An estimate of the optimal bin width for the given data. - - See Also - -------- - _hist_bin_fd, _hist_bin_sturges - """ - # There is no need to check for zero here. If ptp is, so is IQR and - # vice versa. Either both are zero or neither one is. - return min(_hist_bin_fd(x), _hist_bin_sturges(x)) - - -# Private dict initialized at module load time -_hist_bin_selectors = {'auto': _hist_bin_auto, - 'doane': _hist_bin_doane, - 'fd': _hist_bin_fd, - 'rice': _hist_bin_rice, - 'scott': _hist_bin_scott, - 'sqrt': _hist_bin_sqrt, - 'sturges': _hist_bin_sturges} - - -def histogram(a, bins=10, range=None, normed=False, weights=None, - density=None): - r""" - Compute the histogram of a set of data. - - Parameters - ---------- - a : array_like - Input data. The histogram is computed over the flattened array. - bins : int or sequence of scalars or str, optional - If `bins` is an int, it defines the number of equal-width - bins in the given range (10, by default). If `bins` is a - sequence, it defines the bin edges, including the rightmost - edge, allowing for non-uniform bin widths. - - .. versionadded:: 1.11.0 - - If `bins` is a string from the list below, `histogram` will use - the method chosen to calculate the optimal bin width and - consequently the number of bins (see `Notes` for more detail on - the estimators) from the data that falls within the requested - range. While the bin width will be optimal for the actual data - in the range, the number of bins will be computed to fill the - entire range, including the empty portions. For visualisation, - using the 'auto' option is suggested. Weighted data is not - supported for automated bin size selection. - - 'auto' - Maximum of the 'sturges' and 'fd' estimators. Provides good - all around performance. - - 'fd' (Freedman Diaconis Estimator) - Robust (resilient to outliers) estimator that takes into - account data variability and data size. - - 'doane' - An improved version of Sturges' estimator that works better - with non-normal datasets. - - 'scott' - Less robust estimator that that takes into account data - variability and data size. - - 'rice' - Estimator does not take variability into account, only data - size. Commonly overestimates number of bins required. - - 'sturges' - R's default method, only accounts for data size. Only - optimal for gaussian data and underestimates number of bins - for large non-gaussian datasets. - - 'sqrt' - Square root (of data size) estimator, used by Excel and - other programs for its speed and simplicity. - - range : (float, float), optional - The lower and upper range of the bins. If not provided, range - is simply ``(a.min(), a.max())``. Values outside the range are - ignored. The first element of the range must be less than or - equal to the second. `range` affects the automatic bin - computation as well. While bin width is computed to be optimal - based on the actual data within `range`, the bin count will fill - the entire range including portions containing no data. - normed : bool, optional - This keyword is deprecated in NumPy 1.6.0 due to confusing/buggy - behavior. It will be removed in NumPy 2.0.0. Use the ``density`` - keyword instead. If ``False``, the result will contain the - number of samples in each bin. If ``True``, the result is the - value of the probability *density* function at the bin, - normalized such that the *integral* over the range is 1. Note - that this latter behavior is known to be buggy with unequal bin - widths; use ``density`` instead. - weights : array_like, optional - An array of weights, of the same shape as `a`. Each value in - `a` only contributes its associated weight towards the bin count - (instead of 1). If `density` is True, the weights are - normalized, so that the integral of the density over the range - remains 1. - density : bool, optional - If ``False``, the result will contain the number of samples in - each bin. If ``True``, the result is the value of the - probability *density* function at the bin, normalized such that - the *integral* over the range is 1. Note that the sum of the - histogram values will not be equal to 1 unless bins of unity - width are chosen; it is not a probability *mass* function. - - Overrides the ``normed`` keyword if given. - - Returns - ------- - hist : array - The values of the histogram. See `density` and `weights` for a - description of the possible semantics. - bin_edges : array of dtype float - Return the bin edges ``(length(hist)+1)``. - - - See Also - -------- - histogramdd, bincount, searchsorted, digitize - - Notes - ----- - All but the last (righthand-most) bin is half-open. In other words, - if `bins` is:: - - [1, 2, 3, 4] - - then the first bin is ``[1, 2)`` (including 1, but excluding 2) and - the second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which - *includes* 4. - - .. versionadded:: 1.11.0 - - The methods to estimate the optimal number of bins are well founded - in literature, and are inspired by the choices R provides for - histogram visualisation. Note that having the number of bins - proportional to :math:`n^{1/3}` is asymptotically optimal, which is - why it appears in most estimators. These are simply plug-in methods - that give good starting points for number of bins. In the equations - below, :math:`h` is the binwidth and :math:`n_h` is the number of - bins. All estimators that compute bin counts are recast to bin width - using the `ptp` of the data. The final bin count is obtained from - ``np.round(np.ceil(range / h))`. - - 'Auto' (maximum of the 'Sturges' and 'FD' estimators) - A compromise to get a good value. For small datasets the Sturges - value will usually be chosen, while larger datasets will usually - default to FD. Avoids the overly conservative behaviour of FD - and Sturges for small and large datasets respectively. - Switchover point is usually :math:`a.size \approx 1000`. - - 'FD' (Freedman Diaconis Estimator) - .. math:: h = 2 \frac{IQR}{n^{1/3}} - - The binwidth is proportional to the interquartile range (IQR) - and inversely proportional to cube root of a.size. Can be too - conservative for small datasets, but is quite good for large - datasets. The IQR is very robust to outliers. - - 'Scott' - .. math:: h = \sigma \sqrt[3]{\frac{24 * \sqrt{\pi}}{n}} - - The binwidth is proportional to the standard deviation of the - data and inversely proportional to cube root of ``x.size``. Can - be too conservative for small datasets, but is quite good for - large datasets. The standard deviation is not very robust to - outliers. Values are very similar to the Freedman-Diaconis - estimator in the absence of outliers. - - 'Rice' - .. math:: n_h = 2n^{1/3} - - The number of bins is only proportional to cube root of - ``a.size``. It tends to overestimate the number of bins and it - does not take into account data variability. - - 'Sturges' - .. math:: n_h = \log _{2}n+1 - - The number of bins is the base 2 log of ``a.size``. This - estimator assumes normality of data and is too conservative for - larger, non-normal datasets. This is the default method in R's - ``hist`` method. - - 'Doane' - .. math:: n_h = 1 + \log_{2}(n) + - \log_{2}(1 + \frac{|g_1|}{\sigma_{g_1}}) - - g_1 = mean[(\frac{x - \mu}{\sigma})^3] - - \sigma_{g_1} = \sqrt{\frac{6(n - 2)}{(n + 1)(n + 3)}} - - An improved version of Sturges' formula that produces better - estimates for non-normal datasets. This estimator attempts to - account for the skew of the data. - - 'Sqrt' - .. math:: n_h = \sqrt n - The simplest and fastest estimator. Only takes into account the - data size. - - Examples - -------- - >>> np.histogram([1, 2, 1], bins=[0, 1, 2, 3]) - (array([0, 2, 1]), array([0, 1, 2, 3])) - >>> np.histogram(np.arange(4), bins=np.arange(5), density=True) - (array([ 0.25, 0.25, 0.25, 0.25]), array([0, 1, 2, 3, 4])) - >>> np.histogram([[1, 2, 1], [1, 0, 1]], bins=[0,1,2,3]) - (array([1, 4, 1]), array([0, 1, 2, 3])) - - >>> a = np.arange(5) - >>> hist, bin_edges = np.histogram(a, density=True) - >>> hist - array([ 0.5, 0. , 0.5, 0. , 0. , 0.5, 0. , 0.5, 0. , 0.5]) - >>> hist.sum() - 2.4999999999999996 - >>> np.sum(hist * np.diff(bin_edges)) - 1.0 - - .. versionadded:: 1.11.0 - - Automated Bin Selection Methods example, using 2 peak random data - with 2000 points: - - >>> import matplotlib.pyplot as plt - >>> rng = np.random.RandomState(10) # deterministic random data - >>> a = np.hstack((rng.normal(size=1000), - ... rng.normal(loc=5, scale=2, size=1000))) - >>> plt.hist(a, bins='auto') # arguments are passed to np.histogram - >>> plt.title("Histogram with 'auto' bins") - >>> plt.show() - - """ - a = asarray(a) - if weights is not None: - weights = asarray(weights) - if weights.shape != a.shape: - raise ValueError( - 'weights should have the same shape as a.') - weights = weights.ravel() - a = a.ravel() - - # Do not modify the original value of range so we can check for `None` - if range is None: - if a.size == 0: - # handle empty arrays. Can't determine range, so use 0-1. - first_edge, last_edge = 0.0, 1.0 - else: - first_edge, last_edge = a.min() + 0.0, a.max() + 0.0 - else: - first_edge, last_edge = [mi + 0.0 for mi in range] - if first_edge > last_edge: - raise ValueError( - 'max must be larger than min in range parameter.') - if not np.all(np.isfinite([first_edge, last_edge])): - raise ValueError( - 'range parameter must be finite.') - if first_edge == last_edge: - first_edge -= 0.5 - last_edge += 0.5 - - # density overrides the normed keyword - if density is not None: - normed = False - - # parse the overloaded bins argument - n_equal_bins = None - bin_edges = None - - if isinstance(bins, basestring): - bin_name = bins - # if `bins` is a string for an automatic method, - # this will replace it with the number of bins calculated - if bin_name not in _hist_bin_selectors: - raise ValueError( - "{!r} is not a valid estimator for `bins`".format(bin_name)) - if weights is not None: - raise TypeError("Automated estimation of the number of " - "bins is not supported for weighted data") - # Make a reference to `a` - b = a - # Update the reference if the range needs truncation - if range is not None: - keep = (a >= first_edge) - keep &= (a <= last_edge) - if not np.logical_and.reduce(keep): - b = a[keep] - - if b.size == 0: - n_equal_bins = 1 - else: - # Do not call selectors on empty arrays - width = _hist_bin_selectors[bin_name](b) - if width: - n_equal_bins = int(np.ceil((last_edge - first_edge) / width)) - else: - # Width can be zero for some estimators, e.g. FD when - # the IQR of the data is zero. - n_equal_bins = 1 - - elif np.ndim(bins) == 0: - try: - n_equal_bins = operator.index(bins) - except TypeError: - raise TypeError( - '`bins` must be an integer, a string, or an array') - if n_equal_bins < 1: - raise ValueError('`bins` must be positive, when an integer') - - elif np.ndim(bins) == 1: - bin_edges = np.asarray(bins) - if np.any(bin_edges[:-1] > bin_edges[1:]): - raise ValueError( - '`bins` must increase monotonically, when an array') - - else: - raise ValueError('`bins` must be 1d, when an array') - - del bins - - # compute the bins if only the count was specified - if n_equal_bins is not None: - bin_edges = linspace( - first_edge, last_edge, n_equal_bins + 1, endpoint=True) - - # Histogram is an integer or a float array depending on the weights. - if weights is None: - ntype = np.dtype(np.intp) - else: - ntype = weights.dtype - - # We set a block size, as this allows us to iterate over chunks when - # computing histograms, to minimize memory usage. - BLOCK = 65536 - - # The fast path uses bincount, but that only works for certain types - # of weight - simple_weights = ( - weights is None or - np.can_cast(weights.dtype, np.double) or - np.can_cast(weights.dtype, complex) - ) - - if n_equal_bins is not None and simple_weights: - # Fast algorithm for equal bins - # We now convert values of a to bin indices, under the assumption of - # equal bin widths (which is valid here). - - # Initialize empty histogram - n = np.zeros(n_equal_bins, ntype) - - # Pre-compute histogram scaling factor - norm = n_equal_bins / (last_edge - first_edge) - - # We iterate over blocks here for two reasons: the first is that for - # large arrays, it is actually faster (for example for a 10^8 array it - # is 2x as fast) and it results in a memory footprint 3x lower in the - # limit of large arrays. - for i in arange(0, len(a), BLOCK): - tmp_a = a[i:i+BLOCK] - if weights is None: - tmp_w = None - else: - tmp_w = weights[i:i + BLOCK] - - # Only include values in the right range - keep = (tmp_a >= first_edge) - keep &= (tmp_a <= last_edge) - if not np.logical_and.reduce(keep): - tmp_a = tmp_a[keep] - if tmp_w is not None: - tmp_w = tmp_w[keep] - tmp_a_data = tmp_a.astype(float) - tmp_a = tmp_a_data - first_edge - tmp_a *= norm - - # Compute the bin indices, and for values that lie exactly on - # last_edge we need to subtract one - indices = tmp_a.astype(np.intp) - indices[indices == n_equal_bins] -= 1 - - # The index computation is not guaranteed to give exactly - # consistent results within ~1 ULP of the bin edges. - decrement = tmp_a_data < bin_edges[indices] - indices[decrement] -= 1 - # The last bin includes the right edge. The other bins do not. - increment = ((tmp_a_data >= bin_edges[indices + 1]) - & (indices != n_equal_bins - 1)) - indices[increment] += 1 - - # We now compute the histogram using bincount - if ntype.kind == 'c': - n.real += np.bincount(indices, weights=tmp_w.real, - minlength=n_equal_bins) - n.imag += np.bincount(indices, weights=tmp_w.imag, - minlength=n_equal_bins) - else: - n += np.bincount(indices, weights=tmp_w, - minlength=n_equal_bins).astype(ntype) - else: - # Compute via cumulative histogram - cum_n = np.zeros(bin_edges.shape, ntype) - if weights is None: - for i in arange(0, len(a), BLOCK): - sa = sort(a[i:i+BLOCK]) - cum_n += np.r_[sa.searchsorted(bin_edges[:-1], 'left'), - sa.searchsorted(bin_edges[-1], 'right')] - else: - zero = array(0, dtype=ntype) - for i in arange(0, len(a), BLOCK): - tmp_a = a[i:i+BLOCK] - tmp_w = weights[i:i+BLOCK] - sorting_index = np.argsort(tmp_a) - sa = tmp_a[sorting_index] - sw = tmp_w[sorting_index] - cw = np.concatenate(([zero], sw.cumsum())) - bin_index = np.r_[sa.searchsorted(bin_edges[:-1], 'left'), - sa.searchsorted(bin_edges[-1], 'right')] - cum_n += cw[bin_index] - - n = np.diff(cum_n) - - if density: - db = array(np.diff(bin_edges), float) - return n/db/n.sum(), bin_edges - elif normed: - # deprecated, buggy behavior. Remove for NumPy 2.0.0 - db = array(np.diff(bin_edges), float) - return n/(n*db).sum(), bin_edges - else: - return n, bin_edges - - -def histogramdd(sample, bins=10, range=None, normed=False, weights=None): - """ - Compute the multidimensional histogram of some data. - - Parameters - ---------- - sample : array_like - The data to be histogrammed. It must be an (N,D) array or data - that can be converted to such. The rows of the resulting array - are the coordinates of points in a D dimensional polytope. - bins : sequence or int, optional - The bin specification: - - * A sequence of arrays describing the bin edges along each dimension. - * The number of bins for each dimension (nx, ny, ... =bins) - * The number of bins for all dimensions (nx=ny=...=bins). - - range : sequence, optional - A sequence of lower and upper bin edges to be used if the edges are - not given explicitly in `bins`. Defaults to the minimum and maximum - values along each dimension. - normed : bool, optional - If False, returns the number of samples in each bin. If True, - returns the bin density ``bin_count / sample_count / bin_volume``. - weights : (N,) array_like, optional - An array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`. - Weights are normalized to 1 if normed is True. If normed is False, - the values of the returned histogram are equal to the sum of the - weights belonging to the samples falling into each bin. - - Returns - ------- - H : ndarray - The multidimensional histogram of sample x. See normed and weights - for the different possible semantics. - edges : list - A list of D arrays describing the bin edges for each dimension. - - See Also - -------- - histogram: 1-D histogram - histogram2d: 2-D histogram - - Examples - -------- - >>> r = np.random.randn(100,3) - >>> H, edges = np.histogramdd(r, bins = (5, 8, 4)) - >>> H.shape, edges[0].size, edges[1].size, edges[2].size - ((5, 8, 4), 6, 9, 5) - - """ - - try: - # Sample is an ND-array. - N, D = sample.shape - except (AttributeError, ValueError): - # Sample is a sequence of 1D arrays. - sample = atleast_2d(sample).T - N, D = sample.shape - - nbin = empty(D, int) - edges = D*[None] - dedges = D*[None] - if weights is not None: - weights = asarray(weights) - - try: - M = len(bins) - if M != D: - raise ValueError( - 'The dimension of bins must be equal to the dimension of the ' - ' sample x.') - except TypeError: - # bins is an integer - bins = D*[bins] - - # Select range for each dimension - # Used only if number of bins is given. - if range is None: - # Handle empty input. Range can't be determined in that case, use 0-1. - if N == 0: - smin = zeros(D) - smax = ones(D) - else: - smin = atleast_1d(array(sample.min(0), float)) - smax = atleast_1d(array(sample.max(0), float)) - else: - if not np.all(np.isfinite(range)): - raise ValueError( - 'range parameter must be finite.') - smin = zeros(D) - smax = zeros(D) - for i in arange(D): - smin[i], smax[i] = range[i] - - # Make sure the bins have a finite width. - for i in arange(len(smin)): - if smin[i] == smax[i]: - smin[i] = smin[i] - .5 - smax[i] = smax[i] + .5 - - # avoid rounding issues for comparisons when dealing with inexact types - if np.issubdtype(sample.dtype, np.inexact): - edge_dt = sample.dtype - else: - edge_dt = float - # Create edge arrays - for i in arange(D): - if isscalar(bins[i]): - if bins[i] < 1: - raise ValueError( - "Element at index %s in `bins` should be a positive " - "integer." % i) - nbin[i] = bins[i] + 2 # +2 for outlier bins - edges[i] = linspace(smin[i], smax[i], nbin[i]-1, dtype=edge_dt) - else: - edges[i] = asarray(bins[i], edge_dt) - nbin[i] = len(edges[i]) + 1 # +1 for outlier bins - dedges[i] = diff(edges[i]) - if np.any(np.asarray(dedges[i]) <= 0): - raise ValueError( - "Found bin edge of size <= 0. Did you specify `bins` with" - "non-monotonic sequence?") - - nbin = asarray(nbin) - - # Handle empty input. - if N == 0: - return np.zeros(nbin-2), edges - - # Compute the bin number each sample falls into. - Ncount = {} - for i in arange(D): - Ncount[i] = digitize(sample[:, i], edges[i]) - - # Using digitize, values that fall on an edge are put in the right bin. - # For the rightmost bin, we want values equal to the right edge to be - # counted in the last bin, and not as an outlier. - for i in arange(D): - # Rounding precision - mindiff = dedges[i].min() - if not np.isinf(mindiff): - decimal = int(-log10(mindiff)) + 6 - # Find which points are on the rightmost edge. - not_smaller_than_edge = (sample[:, i] >= edges[i][-1]) - on_edge = (around(sample[:, i], decimal) == - around(edges[i][-1], decimal)) - # Shift these points one bin to the left. - Ncount[i][nonzero(on_edge & not_smaller_than_edge)[0]] -= 1 - - # Flattened histogram matrix (1D) - # Reshape is used so that overlarge arrays - # will raise an error. - hist = zeros(nbin, float).reshape(-1) - - # Compute the sample indices in the flattened histogram matrix. - ni = nbin.argsort() - xy = zeros(N, int) - for i in arange(0, D-1): - xy += Ncount[ni[i]] * nbin[ni[i+1:]].prod() - xy += Ncount[ni[-1]] - - # Compute the number of repetitions in xy and assign it to the - # flattened histmat. - if len(xy) == 0: - return zeros(nbin-2, int), edges - - flatcount = bincount(xy, weights) - a = arange(len(flatcount)) - hist[a] = flatcount - - # Shape into a proper matrix - hist = hist.reshape(sort(nbin)) - for i in arange(nbin.size): - j = ni.argsort()[i] - hist = hist.swapaxes(i, j) - ni[i], ni[j] = ni[j], ni[i] - - # Remove outliers (indices 0 and -1 for each dimension). - core = D*[slice(1, -1)] - hist = hist[core] - - # Normalize if normed is True - if normed: - s = hist.sum() - for i in arange(D): - shape = ones(D, int) - shape[i] = nbin[i] - 2 - hist = hist / dedges[i].reshape(shape) - hist /= s - - if (hist.shape != nbin - 2).any(): - raise RuntimeError( - "Internal Shape Error") - return hist, edges - - def average(a, axis=None, weights=None, returned=False): """ Compute the weighted average along the specified axis. -- cgit v1.2.1