diff options
author | Eric Wieser <wieser.eric@gmail.com> | 2018-07-31 00:41:28 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-07-31 00:41:28 -0700 |
commit | 7f4579279a6a6aa07df664b901afa36ab3fc5ce0 (patch) | |
tree | 3524c05c661f4948eabf066b46b5ad3aaf6ad617 /numpy/lib/histograms.py | |
parent | 24960daf3e326591047eb099af840da6e95d0910 (diff) | |
parent | 9bb569c4e0e1cf08128179d157bdab10c8706a97 (diff) | |
download | numpy-7f4579279a6a6aa07df664b901afa36ab3fc5ce0.tar.gz |
Merge branch 'master' into ix_-preserve-type
Diffstat (limited to 'numpy/lib/histograms.py')
-rw-r--r-- | numpy/lib/histograms.py | 992 |
1 files changed, 992 insertions, 0 deletions
diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py new file mode 100644 index 000000000..422b356f7 --- /dev/null +++ b/numpy/lib/histograms.py @@ -0,0 +1,992 @@ +""" +Histogram-related functions +""" +from __future__ import division, absolute_import, print_function + +import operator +import warnings + +import numpy as np +from numpy.compat.py3k import basestring + +__all__ = ['histogram', 'histogramdd', 'histogram_bin_edges'] + +# range is a keyword argument to many functions, so save the builtin so they can +# use it. +_range = range + + +def _hist_bin_sqrt(x): + """ + Square root histogram bin estimator. + + Bin width is inversely proportional to the data size. Used by many + programs for its simplicity. + + Parameters + ---------- + x : array_like + Input data that is to be histogrammed, trimmed to range. May not + be empty. + + Returns + ------- + h : An estimate of the optimal bin width for the given data. + """ + return x.ptp() / np.sqrt(x.size) + + +def _hist_bin_sturges(x): + """ + Sturges histogram bin estimator. + + A very simplistic estimator based on the assumption of normality of + the data. This estimator has poor performance for non-normal data, + which becomes especially obvious for large data sets. The estimate + depends only on size of the data. + + Parameters + ---------- + x : array_like + Input data that is to be histogrammed, trimmed to range. May not + be empty. + + Returns + ------- + h : An estimate of the optimal bin width for the given data. + """ + return x.ptp() / (np.log2(x.size) + 1.0) + + +def _hist_bin_rice(x): + """ + Rice histogram bin estimator. + + Another simple estimator with no normality assumption. It has better + performance for large data than Sturges, but tends to overestimate + the number of bins. The number of bins is proportional to the cube + root of data size (asymptotically optimal). The estimate depends + only on size of the data. + + Parameters + ---------- + x : array_like + Input data that is to be histogrammed, trimmed to range. May not + be empty. + + Returns + ------- + h : An estimate of the optimal bin width for the given data. + """ + return x.ptp() / (2.0 * x.size ** (1.0 / 3)) + + +def _hist_bin_scott(x): + """ + Scott histogram bin estimator. + + The binwidth is proportional to the standard deviation of the data + and inversely proportional to the cube root of data size + (asymptotically optimal). + + Parameters + ---------- + x : array_like + Input data that is to be histogrammed, trimmed to range. May not + be empty. + + Returns + ------- + h : An estimate of the optimal bin width for the given data. + """ + return (24.0 * np.pi**0.5 / x.size)**(1.0 / 3.0) * np.std(x) + + +def _hist_bin_doane(x): + """ + Doane's histogram bin estimator. + + Improved version of Sturges' formula which works better for + non-normal data. See + stats.stackexchange.com/questions/55134/doanes-formula-for-histogram-binning + + Parameters + ---------- + x : array_like + Input data that is to be histogrammed, trimmed to range. May not + be empty. + + Returns + ------- + h : An estimate of the optimal bin width for the given data. + """ + if x.size > 2: + sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3))) + sigma = np.std(x) + if sigma > 0.0: + # These three operations add up to + # g1 = np.mean(((x - np.mean(x)) / sigma)**3) + # but use only one temp array instead of three + temp = x - np.mean(x) + np.true_divide(temp, sigma, temp) + np.power(temp, 3, temp) + g1 = np.mean(temp) + return x.ptp() / (1.0 + np.log2(x.size) + + np.log2(1.0 + np.absolute(g1) / sg1)) + return 0.0 + + +def _hist_bin_fd(x): + """ + The Freedman-Diaconis histogram bin estimator. + + The Freedman-Diaconis rule uses interquartile range (IQR) to + estimate binwidth. It is considered a variation of the Scott rule + with more robustness as the IQR is less affected by outliers than + the standard deviation. However, the IQR depends on fewer points + than the standard deviation, so it is less accurate, especially for + long tailed distributions. + + If the IQR is 0, this function returns 1 for the number of bins. + Binwidth is inversely proportional to the cube root of data size + (asymptotically optimal). + + Parameters + ---------- + x : array_like + Input data that is to be histogrammed, trimmed to range. May not + be empty. + + Returns + ------- + h : An estimate of the optimal bin width for the given data. + """ + iqr = np.subtract(*np.percentile(x, [75, 25])) + return 2.0 * iqr * x.size ** (-1.0 / 3.0) + + +def _hist_bin_auto(x): + """ + Histogram bin estimator that uses the minimum width of the + Freedman-Diaconis and Sturges estimators if the FD bandwidth is non zero + and the Sturges estimator if the FD bandwidth is 0. + + The FD estimator is usually the most robust method, but its width + estimate tends to be too large for small `x` and bad for data with limited + variance. The Sturges estimator is quite good for small (<1000) datasets + and is the default in the R language. This method gives good off the shelf + behaviour. + + .. versionchanged:: 1.15.0 + If there is limited variance the IQR can be 0, which results in the + FD bin width being 0 too. This is not a valid bin width, so + ``np.histogram_bin_edges`` chooses 1 bin instead, which may not be optimal. + If the IQR is 0, it's unlikely any variance based estimators will be of + use, so we revert to the sturges estimator, which only uses the size of the + dataset in its calculation. + + Parameters + ---------- + x : array_like + Input data that is to be histogrammed, trimmed to range. May not + be empty. + + Returns + ------- + h : An estimate of the optimal bin width for the given data. + + See Also + -------- + _hist_bin_fd, _hist_bin_sturges + """ + fd_bw = _hist_bin_fd(x) + sturges_bw = _hist_bin_sturges(x) + if fd_bw: + return min(fd_bw, sturges_bw) + else: + # limited variance, so we return a len dependent bw estimator + return sturges_bw + +# Private dict initialized at module load time +_hist_bin_selectors = {'auto': _hist_bin_auto, + 'doane': _hist_bin_doane, + 'fd': _hist_bin_fd, + 'rice': _hist_bin_rice, + 'scott': _hist_bin_scott, + 'sqrt': _hist_bin_sqrt, + 'sturges': _hist_bin_sturges} + + +def _ravel_and_check_weights(a, weights): + """ Check a and weights have matching shapes, and ravel both """ + a = np.asarray(a) + if weights is not None: + weights = np.asarray(weights) + if weights.shape != a.shape: + raise ValueError( + 'weights should have the same shape as a.') + weights = weights.ravel() + a = a.ravel() + return a, weights + + +def _get_outer_edges(a, range): + """ + Determine the outer bin edges to use, from either the data or the range + argument + """ + if range is not None: + first_edge, last_edge = range + if first_edge > last_edge: + raise ValueError( + 'max must be larger than min in range parameter.') + if not (np.isfinite(first_edge) and np.isfinite(last_edge)): + raise ValueError( + "supplied range of [{}, {}] is not finite".format(first_edge, last_edge)) + elif a.size == 0: + # handle empty arrays. Can't determine range, so use 0-1. + first_edge, last_edge = 0, 1 + else: + first_edge, last_edge = a.min(), a.max() + if not (np.isfinite(first_edge) and np.isfinite(last_edge)): + raise ValueError( + "autodetected range of [{}, {}] is not finite".format(first_edge, last_edge)) + + # expand empty range to avoid divide by zero + if first_edge == last_edge: + first_edge = first_edge - 0.5 + last_edge = last_edge + 0.5 + + return first_edge, last_edge + + +def _get_bin_edges(a, bins, range, weights): + """ + Computes the bins used internally by `histogram`. + + Parameters + ========== + a : ndarray + Ravelled data array + bins, range + Forwarded arguments from `histogram`. + weights : ndarray, optional + Ravelled weights array, or None + + Returns + ======= + bin_edges : ndarray + Array of bin edges + uniform_bins : (Number, Number, int): + The upper bound, lowerbound, and number of bins, used in the optimized + implementation of `histogram` that works on uniform bins. + """ + # parse the overloaded bins argument + n_equal_bins = None + bin_edges = None + + if isinstance(bins, basestring): + bin_name = bins + # if `bins` is a string for an automatic method, + # this will replace it with the number of bins calculated + if bin_name not in _hist_bin_selectors: + raise ValueError( + "{!r} is not a valid estimator for `bins`".format(bin_name)) + if weights is not None: + raise TypeError("Automated estimation of the number of " + "bins is not supported for weighted data") + + first_edge, last_edge = _get_outer_edges(a, range) + + # truncate the range if needed + if range is not None: + keep = (a >= first_edge) + keep &= (a <= last_edge) + if not np.logical_and.reduce(keep): + a = a[keep] + + if a.size == 0: + n_equal_bins = 1 + else: + # Do not call selectors on empty arrays + width = _hist_bin_selectors[bin_name](a) + if width: + n_equal_bins = int(np.ceil((last_edge - first_edge) / width)) + else: + # Width can be zero for some estimators, e.g. FD when + # the IQR of the data is zero. + n_equal_bins = 1 + + elif np.ndim(bins) == 0: + try: + n_equal_bins = operator.index(bins) + except TypeError: + raise TypeError( + '`bins` must be an integer, a string, or an array') + if n_equal_bins < 1: + raise ValueError('`bins` must be positive, when an integer') + + first_edge, last_edge = _get_outer_edges(a, range) + + elif np.ndim(bins) == 1: + bin_edges = np.asarray(bins) + if np.any(bin_edges[:-1] > bin_edges[1:]): + raise ValueError( + '`bins` must increase monotonically, when an array') + + else: + raise ValueError('`bins` must be 1d, when an array') + + if n_equal_bins is not None: + # gh-10322 means that type resolution rules are dependent on array + # shapes. To avoid this causing problems, we pick a type now and stick + # with it throughout. + bin_type = np.result_type(first_edge, last_edge, a) + if np.issubdtype(bin_type, np.integer): + bin_type = np.result_type(bin_type, float) + + # bin edges must be computed + bin_edges = np.linspace( + first_edge, last_edge, n_equal_bins + 1, + endpoint=True, dtype=bin_type) + return bin_edges, (first_edge, last_edge, n_equal_bins) + else: + return bin_edges, None + + +def _search_sorted_inclusive(a, v): + """ + Like `searchsorted`, but where the last item in `v` is placed on the right. + + In the context of a histogram, this makes the last bin edge inclusive + """ + return np.concatenate(( + a.searchsorted(v[:-1], 'left'), + a.searchsorted(v[-1:], 'right') + )) + + +def histogram_bin_edges(a, bins=10, range=None, weights=None): + r""" + Function to calculate only the edges of the bins used by the `histogram` function. + + Parameters + ---------- + a : array_like + Input data. The histogram is computed over the flattened array. + bins : int or sequence of scalars or str, optional + If `bins` is an int, it defines the number of equal-width + bins in the given range (10, by default). If `bins` is a + sequence, it defines the bin edges, including the rightmost + edge, allowing for non-uniform bin widths. + + If `bins` is a string from the list below, `histogram_bin_edges` will use + the method chosen to calculate the optimal bin width and + consequently the number of bins (see `Notes` for more detail on + the estimators) from the data that falls within the requested + range. While the bin width will be optimal for the actual data + in the range, the number of bins will be computed to fill the + entire range, including the empty portions. For visualisation, + using the 'auto' option is suggested. Weighted data is not + supported for automated bin size selection. + + 'auto' + Maximum of the 'sturges' and 'fd' estimators. Provides good + all around performance. + + 'fd' (Freedman Diaconis Estimator) + Robust (resilient to outliers) estimator that takes into + account data variability and data size. + + 'doane' + An improved version of Sturges' estimator that works better + with non-normal datasets. + + 'scott' + Less robust estimator that that takes into account data + variability and data size. + + 'rice' + Estimator does not take variability into account, only data + size. Commonly overestimates number of bins required. + + 'sturges' + R's default method, only accounts for data size. Only + optimal for gaussian data and underestimates number of bins + for large non-gaussian datasets. + + 'sqrt' + Square root (of data size) estimator, used by Excel and + other programs for its speed and simplicity. + + range : (float, float), optional + The lower and upper range of the bins. If not provided, range + is simply ``(a.min(), a.max())``. Values outside the range are + ignored. The first element of the range must be less than or + equal to the second. `range` affects the automatic bin + computation as well. While bin width is computed to be optimal + based on the actual data within `range`, the bin count will fill + the entire range including portions containing no data. + + weights : array_like, optional + An array of weights, of the same shape as `a`. Each value in + `a` only contributes its associated weight towards the bin count + (instead of 1). This is currently not used by any of the bin estimators, + but may be in the future. + + Returns + ------- + bin_edges : array of dtype float + The edges to pass into `histogram` + + See Also + -------- + histogram + + Notes + ----- + The methods to estimate the optimal number of bins are well founded + in literature, and are inspired by the choices R provides for + histogram visualisation. Note that having the number of bins + proportional to :math:`n^{1/3}` is asymptotically optimal, which is + why it appears in most estimators. These are simply plug-in methods + that give good starting points for number of bins. In the equations + below, :math:`h` is the binwidth and :math:`n_h` is the number of + bins. All estimators that compute bin counts are recast to bin width + using the `ptp` of the data. The final bin count is obtained from + ``np.round(np.ceil(range / h))``. + + 'Auto' (maximum of the 'Sturges' and 'FD' estimators) + A compromise to get a good value. For small datasets the Sturges + value will usually be chosen, while larger datasets will usually + default to FD. Avoids the overly conservative behaviour of FD + and Sturges for small and large datasets respectively. + Switchover point is usually :math:`a.size \approx 1000`. + + 'FD' (Freedman Diaconis Estimator) + .. math:: h = 2 \frac{IQR}{n^{1/3}} + + The binwidth is proportional to the interquartile range (IQR) + and inversely proportional to cube root of a.size. Can be too + conservative for small datasets, but is quite good for large + datasets. The IQR is very robust to outliers. + + 'Scott' + .. math:: h = \sigma \sqrt[3]{\frac{24 * \sqrt{\pi}}{n}} + + The binwidth is proportional to the standard deviation of the + data and inversely proportional to cube root of ``x.size``. Can + be too conservative for small datasets, but is quite good for + large datasets. The standard deviation is not very robust to + outliers. Values are very similar to the Freedman-Diaconis + estimator in the absence of outliers. + + 'Rice' + .. math:: n_h = 2n^{1/3} + + The number of bins is only proportional to cube root of + ``a.size``. It tends to overestimate the number of bins and it + does not take into account data variability. + + 'Sturges' + .. math:: n_h = \log _{2}n+1 + + The number of bins is the base 2 log of ``a.size``. This + estimator assumes normality of data and is too conservative for + larger, non-normal datasets. This is the default method in R's + ``hist`` method. + + 'Doane' + .. math:: n_h = 1 + \log_{2}(n) + + \log_{2}(1 + \frac{|g_1|}{\sigma_{g_1}}) + + g_1 = mean[(\frac{x - \mu}{\sigma})^3] + + \sigma_{g_1} = \sqrt{\frac{6(n - 2)}{(n + 1)(n + 3)}} + + An improved version of Sturges' formula that produces better + estimates for non-normal datasets. This estimator attempts to + account for the skew of the data. + + 'Sqrt' + .. math:: n_h = \sqrt n + The simplest and fastest estimator. Only takes into account the + data size. + + Examples + -------- + >>> arr = np.array([0, 0, 0, 1, 2, 3, 3, 4, 5]) + >>> np.histogram_bin_edges(arr, bins='auto', range=(0, 1)) + array([0. , 0.25, 0.5 , 0.75, 1. ]) + >>> np.histogram_bin_edges(arr, bins=2) + array([0. , 2.5, 5. ]) + + For consistency with histogram, an array of pre-computed bins is + passed through unmodified: + + >>> np.histogram_bin_edges(arr, [1, 2]) + array([1, 2]) + + This function allows one set of bins to be computed, and reused across + multiple histograms: + + >>> shared_bins = np.histogram_bin_edges(arr, bins='auto') + >>> shared_bins + array([0., 1., 2., 3., 4., 5.]) + + >>> group_id = np.array([0, 1, 1, 0, 1, 1, 0, 1, 1]) + >>> hist_0, _ = np.histogram(arr[group_id == 0], bins=shared_bins) + >>> hist_1, _ = np.histogram(arr[group_id == 1], bins=shared_bins) + + >>> hist_0; hist_1 + array([1, 1, 0, 1, 0]) + array([2, 0, 1, 1, 2]) + + Which gives more easily comparable results than using separate bins for + each histogram: + + >>> hist_0, bins_0 = np.histogram(arr[group_id == 0], bins='auto') + >>> hist_1, bins_1 = np.histogram(arr[group_id == 1], bins='auto') + >>> hist_0; hist1 + array([1, 1, 1]) + array([2, 1, 1, 2]) + >>> bins_0; bins_1 + array([0., 1., 2., 3.]) + array([0. , 1.25, 2.5 , 3.75, 5. ]) + + """ + a, weights = _ravel_and_check_weights(a, weights) + bin_edges, _ = _get_bin_edges(a, bins, range, weights) + return bin_edges + + +def histogram(a, bins=10, range=None, normed=None, weights=None, + density=None): + r""" + Compute the histogram of a set of data. + + Parameters + ---------- + a : array_like + Input data. The histogram is computed over the flattened array. + bins : int or sequence of scalars or str, optional + If `bins` is an int, it defines the number of equal-width + bins in the given range (10, by default). If `bins` is a + sequence, it defines a monotonically increasing array of bin edges, + including the rightmost edge, allowing for non-uniform bin widths. + + .. versionadded:: 1.11.0 + + If `bins` is a string, it defines the method used to calculate the + optimal bin width, as defined by `histogram_bin_edges`. + + range : (float, float), optional + The lower and upper range of the bins. If not provided, range + is simply ``(a.min(), a.max())``. Values outside the range are + ignored. The first element of the range must be less than or + equal to the second. `range` affects the automatic bin + computation as well. While bin width is computed to be optimal + based on the actual data within `range`, the bin count will fill + the entire range including portions containing no data. + normed : bool, optional + + .. deprecated:: 1.6.0 + + This is equivalent to the `density` argument, but produces incorrect + results for unequal bin widths. It should not be used. + + .. versionchanged:: 1.15.0 + DeprecationWarnings are actually emitted. + + weights : array_like, optional + An array of weights, of the same shape as `a`. Each value in + `a` only contributes its associated weight towards the bin count + (instead of 1). If `density` is True, the weights are + normalized, so that the integral of the density over the range + remains 1. + density : bool, optional + If ``False``, the result will contain the number of samples in + each bin. If ``True``, the result is the value of the + probability *density* function at the bin, normalized such that + the *integral* over the range is 1. Note that the sum of the + histogram values will not be equal to 1 unless bins of unity + width are chosen; it is not a probability *mass* function. + + Overrides the ``normed`` keyword if given. + + Returns + ------- + hist : array + The values of the histogram. See `density` and `weights` for a + description of the possible semantics. + bin_edges : array of dtype float + Return the bin edges ``(length(hist)+1)``. + + + See Also + -------- + histogramdd, bincount, searchsorted, digitize, histogram_bin_edges + + Notes + ----- + All but the last (righthand-most) bin is half-open. In other words, + if `bins` is:: + + [1, 2, 3, 4] + + then the first bin is ``[1, 2)`` (including 1, but excluding 2) and + the second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which + *includes* 4. + + + Examples + -------- + >>> np.histogram([1, 2, 1], bins=[0, 1, 2, 3]) + (array([0, 2, 1]), array([0, 1, 2, 3])) + >>> np.histogram(np.arange(4), bins=np.arange(5), density=True) + (array([ 0.25, 0.25, 0.25, 0.25]), array([0, 1, 2, 3, 4])) + >>> np.histogram([[1, 2, 1], [1, 0, 1]], bins=[0,1,2,3]) + (array([1, 4, 1]), array([0, 1, 2, 3])) + + >>> a = np.arange(5) + >>> hist, bin_edges = np.histogram(a, density=True) + >>> hist + array([ 0.5, 0. , 0.5, 0. , 0. , 0.5, 0. , 0.5, 0. , 0.5]) + >>> hist.sum() + 2.4999999999999996 + >>> np.sum(hist * np.diff(bin_edges)) + 1.0 + + .. versionadded:: 1.11.0 + + Automated Bin Selection Methods example, using 2 peak random data + with 2000 points: + + >>> import matplotlib.pyplot as plt + >>> rng = np.random.RandomState(10) # deterministic random data + >>> a = np.hstack((rng.normal(size=1000), + ... rng.normal(loc=5, scale=2, size=1000))) + >>> plt.hist(a, bins='auto') # arguments are passed to np.histogram + >>> plt.title("Histogram with 'auto' bins") + >>> plt.show() + + """ + a, weights = _ravel_and_check_weights(a, weights) + + bin_edges, uniform_bins = _get_bin_edges(a, bins, range, weights) + + # Histogram is an integer or a float array depending on the weights. + if weights is None: + ntype = np.dtype(np.intp) + else: + ntype = weights.dtype + + # We set a block size, as this allows us to iterate over chunks when + # computing histograms, to minimize memory usage. + BLOCK = 65536 + + # The fast path uses bincount, but that only works for certain types + # of weight + simple_weights = ( + weights is None or + np.can_cast(weights.dtype, np.double) or + np.can_cast(weights.dtype, complex) + ) + + if uniform_bins is not None and simple_weights: + # Fast algorithm for equal bins + # We now convert values of a to bin indices, under the assumption of + # equal bin widths (which is valid here). + first_edge, last_edge, n_equal_bins = uniform_bins + + # Initialize empty histogram + n = np.zeros(n_equal_bins, ntype) + + # Pre-compute histogram scaling factor + norm = n_equal_bins / (last_edge - first_edge) + + # We iterate over blocks here for two reasons: the first is that for + # large arrays, it is actually faster (for example for a 10^8 array it + # is 2x as fast) and it results in a memory footprint 3x lower in the + # limit of large arrays. + for i in _range(0, len(a), BLOCK): + tmp_a = a[i:i+BLOCK] + if weights is None: + tmp_w = None + else: + tmp_w = weights[i:i + BLOCK] + + # Only include values in the right range + keep = (tmp_a >= first_edge) + keep &= (tmp_a <= last_edge) + if not np.logical_and.reduce(keep): + tmp_a = tmp_a[keep] + if tmp_w is not None: + tmp_w = tmp_w[keep] + + # This cast ensures no type promotions occur below, which gh-10322 + # make unpredictable. Getting it wrong leads to precision errors + # like gh-8123. + tmp_a = tmp_a.astype(bin_edges.dtype, copy=False) + + # Compute the bin indices, and for values that lie exactly on + # last_edge we need to subtract one + f_indices = (tmp_a - first_edge) * norm + indices = f_indices.astype(np.intp) + indices[indices == n_equal_bins] -= 1 + + # The index computation is not guaranteed to give exactly + # consistent results within ~1 ULP of the bin edges. + decrement = tmp_a < bin_edges[indices] + indices[decrement] -= 1 + # The last bin includes the right edge. The other bins do not. + increment = ((tmp_a >= bin_edges[indices + 1]) + & (indices != n_equal_bins - 1)) + indices[increment] += 1 + + # We now compute the histogram using bincount + if ntype.kind == 'c': + n.real += np.bincount(indices, weights=tmp_w.real, + minlength=n_equal_bins) + n.imag += np.bincount(indices, weights=tmp_w.imag, + minlength=n_equal_bins) + else: + n += np.bincount(indices, weights=tmp_w, + minlength=n_equal_bins).astype(ntype) + else: + # Compute via cumulative histogram + cum_n = np.zeros(bin_edges.shape, ntype) + if weights is None: + for i in _range(0, len(a), BLOCK): + sa = np.sort(a[i:i+BLOCK]) + cum_n += _search_sorted_inclusive(sa, bin_edges) + else: + zero = np.zeros(1, dtype=ntype) + for i in _range(0, len(a), BLOCK): + tmp_a = a[i:i+BLOCK] + tmp_w = weights[i:i+BLOCK] + sorting_index = np.argsort(tmp_a) + sa = tmp_a[sorting_index] + sw = tmp_w[sorting_index] + cw = np.concatenate((zero, sw.cumsum())) + bin_index = _search_sorted_inclusive(sa, bin_edges) + cum_n += cw[bin_index] + + n = np.diff(cum_n) + + # density overrides the normed keyword + if density is not None: + if normed is not None: + # 2018-06-13, numpy 1.15.0 (this was not noisily deprecated in 1.6) + warnings.warn( + "The normed argument is ignored when density is provided. " + "In future passing both will result in an error.", + DeprecationWarning, stacklevel=2) + normed = None + + if density: + db = np.array(np.diff(bin_edges), float) + return n/db/n.sum(), bin_edges + elif normed: + # 2018-06-13, numpy 1.15.0 (this was not noisily deprecated in 1.6) + warnings.warn( + "Passing `normed=True` on non-uniform bins has always been " + "broken, and computes neither the probability density " + "function nor the probability mass function. " + "The result is only correct if the bins are uniform, when " + "density=True will produce the same result anyway. " + "The argument will be removed in a future version of " + "numpy.", + np.VisibleDeprecationWarning, stacklevel=2) + + # this normalization is incorrect, but + db = np.array(np.diff(bin_edges), float) + return n/(n*db).sum(), bin_edges + else: + if normed is not None: + # 2018-06-13, numpy 1.15.0 (this was not noisily deprecated in 1.6) + warnings.warn( + "Passing normed=False is deprecated, and has no effect. " + "Consider passing the density argument instead.", + DeprecationWarning, stacklevel=2) + return n, bin_edges + + +def histogramdd(sample, bins=10, range=None, normed=None, weights=None, + density=None): + """ + Compute the multidimensional histogram of some data. + + Parameters + ---------- + sample : (N, D) array, or (D, N) array_like + The data to be histogrammed. + + Note the unusual interpretation of sample when an array_like: + + * When an array, each row is a coordinate in a D-dimensional space - + such as ``histogramgramdd(np.array([p1, p2, p3]))``. + * When an array_like, each element is the list of values for single + coordinate - such as ``histogramgramdd((X, Y, Z))``. + + The first form should be preferred. + + bins : sequence or int, optional + The bin specification: + + * A sequence of arrays describing the monotonically increasing bin + edges along each dimension. + * The number of bins for each dimension (nx, ny, ... =bins) + * The number of bins for all dimensions (nx=ny=...=bins). + + range : sequence, optional + A sequence of length D, each an optional (lower, upper) tuple giving + the outer bin edges to be used if the edges are not given explicitly in + `bins`. + An entry of None in the sequence results in the minimum and maximum + values being used for the corresponding dimension. + The default, None, is equivalent to passing a tuple of D None values. + density : bool, optional + If False, the default, returns the number of samples in each bin. + If True, returns the probability *density* function at the bin, + ``bin_count / sample_count / bin_volume``. + normed : bool, optional + An alias for the density argument that behaves identically. To avoid + confusion with the broken normed argument to `histogram`, `density` + should be preferred. + weights : (N,) array_like, optional + An array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`. + Weights are normalized to 1 if normed is True. If normed is False, + the values of the returned histogram are equal to the sum of the + weights belonging to the samples falling into each bin. + + Returns + ------- + H : ndarray + The multidimensional histogram of sample x. See normed and weights + for the different possible semantics. + edges : list + A list of D arrays describing the bin edges for each dimension. + + See Also + -------- + histogram: 1-D histogram + histogram2d: 2-D histogram + + Examples + -------- + >>> r = np.random.randn(100,3) + >>> H, edges = np.histogramdd(r, bins = (5, 8, 4)) + >>> H.shape, edges[0].size, edges[1].size, edges[2].size + ((5, 8, 4), 6, 9, 5) + + """ + + try: + # Sample is an ND-array. + N, D = sample.shape + except (AttributeError, ValueError): + # Sample is a sequence of 1D arrays. + sample = np.atleast_2d(sample).T + N, D = sample.shape + + nbin = np.empty(D, int) + edges = D*[None] + dedges = D*[None] + if weights is not None: + weights = np.asarray(weights) + + try: + M = len(bins) + if M != D: + raise ValueError( + 'The dimension of bins must be equal to the dimension of the ' + ' sample x.') + except TypeError: + # bins is an integer + bins = D*[bins] + + # normalize the range argument + if range is None: + range = (None,) * D + elif len(range) != D: + raise ValueError('range argument must have one entry per dimension') + + # Create edge arrays + for i in _range(D): + if np.ndim(bins[i]) == 0: + if bins[i] < 1: + raise ValueError( + '`bins[{}]` must be positive, when an integer'.format(i)) + smin, smax = _get_outer_edges(sample[:,i], range[i]) + edges[i] = np.linspace(smin, smax, bins[i] + 1) + elif np.ndim(bins[i]) == 1: + edges[i] = np.asarray(bins[i]) + if np.any(edges[i][:-1] > edges[i][1:]): + raise ValueError( + '`bins[{}]` must be monotonically increasing, when an array' + .format(i)) + else: + raise ValueError( + '`bins[{}]` must be a scalar or 1d array'.format(i)) + + nbin[i] = len(edges[i]) + 1 # includes an outlier on each end + dedges[i] = np.diff(edges[i]) + + # Compute the bin number each sample falls into. + Ncount = tuple( + # avoid np.digitize to work around gh-11022 + np.searchsorted(edges[i], sample[:, i], side='right') + for i in _range(D) + ) + + # Using digitize, values that fall on an edge are put in the right bin. + # For the rightmost bin, we want values equal to the right edge to be + # counted in the last bin, and not as an outlier. + for i in _range(D): + # Find which points are on the rightmost edge. + on_edge = (sample[:, i] == edges[i][-1]) + # Shift these points one bin to the left. + Ncount[i][on_edge] -= 1 + + # Compute the sample indices in the flattened histogram matrix. + # This raises an error if the array is too large. + xy = np.ravel_multi_index(Ncount, nbin) + + # Compute the number of repetitions in xy and assign it to the + # flattened histmat. + hist = np.bincount(xy, weights, minlength=nbin.prod()) + + # Shape into a proper matrix + hist = hist.reshape(nbin) + + # This preserves the (bad) behavior observed in gh-7845, for now. + hist = hist.astype(float, casting='safe') + + # Remove outliers (indices 0 and -1 for each dimension). + core = D*(slice(1, -1),) + hist = hist[core] + + # handle the aliasing normed argument + if normed is None: + if density is None: + density = False + elif density is None: + # an explicit normed argument was passed, alias it to the new name + density = normed + else: + raise TypeError("Cannot specify both 'normed' and 'density'") + + if density: + # calculate the probability density function + s = hist.sum() + for i in _range(D): + shape = np.ones(D, int) + shape[i] = nbin[i] - 2 + hist = hist / dedges[i].reshape(shape) + hist /= s + + if (hist.shape != nbin - 2).any(): + raise RuntimeError( + "Internal Shape Error") + return hist, edges |