diff options
Diffstat (limited to 'numpy/lib')
| -rw-r--r-- | numpy/lib/__init__.py | 2 | ||||
| -rw-r--r-- | numpy/lib/arraypad.py | 2 | ||||
| -rw-r--r-- | numpy/lib/arraysetops.py | 126 | ||||
| -rw-r--r-- | numpy/lib/format.py | 7 | ||||
| -rw-r--r-- | numpy/lib/function_base.py | 874 | ||||
| -rw-r--r-- | numpy/lib/histograms.py | 878 | ||||
| -rw-r--r-- | numpy/lib/nanfunctions.py | 73 | ||||
| -rw-r--r-- | numpy/lib/npyio.py | 69 | ||||
| -rw-r--r-- | numpy/lib/polynomial.py | 2 | ||||
| -rw-r--r-- | numpy/lib/tests/test_arraysetops.py | 23 | ||||
| -rw-r--r-- | numpy/lib/tests/test_format.py | 12 | ||||
| -rw-r--r-- | numpy/lib/tests/test_function_base.py | 528 | ||||
| -rw-r--r-- | numpy/lib/tests/test_histograms.py | 635 | ||||
| -rw-r--r-- | numpy/lib/tests/test_index_tricks.py | 2 | ||||
| -rw-r--r-- | numpy/lib/tests/test_io.py | 24 | ||||
| -rw-r--r-- | numpy/lib/tests/test_polynomial.py | 8 | ||||
| -rw-r--r-- | numpy/lib/tests/test_type_check.py | 18 | ||||
| -rw-r--r-- | numpy/lib/type_check.py | 23 | ||||
| -rw-r--r-- | numpy/lib/utils.py | 2 |
19 files changed, 1805 insertions, 1503 deletions
diff --git a/numpy/lib/__init__.py b/numpy/lib/__init__.py index d85a179dd..cc05232a2 100644 --- a/numpy/lib/__init__.py +++ b/numpy/lib/__init__.py @@ -14,6 +14,7 @@ from .shape_base import * from .stride_tricks import * from .twodim_base import * from .ufunclike import * +from .histograms import * from . import scimath as emath from .polynomial import * @@ -43,6 +44,7 @@ __all__ += arraysetops.__all__ __all__ += npyio.__all__ __all__ += financial.__all__ __all__ += nanfunctions.__all__ +__all__ += histograms.__all__ from numpy.testing import _numpy_tester test = _numpy_tester().test diff --git a/numpy/lib/arraypad.py b/numpy/lib/arraypad.py index 153b4af65..cdc354a02 100644 --- a/numpy/lib/arraypad.py +++ b/numpy/lib/arraypad.py @@ -1186,7 +1186,7 @@ def pad(array, pad_width, mode, **kwargs): reflect_type : {'even', 'odd'}, optional Used in 'reflect', and 'symmetric'. The 'even' style is the default with an unaltered reflection around the edge value. For - the 'odd' style, the extented part of the array is created by + the 'odd' style, the extended part of the array is created by subtracting the reflected values from two times the edge value. Returns diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py index a9426cdf3..e8eda297f 100644 --- a/numpy/lib/arraysetops.py +++ b/numpy/lib/arraysetops.py @@ -110,16 +110,25 @@ def ediff1d(ary, to_end=None, to_begin=None): return result +def _unpack_tuple(x): + """ Unpacks one-element tuples for use as return values """ + if len(x) == 1: + return x[0] + else: + return x + + def unique(ar, return_index=False, return_inverse=False, return_counts=False, axis=None): """ Find the unique elements of an array. Returns the sorted unique elements of an array. There are three optional - outputs in addition to the unique elements: the indices of the input array - that give the unique values, the indices of the unique array that - reconstruct the input array, and the number of times each unique value - comes up in the input array. + outputs in addition to the unique elements: + + * the indices of the input array that give the unique values + * the indices of the unique array that reconstruct the input array + * the number of times each unique value comes up in the input array Parameters ---------- @@ -135,16 +144,18 @@ def unique(ar, return_index=False, return_inverse=False, return_counts : bool, optional If True, also return the number of times each unique item appears in `ar`. + .. versionadded:: 1.9.0 - axis : int or None, optional - The axis to operate on. If None, `ar` will be flattened beforehand. - Otherwise, duplicate items will be removed along the provided axis, - with all the other axes belonging to the each of the unique elements. - Object arrays or structured arrays that contain objects are not - supported if the `axis` kwarg is used. - .. versionadded:: 1.13.0 + axis : int or None, optional + The axis to operate on. If None, `ar` will be flattened. If an integer, + the subarrays indexed by the given axis will be flattened and treated + as the elements of a 1-D array with the dimension of the given axis, + see the notes for more details. Object arrays or structured arrays + that contain objects are not supported if the `axis` kwarg is used. The + default is None. + .. versionadded:: 1.13.0 Returns ------- @@ -159,6 +170,7 @@ def unique(ar, return_index=False, return_inverse=False, unique_counts : ndarray, optional The number of times each of the unique values comes up in the original array. Only provided if `return_counts` is True. + .. versionadded:: 1.9.0 See Also @@ -166,6 +178,17 @@ def unique(ar, return_index=False, return_inverse=False, numpy.lib.arraysetops : Module with a number of other functions for performing set operations on arrays. + Notes + ----- + When an axis is specified the subarrays indexed by the axis are sorted. + This is done by making the specified axis the first dimension of the array + and then flattening the subarrays in C order. The flattened subarrays are + then viewed as a structured type with each element given a label, with the + effect that we end up with a 1-D array of structured types that can be + treated in the same way as any other 1-D array. The result is that the + flattened subarrays are sorted in lexicographic order starting with the + first element. + Examples -------- >>> np.unique([1, 1, 2, 2, 3, 3]) @@ -207,24 +230,21 @@ def unique(ar, return_index=False, return_inverse=False, """ ar = np.asanyarray(ar) if axis is None: - return _unique1d(ar, return_index, return_inverse, return_counts) - if not (-ar.ndim <= axis < ar.ndim): - raise ValueError('Invalid axis kwarg specified for unique') + ret = _unique1d(ar, return_index, return_inverse, return_counts) + return _unpack_tuple(ret) + + # axis was specified and not None + try: + ar = np.swapaxes(ar, axis, 0) + except np.AxisError: + # this removes the "axis1" or "axis2" prefix from the error message + raise np.AxisError(axis, ar.ndim) - ar = np.swapaxes(ar, axis, 0) - orig_shape, orig_dtype = ar.shape, ar.dtype # Must reshape to a contiguous 2D array for this to work... + orig_shape, orig_dtype = ar.shape, ar.dtype ar = ar.reshape(orig_shape[0], -1) ar = np.ascontiguousarray(ar) - - if ar.dtype.char in (np.typecodes['AllInteger'] + - np.typecodes['Datetime'] + 'S'): - # Optimization: Creating a view of your data with a np.void data type of - # size the number of bytes in a full row. Handles any type where items - # have a unique binary representation, i.e. 0 is only 0, not +0 and -0. - dtype = np.dtype((np.void, ar.dtype.itemsize * ar.shape[1])) - else: - dtype = [('f{i}'.format(i=i), ar.dtype) for i in range(ar.shape[1])] + dtype = [('f{i}'.format(i=i), ar.dtype) for i in range(ar.shape[1])] try: consolidated = ar.view(dtype) @@ -241,11 +261,9 @@ def unique(ar, return_index=False, return_inverse=False, output = _unique1d(consolidated, return_index, return_inverse, return_counts) - if not (return_index or return_inverse or return_counts): - return reshape_uniq(output) - else: - uniq = reshape_uniq(output[0]) - return (uniq,) + output[1:] + output = (reshape_uniq(output[0]),) + output[1:] + return _unpack_tuple(output) + def _unique1d(ar, return_index=False, return_inverse=False, return_counts=False): @@ -255,20 +273,6 @@ def _unique1d(ar, return_index=False, return_inverse=False, ar = np.asanyarray(ar).flatten() optional_indices = return_index or return_inverse - optional_returns = optional_indices or return_counts - - if ar.size == 0: - if not optional_returns: - ret = ar - else: - ret = (ar,) - if return_index: - ret += (np.empty(0, np.intp),) - if return_inverse: - ret += (np.empty(0, np.intp),) - if return_counts: - ret += (np.empty(0, np.intp),) - return ret if optional_indices: perm = ar.argsort(kind='mergesort' if return_index else 'quicksort') @@ -276,24 +280,24 @@ def _unique1d(ar, return_index=False, return_inverse=False, else: ar.sort() aux = ar - flag = np.concatenate(([True], aux[1:] != aux[:-1])) - - if not optional_returns: - ret = aux[flag] - else: - ret = (aux[flag],) - if return_index: - ret += (perm[flag],) - if return_inverse: - iflag = np.cumsum(flag) - 1 - inv_idx = np.empty(ar.shape, dtype=np.intp) - inv_idx[perm] = iflag - ret += (inv_idx,) - if return_counts: - idx = np.concatenate(np.nonzero(flag) + ([ar.size],)) - ret += (np.diff(idx),) + mask = np.empty(aux.shape, dtype=np.bool_) + mask[:1] = True + mask[1:] = aux[1:] != aux[:-1] + + ret = (aux[mask],) + if return_index: + ret += (perm[mask],) + if return_inverse: + imask = np.cumsum(mask) - 1 + inv_idx = np.empty(mask.shape, dtype=np.intp) + inv_idx[perm] = imask + ret += (inv_idx,) + if return_counts: + idx = np.concatenate(np.nonzero(mask) + ([mask.size],)) + ret += (np.diff(idx),) return ret + def intersect1d(ar1, ar2, assume_unique=False): """ Find the intersection of two arrays. @@ -614,7 +618,7 @@ def union1d(ar1, ar2): >>> reduce(np.union1d, ([1, 3, 4, 3], [3, 1, 2, 1], [6, 3, 4, 2])) array([1, 2, 3, 4, 6]) """ - return unique(np.concatenate((ar1, ar2))) + return unique(np.concatenate((ar1, ar2), axis=None)) def setdiff1d(ar1, ar2, assume_unique=False): """ diff --git a/numpy/lib/format.py b/numpy/lib/format.py index 84af2afc8..363bb2101 100644 --- a/numpy/lib/format.py +++ b/numpy/lib/format.py @@ -454,7 +454,9 @@ def _filter_header(s): tokens = [] last_token_was_number = False - for token in tokenize.generate_tokens(StringIO(asstr(s)).read): + # adding newline as python 2.7.5 workaround + string = asstr(s) + "\n" + for token in tokenize.generate_tokens(StringIO(string).readline): token_type = token[0] token_string = token[1] if (last_token_was_number and @@ -464,7 +466,8 @@ def _filter_header(s): else: tokens.append(token) last_token_was_number = (token_type == tokenize.NUMBER) - return tokenize.untokenize(tokens) + # removing newline (see above) as python 2.7.5 workaround + return tokenize.untokenize(tokens)[:-1] def _read_array_header(fp, version): diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index fb37801b9..504280cef 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -39,12 +39,14 @@ if sys.version_info[0] < 3: else: import builtins +# needed in this module for compatibility +from numpy.lib.histograms import histogram, histogramdd __all__ = [ 'select', 'piecewise', 'trim_zeros', 'copy', 'iterable', 'percentile', 'diff', 'gradient', 'angle', 'unwrap', 'sort_complex', 'disp', 'flip', 'rot90', 'extract', 'place', 'vectorize', 'asarray_chkfinite', 'average', - 'histogram', 'histogramdd', 'bincount', 'digitize', 'cov', 'corrcoef', + 'bincount', 'digitize', 'cov', 'corrcoef', 'msort', 'median', 'sinc', 'hamming', 'hanning', 'bartlett', 'blackman', 'kaiser', 'trapz', 'i0', 'add_newdoc', 'add_docstring', 'meshgrid', 'delete', 'insert', 'append', 'interp', 'add_newdoc_ufunc' @@ -241,806 +243,6 @@ def iterable(y): return True -def _hist_bin_sqrt(x): - """ - Square root histogram bin estimator. - - Bin width is inversely proportional to the data size. Used by many - programs for its simplicity. - - Parameters - ---------- - x : array_like - Input data that is to be histogrammed, trimmed to range. May not - be empty. - - Returns - ------- - h : An estimate of the optimal bin width for the given data. - """ - return x.ptp() / np.sqrt(x.size) - - -def _hist_bin_sturges(x): - """ - Sturges histogram bin estimator. - - A very simplistic estimator based on the assumption of normality of - the data. This estimator has poor performance for non-normal data, - which becomes especially obvious for large data sets. The estimate - depends only on size of the data. - - Parameters - ---------- - x : array_like - Input data that is to be histogrammed, trimmed to range. May not - be empty. - - Returns - ------- - h : An estimate of the optimal bin width for the given data. - """ - return x.ptp() / (np.log2(x.size) + 1.0) - - -def _hist_bin_rice(x): - """ - Rice histogram bin estimator. - - Another simple estimator with no normality assumption. It has better - performance for large data than Sturges, but tends to overestimate - the number of bins. The number of bins is proportional to the cube - root of data size (asymptotically optimal). The estimate depends - only on size of the data. - - Parameters - ---------- - x : array_like - Input data that is to be histogrammed, trimmed to range. May not - be empty. - - Returns - ------- - h : An estimate of the optimal bin width for the given data. - """ - return x.ptp() / (2.0 * x.size ** (1.0 / 3)) - - -def _hist_bin_scott(x): - """ - Scott histogram bin estimator. - - The binwidth is proportional to the standard deviation of the data - and inversely proportional to the cube root of data size - (asymptotically optimal). - - Parameters - ---------- - x : array_like - Input data that is to be histogrammed, trimmed to range. May not - be empty. - - Returns - ------- - h : An estimate of the optimal bin width for the given data. - """ - return (24.0 * np.pi**0.5 / x.size)**(1.0 / 3.0) * np.std(x) - - -def _hist_bin_doane(x): - """ - Doane's histogram bin estimator. - - Improved version of Sturges' formula which works better for - non-normal data. See - stats.stackexchange.com/questions/55134/doanes-formula-for-histogram-binning - - Parameters - ---------- - x : array_like - Input data that is to be histogrammed, trimmed to range. May not - be empty. - - Returns - ------- - h : An estimate of the optimal bin width for the given data. - """ - if x.size > 2: - sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3))) - sigma = np.std(x) - if sigma > 0.0: - # These three operations add up to - # g1 = np.mean(((x - np.mean(x)) / sigma)**3) - # but use only one temp array instead of three - temp = x - np.mean(x) - np.true_divide(temp, sigma, temp) - np.power(temp, 3, temp) - g1 = np.mean(temp) - return x.ptp() / (1.0 + np.log2(x.size) + - np.log2(1.0 + np.absolute(g1) / sg1)) - return 0.0 - - -def _hist_bin_fd(x): - """ - The Freedman-Diaconis histogram bin estimator. - - The Freedman-Diaconis rule uses interquartile range (IQR) to - estimate binwidth. It is considered a variation of the Scott rule - with more robustness as the IQR is less affected by outliers than - the standard deviation. However, the IQR depends on fewer points - than the standard deviation, so it is less accurate, especially for - long tailed distributions. - - If the IQR is 0, this function returns 1 for the number of bins. - Binwidth is inversely proportional to the cube root of data size - (asymptotically optimal). - - Parameters - ---------- - x : array_like - Input data that is to be histogrammed, trimmed to range. May not - be empty. - - Returns - ------- - h : An estimate of the optimal bin width for the given data. - """ - iqr = np.subtract(*np.percentile(x, [75, 25])) - return 2.0 * iqr * x.size ** (-1.0 / 3.0) - - -def _hist_bin_auto(x): - """ - Histogram bin estimator that uses the minimum width of the - Freedman-Diaconis and Sturges estimators. - - The FD estimator is usually the most robust method, but its width - estimate tends to be too large for small `x`. The Sturges estimator - is quite good for small (<1000) datasets and is the default in the R - language. This method gives good off the shelf behaviour. - - Parameters - ---------- - x : array_like - Input data that is to be histogrammed, trimmed to range. May not - be empty. - - Returns - ------- - h : An estimate of the optimal bin width for the given data. - - See Also - -------- - _hist_bin_fd, _hist_bin_sturges - """ - # There is no need to check for zero here. If ptp is, so is IQR and - # vice versa. Either both are zero or neither one is. - return min(_hist_bin_fd(x), _hist_bin_sturges(x)) - - -# Private dict initialized at module load time -_hist_bin_selectors = {'auto': _hist_bin_auto, - 'doane': _hist_bin_doane, - 'fd': _hist_bin_fd, - 'rice': _hist_bin_rice, - 'scott': _hist_bin_scott, - 'sqrt': _hist_bin_sqrt, - 'sturges': _hist_bin_sturges} - - -def histogram(a, bins=10, range=None, normed=False, weights=None, - density=None): - r""" - Compute the histogram of a set of data. - - Parameters - ---------- - a : array_like - Input data. The histogram is computed over the flattened array. - bins : int or sequence of scalars or str, optional - If `bins` is an int, it defines the number of equal-width - bins in the given range (10, by default). If `bins` is a - sequence, it defines the bin edges, including the rightmost - edge, allowing for non-uniform bin widths. - - .. versionadded:: 1.11.0 - - If `bins` is a string from the list below, `histogram` will use - the method chosen to calculate the optimal bin width and - consequently the number of bins (see `Notes` for more detail on - the estimators) from the data that falls within the requested - range. While the bin width will be optimal for the actual data - in the range, the number of bins will be computed to fill the - entire range, including the empty portions. For visualisation, - using the 'auto' option is suggested. Weighted data is not - supported for automated bin size selection. - - 'auto' - Maximum of the 'sturges' and 'fd' estimators. Provides good - all around performance. - - 'fd' (Freedman Diaconis Estimator) - Robust (resilient to outliers) estimator that takes into - account data variability and data size. - - 'doane' - An improved version of Sturges' estimator that works better - with non-normal datasets. - - 'scott' - Less robust estimator that that takes into account data - variability and data size. - - 'rice' - Estimator does not take variability into account, only data - size. Commonly overestimates number of bins required. - - 'sturges' - R's default method, only accounts for data size. Only - optimal for gaussian data and underestimates number of bins - for large non-gaussian datasets. - - 'sqrt' - Square root (of data size) estimator, used by Excel and - other programs for its speed and simplicity. - - range : (float, float), optional - The lower and upper range of the bins. If not provided, range - is simply ``(a.min(), a.max())``. Values outside the range are - ignored. The first element of the range must be less than or - equal to the second. `range` affects the automatic bin - computation as well. While bin width is computed to be optimal - based on the actual data within `range`, the bin count will fill - the entire range including portions containing no data. - normed : bool, optional - This keyword is deprecated in NumPy 1.6.0 due to confusing/buggy - behavior. It will be removed in NumPy 2.0.0. Use the ``density`` - keyword instead. If ``False``, the result will contain the - number of samples in each bin. If ``True``, the result is the - value of the probability *density* function at the bin, - normalized such that the *integral* over the range is 1. Note - that this latter behavior is known to be buggy with unequal bin - widths; use ``density`` instead. - weights : array_like, optional - An array of weights, of the same shape as `a`. Each value in - `a` only contributes its associated weight towards the bin count - (instead of 1). If `density` is True, the weights are - normalized, so that the integral of the density over the range - remains 1. - density : bool, optional - If ``False``, the result will contain the number of samples in - each bin. If ``True``, the result is the value of the - probability *density* function at the bin, normalized such that - the *integral* over the range is 1. Note that the sum of the - histogram values will not be equal to 1 unless bins of unity - width are chosen; it is not a probability *mass* function. - - Overrides the ``normed`` keyword if given. - - Returns - ------- - hist : array - The values of the histogram. See `density` and `weights` for a - description of the possible semantics. - bin_edges : array of dtype float - Return the bin edges ``(length(hist)+1)``. - - - See Also - -------- - histogramdd, bincount, searchsorted, digitize - - Notes - ----- - All but the last (righthand-most) bin is half-open. In other words, - if `bins` is:: - - [1, 2, 3, 4] - - then the first bin is ``[1, 2)`` (including 1, but excluding 2) and - the second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which - *includes* 4. - - .. versionadded:: 1.11.0 - - The methods to estimate the optimal number of bins are well founded - in literature, and are inspired by the choices R provides for - histogram visualisation. Note that having the number of bins - proportional to :math:`n^{1/3}` is asymptotically optimal, which is - why it appears in most estimators. These are simply plug-in methods - that give good starting points for number of bins. In the equations - below, :math:`h` is the binwidth and :math:`n_h` is the number of - bins. All estimators that compute bin counts are recast to bin width - using the `ptp` of the data. The final bin count is obtained from - ``np.round(np.ceil(range / h))`. - - 'Auto' (maximum of the 'Sturges' and 'FD' estimators) - A compromise to get a good value. For small datasets the Sturges - value will usually be chosen, while larger datasets will usually - default to FD. Avoids the overly conservative behaviour of FD - and Sturges for small and large datasets respectively. - Switchover point is usually :math:`a.size \approx 1000`. - - 'FD' (Freedman Diaconis Estimator) - .. math:: h = 2 \frac{IQR}{n^{1/3}} - - The binwidth is proportional to the interquartile range (IQR) - and inversely proportional to cube root of a.size. Can be too - conservative for small datasets, but is quite good for large - datasets. The IQR is very robust to outliers. - - 'Scott' - .. math:: h = \sigma \sqrt[3]{\frac{24 * \sqrt{\pi}}{n}} - - The binwidth is proportional to the standard deviation of the - data and inversely proportional to cube root of ``x.size``. Can - be too conservative for small datasets, but is quite good for - large datasets. The standard deviation is not very robust to - outliers. Values are very similar to the Freedman-Diaconis - estimator in the absence of outliers. - - 'Rice' - .. math:: n_h = 2n^{1/3} - - The number of bins is only proportional to cube root of - ``a.size``. It tends to overestimate the number of bins and it - does not take into account data variability. - - 'Sturges' - .. math:: n_h = \log _{2}n+1 - - The number of bins is the base 2 log of ``a.size``. This - estimator assumes normality of data and is too conservative for - larger, non-normal datasets. This is the default method in R's - ``hist`` method. - - 'Doane' - .. math:: n_h = 1 + \log_{2}(n) + - \log_{2}(1 + \frac{|g_1|}{\sigma_{g_1}}) - - g_1 = mean[(\frac{x - \mu}{\sigma})^3] - - \sigma_{g_1} = \sqrt{\frac{6(n - 2)}{(n + 1)(n + 3)}} - - An improved version of Sturges' formula that produces better - estimates for non-normal datasets. This estimator attempts to - account for the skew of the data. - - 'Sqrt' - .. math:: n_h = \sqrt n - The simplest and fastest estimator. Only takes into account the - data size. - - Examples - -------- - >>> np.histogram([1, 2, 1], bins=[0, 1, 2, 3]) - (array([0, 2, 1]), array([0, 1, 2, 3])) - >>> np.histogram(np.arange(4), bins=np.arange(5), density=True) - (array([ 0.25, 0.25, 0.25, 0.25]), array([0, 1, 2, 3, 4])) - >>> np.histogram([[1, 2, 1], [1, 0, 1]], bins=[0,1,2,3]) - (array([1, 4, 1]), array([0, 1, 2, 3])) - - >>> a = np.arange(5) - >>> hist, bin_edges = np.histogram(a, density=True) - >>> hist - array([ 0.5, 0. , 0.5, 0. , 0. , 0.5, 0. , 0.5, 0. , 0.5]) - >>> hist.sum() - 2.4999999999999996 - >>> np.sum(hist * np.diff(bin_edges)) - 1.0 - - .. versionadded:: 1.11.0 - - Automated Bin Selection Methods example, using 2 peak random data - with 2000 points: - - >>> import matplotlib.pyplot as plt - >>> rng = np.random.RandomState(10) # deterministic random data - >>> a = np.hstack((rng.normal(size=1000), - ... rng.normal(loc=5, scale=2, size=1000))) - >>> plt.hist(a, bins='auto') # arguments are passed to np.histogram - >>> plt.title("Histogram with 'auto' bins") - >>> plt.show() - - """ - a = asarray(a) - if weights is not None: - weights = asarray(weights) - if weights.shape != a.shape: - raise ValueError( - 'weights should have the same shape as a.') - weights = weights.ravel() - a = a.ravel() - - # Do not modify the original value of range so we can check for `None` - if range is None: - if a.size == 0: - # handle empty arrays. Can't determine range, so use 0-1. - first_edge, last_edge = 0.0, 1.0 - else: - first_edge, last_edge = a.min() + 0.0, a.max() + 0.0 - else: - first_edge, last_edge = [mi + 0.0 for mi in range] - if first_edge > last_edge: - raise ValueError( - 'max must be larger than min in range parameter.') - if not np.all(np.isfinite([first_edge, last_edge])): - raise ValueError( - 'range parameter must be finite.') - if first_edge == last_edge: - first_edge -= 0.5 - last_edge += 0.5 - - # density overrides the normed keyword - if density is not None: - normed = False - - # parse the overloaded bins argument - n_equal_bins = None - bin_edges = None - - if isinstance(bins, basestring): - bin_name = bins - # if `bins` is a string for an automatic method, - # this will replace it with the number of bins calculated - if bin_name not in _hist_bin_selectors: - raise ValueError( - "{!r} is not a valid estimator for `bins`".format(bin_name)) - if weights is not None: - raise TypeError("Automated estimation of the number of " - "bins is not supported for weighted data") - # Make a reference to `a` - b = a - # Update the reference if the range needs truncation - if range is not None: - keep = (a >= first_edge) - keep &= (a <= last_edge) - if not np.logical_and.reduce(keep): - b = a[keep] - - if b.size == 0: - n_equal_bins = 1 - else: - # Do not call selectors on empty arrays - width = _hist_bin_selectors[bin_name](b) - if width: - n_equal_bins = int(np.ceil((last_edge - first_edge) / width)) - else: - # Width can be zero for some estimators, e.g. FD when - # the IQR of the data is zero. - n_equal_bins = 1 - - elif np.ndim(bins) == 0: - try: - n_equal_bins = operator.index(bins) - except TypeError: - raise TypeError( - '`bins` must be an integer, a string, or an array') - if n_equal_bins < 1: - raise ValueError('`bins` must be positive, when an integer') - - elif np.ndim(bins) == 1: - bin_edges = np.asarray(bins) - if np.any(bin_edges[:-1] > bin_edges[1:]): - raise ValueError( - '`bins` must increase monotonically, when an array') - - else: - raise ValueError('`bins` must be 1d, when an array') - - del bins - - # compute the bins if only the count was specified - if n_equal_bins is not None: - bin_edges = linspace( - first_edge, last_edge, n_equal_bins + 1, endpoint=True) - - # Histogram is an integer or a float array depending on the weights. - if weights is None: - ntype = np.dtype(np.intp) - else: - ntype = weights.dtype - - # We set a block size, as this allows us to iterate over chunks when - # computing histograms, to minimize memory usage. - BLOCK = 65536 - - # The fast path uses bincount, but that only works for certain types - # of weight - simple_weights = ( - weights is None or - np.can_cast(weights.dtype, np.double) or - np.can_cast(weights.dtype, complex) - ) - - if n_equal_bins is not None and simple_weights: - # Fast algorithm for equal bins - # We now convert values of a to bin indices, under the assumption of - # equal bin widths (which is valid here). - - # Initialize empty histogram - n = np.zeros(n_equal_bins, ntype) - - # Pre-compute histogram scaling factor - norm = n_equal_bins / (last_edge - first_edge) - - # We iterate over blocks here for two reasons: the first is that for - # large arrays, it is actually faster (for example for a 10^8 array it - # is 2x as fast) and it results in a memory footprint 3x lower in the - # limit of large arrays. - for i in arange(0, len(a), BLOCK): - tmp_a = a[i:i+BLOCK] - if weights is None: - tmp_w = None - else: - tmp_w = weights[i:i + BLOCK] - - # Only include values in the right range - keep = (tmp_a >= first_edge) - keep &= (tmp_a <= last_edge) - if not np.logical_and.reduce(keep): - tmp_a = tmp_a[keep] - if tmp_w is not None: - tmp_w = tmp_w[keep] - tmp_a_data = tmp_a.astype(float) - tmp_a = tmp_a_data - first_edge - tmp_a *= norm - - # Compute the bin indices, and for values that lie exactly on - # last_edge we need to subtract one - indices = tmp_a.astype(np.intp) - indices[indices == n_equal_bins] -= 1 - - # The index computation is not guaranteed to give exactly - # consistent results within ~1 ULP of the bin edges. - decrement = tmp_a_data < bin_edges[indices] - indices[decrement] -= 1 - # The last bin includes the right edge. The other bins do not. - increment = ((tmp_a_data >= bin_edges[indices + 1]) - & (indices != n_equal_bins - 1)) - indices[increment] += 1 - - # We now compute the histogram using bincount - if ntype.kind == 'c': - n.real += np.bincount(indices, weights=tmp_w.real, - minlength=n_equal_bins) - n.imag += np.bincount(indices, weights=tmp_w.imag, - minlength=n_equal_bins) - else: - n += np.bincount(indices, weights=tmp_w, - minlength=n_equal_bins).astype(ntype) - else: - # Compute via cumulative histogram - cum_n = np.zeros(bin_edges.shape, ntype) - if weights is None: - for i in arange(0, len(a), BLOCK): - sa = sort(a[i:i+BLOCK]) - cum_n += np.r_[sa.searchsorted(bin_edges[:-1], 'left'), - sa.searchsorted(bin_edges[-1], 'right')] - else: - zero = array(0, dtype=ntype) - for i in arange(0, len(a), BLOCK): - tmp_a = a[i:i+BLOCK] - tmp_w = weights[i:i+BLOCK] - sorting_index = np.argsort(tmp_a) - sa = tmp_a[sorting_index] - sw = tmp_w[sorting_index] - cw = np.concatenate(([zero], sw.cumsum())) - bin_index = np.r_[sa.searchsorted(bin_edges[:-1], 'left'), - sa.searchsorted(bin_edges[-1], 'right')] - cum_n += cw[bin_index] - - n = np.diff(cum_n) - - if density: - db = array(np.diff(bin_edges), float) - return n/db/n.sum(), bin_edges - elif normed: - # deprecated, buggy behavior. Remove for NumPy 2.0.0 - db = array(np.diff(bin_edges), float) - return n/(n*db).sum(), bin_edges - else: - return n, bin_edges - - -def histogramdd(sample, bins=10, range=None, normed=False, weights=None): - """ - Compute the multidimensional histogram of some data. - - Parameters - ---------- - sample : array_like - The data to be histogrammed. It must be an (N,D) array or data - that can be converted to such. The rows of the resulting array - are the coordinates of points in a D dimensional polytope. - bins : sequence or int, optional - The bin specification: - - * A sequence of arrays describing the bin edges along each dimension. - * The number of bins for each dimension (nx, ny, ... =bins) - * The number of bins for all dimensions (nx=ny=...=bins). - - range : sequence, optional - A sequence of lower and upper bin edges to be used if the edges are - not given explicitly in `bins`. Defaults to the minimum and maximum - values along each dimension. - normed : bool, optional - If False, returns the number of samples in each bin. If True, - returns the bin density ``bin_count / sample_count / bin_volume``. - weights : (N,) array_like, optional - An array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`. - Weights are normalized to 1 if normed is True. If normed is False, - the values of the returned histogram are equal to the sum of the - weights belonging to the samples falling into each bin. - - Returns - ------- - H : ndarray - The multidimensional histogram of sample x. See normed and weights - for the different possible semantics. - edges : list - A list of D arrays describing the bin edges for each dimension. - - See Also - -------- - histogram: 1-D histogram - histogram2d: 2-D histogram - - Examples - -------- - >>> r = np.random.randn(100,3) - >>> H, edges = np.histogramdd(r, bins = (5, 8, 4)) - >>> H.shape, edges[0].size, edges[1].size, edges[2].size - ((5, 8, 4), 6, 9, 5) - - """ - - try: - # Sample is an ND-array. - N, D = sample.shape - except (AttributeError, ValueError): - # Sample is a sequence of 1D arrays. - sample = atleast_2d(sample).T - N, D = sample.shape - - nbin = empty(D, int) - edges = D*[None] - dedges = D*[None] - if weights is not None: - weights = asarray(weights) - - try: - M = len(bins) - if M != D: - raise ValueError( - 'The dimension of bins must be equal to the dimension of the ' - ' sample x.') - except TypeError: - # bins is an integer - bins = D*[bins] - - # Select range for each dimension - # Used only if number of bins is given. - if range is None: - # Handle empty input. Range can't be determined in that case, use 0-1. - if N == 0: - smin = zeros(D) - smax = ones(D) - else: - smin = atleast_1d(array(sample.min(0), float)) - smax = atleast_1d(array(sample.max(0), float)) - else: - if not np.all(np.isfinite(range)): - raise ValueError( - 'range parameter must be finite.') - smin = zeros(D) - smax = zeros(D) - for i in arange(D): - smin[i], smax[i] = range[i] - - # Make sure the bins have a finite width. - for i in arange(len(smin)): - if smin[i] == smax[i]: - smin[i] = smin[i] - .5 - smax[i] = smax[i] + .5 - - # avoid rounding issues for comparisons when dealing with inexact types - if np.issubdtype(sample.dtype, np.inexact): - edge_dt = sample.dtype - else: - edge_dt = float - # Create edge arrays - for i in arange(D): - if isscalar(bins[i]): - if bins[i] < 1: - raise ValueError( - "Element at index %s in `bins` should be a positive " - "integer." % i) - nbin[i] = bins[i] + 2 # +2 for outlier bins - edges[i] = linspace(smin[i], smax[i], nbin[i]-1, dtype=edge_dt) - else: - edges[i] = asarray(bins[i], edge_dt) - nbin[i] = len(edges[i]) + 1 # +1 for outlier bins - dedges[i] = diff(edges[i]) - if np.any(np.asarray(dedges[i]) <= 0): - raise ValueError( - "Found bin edge of size <= 0. Did you specify `bins` with" - "non-monotonic sequence?") - - nbin = asarray(nbin) - - # Handle empty input. - if N == 0: - return np.zeros(nbin-2), edges - - # Compute the bin number each sample falls into. - Ncount = {} - for i in arange(D): - Ncount[i] = digitize(sample[:, i], edges[i]) - - # Using digitize, values that fall on an edge are put in the right bin. - # For the rightmost bin, we want values equal to the right edge to be - # counted in the last bin, and not as an outlier. - for i in arange(D): - # Rounding precision - mindiff = dedges[i].min() - if not np.isinf(mindiff): - decimal = int(-log10(mindiff)) + 6 - # Find which points are on the rightmost edge. - not_smaller_than_edge = (sample[:, i] >= edges[i][-1]) - on_edge = (around(sample[:, i], decimal) == - around(edges[i][-1], decimal)) - # Shift these points one bin to the left. - Ncount[i][nonzero(on_edge & not_smaller_than_edge)[0]] -= 1 - - # Flattened histogram matrix (1D) - # Reshape is used so that overlarge arrays - # will raise an error. - hist = zeros(nbin, float).reshape(-1) - - # Compute the sample indices in the flattened histogram matrix. - ni = nbin.argsort() - xy = zeros(N, int) - for i in arange(0, D-1): - xy += Ncount[ni[i]] * nbin[ni[i+1:]].prod() - xy += Ncount[ni[-1]] - - # Compute the number of repetitions in xy and assign it to the - # flattened histmat. - if len(xy) == 0: - return zeros(nbin-2, int), edges - - flatcount = bincount(xy, weights) - a = arange(len(flatcount)) - hist[a] = flatcount - - # Shape into a proper matrix - hist = hist.reshape(sort(nbin)) - for i in arange(nbin.size): - j = ni.argsort()[i] - hist = hist.swapaxes(i, j) - ni[i], ni[j] = ni[j], ni[i] - - # Remove outliers (indices 0 and -1 for each dimension). - core = D*[slice(1, -1)] - hist = hist[core] - - # Normalize if normed is True - if normed: - s = hist.sum() - for i in arange(D): - shape = ones(D, int) - shape[i] = nbin[i] - 2 - hist = hist / dedges[i].reshape(shape) - hist /= s - - if (hist.shape != nbin - 2).any(): - raise RuntimeError( - "Internal Shape Error") - return hist, edges - - def average(a, axis=None, weights=None, returned=False): """ Compute the weighted average along the specified axis. @@ -2034,7 +1236,8 @@ def interp(x, xp, fp, left=None, right=None, period=None): >>> np.interp(x, xp, fp, period=360) array([7.5, 5., 8.75, 6.25, 3., 3.25, 3.5, 3.75]) - Complex interpolation + Complex interpolation: + >>> x = [1.5, 4.0] >>> xp = [2,3,5] >>> fp = [1.0j, 0, 2+3j] @@ -2052,23 +1255,13 @@ def interp(x, xp, fp, left=None, right=None, period=None): interp_func = compiled_interp input_dtype = np.float64 - if period is None: - if isinstance(x, (float, int, number)): - return interp_func([x], xp, fp, left, right).item() - elif isinstance(x, np.ndarray) and x.ndim == 0: - return interp_func([x], xp, fp, left, right).item() - else: - return interp_func(x, xp, fp, left, right) - else: + if period is not None: if period == 0: raise ValueError("period must be a non-zero value") period = abs(period) left = None right = None - return_array = True - if isinstance(x, (float, int, number)): - return_array = False - x = [x] + x = np.asarray(x, dtype=np.float64) xp = np.asarray(xp, dtype=np.float64) fp = np.asarray(fp, dtype=input_dtype) @@ -2086,10 +1279,7 @@ def interp(x, xp, fp, left=None, right=None, period=None): xp = np.concatenate((xp[-1:]-period, xp, xp[0:1]+period)) fp = np.concatenate((fp[-1:], fp, fp[0:1])) - if return_array: - return interp_func(x, xp, fp, left, right) - else: - return interp_func(x, xp, fp, left, right).item() + return interp_func(x, xp, fp, left, right) def angle(z, deg=0): @@ -2942,7 +2132,7 @@ def cov(m, y=None, rowvar=True, bias=False, ddof=None, fweights=None, .. versionadded:: 1.5 fweights : array_like, int, optional - 1-D array of integer freguency weights; the number of times each + 1-D array of integer frequency weights; the number of times each observation vector should be repeated. .. versionadded:: 1.10 @@ -3993,7 +3183,7 @@ def _ureduce(a, func, **kwargs): Input array or object that can be converted to an array. func : callable Reduction function capable of receiving a single axis argument. - It is is called with `a` as first argument followed by `kwargs`. + It is called with `a` as first argument followed by `kwargs`. kwargs : keyword arguments additional keyword arguments to pass to `func`. @@ -4284,8 +3474,17 @@ def percentile(a, q, axis=None, out=None, >>> assert not np.all(a == b) """ - q = array(q, dtype=np.float64, copy=True) - r, k = _ureduce(a, func=_percentile, q=q, axis=axis, out=out, + q = np.true_divide(q, 100.0) # handles the asarray for us too + if not _quantile_is_valid(q): + raise ValueError("Percentiles must be in the range [0, 100]") + return _quantile_unchecked( + a, q, axis, out, overwrite_input, interpolation, keepdims) + + +def _quantile_unchecked(a, q, axis=None, out=None, overwrite_input=False, + interpolation='linear', keepdims=False): + """Assumes that q is in [0, 1], and is an ndarray""" + r, k = _ureduce(a, func=_quantile_ureduce_func, q=q, axis=axis, out=out, overwrite_input=overwrite_input, interpolation=interpolation) if keepdims: @@ -4294,8 +3493,21 @@ def percentile(a, q, axis=None, out=None, return r -def _percentile(a, q, axis=None, out=None, - overwrite_input=False, interpolation='linear', keepdims=False): +def _quantile_is_valid(q): + # avoid expensive reductions, relevant for arrays with < O(1000) elements + if q.ndim == 1 and q.size < 10: + for i in range(q.size): + if q[i] < 0.0 or q[i] > 1.0: + return False + else: + # faster than any() + if np.count_nonzero(q < 0.0) or np.count_nonzero(q > 1.0): + return False + return True + + +def _quantile_ureduce_func(a, q, axis=None, out=None, overwrite_input=False, + interpolation='linear', keepdims=False): a = asarray(a) if q.ndim == 0: # Do not allow 0-d arrays because following code fails for scalar @@ -4304,19 +3516,7 @@ def _percentile(a, q, axis=None, out=None, else: zerod = False - # avoid expensive reductions, relevant for arrays with < O(1000) elements - if q.size < 10: - for i in range(q.size): - if q[i] < 0. or q[i] > 100.: - raise ValueError("Percentiles must be in the range [0,100]") - q[i] /= 100. - else: - # faster than any() - if np.count_nonzero(q < 0.) or np.count_nonzero(q > 100.): - raise ValueError("Percentiles must be in the range [0,100]") - q /= 100. - - # prepare a for partioning + # prepare a for partitioning if overwrite_input: if axis is None: ap = a.ravel() diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py new file mode 100644 index 000000000..ccc6c0616 --- /dev/null +++ b/numpy/lib/histograms.py @@ -0,0 +1,878 @@ +""" +Histogram-related functions +""" +from __future__ import division, absolute_import, print_function + +import operator + +import numpy as np +from numpy.compat.py3k import basestring + +__all__ = ['histogram', 'histogramdd'] + + +def _hist_bin_sqrt(x): + """ + Square root histogram bin estimator. + + Bin width is inversely proportional to the data size. Used by many + programs for its simplicity. + + Parameters + ---------- + x : array_like + Input data that is to be histogrammed, trimmed to range. May not + be empty. + + Returns + ------- + h : An estimate of the optimal bin width for the given data. + """ + return x.ptp() / np.sqrt(x.size) + + +def _hist_bin_sturges(x): + """ + Sturges histogram bin estimator. + + A very simplistic estimator based on the assumption of normality of + the data. This estimator has poor performance for non-normal data, + which becomes especially obvious for large data sets. The estimate + depends only on size of the data. + + Parameters + ---------- + x : array_like + Input data that is to be histogrammed, trimmed to range. May not + be empty. + + Returns + ------- + h : An estimate of the optimal bin width for the given data. + """ + return x.ptp() / (np.log2(x.size) + 1.0) + + +def _hist_bin_rice(x): + """ + Rice histogram bin estimator. + + Another simple estimator with no normality assumption. It has better + performance for large data than Sturges, but tends to overestimate + the number of bins. The number of bins is proportional to the cube + root of data size (asymptotically optimal). The estimate depends + only on size of the data. + + Parameters + ---------- + x : array_like + Input data that is to be histogrammed, trimmed to range. May not + be empty. + + Returns + ------- + h : An estimate of the optimal bin width for the given data. + """ + return x.ptp() / (2.0 * x.size ** (1.0 / 3)) + + +def _hist_bin_scott(x): + """ + Scott histogram bin estimator. + + The binwidth is proportional to the standard deviation of the data + and inversely proportional to the cube root of data size + (asymptotically optimal). + + Parameters + ---------- + x : array_like + Input data that is to be histogrammed, trimmed to range. May not + be empty. + + Returns + ------- + h : An estimate of the optimal bin width for the given data. + """ + return (24.0 * np.pi**0.5 / x.size)**(1.0 / 3.0) * np.std(x) + + +def _hist_bin_doane(x): + """ + Doane's histogram bin estimator. + + Improved version of Sturges' formula which works better for + non-normal data. See + stats.stackexchange.com/questions/55134/doanes-formula-for-histogram-binning + + Parameters + ---------- + x : array_like + Input data that is to be histogrammed, trimmed to range. May not + be empty. + + Returns + ------- + h : An estimate of the optimal bin width for the given data. + """ + if x.size > 2: + sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3))) + sigma = np.std(x) + if sigma > 0.0: + # These three operations add up to + # g1 = np.mean(((x - np.mean(x)) / sigma)**3) + # but use only one temp array instead of three + temp = x - np.mean(x) + np.true_divide(temp, sigma, temp) + np.power(temp, 3, temp) + g1 = np.mean(temp) + return x.ptp() / (1.0 + np.log2(x.size) + + np.log2(1.0 + np.absolute(g1) / sg1)) + return 0.0 + + +def _hist_bin_fd(x): + """ + The Freedman-Diaconis histogram bin estimator. + + The Freedman-Diaconis rule uses interquartile range (IQR) to + estimate binwidth. It is considered a variation of the Scott rule + with more robustness as the IQR is less affected by outliers than + the standard deviation. However, the IQR depends on fewer points + than the standard deviation, so it is less accurate, especially for + long tailed distributions. + + If the IQR is 0, this function returns 1 for the number of bins. + Binwidth is inversely proportional to the cube root of data size + (asymptotically optimal). + + Parameters + ---------- + x : array_like + Input data that is to be histogrammed, trimmed to range. May not + be empty. + + Returns + ------- + h : An estimate of the optimal bin width for the given data. + """ + iqr = np.subtract(*np.percentile(x, [75, 25])) + return 2.0 * iqr * x.size ** (-1.0 / 3.0) + + +def _hist_bin_auto(x): + """ + Histogram bin estimator that uses the minimum width of the + Freedman-Diaconis and Sturges estimators. + + The FD estimator is usually the most robust method, but its width + estimate tends to be too large for small `x`. The Sturges estimator + is quite good for small (<1000) datasets and is the default in the R + language. This method gives good off the shelf behaviour. + + Parameters + ---------- + x : array_like + Input data that is to be histogrammed, trimmed to range. May not + be empty. + + Returns + ------- + h : An estimate of the optimal bin width for the given data. + + See Also + -------- + _hist_bin_fd, _hist_bin_sturges + """ + # There is no need to check for zero here. If ptp is, so is IQR and + # vice versa. Either both are zero or neither one is. + return min(_hist_bin_fd(x), _hist_bin_sturges(x)) + + +# Private dict initialized at module load time +_hist_bin_selectors = {'auto': _hist_bin_auto, + 'doane': _hist_bin_doane, + 'fd': _hist_bin_fd, + 'rice': _hist_bin_rice, + 'scott': _hist_bin_scott, + 'sqrt': _hist_bin_sqrt, + 'sturges': _hist_bin_sturges} + + +def _ravel_and_check_weights(a, weights): + """ Check a and weights have matching shapes, and ravel both """ + a = np.asarray(a) + if weights is not None: + weights = np.asarray(weights) + if weights.shape != a.shape: + raise ValueError( + 'weights should have the same shape as a.') + weights = weights.ravel() + a = a.ravel() + return a, weights + + +def _get_outer_edges(a, range): + """ + Determine the outer bin edges to use, from either the data or the range + argument + """ + if range is not None: + first_edge, last_edge = range + if first_edge > last_edge: + raise ValueError( + 'max must be larger than min in range parameter.') + if not (np.isfinite(first_edge) and np.isfinite(last_edge)): + raise ValueError( + "supplied range of [{}, {}] is not finite".format(first_edge, last_edge)) + elif a.size == 0: + # handle empty arrays. Can't determine range, so use 0-1. + first_edge, last_edge = 0, 1 + else: + first_edge, last_edge = a.min(), a.max() + if not (np.isfinite(first_edge) and np.isfinite(last_edge)): + raise ValueError( + "autodetected range of [{}, {}] is not finite".format(first_edge, last_edge)) + + # expand empty range to avoid divide by zero + if first_edge == last_edge: + first_edge = first_edge - 0.5 + last_edge = last_edge + 0.5 + + return first_edge, last_edge + + +def _get_bin_edges(a, bins, range, weights): + """ + Computes the bins used internally by `histogram`. + + Parameters + ========== + a : ndarray + Ravelled data array + bins, range + Forwarded arguments from `histogram`. + weights : ndarray, optional + Ravelled weights array, or None + + Returns + ======= + bin_edges : ndarray + Array of bin edges + uniform_bins : (Number, Number, int): + The upper bound, lowerbound, and number of bins, used in the optimized + implementation of `histogram` that works on uniform bins. + """ + # parse the overloaded bins argument + n_equal_bins = None + bin_edges = None + + if isinstance(bins, basestring): + bin_name = bins + # if `bins` is a string for an automatic method, + # this will replace it with the number of bins calculated + if bin_name not in _hist_bin_selectors: + raise ValueError( + "{!r} is not a valid estimator for `bins`".format(bin_name)) + if weights is not None: + raise TypeError("Automated estimation of the number of " + "bins is not supported for weighted data") + + first_edge, last_edge = _get_outer_edges(a, range) + + # truncate the range if needed + if range is not None: + keep = (a >= first_edge) + keep &= (a <= last_edge) + if not np.logical_and.reduce(keep): + a = a[keep] + + if a.size == 0: + n_equal_bins = 1 + else: + # Do not call selectors on empty arrays + width = _hist_bin_selectors[bin_name](a) + if width: + n_equal_bins = int(np.ceil((last_edge - first_edge) / width)) + else: + # Width can be zero for some estimators, e.g. FD when + # the IQR of the data is zero. + n_equal_bins = 1 + + elif np.ndim(bins) == 0: + try: + n_equal_bins = operator.index(bins) + except TypeError: + raise TypeError( + '`bins` must be an integer, a string, or an array') + if n_equal_bins < 1: + raise ValueError('`bins` must be positive, when an integer') + + first_edge, last_edge = _get_outer_edges(a, range) + + elif np.ndim(bins) == 1: + bin_edges = np.asarray(bins) + if np.any(bin_edges[:-1] > bin_edges[1:]): + raise ValueError( + '`bins` must increase monotonically, when an array') + + else: + raise ValueError('`bins` must be 1d, when an array') + + if n_equal_bins is not None: + # gh-10322 means that type resolution rules are dependent on array + # shapes. To avoid this causing problems, we pick a type now and stick + # with it throughout. + bin_type = np.result_type(first_edge, last_edge, a) + if np.issubdtype(bin_type, np.integer): + bin_type = np.result_type(bin_type, float) + + # bin edges must be computed + bin_edges = np.linspace( + first_edge, last_edge, n_equal_bins + 1, + endpoint=True, dtype=bin_type) + return bin_edges, (first_edge, last_edge, n_equal_bins) + else: + return bin_edges, None + + +def _search_sorted_inclusive(a, v): + """ + Like `searchsorted`, but where the last item in `v` is placed on the right. + + In the context of a histogram, this makes the last bin edge inclusive + """ + return np.concatenate(( + a.searchsorted(v[:-1], 'left'), + a.searchsorted(v[-1:], 'right') + )) + + +def histogram(a, bins=10, range=None, normed=False, weights=None, + density=None): + r""" + Compute the histogram of a set of data. + + Parameters + ---------- + a : array_like + Input data. The histogram is computed over the flattened array. + bins : int or sequence of scalars or str, optional + If `bins` is an int, it defines the number of equal-width + bins in the given range (10, by default). If `bins` is a + sequence, it defines the bin edges, including the rightmost + edge, allowing for non-uniform bin widths. + + .. versionadded:: 1.11.0 + + If `bins` is a string from the list below, `histogram` will use + the method chosen to calculate the optimal bin width and + consequently the number of bins (see `Notes` for more detail on + the estimators) from the data that falls within the requested + range. While the bin width will be optimal for the actual data + in the range, the number of bins will be computed to fill the + entire range, including the empty portions. For visualisation, + using the 'auto' option is suggested. Weighted data is not + supported for automated bin size selection. + + 'auto' + Maximum of the 'sturges' and 'fd' estimators. Provides good + all around performance. + + 'fd' (Freedman Diaconis Estimator) + Robust (resilient to outliers) estimator that takes into + account data variability and data size. + + 'doane' + An improved version of Sturges' estimator that works better + with non-normal datasets. + + 'scott' + Less robust estimator that that takes into account data + variability and data size. + + 'rice' + Estimator does not take variability into account, only data + size. Commonly overestimates number of bins required. + + 'sturges' + R's default method, only accounts for data size. Only + optimal for gaussian data and underestimates number of bins + for large non-gaussian datasets. + + 'sqrt' + Square root (of data size) estimator, used by Excel and + other programs for its speed and simplicity. + + range : (float, float), optional + The lower and upper range of the bins. If not provided, range + is simply ``(a.min(), a.max())``. Values outside the range are + ignored. The first element of the range must be less than or + equal to the second. `range` affects the automatic bin + computation as well. While bin width is computed to be optimal + based on the actual data within `range`, the bin count will fill + the entire range including portions containing no data. + normed : bool, optional + This keyword is deprecated in NumPy 1.6.0 due to confusing/buggy + behavior. It will be removed in NumPy 2.0.0. Use the ``density`` + keyword instead. If ``False``, the result will contain the + number of samples in each bin. If ``True``, the result is the + value of the probability *density* function at the bin, + normalized such that the *integral* over the range is 1. Note + that this latter behavior is known to be buggy with unequal bin + widths; use ``density`` instead. + weights : array_like, optional + An array of weights, of the same shape as `a`. Each value in + `a` only contributes its associated weight towards the bin count + (instead of 1). If `density` is True, the weights are + normalized, so that the integral of the density over the range + remains 1. + density : bool, optional + If ``False``, the result will contain the number of samples in + each bin. If ``True``, the result is the value of the + probability *density* function at the bin, normalized such that + the *integral* over the range is 1. Note that the sum of the + histogram values will not be equal to 1 unless bins of unity + width are chosen; it is not a probability *mass* function. + + Overrides the ``normed`` keyword if given. + + Returns + ------- + hist : array + The values of the histogram. See `density` and `weights` for a + description of the possible semantics. + bin_edges : array of dtype float + Return the bin edges ``(length(hist)+1)``. + + + See Also + -------- + histogramdd, bincount, searchsorted, digitize + + Notes + ----- + All but the last (righthand-most) bin is half-open. In other words, + if `bins` is:: + + [1, 2, 3, 4] + + then the first bin is ``[1, 2)`` (including 1, but excluding 2) and + the second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which + *includes* 4. + + .. versionadded:: 1.11.0 + + The methods to estimate the optimal number of bins are well founded + in literature, and are inspired by the choices R provides for + histogram visualisation. Note that having the number of bins + proportional to :math:`n^{1/3}` is asymptotically optimal, which is + why it appears in most estimators. These are simply plug-in methods + that give good starting points for number of bins. In the equations + below, :math:`h` is the binwidth and :math:`n_h` is the number of + bins. All estimators that compute bin counts are recast to bin width + using the `ptp` of the data. The final bin count is obtained from + ``np.round(np.ceil(range / h))`. + + 'Auto' (maximum of the 'Sturges' and 'FD' estimators) + A compromise to get a good value. For small datasets the Sturges + value will usually be chosen, while larger datasets will usually + default to FD. Avoids the overly conservative behaviour of FD + and Sturges for small and large datasets respectively. + Switchover point is usually :math:`a.size \approx 1000`. + + 'FD' (Freedman Diaconis Estimator) + .. math:: h = 2 \frac{IQR}{n^{1/3}} + + The binwidth is proportional to the interquartile range (IQR) + and inversely proportional to cube root of a.size. Can be too + conservative for small datasets, but is quite good for large + datasets. The IQR is very robust to outliers. + + 'Scott' + .. math:: h = \sigma \sqrt[3]{\frac{24 * \sqrt{\pi}}{n}} + + The binwidth is proportional to the standard deviation of the + data and inversely proportional to cube root of ``x.size``. Can + be too conservative for small datasets, but is quite good for + large datasets. The standard deviation is not very robust to + outliers. Values are very similar to the Freedman-Diaconis + estimator in the absence of outliers. + + 'Rice' + .. math:: n_h = 2n^{1/3} + + The number of bins is only proportional to cube root of + ``a.size``. It tends to overestimate the number of bins and it + does not take into account data variability. + + 'Sturges' + .. math:: n_h = \log _{2}n+1 + + The number of bins is the base 2 log of ``a.size``. This + estimator assumes normality of data and is too conservative for + larger, non-normal datasets. This is the default method in R's + ``hist`` method. + + 'Doane' + .. math:: n_h = 1 + \log_{2}(n) + + \log_{2}(1 + \frac{|g_1|}{\sigma_{g_1}}) + + g_1 = mean[(\frac{x - \mu}{\sigma})^3] + + \sigma_{g_1} = \sqrt{\frac{6(n - 2)}{(n + 1)(n + 3)}} + + An improved version of Sturges' formula that produces better + estimates for non-normal datasets. This estimator attempts to + account for the skew of the data. + + 'Sqrt' + .. math:: n_h = \sqrt n + The simplest and fastest estimator. Only takes into account the + data size. + + Examples + -------- + >>> np.histogram([1, 2, 1], bins=[0, 1, 2, 3]) + (array([0, 2, 1]), array([0, 1, 2, 3])) + >>> np.histogram(np.arange(4), bins=np.arange(5), density=True) + (array([ 0.25, 0.25, 0.25, 0.25]), array([0, 1, 2, 3, 4])) + >>> np.histogram([[1, 2, 1], [1, 0, 1]], bins=[0,1,2,3]) + (array([1, 4, 1]), array([0, 1, 2, 3])) + + >>> a = np.arange(5) + >>> hist, bin_edges = np.histogram(a, density=True) + >>> hist + array([ 0.5, 0. , 0.5, 0. , 0. , 0.5, 0. , 0.5, 0. , 0.5]) + >>> hist.sum() + 2.4999999999999996 + >>> np.sum(hist * np.diff(bin_edges)) + 1.0 + + .. versionadded:: 1.11.0 + + Automated Bin Selection Methods example, using 2 peak random data + with 2000 points: + + >>> import matplotlib.pyplot as plt + >>> rng = np.random.RandomState(10) # deterministic random data + >>> a = np.hstack((rng.normal(size=1000), + ... rng.normal(loc=5, scale=2, size=1000))) + >>> plt.hist(a, bins='auto') # arguments are passed to np.histogram + >>> plt.title("Histogram with 'auto' bins") + >>> plt.show() + + """ + a, weights = _ravel_and_check_weights(a, weights) + + bin_edges, uniform_bins = _get_bin_edges(a, bins, range, weights) + + # Histogram is an integer or a float array depending on the weights. + if weights is None: + ntype = np.dtype(np.intp) + else: + ntype = weights.dtype + + # We set a block size, as this allows us to iterate over chunks when + # computing histograms, to minimize memory usage. + BLOCK = 65536 + + # The fast path uses bincount, but that only works for certain types + # of weight + simple_weights = ( + weights is None or + np.can_cast(weights.dtype, np.double) or + np.can_cast(weights.dtype, complex) + ) + + if uniform_bins is not None and simple_weights: + # Fast algorithm for equal bins + # We now convert values of a to bin indices, under the assumption of + # equal bin widths (which is valid here). + first_edge, last_edge, n_equal_bins = uniform_bins + + # Initialize empty histogram + n = np.zeros(n_equal_bins, ntype) + + # Pre-compute histogram scaling factor + norm = n_equal_bins / (last_edge - first_edge) + + # We iterate over blocks here for two reasons: the first is that for + # large arrays, it is actually faster (for example for a 10^8 array it + # is 2x as fast) and it results in a memory footprint 3x lower in the + # limit of large arrays. + for i in np.arange(0, len(a), BLOCK): + tmp_a = a[i:i+BLOCK] + if weights is None: + tmp_w = None + else: + tmp_w = weights[i:i + BLOCK] + + # Only include values in the right range + keep = (tmp_a >= first_edge) + keep &= (tmp_a <= last_edge) + if not np.logical_and.reduce(keep): + tmp_a = tmp_a[keep] + if tmp_w is not None: + tmp_w = tmp_w[keep] + + # This cast ensures no type promotions occur below, which gh-10322 + # make unpredictable. Getting it wrong leads to precision errors + # like gh-8123. + tmp_a = tmp_a.astype(bin_edges.dtype, copy=False) + + # Compute the bin indices, and for values that lie exactly on + # last_edge we need to subtract one + f_indices = (tmp_a - first_edge) * norm + indices = f_indices.astype(np.intp) + indices[indices == n_equal_bins] -= 1 + + # The index computation is not guaranteed to give exactly + # consistent results within ~1 ULP of the bin edges. + decrement = tmp_a < bin_edges[indices] + indices[decrement] -= 1 + # The last bin includes the right edge. The other bins do not. + increment = ((tmp_a >= bin_edges[indices + 1]) + & (indices != n_equal_bins - 1)) + indices[increment] += 1 + + # We now compute the histogram using bincount + if ntype.kind == 'c': + n.real += np.bincount(indices, weights=tmp_w.real, + minlength=n_equal_bins) + n.imag += np.bincount(indices, weights=tmp_w.imag, + minlength=n_equal_bins) + else: + n += np.bincount(indices, weights=tmp_w, + minlength=n_equal_bins).astype(ntype) + else: + # Compute via cumulative histogram + cum_n = np.zeros(bin_edges.shape, ntype) + if weights is None: + for i in np.arange(0, len(a), BLOCK): + sa = np.sort(a[i:i+BLOCK]) + cum_n += _search_sorted_inclusive(sa, bin_edges) + else: + zero = np.zeros(1, dtype=ntype) + for i in np.arange(0, len(a), BLOCK): + tmp_a = a[i:i+BLOCK] + tmp_w = weights[i:i+BLOCK] + sorting_index = np.argsort(tmp_a) + sa = tmp_a[sorting_index] + sw = tmp_w[sorting_index] + cw = np.concatenate((zero, sw.cumsum())) + bin_index = _search_sorted_inclusive(sa, bin_edges) + cum_n += cw[bin_index] + + n = np.diff(cum_n) + + # density overrides the normed keyword + if density is not None: + normed = False + + if density: + db = np.array(np.diff(bin_edges), float) + return n/db/n.sum(), bin_edges + elif normed: + # deprecated, buggy behavior. Remove for NumPy 2.0.0 + db = np.array(np.diff(bin_edges), float) + return n/(n*db).sum(), bin_edges + else: + return n, bin_edges + + +def histogramdd(sample, bins=10, range=None, normed=False, weights=None): + """ + Compute the multidimensional histogram of some data. + + Parameters + ---------- + sample : array_like + The data to be histogrammed. It must be an (N,D) array or data + that can be converted to such. The rows of the resulting array + are the coordinates of points in a D dimensional polytope. + bins : sequence or int, optional + The bin specification: + + * A sequence of arrays describing the bin edges along each dimension. + * The number of bins for each dimension (nx, ny, ... =bins) + * The number of bins for all dimensions (nx=ny=...=bins). + + range : sequence, optional + A sequence of lower and upper bin edges to be used if the edges are + not given explicitly in `bins`. Defaults to the minimum and maximum + values along each dimension. + normed : bool, optional + If False, returns the number of samples in each bin. If True, + returns the bin density ``bin_count / sample_count / bin_volume``. + weights : (N,) array_like, optional + An array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`. + Weights are normalized to 1 if normed is True. If normed is False, + the values of the returned histogram are equal to the sum of the + weights belonging to the samples falling into each bin. + + Returns + ------- + H : ndarray + The multidimensional histogram of sample x. See normed and weights + for the different possible semantics. + edges : list + A list of D arrays describing the bin edges for each dimension. + + See Also + -------- + histogram: 1-D histogram + histogram2d: 2-D histogram + + Examples + -------- + >>> r = np.random.randn(100,3) + >>> H, edges = np.histogramdd(r, bins = (5, 8, 4)) + >>> H.shape, edges[0].size, edges[1].size, edges[2].size + ((5, 8, 4), 6, 9, 5) + + """ + + try: + # Sample is an ND-array. + N, D = sample.shape + except (AttributeError, ValueError): + # Sample is a sequence of 1D arrays. + sample = np.atleast_2d(sample).T + N, D = sample.shape + + nbin = np.empty(D, int) + edges = D*[None] + dedges = D*[None] + if weights is not None: + weights = np.asarray(weights) + + try: + M = len(bins) + if M != D: + raise ValueError( + 'The dimension of bins must be equal to the dimension of the ' + ' sample x.') + except TypeError: + # bins is an integer + bins = D*[bins] + + # Select range for each dimension + # Used only if number of bins is given. + if range is None: + # Handle empty input. Range can't be determined in that case, use 0-1. + if N == 0: + smin = np.zeros(D) + smax = np.ones(D) + else: + smin = np.atleast_1d(np.array(sample.min(0), float)) + smax = np.atleast_1d(np.array(sample.max(0), float)) + else: + if not np.all(np.isfinite(range)): + raise ValueError( + 'range parameter must be finite.') + smin = np.zeros(D) + smax = np.zeros(D) + for i in np.arange(D): + smin[i], smax[i] = range[i] + + # Make sure the bins have a finite width. + for i in np.arange(len(smin)): + if smin[i] == smax[i]: + smin[i] = smin[i] - .5 + smax[i] = smax[i] + .5 + + # avoid rounding issues for comparisons when dealing with inexact types + if np.issubdtype(sample.dtype, np.inexact): + edge_dt = sample.dtype + else: + edge_dt = float + # Create edge arrays + for i in np.arange(D): + if np.isscalar(bins[i]): + if bins[i] < 1: + raise ValueError( + "Element at index %s in `bins` should be a positive " + "integer." % i) + nbin[i] = bins[i] + 2 # +2 for outlier bins + edges[i] = np.linspace(smin[i], smax[i], nbin[i]-1, dtype=edge_dt) + else: + edges[i] = np.asarray(bins[i], edge_dt) + nbin[i] = len(edges[i]) + 1 # +1 for outlier bins + dedges[i] = np.diff(edges[i]) + if np.any(np.asarray(dedges[i]) <= 0): + raise ValueError( + "Found bin edge of size <= 0. Did you specify `bins` with" + "non-monotonic sequence?") + + nbin = np.asarray(nbin) + + # Handle empty input. + if N == 0: + return np.zeros(nbin-2), edges + + # Compute the bin number each sample falls into. + Ncount = {} + for i in np.arange(D): + Ncount[i] = np.digitize(sample[:, i], edges[i]) + + # Using digitize, values that fall on an edge are put in the right bin. + # For the rightmost bin, we want values equal to the right edge to be + # counted in the last bin, and not as an outlier. + for i in np.arange(D): + # Rounding precision + mindiff = dedges[i].min() + if not np.isinf(mindiff): + decimal = int(-np.log10(mindiff)) + 6 + # Find which points are on the rightmost edge. + not_smaller_than_edge = (sample[:, i] >= edges[i][-1]) + on_edge = (np.around(sample[:, i], decimal) == + np.around(edges[i][-1], decimal)) + # Shift these points one bin to the left. + Ncount[i][np.nonzero(on_edge & not_smaller_than_edge)[0]] -= 1 + + # Flattened histogram matrix (1D) + # Reshape is used so that overlarge arrays + # will raise an error. + hist = np.zeros(nbin, float).reshape(-1) + + # Compute the sample indices in the flattened histogram matrix. + ni = nbin.argsort() + xy = np.zeros(N, int) + for i in np.arange(0, D-1): + xy += Ncount[ni[i]] * nbin[ni[i+1:]].prod() + xy += Ncount[ni[-1]] + + # Compute the number of repetitions in xy and assign it to the + # flattened histmat. + if len(xy) == 0: + return np.zeros(nbin-2, int), edges + + flatcount = np.bincount(xy, weights) + a = np.arange(len(flatcount)) + hist[a] = flatcount + + # Shape into a proper matrix + hist = hist.reshape(np.sort(nbin)) + for i in np.arange(nbin.size): + j = ni.argsort()[i] + hist = hist.swapaxes(i, j) + ni[i], ni[j] = ni[j], ni[i] + + # Remove outliers (indices 0 and -1 for each dimension). + core = D*[slice(1, -1)] + hist = hist[core] + + # Normalize if normed is True + if normed: + s = hist.sum() + for i in np.arange(D): + shape = np.ones(D, int) + shape[i] = nbin[i] - 2 + hist = hist / dedges[i].reshape(shape) + hist /= s + + if (hist.shape != nbin - 2).any(): + raise RuntimeError( + "Internal Shape Error") + return hist, edges diff --git a/numpy/lib/nanfunctions.py b/numpy/lib/nanfunctions.py index 22b295b9d..16e363d7c 100644 --- a/numpy/lib/nanfunctions.py +++ b/numpy/lib/nanfunctions.py @@ -23,7 +23,7 @@ from __future__ import division, absolute_import, print_function import warnings import numpy as np -from numpy.lib.function_base import _ureduce as _ureduce +from numpy.lib import function_base __all__ = [ @@ -198,8 +198,8 @@ def nanmin(a, axis=None, out=None, keepdims=np._NoValue): a : array_like Array containing numbers whose minimum is desired. If `a` is not an array, a conversion is attempted. - axis : int, optional - Axis along which the minimum is computed. The default is to compute + axis : {int, tuple of int, None}, optional + Axis or axes along which the minimum is computed. The default is to compute the minimum of the flattened array. out : ndarray, optional Alternate output array in which to place the result. The default @@ -306,8 +306,8 @@ def nanmax(a, axis=None, out=None, keepdims=np._NoValue): a : array_like Array containing numbers whose maximum is desired. If `a` is not an array, a conversion is attempted. - axis : int, optional - Axis along which the maximum is computed. The default is to compute + axis : {int, tuple of int, None}, optional + Axis or axes along which the maximum is computed. The default is to compute the maximum of the flattened array. out : ndarray, optional Alternate output array in which to place the result. The default @@ -505,8 +505,8 @@ def nansum(a, axis=None, dtype=None, out=None, keepdims=np._NoValue): a : array_like Array containing numbers whose sum is desired. If `a` is not an array, a conversion is attempted. - axis : int, optional - Axis along which the sum is computed. The default is to compute the + axis : {int, tuple of int, None}, optional + Axis or axes along which the sum is computed. The default is to compute the sum of the flattened array. dtype : data-type, optional The type of the returned array and of the accumulator in which the @@ -596,8 +596,8 @@ def nanprod(a, axis=None, dtype=None, out=None, keepdims=np._NoValue): a : array_like Array containing numbers whose product is desired. If `a` is not an array, a conversion is attempted. - axis : int, optional - Axis along which the product is computed. The default is to compute + axis : {int, tuple of int, None}, optional + Axis or axes along which the product is computed. The default is to compute the product of the flattened array. dtype : data-type, optional The type of the returned array and of the accumulator in which the @@ -791,8 +791,8 @@ def nanmean(a, axis=None, dtype=None, out=None, keepdims=np._NoValue): a : array_like Array containing numbers whose mean is desired. If `a` is not an array, a conversion is attempted. - axis : int, optional - Axis along which the means are computed. The default is to compute + axis : {int, tuple of int, None}, optional + Axis or axes along which the means are computed. The default is to compute the mean of the flattened array. dtype : data-type, optional Type to use in computing the mean. For integer inputs, the default @@ -1017,8 +1017,8 @@ def nanmedian(a, axis=None, out=None, overwrite_input=False, keepdims=np._NoValu if a.size == 0: return np.nanmean(a, axis, out=out, keepdims=keepdims) - r, k = _ureduce(a, func=_nanmedian, axis=axis, out=out, - overwrite_input=overwrite_input) + r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out, + overwrite_input=overwrite_input) if keepdims and keepdims is not np._NoValue: return r.reshape(k) else: @@ -1038,7 +1038,8 @@ def nanpercentile(a, q, axis=None, out=None, overwrite_input=False, Parameters ---------- a : array_like - Input array or object that can be converted to an array. + Input array or object that can be converted to an array, containing + nan values to be ignored. q : array_like of float Percentile or sequence of percentiles to compute, which must be between 0 and 100 inclusive. @@ -1134,36 +1135,44 @@ def nanpercentile(a, q, axis=None, out=None, overwrite_input=False, >>> assert not np.all(a==b) """ - a = np.asanyarray(a) - q = np.asanyarray(q) + q = np.true_divide(q, 100.0) # handles the asarray for us too + if not function_base._quantile_is_valid(q): + raise ValueError("Percentiles must be in the range [0, 100]") + return _nanquantile_unchecked( + a, q, axis, out, overwrite_input, interpolation, keepdims) + + +def _nanquantile_unchecked(a, q, axis=None, out=None, overwrite_input=False, + interpolation='linear', keepdims=np._NoValue): + """Assumes that q is in [0, 1], and is an ndarray""" # apply_along_axis in _nanpercentile doesn't handle empty arrays well, # so deal them upfront if a.size == 0: return np.nanmean(a, axis, out=out, keepdims=keepdims) - r, k = _ureduce(a, func=_nanpercentile, q=q, axis=axis, out=out, - overwrite_input=overwrite_input, - interpolation=interpolation) + r, k = function_base._ureduce( + a, func=_nanquantile_ureduce_func, q=q, axis=axis, out=out, + overwrite_input=overwrite_input, interpolation=interpolation + ) if keepdims and keepdims is not np._NoValue: return r.reshape(q.shape + k) else: return r -def _nanpercentile(a, q, axis=None, out=None, overwrite_input=False, - interpolation='linear'): +def _nanquantile_ureduce_func(a, q, axis=None, out=None, overwrite_input=False, + interpolation='linear'): """ Private function that doesn't support extended axis or keepdims. These methods are extended to this function using _ureduce See nanpercentile for parameter usage - """ if axis is None or a.ndim == 1: part = a.ravel() - result = _nanpercentile1d(part, q, overwrite_input, interpolation) + result = _nanquantile_1d(part, q, overwrite_input, interpolation) else: - result = np.apply_along_axis(_nanpercentile1d, axis, a, q, + result = np.apply_along_axis(_nanquantile_1d, axis, a, q, overwrite_input, interpolation) # apply_along_axis fills in collapsed axis with results. # Move that axis to the beginning to match percentile's @@ -1176,9 +1185,9 @@ def _nanpercentile(a, q, axis=None, out=None, overwrite_input=False, return result -def _nanpercentile1d(arr1d, q, overwrite_input=False, interpolation='linear'): +def _nanquantile_1d(arr1d, q, overwrite_input=False, interpolation='linear'): """ - Private function for rank 1 arrays. Compute percentile ignoring NaNs. + Private function for rank 1 arrays. Compute quantile ignoring NaNs. See nanpercentile for parameter usage """ arr1d, overwrite_input = _remove_nan_1d(arr1d, @@ -1186,8 +1195,8 @@ def _nanpercentile1d(arr1d, q, overwrite_input=False, interpolation='linear'): if arr1d.size == 0: return np.full(q.shape, np.nan)[()] # convert to scalar - return np.percentile(arr1d, q, overwrite_input=overwrite_input, - interpolation=interpolation) + return function_base._quantile_unchecked( + arr1d, q, overwrite_input=overwrite_input, interpolation=interpolation) def nanvar(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue): @@ -1208,8 +1217,8 @@ def nanvar(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue): a : array_like Array containing numbers whose variance is desired. If `a` is not an array, a conversion is attempted. - axis : int, optional - Axis along which the variance is computed. The default is to compute + axis : {int, tuple of int, None}, optional + Axis or axes along which the variance is computed. The default is to compute the variance of the flattened array. dtype : data-type, optional Type to use in computing the variance. For arrays of integer type @@ -1350,8 +1359,8 @@ def nanstd(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue): ---------- a : array_like Calculate the standard deviation of the non-NaN values. - axis : int, optional - Axis along which the standard deviation is computed. The default is + axis : {int, tuple of int, None}, optional + Axis or axes along which the standard deviation is computed. The default is to compute the standard deviation of the flattened array. dtype : dtype, optional Type to use in computing the standard deviation. For arrays of diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index 66dc68538..959574594 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -477,7 +477,7 @@ def save(file, arr, allow_pickle=True, fix_imports=True): ----- For a description of the ``.npy`` format, see the module docstring of `numpy.lib.format` or the NumPy Enhancement Proposal - http://docs.scipy.org/doc/numpy/neps/npy-format.html + http://numpy.github.io/neps/npy-format.html Examples -------- @@ -563,7 +563,7 @@ def savez(file, *args, **kwds): in the archive contains one variable in ``.npy`` format. For a description of the ``.npy`` format, see `numpy.lib.format` or the NumPy Enhancement Proposal - http://docs.scipy.org/doc/numpy/neps/npy-format.html + http://numpy.github.io/neps/npy-format.html When opening the saved ``.npz`` file with `load` a `NpzFile` object is returned. This is a dictionary-like object which can be queried for @@ -644,7 +644,7 @@ def savez_compressed(file, *args, **kwds): ``zipfile.ZIP_DEFLATED`` and each file in the archive contains one variable in ``.npy`` format. For a description of the ``.npy`` format, see `numpy.lib.format` or the NumPy Enhancement Proposal - http://docs.scipy.org/doc/numpy/neps/npy-format.html + http://numpy.github.io/neps/npy-format.html When opening the saved ``.npz`` file with `load` a `NpzFile` object is returned. This is a dictionary-like object which can be queried for @@ -797,22 +797,23 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, The string used to separate values. For backwards compatibility, byte strings will be decoded as 'latin1'. The default is whitespace. converters : dict, optional - A dictionary mapping column number to a function that will convert - that column to a float. E.g., if column 0 is a date string: - ``converters = {0: datestr2num}``. Converters can also be used to - provide a default value for missing data (but see also `genfromtxt`): - ``converters = {3: lambda s: float(s.strip() or 0)}``. Default: None. + A dictionary mapping column number to a function that will parse the + column string into the desired value. E.g., if column 0 is a date + string: ``converters = {0: datestr2num}``. Converters can also be + used to provide a default value for missing data (but see also + `genfromtxt`): ``converters = {3: lambda s: float(s.strip() or 0)}``. + Default: None. skiprows : int, optional Skip the first `skiprows` lines; default: 0. usecols : int or sequence, optional Which columns to read, with 0 being the first. For example, - usecols = (1,4,5) will extract the 2nd, 5th and 6th columns. + ``usecols = (1,4,5)`` will extract the 2nd, 5th and 6th columns. The default, None, results in all columns being read. .. versionchanged:: 1.11.0 When a single column has to be read it is possible to use an integer instead of a tuple. E.g ``usecols = 3`` reads the - fourth column the same way as `usecols = (3,)`` would. + fourth column the same way as ``usecols = (3,)`` would. unpack : bool, optional If True, the returned array is transposed, so that arguments may be unpacked using ``x, y, z = loadtxt(...)``. When used with a structured @@ -827,7 +828,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, Encoding used to decode the inputfile. Does not apply to input streams. The special value 'bytes' enables backward compatibility workarounds that ensures you receive byte arrays as results if possible and passes - latin1 encoded strings to converters. Override this value to receive + 'latin1' encoded strings to converters. Override this value to receive unicode arrays and pass strings as input to converters. If set to None the system default is used. The default value is 'bytes'. @@ -1108,6 +1109,11 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, finally: if fown: fh.close() + # recursive closures have a cyclic reference to themselves, which + # requires gc to collect (gh-10620). To avoid this problem, for + # performance and PyPy friendliness, we break the cycle: + flatten_dtype_internal = None + pack_items = None if X is None: X = np.array([], dtype) @@ -1459,9 +1465,9 @@ def fromregex(file, regexp, dtype, encoding=None): dtype = np.dtype(dtype) content = file.read() - if isinstance(content, bytes) and not isinstance(regexp, bytes): + if isinstance(content, bytes) and isinstance(regexp, np.unicode): regexp = asbytes(regexp) - elif not isinstance(content, bytes) and isinstance(regexp, bytes): + elif isinstance(content, np.unicode) and isinstance(regexp, bytes): regexp = asstr(regexp) if not hasattr(regexp, 'match'): @@ -2049,7 +2055,6 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, strcolidx = [i for (i, v) in enumerate(column_types) if v == np.unicode_] - type_str = np.unicode_ if byte_converters and strcolidx: # convert strings back to bytes for backward compatibility warnings.warn( @@ -2065,33 +2070,37 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, try: data = [encode_unicode_cols(r) for r in data] - type_str = np.bytes_ except UnicodeEncodeError: pass + else: + for i in strcolidx: + column_types[i] = np.bytes_ + # Update string types to be the right length + sized_column_types = column_types[:] + for i, col_type in enumerate(column_types): + if np.issubdtype(col_type, np.character): + n_chars = max(len(row[i]) for row in data) + sized_column_types[i] = (col_type, n_chars) - # ... and take the largest number of chars. - for i in strcolidx: - max_line_length = max(len(row[i]) for row in data) - column_types[i] = np.dtype((type_str, max_line_length)) - # if names is None: - # If the dtype is uniform, don't define names, else use '' - base = set([c.type for c in converters if c._checked]) + # If the dtype is uniform (before sizing strings) + base = set([ + c_type + for c, c_type in zip(converters, column_types) + if c._checked]) if len(base) == 1: - if strcolidx: - (ddtype, mdtype) = (type_str, bool) - else: - (ddtype, mdtype) = (list(base)[0], bool) + uniform_type, = base + (ddtype, mdtype) = (uniform_type, bool) else: ddtype = [(defaultfmt % i, dt) - for (i, dt) in enumerate(column_types)] + for (i, dt) in enumerate(sized_column_types)] if usemask: mdtype = [(defaultfmt % i, bool) - for (i, dt) in enumerate(column_types)] + for (i, dt) in enumerate(sized_column_types)] else: - ddtype = list(zip(names, column_types)) - mdtype = list(zip(names, [bool] * len(column_types))) + ddtype = list(zip(names, sized_column_types)) + mdtype = list(zip(names, [bool] * len(sized_column_types))) output = np.array(data, dtype=ddtype) if usemask: outputmask = np.array(masks, dtype=mdtype) diff --git a/numpy/lib/polynomial.py b/numpy/lib/polynomial.py index f49b7e295..41b5e2f64 100644 --- a/numpy/lib/polynomial.py +++ b/numpy/lib/polynomial.py @@ -897,7 +897,7 @@ def polydiv(u, v): n = len(v) - 1 scale = 1. / v[0] q = NX.zeros((max(m - n + 1, 1),), w.dtype) - r = u.copy() + r = u.astype(w.dtype) for k in range(0, m-n+1): d = scale * r[k] q[k] = d diff --git a/numpy/lib/tests/test_arraysetops.py b/numpy/lib/tests/test_arraysetops.py index b4787838d..8286834a4 100644 --- a/numpy/lib/tests/test_arraysetops.py +++ b/numpy/lib/tests/test_arraysetops.py @@ -4,6 +4,8 @@ from __future__ import division, absolute_import, print_function import numpy as np +import sys + from numpy.testing import ( run_module_suite, assert_array_equal, assert_equal, assert_raises, ) @@ -247,6 +249,14 @@ class TestSetOps(object): c = union1d(a, b) assert_array_equal(c, ec) + # Tests gh-10340, arguments to union1d should be + # flattened if they are not already 1D + x = np.array([[0, 1, 2], [3, 4, 5]]) + y = np.array([0, 1, 2, 3, 4]) + ez = np.array([0, 1, 2, 3, 4, 5]) + z = union1d(x, y) + assert_array_equal(z, ez) + assert_array_equal([], union1d([], [])) def test_setdiff1d(self): @@ -401,8 +411,8 @@ class TestUnique(object): assert_raises(TypeError, self._run_axis_tests, [('a', int), ('b', object)]) - assert_raises(ValueError, unique, np.arange(10), axis=2) - assert_raises(ValueError, unique, np.arange(10), axis=-2) + assert_raises(np.AxisError, unique, np.arange(10), axis=2) + assert_raises(np.AxisError, unique, np.arange(10), axis=-2) def test_unique_axis_list(self): msg = "Unique failed on list of lists" @@ -445,6 +455,15 @@ class TestUnique(object): assert_array_equal(v.data, v2.data, msg) assert_array_equal(v.mask, v2.mask, msg) + def test_unique_sort_order_with_axis(self): + # These tests fail if sorting along axis is done by treating subarrays + # as unsigned byte strings. See gh-10495. + fmt = "sort order incorrect for integer type '%s'" + for dt in 'bhilq': + a = np.array([[-1],[0]], dt) + b = np.unique(a, axis=0) + assert_array_equal(a, b, fmt % dt) + def _run_axis_tests(self, dtype): data = np.array([[0, 1, 0, 0], [1, 0, 0, 0], diff --git a/numpy/lib/tests/test_format.py b/numpy/lib/tests/test_format.py index 2d2b4cea2..d3bd2cef7 100644 --- a/numpy/lib/tests/test_format.py +++ b/numpy/lib/tests/test_format.py @@ -454,20 +454,20 @@ def assert_equal_(o1, o2): def test_roundtrip(): for arr in basic_arrays + record_arrays: arr2 = roundtrip(arr) - yield assert_array_equal, arr, arr2 + assert_array_equal(arr, arr2) def test_roundtrip_randsize(): for arr in basic_arrays + record_arrays: if arr.dtype != object: arr2 = roundtrip_randsize(arr) - yield assert_array_equal, arr, arr2 + assert_array_equal(arr, arr2) def test_roundtrip_truncated(): for arr in basic_arrays: if arr.dtype != object: - yield assert_raises, ValueError, roundtrip_truncated, arr + assert_raises(ValueError, roundtrip_truncated, arr) def test_long_str(): @@ -508,7 +508,7 @@ def test_memmap_roundtrip(): fp = open(mfn, 'rb') memmap_bytes = fp.read() fp.close() - yield assert_equal_, normal_bytes, memmap_bytes + assert_equal_(normal_bytes, memmap_bytes) # Check that reading the file using memmap works. ma = format.open_memmap(nfn, mode='r') @@ -728,13 +728,13 @@ def test_read_magic(): def test_read_magic_bad_magic(): for magic in malformed_magic: f = BytesIO(magic) - yield raises(ValueError)(format.read_magic), f + assert_raises(ValueError, format.read_array, f) def test_read_version_1_0_bad_magic(): for magic in bad_version_magic + malformed_magic: f = BytesIO(magic) - yield raises(ValueError)(format.read_array), f + assert_raises(ValueError, format.read_array, f) def test_bad_magic_args(): diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index 8381c2465..49b450175 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -556,6 +556,9 @@ class TestPtp(object): assert_equal(b.ptp(axis=0), [5.0, 7.0, 7.0]) assert_equal(b.ptp(axis=-1), [6.0, 6.0, 6.0]) + assert_equal(b.ptp(axis=0, keepdims=True), [[5.0, 7.0, 7.0]]) + assert_equal(b.ptp(axis=(0,1), keepdims=True), [[8.0]]) + class TestCumsum(object): @@ -1617,518 +1620,6 @@ class TestSinc(object): assert_array_equal(y1, y3) -class TestHistogram(object): - - def setup(self): - pass - - def teardown(self): - pass - - def test_simple(self): - n = 100 - v = rand(n) - (a, b) = histogram(v) - # check if the sum of the bins equals the number of samples - assert_equal(np.sum(a, axis=0), n) - # check that the bin counts are evenly spaced when the data is from - # a linear function - (a, b) = histogram(np.linspace(0, 10, 100)) - assert_array_equal(a, 10) - - def test_one_bin(self): - # Ticket 632 - hist, edges = histogram([1, 2, 3, 4], [1, 2]) - assert_array_equal(hist, [2, ]) - assert_array_equal(edges, [1, 2]) - assert_raises(ValueError, histogram, [1, 2], bins=0) - h, e = histogram([1, 2], bins=1) - assert_equal(h, np.array([2])) - assert_allclose(e, np.array([1., 2.])) - - def test_normed(self): - # Check that the integral of the density equals 1. - n = 100 - v = rand(n) - a, b = histogram(v, normed=True) - area = np.sum(a * diff(b)) - assert_almost_equal(area, 1) - - # Check with non-constant bin widths (buggy but backwards - # compatible) - v = np.arange(10) - bins = [0, 1, 5, 9, 10] - a, b = histogram(v, bins, normed=True) - area = np.sum(a * diff(b)) - assert_almost_equal(area, 1) - - def test_density(self): - # Check that the integral of the density equals 1. - n = 100 - v = rand(n) - a, b = histogram(v, density=True) - area = np.sum(a * diff(b)) - assert_almost_equal(area, 1) - - # Check with non-constant bin widths - v = np.arange(10) - bins = [0, 1, 3, 6, 10] - a, b = histogram(v, bins, density=True) - assert_array_equal(a, .1) - assert_equal(np.sum(a * diff(b)), 1) - - # Variale bin widths are especially useful to deal with - # infinities. - v = np.arange(10) - bins = [0, 1, 3, 6, np.inf] - a, b = histogram(v, bins, density=True) - assert_array_equal(a, [.1, .1, .1, 0.]) - - # Taken from a bug report from N. Becker on the numpy-discussion - # mailing list Aug. 6, 2010. - counts, dmy = np.histogram( - [1, 2, 3, 4], [0.5, 1.5, np.inf], density=True) - assert_equal(counts, [.25, 0]) - - def test_outliers(self): - # Check that outliers are not tallied - a = np.arange(10) + .5 - - # Lower outliers - h, b = histogram(a, range=[0, 9]) - assert_equal(h.sum(), 9) - - # Upper outliers - h, b = histogram(a, range=[1, 10]) - assert_equal(h.sum(), 9) - - # Normalization - h, b = histogram(a, range=[1, 9], normed=True) - assert_almost_equal((h * diff(b)).sum(), 1, decimal=15) - - # Weights - w = np.arange(10) + .5 - h, b = histogram(a, range=[1, 9], weights=w, normed=True) - assert_equal((h * diff(b)).sum(), 1) - - h, b = histogram(a, bins=8, range=[1, 9], weights=w) - assert_equal(h, w[1:-1]) - - def test_type(self): - # Check the type of the returned histogram - a = np.arange(10) + .5 - h, b = histogram(a) - assert_(np.issubdtype(h.dtype, np.integer)) - - h, b = histogram(a, normed=True) - assert_(np.issubdtype(h.dtype, np.floating)) - - h, b = histogram(a, weights=np.ones(10, int)) - assert_(np.issubdtype(h.dtype, np.integer)) - - h, b = histogram(a, weights=np.ones(10, float)) - assert_(np.issubdtype(h.dtype, np.floating)) - - def test_f32_rounding(self): - # gh-4799, check that the rounding of the edges works with float32 - x = np.array([276.318359, -69.593948, 21.329449], dtype=np.float32) - y = np.array([5005.689453, 4481.327637, 6010.369629], dtype=np.float32) - counts_hist, xedges, yedges = np.histogram2d(x, y, bins=100) - assert_equal(counts_hist.sum(), 3.) - - def test_weights(self): - v = rand(100) - w = np.ones(100) * 5 - a, b = histogram(v) - na, nb = histogram(v, normed=True) - wa, wb = histogram(v, weights=w) - nwa, nwb = histogram(v, weights=w, normed=True) - assert_array_almost_equal(a * 5, wa) - assert_array_almost_equal(na, nwa) - - # Check weights are properly applied. - v = np.linspace(0, 10, 10) - w = np.concatenate((np.zeros(5), np.ones(5))) - wa, wb = histogram(v, bins=np.arange(11), weights=w) - assert_array_almost_equal(wa, w) - - # Check with integer weights - wa, wb = histogram([1, 2, 2, 4], bins=4, weights=[4, 3, 2, 1]) - assert_array_equal(wa, [4, 5, 0, 1]) - wa, wb = histogram( - [1, 2, 2, 4], bins=4, weights=[4, 3, 2, 1], normed=True) - assert_array_almost_equal(wa, np.array([4, 5, 0, 1]) / 10. / 3. * 4) - - # Check weights with non-uniform bin widths - a, b = histogram( - np.arange(9), [0, 1, 3, 6, 10], - weights=[2, 1, 1, 1, 1, 1, 1, 1, 1], density=True) - assert_almost_equal(a, [.2, .1, .1, .075]) - - def test_exotic_weights(self): - - # Test the use of weights that are not integer or floats, but e.g. - # complex numbers or object types. - - # Complex weights - values = np.array([1.3, 2.5, 2.3]) - weights = np.array([1, -1, 2]) + 1j * np.array([2, 1, 2]) - - # Check with custom bins - wa, wb = histogram(values, bins=[0, 2, 3], weights=weights) - assert_array_almost_equal(wa, np.array([1, 1]) + 1j * np.array([2, 3])) - - # Check with even bins - wa, wb = histogram(values, bins=2, range=[1, 3], weights=weights) - assert_array_almost_equal(wa, np.array([1, 1]) + 1j * np.array([2, 3])) - - # Decimal weights - from decimal import Decimal - values = np.array([1.3, 2.5, 2.3]) - weights = np.array([Decimal(1), Decimal(2), Decimal(3)]) - - # Check with custom bins - wa, wb = histogram(values, bins=[0, 2, 3], weights=weights) - assert_array_almost_equal(wa, [Decimal(1), Decimal(5)]) - - # Check with even bins - wa, wb = histogram(values, bins=2, range=[1, 3], weights=weights) - assert_array_almost_equal(wa, [Decimal(1), Decimal(5)]) - - def test_no_side_effects(self): - # This is a regression test that ensures that values passed to - # ``histogram`` are unchanged. - values = np.array([1.3, 2.5, 2.3]) - np.histogram(values, range=[-10, 10], bins=100) - assert_array_almost_equal(values, [1.3, 2.5, 2.3]) - - def test_empty(self): - a, b = histogram([], bins=([0, 1])) - assert_array_equal(a, np.array([0])) - assert_array_equal(b, np.array([0, 1])) - - def test_error_binnum_type (self): - # Tests if right Error is raised if bins argument is float - vals = np.linspace(0.0, 1.0, num=100) - histogram(vals, 5) - assert_raises(TypeError, histogram, vals, 2.4) - - def test_finite_range(self): - # Normal ranges should be fine - vals = np.linspace(0.0, 1.0, num=100) - histogram(vals, range=[0.25,0.75]) - assert_raises(ValueError, histogram, vals, range=[np.nan,0.75]) - assert_raises(ValueError, histogram, vals, range=[0.25,np.inf]) - - def test_bin_edge_cases(self): - # Ensure that floating-point computations correctly place edge cases. - arr = np.array([337, 404, 739, 806, 1007, 1811, 2012]) - hist, edges = np.histogram(arr, bins=8296, range=(2, 2280)) - mask = hist > 0 - left_edges = edges[:-1][mask] - right_edges = edges[1:][mask] - for x, left, right in zip(arr, left_edges, right_edges): - assert_(x >= left) - assert_(x < right) - - def test_last_bin_inclusive_range(self): - arr = np.array([0., 0., 0., 1., 2., 3., 3., 4., 5.]) - hist, edges = np.histogram(arr, bins=30, range=(-0.5, 5)) - assert_equal(hist[-1], 1) - - def test_unsigned_monotonicity_check(self): - # Ensures ValueError is raised if bins not increasing monotonically - # when bins contain unsigned values (see #9222) - arr = np.array([2]) - bins = np.array([1, 3, 1], dtype='uint64') - with assert_raises(ValueError): - hist, edges = np.histogram(arr, bins=bins) - - -class TestHistogramOptimBinNums(object): - """ - Provide test coverage when using provided estimators for optimal number of - bins - """ - - def test_empty(self): - estimator_list = ['fd', 'scott', 'rice', 'sturges', - 'doane', 'sqrt', 'auto'] - # check it can deal with empty data - for estimator in estimator_list: - a, b = histogram([], bins=estimator) - assert_array_equal(a, np.array([0])) - assert_array_equal(b, np.array([0, 1])) - - def test_simple(self): - """ - Straightforward testing with a mixture of linspace data (for - consistency). All test values have been precomputed and the values - shouldn't change - """ - # Some basic sanity checking, with some fixed data. - # Checking for the correct number of bins - basic_test = {50: {'fd': 4, 'scott': 4, 'rice': 8, 'sturges': 7, - 'doane': 8, 'sqrt': 8, 'auto': 7}, - 500: {'fd': 8, 'scott': 8, 'rice': 16, 'sturges': 10, - 'doane': 12, 'sqrt': 23, 'auto': 10}, - 5000: {'fd': 17, 'scott': 17, 'rice': 35, 'sturges': 14, - 'doane': 17, 'sqrt': 71, 'auto': 17}} - - for testlen, expectedResults in basic_test.items(): - # Create some sort of non uniform data to test with - # (2 peak uniform mixture) - x1 = np.linspace(-10, -1, testlen // 5 * 2) - x2 = np.linspace(1, 10, testlen // 5 * 3) - x = np.concatenate((x1, x2)) - for estimator, numbins in expectedResults.items(): - a, b = np.histogram(x, estimator) - assert_equal(len(a), numbins, err_msg="For the {0} estimator " - "with datasize of {1}".format(estimator, testlen)) - - def test_small(self): - """ - Smaller datasets have the potential to cause issues with the data - adaptive methods, especially the FD method. All bin numbers have been - precalculated. - """ - small_dat = {1: {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1, - 'doane': 1, 'sqrt': 1}, - 2: {'fd': 2, 'scott': 1, 'rice': 3, 'sturges': 2, - 'doane': 1, 'sqrt': 2}, - 3: {'fd': 2, 'scott': 2, 'rice': 3, 'sturges': 3, - 'doane': 3, 'sqrt': 2}} - - for testlen, expectedResults in small_dat.items(): - testdat = np.arange(testlen) - for estimator, expbins in expectedResults.items(): - a, b = np.histogram(testdat, estimator) - assert_equal(len(a), expbins, err_msg="For the {0} estimator " - "with datasize of {1}".format(estimator, testlen)) - - def test_incorrect_methods(self): - """ - Check a Value Error is thrown when an unknown string is passed in - """ - check_list = ['mad', 'freeman', 'histograms', 'IQR'] - for estimator in check_list: - assert_raises(ValueError, histogram, [1, 2, 3], estimator) - - def test_novariance(self): - """ - Check that methods handle no variance in data - Primarily for Scott and FD as the SD and IQR are both 0 in this case - """ - novar_dataset = np.ones(100) - novar_resultdict = {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1, - 'doane': 1, 'sqrt': 1, 'auto': 1} - - for estimator, numbins in novar_resultdict.items(): - a, b = np.histogram(novar_dataset, estimator) - assert_equal(len(a), numbins, err_msg="{0} estimator, " - "No Variance test".format(estimator)) - - def test_outlier(self): - """ - Check the FD, Scott and Doane with outliers. - - The FD estimates a smaller binwidth since it's less affected by - outliers. Since the range is so (artificially) large, this means more - bins, most of which will be empty, but the data of interest usually is - unaffected. The Scott estimator is more affected and returns fewer bins, - despite most of the variance being in one area of the data. The Doane - estimator lies somewhere between the other two. - """ - xcenter = np.linspace(-10, 10, 50) - outlier_dataset = np.hstack((np.linspace(-110, -100, 5), xcenter)) - - outlier_resultdict = {'fd': 21, 'scott': 5, 'doane': 11} - - for estimator, numbins in outlier_resultdict.items(): - a, b = np.histogram(outlier_dataset, estimator) - assert_equal(len(a), numbins) - - def test_simple_range(self): - """ - Straightforward testing with a mixture of linspace data (for - consistency). Adding in a 3rd mixture that will then be - completely ignored. All test values have been precomputed and - the shouldn't change. - """ - # some basic sanity checking, with some fixed data. - # Checking for the correct number of bins - basic_test = { - 50: {'fd': 8, 'scott': 8, 'rice': 15, - 'sturges': 14, 'auto': 14}, - 500: {'fd': 15, 'scott': 16, 'rice': 32, - 'sturges': 20, 'auto': 20}, - 5000: {'fd': 33, 'scott': 33, 'rice': 69, - 'sturges': 27, 'auto': 33} - } - - for testlen, expectedResults in basic_test.items(): - # create some sort of non uniform data to test with - # (3 peak uniform mixture) - x1 = np.linspace(-10, -1, testlen // 5 * 2) - x2 = np.linspace(1, 10, testlen // 5 * 3) - x3 = np.linspace(-100, -50, testlen) - x = np.hstack((x1, x2, x3)) - for estimator, numbins in expectedResults.items(): - a, b = np.histogram(x, estimator, range = (-20, 20)) - msg = "For the {0} estimator".format(estimator) - msg += " with datasize of {0}".format(testlen) - assert_equal(len(a), numbins, err_msg=msg) - - def test_simple_weighted(self): - """ - Check that weighted data raises a TypeError - """ - estimator_list = ['fd', 'scott', 'rice', 'sturges', 'auto'] - for estimator in estimator_list: - assert_raises(TypeError, histogram, [1, 2, 3], - estimator, weights=[1, 2, 3]) - - -class TestHistogramdd(object): - - def test_simple(self): - x = np.array([[-.5, .5, 1.5], [-.5, 1.5, 2.5], [-.5, 2.5, .5], - [.5, .5, 1.5], [.5, 1.5, 2.5], [.5, 2.5, 2.5]]) - H, edges = histogramdd(x, (2, 3, 3), - range=[[-1, 1], [0, 3], [0, 3]]) - answer = np.array([[[0, 1, 0], [0, 0, 1], [1, 0, 0]], - [[0, 1, 0], [0, 0, 1], [0, 0, 1]]]) - assert_array_equal(H, answer) - - # Check normalization - ed = [[-2, 0, 2], [0, 1, 2, 3], [0, 1, 2, 3]] - H, edges = histogramdd(x, bins=ed, normed=True) - assert_(np.all(H == answer / 12.)) - - # Check that H has the correct shape. - H, edges = histogramdd(x, (2, 3, 4), - range=[[-1, 1], [0, 3], [0, 4]], - normed=True) - answer = np.array([[[0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]], - [[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0]]]) - assert_array_almost_equal(H, answer / 6., 4) - # Check that a sequence of arrays is accepted and H has the correct - # shape. - z = [np.squeeze(y) for y in split(x, 3, axis=1)] - H, edges = histogramdd( - z, bins=(4, 3, 2), range=[[-2, 2], [0, 3], [0, 2]]) - answer = np.array([[[0, 0], [0, 0], [0, 0]], - [[0, 1], [0, 0], [1, 0]], - [[0, 1], [0, 0], [0, 0]], - [[0, 0], [0, 0], [0, 0]]]) - assert_array_equal(H, answer) - - Z = np.zeros((5, 5, 5)) - Z[list(range(5)), list(range(5)), list(range(5))] = 1. - H, edges = histogramdd([np.arange(5), np.arange(5), np.arange(5)], 5) - assert_array_equal(H, Z) - - def test_shape_3d(self): - # All possible permutations for bins of different lengths in 3D. - bins = ((5, 4, 6), (6, 4, 5), (5, 6, 4), (4, 6, 5), (6, 5, 4), - (4, 5, 6)) - r = rand(10, 3) - for b in bins: - H, edges = histogramdd(r, b) - assert_(H.shape == b) - - def test_shape_4d(self): - # All possible permutations for bins of different lengths in 4D. - bins = ((7, 4, 5, 6), (4, 5, 7, 6), (5, 6, 4, 7), (7, 6, 5, 4), - (5, 7, 6, 4), (4, 6, 7, 5), (6, 5, 7, 4), (7, 5, 4, 6), - (7, 4, 6, 5), (6, 4, 7, 5), (6, 7, 5, 4), (4, 6, 5, 7), - (4, 7, 5, 6), (5, 4, 6, 7), (5, 7, 4, 6), (6, 7, 4, 5), - (6, 5, 4, 7), (4, 7, 6, 5), (4, 5, 6, 7), (7, 6, 4, 5), - (5, 4, 7, 6), (5, 6, 7, 4), (6, 4, 5, 7), (7, 5, 6, 4)) - - r = rand(10, 4) - for b in bins: - H, edges = histogramdd(r, b) - assert_(H.shape == b) - - def test_weights(self): - v = rand(100, 2) - hist, edges = histogramdd(v) - n_hist, edges = histogramdd(v, normed=True) - w_hist, edges = histogramdd(v, weights=np.ones(100)) - assert_array_equal(w_hist, hist) - w_hist, edges = histogramdd(v, weights=np.ones(100) * 2, normed=True) - assert_array_equal(w_hist, n_hist) - w_hist, edges = histogramdd(v, weights=np.ones(100, int) * 2) - assert_array_equal(w_hist, 2 * hist) - - def test_identical_samples(self): - x = np.zeros((10, 2), int) - hist, edges = histogramdd(x, bins=2) - assert_array_equal(edges[0], np.array([-0.5, 0., 0.5])) - - def test_empty(self): - a, b = histogramdd([[], []], bins=([0, 1], [0, 1])) - assert_array_max_ulp(a, np.array([[0.]])) - a, b = np.histogramdd([[], [], []], bins=2) - assert_array_max_ulp(a, np.zeros((2, 2, 2))) - - def test_bins_errors(self): - # There are two ways to specify bins. Check for the right errors - # when mixing those. - x = np.arange(8).reshape(2, 4) - assert_raises(ValueError, np.histogramdd, x, bins=[-1, 2, 4, 5]) - assert_raises(ValueError, np.histogramdd, x, bins=[1, 0.99, 1, 1]) - assert_raises( - ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 2, 3]]) - assert_raises( - ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 3, -3]]) - assert_(np.histogramdd(x, bins=[1, 1, 1, [1, 2, 3, 4]])) - - def test_inf_edges(self): - # Test using +/-inf bin edges works. See #1788. - with np.errstate(invalid='ignore'): - x = np.arange(6).reshape(3, 2) - expected = np.array([[1, 0], [0, 1], [0, 1]]) - h, e = np.histogramdd(x, bins=[3, [-np.inf, 2, 10]]) - assert_allclose(h, expected) - h, e = np.histogramdd(x, bins=[3, np.array([-1, 2, np.inf])]) - assert_allclose(h, expected) - h, e = np.histogramdd(x, bins=[3, [-np.inf, 3, np.inf]]) - assert_allclose(h, expected) - - def test_rightmost_binedge(self): - # Test event very close to rightmost binedge. See Github issue #4266 - x = [0.9999999995] - bins = [[0., 0.5, 1.0]] - hist, _ = histogramdd(x, bins=bins) - assert_(hist[0] == 0.0) - assert_(hist[1] == 1.) - x = [1.0] - bins = [[0., 0.5, 1.0]] - hist, _ = histogramdd(x, bins=bins) - assert_(hist[0] == 0.0) - assert_(hist[1] == 1.) - x = [1.0000000001] - bins = [[0., 0.5, 1.0]] - hist, _ = histogramdd(x, bins=bins) - assert_(hist[0] == 0.0) - assert_(hist[1] == 1.) - x = [1.0001] - bins = [[0., 0.5, 1.0]] - hist, _ = histogramdd(x, bins=bins) - assert_(hist[0] == 0.0) - assert_(hist[1] == 0.0) - - def test_finite_range(self): - vals = np.random.random((100, 3)) - histogramdd(vals, range=[[0.0, 1.0], [0.25, 0.75], [0.25, 0.5]]) - assert_raises(ValueError, histogramdd, vals, - range=[[0.0, 1.0], [0.25, 0.75], [0.25, np.inf]]) - assert_raises(ValueError, histogramdd, vals, - range=[[0.0, 1.0], [np.nan, 0.75], [0.25, 0.5]]) - - class TestUnique(object): def test_simple(self): @@ -2761,8 +2252,17 @@ class TestInterp(object): y = np.linspace(0, 1, 5) x0 = np.array(.3) assert_almost_equal(np.interp(x0, x, y), x0) - x0 = np.array(.3, dtype=object) - assert_almost_equal(np.interp(x0, x, y), .3) + + xp = np.array([0, 2, 4]) + fp = np.array([1, -1, 1]) + + actual = np.interp(np.array(1), xp, fp) + assert_equal(actual, 0) + assert_(isinstance(actual, np.float64)) + + actual = np.interp(np.array(4.5), xp, fp, period=4) + assert_equal(actual, 0.5) + assert_(isinstance(actual, np.float64)) def test_if_len_x_is_small(self): xp = np.arange(0, 10, 0.0001) diff --git a/numpy/lib/tests/test_histograms.py b/numpy/lib/tests/test_histograms.py new file mode 100644 index 000000000..a2c684a20 --- /dev/null +++ b/numpy/lib/tests/test_histograms.py @@ -0,0 +1,635 @@ +from __future__ import division, absolute_import, print_function + +import numpy as np + +from numpy.lib.histograms import histogram, histogramdd +from numpy.testing import ( + run_module_suite, assert_, assert_equal, assert_array_equal, + assert_almost_equal, assert_array_almost_equal, assert_raises, + assert_allclose, assert_array_max_ulp, assert_warns, assert_raises_regex, + dec, suppress_warnings, HAS_REFCOUNT, +) + + +class TestHistogram(object): + + def setup(self): + pass + + def teardown(self): + pass + + def test_simple(self): + n = 100 + v = np.random.rand(n) + (a, b) = histogram(v) + # check if the sum of the bins equals the number of samples + assert_equal(np.sum(a, axis=0), n) + # check that the bin counts are evenly spaced when the data is from + # a linear function + (a, b) = histogram(np.linspace(0, 10, 100)) + assert_array_equal(a, 10) + + def test_one_bin(self): + # Ticket 632 + hist, edges = histogram([1, 2, 3, 4], [1, 2]) + assert_array_equal(hist, [2, ]) + assert_array_equal(edges, [1, 2]) + assert_raises(ValueError, histogram, [1, 2], bins=0) + h, e = histogram([1, 2], bins=1) + assert_equal(h, np.array([2])) + assert_allclose(e, np.array([1., 2.])) + + def test_normed(self): + # Check that the integral of the density equals 1. + n = 100 + v = np.random.rand(n) + a, b = histogram(v, normed=True) + area = np.sum(a * np.diff(b)) + assert_almost_equal(area, 1) + + # Check with non-constant bin widths (buggy but backwards + # compatible) + v = np.arange(10) + bins = [0, 1, 5, 9, 10] + a, b = histogram(v, bins, normed=True) + area = np.sum(a * np.diff(b)) + assert_almost_equal(area, 1) + + def test_density(self): + # Check that the integral of the density equals 1. + n = 100 + v = np.random.rand(n) + a, b = histogram(v, density=True) + area = np.sum(a * np.diff(b)) + assert_almost_equal(area, 1) + + # Check with non-constant bin widths + v = np.arange(10) + bins = [0, 1, 3, 6, 10] + a, b = histogram(v, bins, density=True) + assert_array_equal(a, .1) + assert_equal(np.sum(a * np.diff(b)), 1) + + # Variale bin widths are especially useful to deal with + # infinities. + v = np.arange(10) + bins = [0, 1, 3, 6, np.inf] + a, b = histogram(v, bins, density=True) + assert_array_equal(a, [.1, .1, .1, 0.]) + + # Taken from a bug report from N. Becker on the numpy-discussion + # mailing list Aug. 6, 2010. + counts, dmy = np.histogram( + [1, 2, 3, 4], [0.5, 1.5, np.inf], density=True) + assert_equal(counts, [.25, 0]) + + def test_outliers(self): + # Check that outliers are not tallied + a = np.arange(10) + .5 + + # Lower outliers + h, b = histogram(a, range=[0, 9]) + assert_equal(h.sum(), 9) + + # Upper outliers + h, b = histogram(a, range=[1, 10]) + assert_equal(h.sum(), 9) + + # Normalization + h, b = histogram(a, range=[1, 9], normed=True) + assert_almost_equal((h * np.diff(b)).sum(), 1, decimal=15) + + # Weights + w = np.arange(10) + .5 + h, b = histogram(a, range=[1, 9], weights=w, normed=True) + assert_equal((h * np.diff(b)).sum(), 1) + + h, b = histogram(a, bins=8, range=[1, 9], weights=w) + assert_equal(h, w[1:-1]) + + def test_type(self): + # Check the type of the returned histogram + a = np.arange(10) + .5 + h, b = histogram(a) + assert_(np.issubdtype(h.dtype, np.integer)) + + h, b = histogram(a, normed=True) + assert_(np.issubdtype(h.dtype, np.floating)) + + h, b = histogram(a, weights=np.ones(10, int)) + assert_(np.issubdtype(h.dtype, np.integer)) + + h, b = histogram(a, weights=np.ones(10, float)) + assert_(np.issubdtype(h.dtype, np.floating)) + + def test_f32_rounding(self): + # gh-4799, check that the rounding of the edges works with float32 + x = np.array([276.318359, -69.593948, 21.329449], dtype=np.float32) + y = np.array([5005.689453, 4481.327637, 6010.369629], dtype=np.float32) + counts_hist, xedges, yedges = np.histogram2d(x, y, bins=100) + assert_equal(counts_hist.sum(), 3.) + + def test_weights(self): + v = np.random.rand(100) + w = np.ones(100) * 5 + a, b = histogram(v) + na, nb = histogram(v, normed=True) + wa, wb = histogram(v, weights=w) + nwa, nwb = histogram(v, weights=w, normed=True) + assert_array_almost_equal(a * 5, wa) + assert_array_almost_equal(na, nwa) + + # Check weights are properly applied. + v = np.linspace(0, 10, 10) + w = np.concatenate((np.zeros(5), np.ones(5))) + wa, wb = histogram(v, bins=np.arange(11), weights=w) + assert_array_almost_equal(wa, w) + + # Check with integer weights + wa, wb = histogram([1, 2, 2, 4], bins=4, weights=[4, 3, 2, 1]) + assert_array_equal(wa, [4, 5, 0, 1]) + wa, wb = histogram( + [1, 2, 2, 4], bins=4, weights=[4, 3, 2, 1], normed=True) + assert_array_almost_equal(wa, np.array([4, 5, 0, 1]) / 10. / 3. * 4) + + # Check weights with non-uniform bin widths + a, b = histogram( + np.arange(9), [0, 1, 3, 6, 10], + weights=[2, 1, 1, 1, 1, 1, 1, 1, 1], density=True) + assert_almost_equal(a, [.2, .1, .1, .075]) + + def test_exotic_weights(self): + + # Test the use of weights that are not integer or floats, but e.g. + # complex numbers or object types. + + # Complex weights + values = np.array([1.3, 2.5, 2.3]) + weights = np.array([1, -1, 2]) + 1j * np.array([2, 1, 2]) + + # Check with custom bins + wa, wb = histogram(values, bins=[0, 2, 3], weights=weights) + assert_array_almost_equal(wa, np.array([1, 1]) + 1j * np.array([2, 3])) + + # Check with even bins + wa, wb = histogram(values, bins=2, range=[1, 3], weights=weights) + assert_array_almost_equal(wa, np.array([1, 1]) + 1j * np.array([2, 3])) + + # Decimal weights + from decimal import Decimal + values = np.array([1.3, 2.5, 2.3]) + weights = np.array([Decimal(1), Decimal(2), Decimal(3)]) + + # Check with custom bins + wa, wb = histogram(values, bins=[0, 2, 3], weights=weights) + assert_array_almost_equal(wa, [Decimal(1), Decimal(5)]) + + # Check with even bins + wa, wb = histogram(values, bins=2, range=[1, 3], weights=weights) + assert_array_almost_equal(wa, [Decimal(1), Decimal(5)]) + + def test_no_side_effects(self): + # This is a regression test that ensures that values passed to + # ``histogram`` are unchanged. + values = np.array([1.3, 2.5, 2.3]) + np.histogram(values, range=[-10, 10], bins=100) + assert_array_almost_equal(values, [1.3, 2.5, 2.3]) + + def test_empty(self): + a, b = histogram([], bins=([0, 1])) + assert_array_equal(a, np.array([0])) + assert_array_equal(b, np.array([0, 1])) + + def test_error_binnum_type (self): + # Tests if right Error is raised if bins argument is float + vals = np.linspace(0.0, 1.0, num=100) + histogram(vals, 5) + assert_raises(TypeError, histogram, vals, 2.4) + + def test_finite_range(self): + # Normal ranges should be fine + vals = np.linspace(0.0, 1.0, num=100) + histogram(vals, range=[0.25,0.75]) + assert_raises(ValueError, histogram, vals, range=[np.nan,0.75]) + assert_raises(ValueError, histogram, vals, range=[0.25,np.inf]) + + def test_bin_edge_cases(self): + # Ensure that floating-point computations correctly place edge cases. + arr = np.array([337, 404, 739, 806, 1007, 1811, 2012]) + hist, edges = np.histogram(arr, bins=8296, range=(2, 2280)) + mask = hist > 0 + left_edges = edges[:-1][mask] + right_edges = edges[1:][mask] + for x, left, right in zip(arr, left_edges, right_edges): + assert_(x >= left) + assert_(x < right) + + def test_last_bin_inclusive_range(self): + arr = np.array([0., 0., 0., 1., 2., 3., 3., 4., 5.]) + hist, edges = np.histogram(arr, bins=30, range=(-0.5, 5)) + assert_equal(hist[-1], 1) + + def test_unsigned_monotonicity_check(self): + # Ensures ValueError is raised if bins not increasing monotonically + # when bins contain unsigned values (see #9222) + arr = np.array([2]) + bins = np.array([1, 3, 1], dtype='uint64') + with assert_raises(ValueError): + hist, edges = np.histogram(arr, bins=bins) + + def test_object_array_of_0d(self): + # gh-7864 + assert_raises(ValueError, + histogram, [np.array([0.4]) for i in range(10)] + [-np.inf]) + assert_raises(ValueError, + histogram, [np.array([0.4]) for i in range(10)] + [np.inf]) + + # these should not crash + np.histogram([np.array([0.5]) for i in range(10)] + [.500000000000001]) + np.histogram([np.array([0.5]) for i in range(10)] + [.5]) + + def test_some_nan_values(self): + # gh-7503 + one_nan = np.array([0, 1, np.nan]) + all_nan = np.array([np.nan, np.nan]) + + # the internal commparisons with NaN give warnings + sup = suppress_warnings() + sup.filter(RuntimeWarning) + with sup: + # can't infer range with nan + assert_raises(ValueError, histogram, one_nan, bins='auto') + assert_raises(ValueError, histogram, all_nan, bins='auto') + + # explicit range solves the problem + h, b = histogram(one_nan, bins='auto', range=(0, 1)) + assert_equal(h.sum(), 2) # nan is not counted + h, b = histogram(all_nan, bins='auto', range=(0, 1)) + assert_equal(h.sum(), 0) # nan is not counted + + # as does an explicit set of bins + h, b = histogram(one_nan, bins=[0, 1]) + assert_equal(h.sum(), 2) # nan is not counted + h, b = histogram(all_nan, bins=[0, 1]) + assert_equal(h.sum(), 0) # nan is not counted + + def test_datetime(self): + begin = np.datetime64('2000-01-01', 'D') + offsets = np.array([0, 0, 1, 1, 2, 3, 5, 10, 20]) + bins = np.array([0, 2, 7, 20]) + dates = begin + offsets + date_bins = begin + bins + + td = np.dtype('timedelta64[D]') + + # Results should be the same for integer offsets or datetime values. + # For now, only explicit bins are supported, since linspace does not + # work on datetimes or timedeltas + d_count, d_edge = histogram(dates, bins=date_bins) + t_count, t_edge = histogram(offsets.astype(td), bins=bins.astype(td)) + i_count, i_edge = histogram(offsets, bins=bins) + + assert_equal(d_count, i_count) + assert_equal(t_count, i_count) + + assert_equal((d_edge - begin).astype(int), i_edge) + assert_equal(t_edge.astype(int), i_edge) + + assert_equal(d_edge.dtype, dates.dtype) + assert_equal(t_edge.dtype, td) + + def do_precision_lower_bound(self, float_small, float_large): + eps = np.finfo(float_large).eps + + arr = np.array([1.0], float_small) + range = np.array([1.0 + eps, 2.0], float_large) + + # test is looking for behavior when the bounds change between dtypes + if range.astype(float_small)[0] != 1: + return + + # previously crashed + count, x_loc = np.histogram(arr, bins=1, range=range) + assert_equal(count, [1]) + + # gh-10322 means that the type comes from arr - this may change + assert_equal(x_loc.dtype, float_small) + + def do_precision_upper_bound(self, float_small, float_large): + eps = np.finfo(float_large).eps + + arr = np.array([1.0], float_small) + range = np.array([0.0, 1.0 - eps], float_large) + + # test is looking for behavior when the bounds change between dtypes + if range.astype(float_small)[-1] != 1: + return + + # previously crashed + count, x_loc = np.histogram(arr, bins=1, range=range) + assert_equal(count, [1]) + + # gh-10322 means that the type comes from arr - this may change + assert_equal(x_loc.dtype, float_small) + + def do_precision(self, float_small, float_large): + self.do_precision_lower_bound(float_small, float_large) + self.do_precision_upper_bound(float_small, float_large) + + def test_precision(self): + # not looping results in a useful stack trace upon failure + self.do_precision(np.half, np.single) + self.do_precision(np.half, np.double) + self.do_precision(np.half, np.longdouble) + self.do_precision(np.single, np.double) + self.do_precision(np.single, np.longdouble) + self.do_precision(np.double, np.longdouble) + + +class TestHistogramOptimBinNums(object): + """ + Provide test coverage when using provided estimators for optimal number of + bins + """ + + def test_empty(self): + estimator_list = ['fd', 'scott', 'rice', 'sturges', + 'doane', 'sqrt', 'auto'] + # check it can deal with empty data + for estimator in estimator_list: + a, b = histogram([], bins=estimator) + assert_array_equal(a, np.array([0])) + assert_array_equal(b, np.array([0, 1])) + + def test_simple(self): + """ + Straightforward testing with a mixture of linspace data (for + consistency). All test values have been precomputed and the values + shouldn't change + """ + # Some basic sanity checking, with some fixed data. + # Checking for the correct number of bins + basic_test = {50: {'fd': 4, 'scott': 4, 'rice': 8, 'sturges': 7, + 'doane': 8, 'sqrt': 8, 'auto': 7}, + 500: {'fd': 8, 'scott': 8, 'rice': 16, 'sturges': 10, + 'doane': 12, 'sqrt': 23, 'auto': 10}, + 5000: {'fd': 17, 'scott': 17, 'rice': 35, 'sturges': 14, + 'doane': 17, 'sqrt': 71, 'auto': 17}} + + for testlen, expectedResults in basic_test.items(): + # Create some sort of non uniform data to test with + # (2 peak uniform mixture) + x1 = np.linspace(-10, -1, testlen // 5 * 2) + x2 = np.linspace(1, 10, testlen // 5 * 3) + x = np.concatenate((x1, x2)) + for estimator, numbins in expectedResults.items(): + a, b = np.histogram(x, estimator) + assert_equal(len(a), numbins, err_msg="For the {0} estimator " + "with datasize of {1}".format(estimator, testlen)) + + def test_small(self): + """ + Smaller datasets have the potential to cause issues with the data + adaptive methods, especially the FD method. All bin numbers have been + precalculated. + """ + small_dat = {1: {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1, + 'doane': 1, 'sqrt': 1}, + 2: {'fd': 2, 'scott': 1, 'rice': 3, 'sturges': 2, + 'doane': 1, 'sqrt': 2}, + 3: {'fd': 2, 'scott': 2, 'rice': 3, 'sturges': 3, + 'doane': 3, 'sqrt': 2}} + + for testlen, expectedResults in small_dat.items(): + testdat = np.arange(testlen) + for estimator, expbins in expectedResults.items(): + a, b = np.histogram(testdat, estimator) + assert_equal(len(a), expbins, err_msg="For the {0} estimator " + "with datasize of {1}".format(estimator, testlen)) + + def test_incorrect_methods(self): + """ + Check a Value Error is thrown when an unknown string is passed in + """ + check_list = ['mad', 'freeman', 'histograms', 'IQR'] + for estimator in check_list: + assert_raises(ValueError, histogram, [1, 2, 3], estimator) + + def test_novariance(self): + """ + Check that methods handle no variance in data + Primarily for Scott and FD as the SD and IQR are both 0 in this case + """ + novar_dataset = np.ones(100) + novar_resultdict = {'fd': 1, 'scott': 1, 'rice': 1, 'sturges': 1, + 'doane': 1, 'sqrt': 1, 'auto': 1} + + for estimator, numbins in novar_resultdict.items(): + a, b = np.histogram(novar_dataset, estimator) + assert_equal(len(a), numbins, err_msg="{0} estimator, " + "No Variance test".format(estimator)) + + def test_outlier(self): + """ + Check the FD, Scott and Doane with outliers. + + The FD estimates a smaller binwidth since it's less affected by + outliers. Since the range is so (artificially) large, this means more + bins, most of which will be empty, but the data of interest usually is + unaffected. The Scott estimator is more affected and returns fewer bins, + despite most of the variance being in one area of the data. The Doane + estimator lies somewhere between the other two. + """ + xcenter = np.linspace(-10, 10, 50) + outlier_dataset = np.hstack((np.linspace(-110, -100, 5), xcenter)) + + outlier_resultdict = {'fd': 21, 'scott': 5, 'doane': 11} + + for estimator, numbins in outlier_resultdict.items(): + a, b = np.histogram(outlier_dataset, estimator) + assert_equal(len(a), numbins) + + def test_simple_range(self): + """ + Straightforward testing with a mixture of linspace data (for + consistency). Adding in a 3rd mixture that will then be + completely ignored. All test values have been precomputed and + the shouldn't change. + """ + # some basic sanity checking, with some fixed data. + # Checking for the correct number of bins + basic_test = { + 50: {'fd': 8, 'scott': 8, 'rice': 15, + 'sturges': 14, 'auto': 14}, + 500: {'fd': 15, 'scott': 16, 'rice': 32, + 'sturges': 20, 'auto': 20}, + 5000: {'fd': 33, 'scott': 33, 'rice': 69, + 'sturges': 27, 'auto': 33} + } + + for testlen, expectedResults in basic_test.items(): + # create some sort of non uniform data to test with + # (3 peak uniform mixture) + x1 = np.linspace(-10, -1, testlen // 5 * 2) + x2 = np.linspace(1, 10, testlen // 5 * 3) + x3 = np.linspace(-100, -50, testlen) + x = np.hstack((x1, x2, x3)) + for estimator, numbins in expectedResults.items(): + a, b = np.histogram(x, estimator, range = (-20, 20)) + msg = "For the {0} estimator".format(estimator) + msg += " with datasize of {0}".format(testlen) + assert_equal(len(a), numbins, err_msg=msg) + + def test_simple_weighted(self): + """ + Check that weighted data raises a TypeError + """ + estimator_list = ['fd', 'scott', 'rice', 'sturges', 'auto'] + for estimator in estimator_list: + assert_raises(TypeError, histogram, [1, 2, 3], + estimator, weights=[1, 2, 3]) + + +class TestHistogramdd(object): + + def test_simple(self): + x = np.array([[-.5, .5, 1.5], [-.5, 1.5, 2.5], [-.5, 2.5, .5], + [.5, .5, 1.5], [.5, 1.5, 2.5], [.5, 2.5, 2.5]]) + H, edges = histogramdd(x, (2, 3, 3), + range=[[-1, 1], [0, 3], [0, 3]]) + answer = np.array([[[0, 1, 0], [0, 0, 1], [1, 0, 0]], + [[0, 1, 0], [0, 0, 1], [0, 0, 1]]]) + assert_array_equal(H, answer) + + # Check normalization + ed = [[-2, 0, 2], [0, 1, 2, 3], [0, 1, 2, 3]] + H, edges = histogramdd(x, bins=ed, normed=True) + assert_(np.all(H == answer / 12.)) + + # Check that H has the correct shape. + H, edges = histogramdd(x, (2, 3, 4), + range=[[-1, 1], [0, 3], [0, 4]], + normed=True) + answer = np.array([[[0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]], + [[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0]]]) + assert_array_almost_equal(H, answer / 6., 4) + # Check that a sequence of arrays is accepted and H has the correct + # shape. + z = [np.squeeze(y) for y in np.split(x, 3, axis=1)] + H, edges = histogramdd( + z, bins=(4, 3, 2), range=[[-2, 2], [0, 3], [0, 2]]) + answer = np.array([[[0, 0], [0, 0], [0, 0]], + [[0, 1], [0, 0], [1, 0]], + [[0, 1], [0, 0], [0, 0]], + [[0, 0], [0, 0], [0, 0]]]) + assert_array_equal(H, answer) + + Z = np.zeros((5, 5, 5)) + Z[list(range(5)), list(range(5)), list(range(5))] = 1. + H, edges = histogramdd([np.arange(5), np.arange(5), np.arange(5)], 5) + assert_array_equal(H, Z) + + def test_shape_3d(self): + # All possible permutations for bins of different lengths in 3D. + bins = ((5, 4, 6), (6, 4, 5), (5, 6, 4), (4, 6, 5), (6, 5, 4), + (4, 5, 6)) + r = np.random.rand(10, 3) + for b in bins: + H, edges = histogramdd(r, b) + assert_(H.shape == b) + + def test_shape_4d(self): + # All possible permutations for bins of different lengths in 4D. + bins = ((7, 4, 5, 6), (4, 5, 7, 6), (5, 6, 4, 7), (7, 6, 5, 4), + (5, 7, 6, 4), (4, 6, 7, 5), (6, 5, 7, 4), (7, 5, 4, 6), + (7, 4, 6, 5), (6, 4, 7, 5), (6, 7, 5, 4), (4, 6, 5, 7), + (4, 7, 5, 6), (5, 4, 6, 7), (5, 7, 4, 6), (6, 7, 4, 5), + (6, 5, 4, 7), (4, 7, 6, 5), (4, 5, 6, 7), (7, 6, 4, 5), + (5, 4, 7, 6), (5, 6, 7, 4), (6, 4, 5, 7), (7, 5, 6, 4)) + + r = np.random.rand(10, 4) + for b in bins: + H, edges = histogramdd(r, b) + assert_(H.shape == b) + + def test_weights(self): + v = np.random.rand(100, 2) + hist, edges = histogramdd(v) + n_hist, edges = histogramdd(v, normed=True) + w_hist, edges = histogramdd(v, weights=np.ones(100)) + assert_array_equal(w_hist, hist) + w_hist, edges = histogramdd(v, weights=np.ones(100) * 2, normed=True) + assert_array_equal(w_hist, n_hist) + w_hist, edges = histogramdd(v, weights=np.ones(100, int) * 2) + assert_array_equal(w_hist, 2 * hist) + + def test_identical_samples(self): + x = np.zeros((10, 2), int) + hist, edges = histogramdd(x, bins=2) + assert_array_equal(edges[0], np.array([-0.5, 0., 0.5])) + + def test_empty(self): + a, b = histogramdd([[], []], bins=([0, 1], [0, 1])) + assert_array_max_ulp(a, np.array([[0.]])) + a, b = np.histogramdd([[], [], []], bins=2) + assert_array_max_ulp(a, np.zeros((2, 2, 2))) + + def test_bins_errors(self): + # There are two ways to specify bins. Check for the right errors + # when mixing those. + x = np.arange(8).reshape(2, 4) + assert_raises(ValueError, np.histogramdd, x, bins=[-1, 2, 4, 5]) + assert_raises(ValueError, np.histogramdd, x, bins=[1, 0.99, 1, 1]) + assert_raises( + ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 2, 3]]) + assert_raises( + ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 3, -3]]) + assert_(np.histogramdd(x, bins=[1, 1, 1, [1, 2, 3, 4]])) + + def test_inf_edges(self): + # Test using +/-inf bin edges works. See #1788. + with np.errstate(invalid='ignore'): + x = np.arange(6).reshape(3, 2) + expected = np.array([[1, 0], [0, 1], [0, 1]]) + h, e = np.histogramdd(x, bins=[3, [-np.inf, 2, 10]]) + assert_allclose(h, expected) + h, e = np.histogramdd(x, bins=[3, np.array([-1, 2, np.inf])]) + assert_allclose(h, expected) + h, e = np.histogramdd(x, bins=[3, [-np.inf, 3, np.inf]]) + assert_allclose(h, expected) + + def test_rightmost_binedge(self): + # Test event very close to rightmost binedge. See Github issue #4266 + x = [0.9999999995] + bins = [[0., 0.5, 1.0]] + hist, _ = histogramdd(x, bins=bins) + assert_(hist[0] == 0.0) + assert_(hist[1] == 1.) + x = [1.0] + bins = [[0., 0.5, 1.0]] + hist, _ = histogramdd(x, bins=bins) + assert_(hist[0] == 0.0) + assert_(hist[1] == 1.) + x = [1.0000000001] + bins = [[0., 0.5, 1.0]] + hist, _ = histogramdd(x, bins=bins) + assert_(hist[0] == 0.0) + assert_(hist[1] == 1.) + x = [1.0001] + bins = [[0., 0.5, 1.0]] + hist, _ = histogramdd(x, bins=bins) + assert_(hist[0] == 0.0) + assert_(hist[1] == 0.0) + + def test_finite_range(self): + vals = np.random.random((100, 3)) + histogramdd(vals, range=[[0.0, 1.0], [0.25, 0.75], [0.25, 0.5]]) + assert_raises(ValueError, histogramdd, vals, + range=[[0.0, 1.0], [0.25, 0.75], [0.25, np.inf]]) + assert_raises(ValueError, histogramdd, vals, + range=[[0.0, 1.0], [np.nan, 0.75], [0.25, 0.5]]) + + +if __name__ == "__main__": + run_module_suite() diff --git a/numpy/lib/tests/test_index_tricks.py b/numpy/lib/tests/test_index_tricks.py index b5e06dad0..0520ce580 100644 --- a/numpy/lib/tests/test_index_tricks.py +++ b/numpy/lib/tests/test_index_tricks.py @@ -245,7 +245,7 @@ class TestIndexExpression(object): class TestIx_(object): def test_regression_1(self): - # Test empty inputs create ouputs of indexing type, gh-5804 + # Test empty inputs create outputs of indexing type, gh-5804 # Test both lists and arrays for func in (range, np.arange): a, = np.ix_(func(0)) diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index 75a8e4968..ae40cf73b 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -376,7 +376,7 @@ class TestSaveTxt(object): lines = c.readlines() assert_equal(lines, [b'01 : 2.0\n', b'03 : 4.0\n']) - # Specify delimiter, should be overiden + # Specify delimiter, should be overridden c = BytesIO() np.savetxt(c, a, fmt='%02d : %3.1f', delimiter=',') c.seek(0) @@ -1073,6 +1073,13 @@ class Testfromregex(object): x = np.fromregex(path, regexp, dt, encoding='UTF-8') assert_array_equal(x, a) + def test_compiled_bytes(self): + regexp = re.compile(b'(\\d)') + c = BytesIO(b'123') + dt = [('num', np.float64)] + a = np.array([1, 2, 3], dtype=dt) + x = np.fromregex(c, regexp, dt) + assert_array_equal(x, a) #####-------------------------------------------------------------------------- @@ -1096,7 +1103,7 @@ class TestFromTxt(LoadTxtBase): assert_equal(test, control) def test_array(self): - # Test outputing a standard ndarray + # Test outputting a standard ndarray data = TextIO('1 2\n3 4') control = np.array([[1, 2], [3, 4]], dtype=int) test = np.ndfromtxt(data, dtype=int) @@ -1982,7 +1989,7 @@ M 33 21.99 utf8.encode(encoding) except (UnicodeError, ImportError): raise SkipTest('Skipping test_utf8_file_nodtype_unicode, ' - 'unable to encode utf8 in preferred encoding') + 'unable to encode utf8 in preferred encoding') with temppath() as path: with io.open(path, "wt") as f: @@ -2056,6 +2063,13 @@ M 33 21.99 assert_(isinstance(test, np.recarray)) assert_equal(test, control) + #gh-10394 + data = TextIO('color\n"red"\n"blue"') + test = np.recfromcsv(data, converters={0: lambda x: x.strip(b'\"')}) + control = np.array([('red',), ('blue',)], dtype=[('color', (bytes, 4))]) + assert_equal(test.dtype, control.dtype) + assert_equal(test, control) + def test_max_rows(self): # Test the `max_rows` keyword argument. data = '1 2\n3 4\n5 6\n7 8\n9 10\n' @@ -2226,7 +2240,7 @@ class TestPathUsage(object): @dec.skipif(Path is None, "No pathlib.Path") def test_ndfromtxt(self): - # Test outputing a standard ndarray + # Test outputting a standard ndarray with temppath(suffix='.txt') as path: path = Path(path) with path.open('w') as f: @@ -2292,7 +2306,7 @@ def test_gzip_load(): def test_gzip_loadtxt(): - # Thanks to another windows brokeness, we can't use + # Thanks to another windows brokenness, we can't use # NamedTemporaryFile: a file created from this function cannot be # reopened by another open call. So we first put the gzipped string # of the test reference array, write it to a securely opened file, diff --git a/numpy/lib/tests/test_polynomial.py b/numpy/lib/tests/test_polynomial.py index 9a4650825..03915cead 100644 --- a/numpy/lib/tests/test_polynomial.py +++ b/numpy/lib/tests/test_polynomial.py @@ -222,6 +222,14 @@ class TestDocs(object): assert_equal(p == p2, False) assert_equal(p != p2, True) + def test_polydiv(self): + b = np.poly1d([2, 6, 6, 1]) + a = np.poly1d([-1j, (1+2j), -(2+1j), 1]) + q, r = np.polydiv(b, a) + assert_equal(q.coeffs.dtype, np.complex128) + assert_equal(r.coeffs.dtype, np.complex128) + assert_equal(q*a + r, b) + def test_poly_coeffs_immutable(self): """ Coefficients should not be modifiable """ p = np.poly1d([1, 2, 3]) diff --git a/numpy/lib/tests/test_type_check.py b/numpy/lib/tests/test_type_check.py index 8945b61ea..ce8ef2f15 100644 --- a/numpy/lib/tests/test_type_check.py +++ b/numpy/lib/tests/test_type_check.py @@ -359,6 +359,7 @@ class TestNanToNum(object): assert_all(vals[0] < -1e10) and assert_all(np.isfinite(vals[0])) assert_(vals[1] == 0) assert_all(vals[2] > 1e10) and assert_all(np.isfinite(vals[2])) + assert_equal(type(vals), np.ndarray) # perform the same test but in-place with np.errstate(divide='ignore', invalid='ignore'): @@ -369,16 +370,27 @@ class TestNanToNum(object): assert_all(vals[0] < -1e10) and assert_all(np.isfinite(vals[0])) assert_(vals[1] == 0) assert_all(vals[2] > 1e10) and assert_all(np.isfinite(vals[2])) + assert_equal(type(vals), np.ndarray) + + def test_array(self): + vals = nan_to_num([1]) + assert_array_equal(vals, np.array([1], int)) + assert_equal(type(vals), np.ndarray) def test_integer(self): vals = nan_to_num(1) assert_all(vals == 1) - vals = nan_to_num([1]) - assert_array_equal(vals, np.array([1], int)) + assert_equal(type(vals), np.int_) + + def test_float(self): + vals = nan_to_num(1.0) + assert_all(vals == 1.0) + assert_equal(type(vals), np.float_) def test_complex_good(self): vals = nan_to_num(1+1j) assert_all(vals == 1+1j) + assert_equal(type(vals), np.complex_) def test_complex_bad(self): with np.errstate(divide='ignore', invalid='ignore'): @@ -387,6 +399,7 @@ class TestNanToNum(object): vals = nan_to_num(v) # !! This is actually (unexpectedly) zero assert_all(np.isfinite(vals)) + assert_equal(type(vals), np.complex_) def test_complex_bad2(self): with np.errstate(divide='ignore', invalid='ignore'): @@ -394,6 +407,7 @@ class TestNanToNum(object): v += np.array(-1+1.j)/0. vals = nan_to_num(v) assert_all(np.isfinite(vals)) + assert_equal(type(vals), np.complex_) # Fixme #assert_all(vals.imag > 1e10) and assert_all(np.isfinite(vals)) # !! This is actually (unexpectedly) positive diff --git a/numpy/lib/type_check.py b/numpy/lib/type_check.py index 5c7528d4f..1664e6ebb 100644 --- a/numpy/lib/type_check.py +++ b/numpy/lib/type_check.py @@ -215,7 +215,7 @@ def iscomplex(x): if issubclass(ax.dtype.type, _nx.complexfloating): return ax.imag != 0 res = zeros(ax.shape, bool) - return +res # convet to array-scalar if needed + return +res # convert to array-scalar if needed def isreal(x): """ @@ -330,7 +330,7 @@ def _getmaxmin(t): def nan_to_num(x, copy=True): """ - Replace nan with zero and inf with large finite numbers. + Replace NaN with zero and infinity with large finite numbers. If `x` is inexact, NaN is replaced by zero, and infinity and -infinity replaced by the respectively largest and most negative finite floating @@ -343,7 +343,7 @@ def nan_to_num(x, copy=True): Parameters ---------- - x : array_like + x : scalar or array_like Input data. copy : bool, optional Whether to create a copy of `x` (True) or to replace values @@ -374,6 +374,12 @@ def nan_to_num(x, copy=True): Examples -------- + >>> np.nan_to_num(np.inf) + 1.7976931348623157e+308 + >>> np.nan_to_num(-np.inf) + -1.7976931348623157e+308 + >>> np.nan_to_num(np.nan) + 0.0 >>> x = np.array([np.inf, -np.inf, np.nan, -128, 128]) >>> np.nan_to_num(x) array([ 1.79769313e+308, -1.79769313e+308, 0.00000000e+000, @@ -386,20 +392,21 @@ def nan_to_num(x, copy=True): """ x = _nx.array(x, subok=True, copy=copy) xtype = x.dtype.type + + isscalar = (x.ndim == 0) + if not issubclass(xtype, _nx.inexact): - return x + return x[()] if isscalar else x iscomplex = issubclass(xtype, _nx.complexfloating) - isscalar = (x.ndim == 0) - x = x[None] if isscalar else x dest = (x.real, x.imag) if iscomplex else (x,) maxf, minf = _getmaxmin(x.real.dtype) for d in dest: _nx.copyto(d, 0.0, where=isnan(d)) _nx.copyto(d, maxf, where=isposinf(d)) _nx.copyto(d, minf, where=isneginf(d)) - return x[0] if isscalar else x + return x[()] if isscalar else x #----------------------------------------------------------------------------- @@ -579,7 +586,7 @@ def common_type(*arrays): an integer array, the minimum precision type that is returned is a 64-bit floating point dtype. - All input arrays except int64 and uint64 can be safely cast to the + All input arrays except int64 and uint64 can be safely cast to the returned dtype without loss of information. Parameters diff --git a/numpy/lib/utils.py b/numpy/lib/utils.py index e18eda0fb..1ecd334af 100644 --- a/numpy/lib/utils.py +++ b/numpy/lib/utils.py @@ -707,7 +707,7 @@ def lookfor(what, module=None, import_modules=True, regenerate=False, """ Do a keyword search on docstrings. - A list of of objects that matched the search is displayed, + A list of objects that matched the search is displayed, sorted by relevance. All given keywords need to be found in the docstring for it to be returned as a result, but the order does not matter. |
