diff options
author | Charles Harris <charlesr.harris@gmail.com> | 2013-10-04 13:49:33 -0600 |
---|---|---|
committer | Charles Harris <charlesr.harris@gmail.com> | 2013-10-04 16:07:38 -0600 |
commit | 9c7e6e30315ab4541775d9a78630e88423c752c1 (patch) | |
tree | b6b4a74d215a577c65f3e1dbfb493beef9176df1 /numpy/lib/nanfunctions.py | |
parent | f665c617065ba59b1c517b8dd44080c83dbfd5bd (diff) | |
download | numpy-9c7e6e30315ab4541775d9a78630e88423c752c1.tar.gz |
BUG: Refactor nanfunctions to behave as agreed on for 1.9.
Deal with subclasses of ndarray, like pandas.Series and matrix.
Subclasses may not define the new keyword keepdims or deal
gracefully with ufuncs in all their forms. This is solved by
throwing the problem onto the np.sum, np.any, etc. functions
that have ugly hacks to deal with the problem.
Settle handling of all-nan slices.
nanmax, nanmin -- Raise warning, return NaN for slice.
nanargmax, nanargmin -- Raise ValueError.
nansum -- Return 0 for slice.
nanmean, nanvar, nanstd -- Raise warning, return NaN for slice.
Make NaN functions work for scalar arguments.
This may seem silly, but it removes a check for special cases.
Update tests
Deal with new all-nan handling.
Test with matrix class as example of subclass without keepdims.
Test with scalar arguments.
Fix nanvar issue reported in #3860.
Closes #3860 #3850
Diffstat (limited to 'numpy/lib/nanfunctions.py')
-rw-r--r-- | numpy/lib/nanfunctions.py | 316 |
1 files changed, 172 insertions, 144 deletions
diff --git a/numpy/lib/nanfunctions.py b/numpy/lib/nanfunctions.py index 44cc2b163..5766084ab 100644 --- a/numpy/lib/nanfunctions.py +++ b/numpy/lib/nanfunctions.py @@ -13,23 +13,18 @@ Functions - `nanvar` -- variance of non-NaN values - `nanstd` -- standard deviation of non-NaN values -Classes -------- -- `NanWarning` -- Warning raised by nanfunctions - """ from __future__ import division, absolute_import, print_function import warnings import numpy as np + __all__ = [ 'nansum', 'nanmax', 'nanmin', 'nanargmax', 'nanargmin', 'nanmean', - 'nanvar', 'nanstd', 'NanWarning' + 'nanvar', 'nanstd' ] -class NanWarning(RuntimeWarning): pass - def _replace_nan(a, val): """ @@ -103,7 +98,8 @@ def _divide_by_count(a, b, out=None): Compute a/b ignoring invalid results. If `a` is an array the division is done in place. If `a` is a scalar, then its type is preserved in the output. If out is None, then then a is used instead so that the - division is in place. + division is in place. Note that this is only called with `a` an inexact + type. Parameters ---------- @@ -140,37 +136,38 @@ def _divide_by_count(a, b, out=None): def nanmin(a, axis=None, out=None, keepdims=False): """ - Return the minimum of an array or minimum along an axis, ignoring any - NaNs. + Return minimum of an array or minimum along an axis, ignoring any NaNs. + When all-NaN slices are encountered a ``RuntimeWarning`` is raised and + Nan is returned for that slice. Parameters ---------- a : array_like - Array containing numbers whose minimum is desired. If `a` is not - an array, a conversion is attempted. + Array containing numbers whose minimum is desired. If `a` is not an + array, a conversion is attempted. axis : int, optional Axis along which the minimum is computed. The default is to compute the minimum of the flattened array. out : ndarray, optional Alternate output array in which to place the result. The default is ``None``; if provided, it must have the same shape as the - expected output, but the type will be cast if necessary. - See `doc.ufuncs` for details. + expected output, but the type will be cast if necessary. See + `doc.ufuncs` for details. .. versionadded:: 1.8.0 keepdims : bool, optional - If this is set to True, the axes which are reduced are left - in the result as dimensions with size one. With this option, - the result will broadcast correctly against the original `a`. + If this is set to True, the axes which are reduced are left in the + result as dimensions with size one. With this option, the result + will broadcast correctly against the original `a`. .. versionadded:: 1.8.0 Returns ------- nanmin : ndarray - An array with the same shape as `a`, with the specified axis removed. - If `a` is a 0-d array, or if axis is None, an ndarray scalar is - returned. The same dtype as `a` is returned. + An array with the same shape as `a`, with the specified axis + removed. If `a` is a 0-d array, or if axis is None, an ndarray + scalar is returned. The same dtype as `a` is returned. See Also -------- @@ -193,8 +190,8 @@ def nanmin(a, axis=None, out=None, keepdims=False): ----- Numpy uses the IEEE Standard for Binary Floating-Point for Arithmetic (IEEE 754). This means that Not a Number is not equivalent to infinity. - Positive infinity is treated as a very large number and negative infinity - is treated as a very small (i.e. negative) number. + Positive infinity is treated as a very large number and negative + infinity is treated as a very small (i.e. negative) number. If the input has a integer type the function is equivalent to np.min. @@ -216,32 +213,51 @@ def nanmin(a, axis=None, out=None, keepdims=False): -inf """ - return np.fmin.reduce(a, axis=axis, out=out, keepdims=keepdims) + if not isinstance(a, np.ndarray) or type(a) is np.ndarray: + # Fast, but not safe for subclasses of ndarray + res = np.fmin.reduce(a, axis=axis, out=out, keepdims=keepdims) + if np.isnan(res).any(): + warnings.warn("All-NaN axis encountered", RuntimeWarning) + else: + # Slow, but safe for subclasses of ndarray + a, mask = _replace_nan(a, +np.inf) + res = np.amin(a, axis=axis, out=out, keepdims=keepdims) + if mask is None: + return res + + # Check for all-NaN axis + mask = np.all(mask, axis=axis, keepdims=keepdims) + if np.any(mask): + res = _copyto(res, mask, np.nan) + warnings.warn("All-NaN axis encountered", RuntimeWarning) + return res def nanmax(a, axis=None, out=None, keepdims=False): """ - Return the maximum of an array or maximum along an axis, ignoring any NaNs. + Return the maximum of an array or maximum along an axis, ignoring any + NaNs. When all-NaN slices are encountered a ``RuntimeWarning`` is + raised and NaN is returned for that slice. Parameters ---------- a : array_like - Array containing numbers whose maximum is desired. If `a` is not - an array, a conversion is attempted. + Array containing numbers whose maximum is desired. If `a` is not an + array, a conversion is attempted. axis : int, optional Axis along which the maximum is computed. The default is to compute the maximum of the flattened array. out : ndarray, optional Alternate output array in which to place the result. The default is ``None``; if provided, it must have the same shape as the - expected output, but the type will be cast if necessary. - See `doc.ufuncs` for details. + expected output, but the type will be cast if necessary. See + `doc.ufuncs` for details. .. versionadded:: 1.8.0 keepdims : bool, optional - If this is set to True, the axes which are reduced are left - in the result as dimensions with size one. With this option, - the result will broadcast correctly against the original `a`. + If this is set to True, the axes which are reduced are left in the + result as dimensions with size one. With this option, the result + will broadcast correctly against the original `a`. .. versionadded:: 1.8.0 @@ -273,8 +289,8 @@ def nanmax(a, axis=None, out=None, keepdims=False): ----- Numpy uses the IEEE Standard for Binary Floating-Point for Arithmetic (IEEE 754). This means that Not a Number is not equivalent to infinity. - Positive infinity is treated as a very large number and negative infinity - is treated as a very small (i.e. negative) number. + Positive infinity is treated as a very large number and negative + infinity is treated as a very small (i.e. negative) number. If the input has a integer type the function is equivalent to np.max. @@ -296,15 +312,31 @@ def nanmax(a, axis=None, out=None, keepdims=False): inf """ - return np.fmax.reduce(a, axis=axis, out=out, keepdims=keepdims) + if not isinstance(a, np.ndarray) or type(a) is np.ndarray: + # Fast, but not safe for subclasses of ndarray + res = np.fmax.reduce(a, axis=axis, out=out, keepdims=keepdims) + if np.isnan(res).any(): + warnings.warn("All-NaN slice encountered", RuntimeWarning) + else: + # Slow, but safe for subclasses of ndarray + a, mask = _replace_nan(a, -np.inf) + res = np.amax(a, axis=axis, out=out, keepdims=keepdims) + if mask is None: + return res + + # Check for all-NaN axis + mask = np.all(mask, axis=axis, keepdims=keepdims) + if np.any(mask): + res = _copyto(res, mask, np.nan) + warnings.warn("All-NaN axis encountered", RuntimeWarning) + return res def nanargmin(a, axis=None): """ Return the indices of the minimum values in the specified axis ignoring - NaNs. For all-NaN slices, the negative number ``np.iinfo('intp').min`` - is returned. It is platform dependent. Warning: the results cannot be - trusted if a slice contains only NaNs and Infs. + NaNs. For all-NaN slices ``ValueError`` is raised. Warning: the results + cannot be trusted if a slice contains only NaNs and Infs. Parameters ---------- @@ -336,23 +368,19 @@ def nanargmin(a, axis=None): """ a, mask = _replace_nan(a, np.inf) - if mask is None: - return np.argmin(a, axis) - # May later want to do something special for all nan slices. - mask = mask.all(axis=axis) - ind = np.argmin(a, axis) - if mask.any(): - warnings.warn("All NaN axis detected.", NanWarning) - ind =_copyto(ind, np.iinfo(np.intp).min, mask) - return ind + res = np.argmin(a, axis=axis) + if mask is not None: + mask = np.all(mask, axis=axis) + if np.any(mask): + raise ValueError("All-NaN slice encountered") + return res def nanargmax(a, axis=None): """ Return the indices of the maximum values in the specified axis ignoring - NaNs. For all-NaN slices, the negative number ``np.iinfo('intp').min`` - is returned. It is platform dependent. Warning: the results cannot be - trusted if a slice contains only NaNs and -Infs. + NaNs. For all-NaN slices ``ValueError`` is raised. Warning: the + results cannot be trusted if a slice contains only NaNs and -Infs. Parameters @@ -385,24 +413,21 @@ def nanargmax(a, axis=None): """ a, mask = _replace_nan(a, -np.inf) - if mask is None: - return np.argmax(a, axis) - # May later want to do something special for all nan slices. - mask = mask.all(axis=axis) - ind = np.argmax(a, axis) - if mask.any(): - warnings.warn("All NaN axis detected.", NanWarning) - ind = _copyto(ind, np.iinfo(np.intp).min, mask) - return ind + res = np.argmax(a, axis=axis) + if mask is not None: + mask = np.all(mask, axis=axis) + if np.any(mask): + raise ValueError("All-NaN slice encountered") + return res def nansum(a, axis=None, dtype=None, out=None, keepdims=0): """ - Return the sum of array elements over a given axis treating - Not a Numbers (NaNs) as zero. + Return the sum of array elements over a given axis treating Not a + Numbers (NaNs) as zero. - In Numpy versions <= 1.8 Nan is returned for slices that - are all-NaN or empty. In later versions zero is returned. + In Numpy versions <= 1.8 Nan is returned for slices that are all-NaN or + empty. In later versions zero is returned. Parameters ---------- @@ -410,19 +435,23 @@ def nansum(a, axis=None, dtype=None, out=None, keepdims=0): Array containing numbers whose sum is desired. If `a` is not an array, a conversion is attempted. axis : int, optional - Axis along which the sum is computed. The default is to compute - the sum of the flattened array. + Axis along which the sum is computed. The default is to compute the + sum of the flattened array. dtype : data-type, optional - Type to use in computing the sum. For integer inputs, the default - is the same as `int64`. For inexact inputs, it must be inexact. + The type of the returned array and of the accumulator in which the + elements are summed. By default, the dtype of `a` is used. An + exception is when `a` has an integer type with less precision than + the platform (u)intp. In that case, the default will be either + (u)int32 or (u)int64 depending on whether the platform is 32 or 64 + bits. For inexact inputs, dtype must be inexact. .. versionadded:: 1.8.0 out : ndarray, optional Alternate output array in which to place the result. The default is ``None``. If provided, it must have the same shape as the - expected output, but the type will be cast if necessary. - See `doc.ufuncs` for details. The casting of NaN to integer can - yield unexpected results. + expected output, but the type will be cast if necessary. See + `doc.ufuncs` for details. The casting of NaN to integer can yield + unexpected results. .. versionadded:: 1.8.0 keepdims : bool, optional @@ -444,17 +473,13 @@ def nansum(a, axis=None, dtype=None, out=None, keepdims=0): Notes ----- - Numpy uses the IEEE Standard for Binary Floating-Point for Arithmetic - (IEEE 754). This means that Not a Number is not equivalent to infinity. - If positive or negative infinity are present the result is positive or - negative infinity. But if both positive and negative infinity are present, - the result is Not A Number (NaN). - - Arithmetic is modular when using integer types (all elements of `a` must - be finite i.e. no elements that are NaNs, positive infinity and negative - infinity because NaNs are floating point types), and no error is raised - on overflow. + If both positive and negative infinity are present, the sum will be Not + A Number (NaN). + Numpy integer arithmetic is modular. If the size of a sum exceeds the + size of an integer accumulator, its value will wrap around and the + result will be incorrect. Specifying ``dtype=double`` can alleviate + that problem. Examples -------- @@ -478,7 +503,7 @@ def nansum(a, axis=None, dtype=None, out=None, keepdims=0): """ a, mask = _replace_nan(a, 0) - return a.sum(axis, dtype, out, keepdims) + return np.sum(a, axis=axis, dtype=dtype, out=out, keepdims=keepdims) def nanmean(a, axis=None, dtype=None, out=None, keepdims=False): @@ -489,7 +514,7 @@ def nanmean(a, axis=None, dtype=None, out=None, keepdims=False): the flattened array by default, otherwise over the specified axis. `float64` intermediate and return values are used for integer inputs. - For all-NaN slices, NaN is returned and a `NanWarning` is raised. + For all-NaN slices, NaN is returned and a `RuntimeWarning` is raised. .. versionadded:: 1.8.0 @@ -503,17 +528,17 @@ def nanmean(a, axis=None, dtype=None, out=None, keepdims=False): the mean of the flattened array. dtype : data-type, optional Type to use in computing the mean. For integer inputs, the default - is `float64`; for inexact inputs, it is the same as the - input dtype. + is `float64`; for inexact inputs, it is the same as the input + dtype. out : ndarray, optional Alternate output array in which to place the result. The default is ``None``; if provided, it must have the same shape as the - expected output, but the type will be cast if necessary. - See `doc.ufuncs` for details. + expected output, but the type will be cast if necessary. See + `doc.ufuncs` for details. keepdims : bool, optional - If this is set to True, the axes which are reduced are left - in the result as dimensions with size one. With this option, - the result will broadcast correctly against the original `arr`. + If this is set to True, the axes which are reduced are left in the + result as dimensions with size one. With this option, the result + will broadcast correctly against the original `arr`. Returns ------- @@ -533,11 +558,11 @@ def nanmean(a, axis=None, dtype=None, out=None, keepdims=False): The arithmetic mean is the sum of the non-NaN elements along the axis divided by the number of non-NaN elements. - Note that for floating-point input, the mean is computed using the - same precision the input has. Depending on the input data, this can - cause the results to be inaccurate, especially for `float32`. - Specifying a higher-precision accumulator using the `dtype` keyword - can alleviate this issue. + Note that for floating-point input, the mean is computed using the same + precision the input has. Depending on the input data, this can cause + the results to be inaccurate, especially for `float32`. Specifying a + higher-precision accumulator using the `dtype` keyword can alleviate + this issue. Examples -------- @@ -552,7 +577,7 @@ def nanmean(a, axis=None, dtype=None, out=None, keepdims=False): """ arr, mask = _replace_nan(a, 0) if mask is None: - return np.mean(arr, axis, dtype=dtype, out=out, keepdims=keepdims) + return np.mean(arr, axis=axis, dtype=dtype, out=out, keepdims=keepdims) if dtype is not None: dtype = np.dtype(dtype) @@ -564,28 +589,28 @@ def nanmean(a, axis=None, dtype=None, out=None, keepdims=False): # The warning context speeds things up. with warnings.catch_warnings(): warnings.simplefilter('ignore') - cnt = np.add.reduce(~mask, axis, dtype=np.intp, keepdims=keepdims) - tot = np.add.reduce(arr, axis, dtype=dtype, out=out, keepdims=keepdims) + cnt = np.sum(~mask, axis=axis, dtype=np.intp, keepdims=keepdims) + tot = np.sum(arr, axis=axis, dtype=dtype, out=out, keepdims=keepdims) avg = _divide_by_count(tot, cnt, out=out) isbad = (cnt == 0) if isbad.any(): - warnings.warn("Mean of empty slice", NanWarning) + warnings.warn("Mean of empty slice", RuntimeWarning) # NaN is the only possible bad value, so no further # action is needed to handle bad results. return avg -def nanvar(a, axis=None, dtype=None, out=None, ddof=0, - keepdims=False): +def nanvar(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False): """ Compute the variance along the specified axis, while ignoring NaNs. - Returns the variance of the array elements, a measure of the spread of a - distribution. The variance is computed for the flattened array by + Returns the variance of the array elements, a measure of the spread of + a distribution. The variance is computed for the flattened array by default, otherwise over the specified axis. - For all-NaN slices, NaN is returned and a `NanWarning` is raised. + For all-NaN slices or slices with zero degrees of freedom, NaN is + returned and a `RuntimeWarning` is raised. .. versionadded:: 1.8.0 @@ -638,9 +663,9 @@ def nanvar(a, axis=None, dtype=None, out=None, ddof=0, The mean is normally calculated as ``x.sum() / N``, where ``N = len(x)``. If, however, `ddof` is specified, the divisor ``N - ddof`` is used instead. In standard statistical practice, ``ddof=1`` provides an - unbiased estimator of the variance of a hypothetical infinite population. - ``ddof=0`` provides a maximum likelihood estimate of the variance for - normally distributed variables. + unbiased estimator of the variance of a hypothetical infinite + population. ``ddof=0`` provides a maximum likelihood estimate of the + variance for normally distributed variables. Note that for complex numbers, the absolute value is taken before squaring, so that the result is always real and nonnegative. @@ -664,7 +689,8 @@ def nanvar(a, axis=None, dtype=None, out=None, ddof=0, """ arr, mask = _replace_nan(a, 0) if mask is None: - return np.var(arr, axis, dtype=dtype, out=out, keepdims=keepdims) + return np.var(arr, axis=axis, dtype=dtype, out=out, ddof=ddof, + keepdims=keepdims) if dtype is not None: dtype = np.dtype(dtype) @@ -677,30 +703,29 @@ def nanvar(a, axis=None, dtype=None, out=None, ddof=0, warnings.simplefilter('ignore') # Compute mean - cnt = np.add.reduce(~mask, axis, dtype=np.intp, keepdims=True) - tot = np.add.reduce(arr, axis, dtype=dtype, keepdims=True) - avg = np.divide(tot, cnt, out=tot) + cnt = np.sum(~mask, axis=axis, dtype=np.intp, keepdims=True) + avg = np.sum(arr, axis=axis, dtype=dtype, keepdims=True) + avg = _divide_by_count(avg, cnt) # Compute squared deviation from mean. - x = arr - avg - np.copyto(x, 0, where=mask) + arr -= avg + arr = _copyto(arr, 0, mask) if issubclass(arr.dtype.type, np.complexfloating): - sqr = np.multiply(x, x.conj(), out=x).real + sqr = np.multiply(arr, arr.conj(), out=arr).real else: - sqr = np.multiply(x, x, out=x) - - # adjust cnt. - if not keepdims: - cnt = cnt.squeeze(axis) - cnt -= ddof + sqr = np.multiply(arr, arr, out=arr) # Compute variance. - var = np.add.reduce(sqr, axis, dtype=dtype, out=out, keepdims=keepdims) - var = _divide_by_count(var, cnt) + var = np.sum(sqr, axis=axis, dtype=dtype, out=out, keepdims=keepdims) + if var.ndim < cnt.ndim: + # Subclasses of ndarray may ignore keepdims, so check here. + cnt = cnt.squeeze(axis) + dof = cnt - ddof + var = _divide_by_count(var, dof) - isbad = (cnt <= 0) - if isbad.any(): - warnings.warn("Degrees of freedom <= 0 for slice.", NanWarning) + isbad = (dof <= 0) + if np.any(isbad): + warnings.warn("Degrees of freedom <= 0 for slice.", RuntimeWarning) # NaN, inf, or negative numbers are all possible bad # values, so explicitly replace them with NaN. var = _copyto(var, np.nan, isbad) @@ -712,11 +737,13 @@ def nanstd(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False): Compute the standard deviation along the specified axis, while ignoring NaNs. - Returns the standard deviation, a measure of the spread of a distribution, - of the non-NaN array elements. The standard deviation is computed for the - flattened array by default, otherwise over the specified axis. + Returns the standard deviation, a measure of the spread of a + distribution, of the non-NaN array elements. The standard deviation is + computed for the flattened array by default, otherwise over the + specified axis. - For all-NaN slices, NaN is returned and a `NanWarning` is raised. + For all-NaN slices or slices with zero degrees of freedom, NaN is + returned and a `RuntimeWarning` is raised. .. versionadded:: 1.8.0 @@ -729,12 +756,12 @@ def nanstd(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False): to compute the standard deviation of the flattened array. dtype : dtype, optional Type to use in computing the standard deviation. For arrays of - integer type the default is float64, for arrays of float types it is - the same as the array type. + integer type the default is float64, for arrays of float types it + is the same as the array type. out : ndarray, optional Alternative output array in which to place the result. It must have - the same shape as the expected output but the type (of the calculated - values) will be cast if necessary. + the same shape as the expected output but the type (of the + calculated values) will be cast if necessary. ddof : int, optional Means Delta Degrees of Freedom. The divisor used in calculations is ``N - ddof``, where ``N`` represents the number of non-NaN @@ -761,26 +788,26 @@ def nanstd(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False): Notes ----- The standard deviation is the square root of the average of the squared - deviations from the mean, i.e., ``std = sqrt(mean(abs(x - x.mean())**2))``. + deviations from the mean: ``std = sqrt(mean(abs(x - x.mean())**2))``. The average squared deviation is normally calculated as - ``x.sum() / N``, where ``N = len(x)``. If, however, `ddof` is specified, - the divisor ``N - ddof`` is used instead. In standard statistical - practice, ``ddof=1`` provides an unbiased estimator of the variance - of the infinite population. ``ddof=0`` provides a maximum likelihood - estimate of the variance for normally distributed variables. The - standard deviation computed in this function is the square root of + ``x.sum() / N``, where ``N = len(x)``. If, however, `ddof` is + specified, the divisor ``N - ddof`` is used instead. In standard + statistical practice, ``ddof=1`` provides an unbiased estimator of the + variance of the infinite population. ``ddof=0`` provides a maximum + likelihood estimate of the variance for normally distributed variables. + The standard deviation computed in this function is the square root of the estimated variance, so even with ``ddof=1``, it will not be an unbiased estimate of the standard deviation per se. - Note that, for complex numbers, `std` takes the absolute - value before squaring, so that the result is always real and nonnegative. + Note that, for complex numbers, `std` takes the absolute value before + squaring, so that the result is always real and nonnegative. For floating-point input, the *std* is computed using the same precision the input has. Depending on the input data, this can cause - the results to be inaccurate, especially for float32 (see example below). - Specifying a higher-accuracy accumulator using the `dtype` keyword can - alleviate this issue. + the results to be inaccurate, especially for float32 (see example + below). Specifying a higher-accuracy accumulator using the `dtype` + keyword can alleviate this issue. Examples -------- @@ -793,7 +820,8 @@ def nanstd(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False): array([ 0., 0.5]) """ - var = nanvar(a, axis, dtype, out, ddof, keepdims) + var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof, + keepdims=keepdims) if isinstance(var, np.ndarray): std = np.sqrt(var, out=var) else: |