diff options
author | Charles Harris <charlesr.harris@gmail.com> | 2021-11-15 17:31:41 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-11-15 17:31:41 -0700 |
commit | 376ad691fe4df77e502108d279872f56b30376dc (patch) | |
tree | 11635deb5fb17746e7d18035d747edc7c1217a31 /numpy/lib/function_base.py | |
parent | b75fe5766c3739972e8f5c10a85f320f08e74d26 (diff) | |
parent | 546c47adae1066411ff7a3e3da5d758236ee90cf (diff) | |
download | numpy-376ad691fe4df77e502108d279872f56b30376dc.tar.gz |
Merge pull request #20327 from seberg/rename-interpolation
BUG,DEP: Fixup quantile/percentile and rename interpolation->method
Diffstat (limited to 'numpy/lib/function_base.py')
-rw-r--r-- | numpy/lib/function_base.py | 269 |
1 files changed, 158 insertions, 111 deletions
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index 3c9983edf..a215f63d3 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -50,8 +50,8 @@ __all__ = [ 'quantile' ] -# _QuantileInterpolation is a dictionary listing all the supported -# interpolation methods to compute quantile/percentile. +# _QuantileMethods is a dictionary listing all the supported methods to +# compute quantile/percentile. # # Below virtual_index refer to the index of the element where the percentile # would be found in the sorted sample. @@ -61,13 +61,13 @@ __all__ = [ # is made of a integer part (a.k.a 'i' or 'left') and a fractional part # (a.k.a 'g' or 'gamma') # -# Each _QuantileInterpolation have two properties +# Each method in _QuantileMethods has two properties # get_virtual_index : Callable # The function used to compute the virtual_index. # fix_gamma : Callable # A function used for discret methods to force the index to a specific value. -_QuantileInterpolation = dict( - # --- HYNDMAN AND FAN METHODS +_QuantileMethods = dict( + # --- HYNDMAN and FAN METHODS # Discrete methods inverted_cdf=dict( get_virtual_index=lambda n, quantiles: _inverted_cdf(n, quantiles), @@ -3854,7 +3854,7 @@ def _median(a, axis=None, out=None, overwrite_input=False): def _percentile_dispatcher(a, q, axis=None, out=None, overwrite_input=None, - interpolation=None, keepdims=None): + method=None, keepdims=None, *, interpolation=None): return (a, q, out) @@ -3864,8 +3864,10 @@ def percentile(a, axis=None, out=None, overwrite_input=False, - interpolation="linear", - keepdims=False): + method="linear", + keepdims=False, + *, + interpolation=None): """ Compute the q-th percentile of the data along the specified axis. @@ -3893,31 +3895,33 @@ def percentile(a, If True, then allow the input array `a` to be modified by intermediate calculations, to save memory. In this case, the contents of the input `a` after this function completes is undefined. - interpolation : str, optional - This parameter specifies the interpolation method to - use when the desired percentile lies between two data points - There are many different methods, some unique to NumPy. See the - notes for explanation. Options - - * (NPY 1): 'lower' - * (NPY 2): 'higher', - * (NPY 3): 'midpoint' - * (NPY 4): 'nearest' - * (NPY 5): 'linear' - - New options: - - * (H&F 1): 'inverted_cdf' - * (H&F 2): 'averaged_inverted_cdf' - * (H&F 3): 'closest_observation' - * (H&F 4): 'interpolated_inverted_cdf' - * (H&F 5): 'hazen' - * (H&F 6): 'weibull' - * (H&F 7): 'linear' (default) - * (H&F 8): 'median_unbiased' - * (H&F 9): 'normal_unbiased' + method : str, optional + This parameter specifies the method to use for estimating the + percentile. There are many different methods, some unique to NumPy. + See the notes for explanation. The options sorted by their R type + as summarized in the H&F paper [1]_ are: + + 1. 'inverted_cdf' + 2. 'averaged_inverted_cdf' + 3. 'closest_observation' + 4. 'interpolated_inverted_cdf' + 5. 'hazen' + 6. 'weibull' + 7. 'linear' (default) + 8. 'median_unbiased' + 9. 'normal_unbiased' + + The first three methods are discontiuous. NumPy further defines the + following discontinuous variations of the default 'linear' (7.) option: + + * 'lower' + * 'higher', + * 'midpoint' + * 'nearest' .. versionchanged:: 1.22.0 + This argument was previously called "interpolation" and only + offered the "linear" default and last four options. keepdims : bool, optional If this is set to True, the axes which are reduced are left in @@ -3926,6 +3930,11 @@ def percentile(a, .. versionadded:: 1.9.0 + interpolation : str, optional + Deprecated name for the method keyword argument. + + .. deprecated:: 1.22.0 + Returns ------- percentile : scalar or ndarray @@ -3950,16 +3959,16 @@ def percentile(a, Given a vector ``V`` of length ``N``, the q-th percentile of ``V`` is the value ``q/100`` of the way from the minimum to the maximum in a sorted copy of ``V``. The values and distances of the two nearest - neighbors as well as the `interpolation` parameter will determine the + neighbors as well as the `method` parameter will determine the percentile if the normalized ranking does not match the location of ``q`` exactly. This function is the same as the median if ``q=50``, the same as the minimum if ``q=0`` and the same as the maximum if ``q=100``. - This optional `interpolation` parameter specifies the interpolation - method to use when the desired quantile lies between two data points - ``i < j``. If ``g`` is the fractional part of the index surrounded by - ``i`` and alpha and beta are correction constants modifying i and j. + This optional `method` parameter specifies the method to use when the + desired quantile lies between two data points ``i < j``. + If ``g`` is the fractional part of the index surrounded by ``i`` and + alpha and beta are correction constants modifying i and j. Below, 'q' is the quantile value, 'n' is the sample size and alpha and beta are constants. @@ -3970,7 +3979,7 @@ def percentile(a, .. math:: i + g = (q - alpha) / ( n - alpha - beta + 1 ) - The different interpolation methods then work as follows + The different methods then work as follows inverted_cdf: method 1 of H&F [1]_. @@ -4075,7 +4084,7 @@ def percentile(a, array([7., 2.]) >>> assert not np.all(a == b) - The different types of interpolation can be visualized graphically: + The different methods can be visualized graphically: .. plot:: @@ -4085,20 +4094,25 @@ def percentile(a, p = np.linspace(0, 100, 6001) ax = plt.gca() lines = [ - ('linear', None), - ('higher', '--'), - ('lower', '--'), - ('nearest', '-.'), - ('midpoint', '-.'), - ] - for interpolation, style in lines: + ('linear', '-', 'C0'), + ('inverted_cdf', ':', 'C1'), + # Almost the same as `inverted_cdf`: + ('averaged_inverted_cdf', '-.', 'C1'), + ('closest_observation', ':', 'C2'), + ('interpolated_inverted_cdf', '--', 'C1'), + ('hazen', '--', 'C3'), + ('weibull', '-.', 'C4'), + ('median_unbiased', '--', 'C5'), + ('normal_unbiased', '-.', 'C6'), + ] + for method, style, color in lines: ax.plot( - p, np.percentile(a, p, interpolation=interpolation), - label=interpolation, linestyle=style) + p, np.percentile(a, p, method=method), + label=method, linestyle=style, color=color) ax.set( - title='Interpolation methods for list: ' + str(a), + title='Percentiles for different methods and data: ' + str(a), xlabel='Percentile', - ylabel='List item returned', + ylabel='Estimated percentile value', yticks=a) ax.legend() plt.show() @@ -4110,16 +4124,19 @@ def percentile(a, The American Statistician, 50(4), pp. 361-365, 1996 """ + if interpolation is not None: + method = _check_interpolation_as_method( + method, interpolation, "percentile") q = np.true_divide(q, 100) q = asanyarray(q) # undo any decay that the ufunc performed (see gh-13105) if not _quantile_is_valid(q): raise ValueError("Percentiles must be in the range [0, 100]") return _quantile_unchecked( - a, q, axis, out, overwrite_input, interpolation, keepdims) + a, q, axis, out, overwrite_input, method, keepdims) def _quantile_dispatcher(a, q, axis=None, out=None, overwrite_input=None, - interpolation=None, keepdims=None): + method=None, keepdims=None, *, interpolation=None): return (a, q, out) @@ -4129,8 +4146,10 @@ def quantile(a, axis=None, out=None, overwrite_input=False, - interpolation="linear", - keepdims=False): + method="linear", + keepdims=False, + *, + interpolation=None): """ Compute the q-th quantile of the data along the specified axis. @@ -4155,37 +4174,44 @@ def quantile(a, intermediate calculations, to save memory. In this case, the contents of the input `a` after this function completes is undefined. - interpolation : str, optional - This parameter specifies the interpolation method to use when the - desired quantile lies between two data points There are many - different methods, some unique to NumPy. See the notes for - explanation. Options: - - * (NPY 1): 'lower' - * (NPY 2): 'higher', - * (NPY 3): 'midpoint' - * (NPY 4): 'nearest' - * (NPY 5): 'linear' - - New options: - - * (H&F 1): 'inverted_cdf' - * (H&F 2): 'averaged_inverted_cdf' - * (H&F 3): 'closest_observation' - * (H&F 4): 'interpolated_inverted_cdf' - * (H&F 5): 'hazen' - * (H&F 6): 'weibull' - * (H&F 7): 'linear' (default) - * (H&F 8): 'median_unbiased' - * (H&F 9): 'normal_unbiased' - - .. versionadded:: 1.22.0 + method : str, optional + This parameter specifies the method to use for estimating the + quantile. There are many different methods, some unique to NumPy. + See the notes for explanation. The options sorted by their R type + as summarized in the H&F paper [1]_ are: + + 1. 'inverted_cdf' + 2. 'averaged_inverted_cdf' + 3. 'closest_observation' + 4. 'interpolated_inverted_cdf' + 5. 'hazen' + 6. 'weibull' + 7. 'linear' (default) + 8. 'median_unbiased' + 9. 'normal_unbiased' + + The first three methods are discontiuous. NumPy further defines the + following discontinuous variations of the default 'linear' (7.) option: + + * 'lower' + * 'higher', + * 'midpoint' + * 'nearest' + + .. versionchanged:: 1.22.0 + This argument was previously called "interpolation" and only + offered the "linear" default and last four options. keepdims : bool, optional If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the result will broadcast correctly against the original array `a`. + interpolation : str, optional + Deprecated name for the method keyword argument. + + .. deprecated:: 1.22.0 + Returns ------- quantile : scalar or ndarray @@ -4210,20 +4236,20 @@ def quantile(a, Given a vector ``V`` of length ``N``, the q-th quantile of ``V`` is the value ``q`` of the way from the minimum to the maximum in a sorted copy of ``V``. The values and distances of the two nearest neighbors as well as the - `interpolation` parameter will determine the quantile if the normalized + `method` parameter will determine the quantile if the normalized ranking does not match the location of ``q`` exactly. This function is the same as the median if ``q=0.5``, the same as the minimum if ``q=0.0`` and the same as the maximum if ``q=1.0``. - This optional `interpolation` parameter specifies the interpolation method - to use when the desired quantile lies between two data points ``i < j``. If - ``g`` is the fractional part of the index surrounded by ``i`` and alpha - and beta are correction constants modifying i and j. + This optional `method` parameter specifies the method to use when the + desired quantile lies between two data points ``i < j``. + If ``g`` is the fractional part of the index surrounded by ``i`` and + alpha and beta are correction constants modifying i and j. .. math:: i + g = (q - alpha) / ( n - alpha - beta + 1 ) - The different interpolation methods then work as follows + The different methods then work as follows inverted_cdf: method 1 of H&F [1]_. @@ -4326,6 +4352,8 @@ def quantile(a, array([7., 2.]) >>> assert not np.all(a == b) + See also `numpy.percentile` for a visualization of most methods. + References ---------- .. [1] R. J. Hyndman and Y. Fan, @@ -4333,11 +4361,15 @@ def quantile(a, The American Statistician, 50(4), pp. 361-365, 1996 """ + if interpolation is not None: + method = _check_interpolation_as_method( + method, interpolation, "quantile") + q = np.asanyarray(q) if not _quantile_is_valid(q): raise ValueError("Quantiles must be in the range [0, 1]") return _quantile_unchecked( - a, q, axis, out, overwrite_input, interpolation, keepdims) + a, q, axis, out, overwrite_input, method, keepdims) def _quantile_unchecked(a, @@ -4345,7 +4377,7 @@ def _quantile_unchecked(a, axis=None, out=None, overwrite_input=False, - interpolation="linear", + method="linear", keepdims=False): """Assumes that q is in [0, 1], and is an ndarray""" r, k = _ureduce(a, @@ -4354,7 +4386,7 @@ def _quantile_unchecked(a, axis=axis, out=out, overwrite_input=overwrite_input, - interpolation=interpolation) + method=method) if keepdims: return r.reshape(q.shape + k) else: @@ -4373,6 +4405,23 @@ def _quantile_is_valid(q): return True +def _check_interpolation_as_method(method, interpolation, fname): + # Deprecated NumPy 1.22, 2021-11-08 + warnings.warn( + f"the `interpolation=` argument to {fname} was renamed to " + "`method=`, which has additional options.\n" + "Users of the modes 'nearest', 'lower', 'higher', or " + "'midpoint' are encouraged to review the method they. " + "(Deprecated NumPy 1.22)", + DeprecationWarning, stacklevel=4) + if method != "linear": + # sanity check, we assume this basically never happens + raise TypeError( + "You shall not pass both `method` and `interpolation`!\n" + "(`interpolation` is Deprecated in favor of `method`)") + return interpolation + + def _compute_virtual_index(n, quantiles, alpha: float, beta: float): """ Compute the floating point indexes of an array for the linear @@ -4398,9 +4447,7 @@ def _compute_virtual_index(n, quantiles, alpha: float, beta: float): ) - 1 -def _get_gamma(virtual_indexes, - previous_indexes, - interpolation: _QuantileInterpolation): +def _get_gamma(virtual_indexes, previous_indexes, method): """ Compute gamma (a.k.a 'm' or 'weight') for the linear interpolation of quantiles. @@ -4410,7 +4457,7 @@ def _get_gamma(virtual_indexes, sample. previous_indexes : array_like The floor values of virtual_indexes. - interpolation : _QuantileInterpolation + interpolation : dict The interpolation method chosen, which may have a specific rule modifying gamma. @@ -4418,7 +4465,7 @@ def _get_gamma(virtual_indexes, by the interpolation method. """ gamma = np.asanyarray(virtual_indexes - previous_indexes) - gamma = interpolation["fix_gamma"](gamma, virtual_indexes) + gamma = method["fix_gamma"](gamma, virtual_indexes) return np.asanyarray(gamma) @@ -4447,7 +4494,7 @@ def _lerp(a, b, t, out=None): def _get_gamma_mask(shape, default_value, conditioned_value, where): out = np.full(shape, default_value) - out[where] = conditioned_value + np.copyto(out, conditioned_value, where=where, casting="unsafe") return out @@ -4455,11 +4502,14 @@ def _discret_interpolation_to_boundaries(index, gamma_condition_fun): previous = np.floor(index) next = previous + 1 gamma = index - previous - return _get_gamma_mask(shape=index.shape, - default_value=next, - conditioned_value=previous, - where=gamma_condition_fun(gamma, index) - ).astype(np.intp) + res = _get_gamma_mask(shape=index.shape, + default_value=next, + conditioned_value=previous, + where=gamma_condition_fun(gamma, index) + ).astype(np.intp) + # Some methods can lead to out-of-bound integers, clip them: + res[res < 0] = 0 + return res def _closest_observation(n, quantiles): @@ -4480,7 +4530,7 @@ def _quantile_ureduce_func( axis: int = None, out=None, overwrite_input: bool = False, - interpolation="linear", + method="linear", ) -> np.array: if q.ndim > 2: # The code below works fine for nd, but it might not have useful @@ -4502,7 +4552,7 @@ def _quantile_ureduce_func( result = _quantile(arr, quantiles=q, axis=axis, - interpolation=interpolation, + method=method, out=out) return result @@ -4546,7 +4596,7 @@ def _quantile( arr: np.array, quantiles: np.array, axis: int = -1, - interpolation="linear", + method="linear", out=None, ): """ @@ -4556,8 +4606,8 @@ def _quantile( It computes the quantiles of the array for the given axis. A linear interpolation is performed based on the `interpolation`. - By default, the interpolation is "linear" where - alpha == beta == 1 which performs the 7th method of Hyndman&Fan. + By default, the method is "linear" where alpha == beta == 1 which + performs the 7th method of Hyndman&Fan. With "median_unbiased" we get alpha == beta == 1/3 thus the 8th method of Hyndman&Fan. """ @@ -4574,13 +4624,12 @@ def _quantile( # Virtual because it is a floating point value, not an valid index. # The nearest neighbours are used for interpolation try: - interpolation = _QuantileInterpolation[interpolation] + method = _QuantileMethods[method] except KeyError: raise ValueError( - f"{interpolation!r} is not a valid interpolation. Use one of: " - f"{_QuantileInterpolation.keys()}") from None - virtual_indexes = interpolation["get_virtual_index"](values_count, - quantiles) + f"{method!r} is not a valid method. Use one of: " + f"{_QuantileMethods.keys()}") from None + virtual_indexes = method["get_virtual_index"](values_count, quantiles) virtual_indexes = np.asanyarray(virtual_indexes) if np.issubdtype(virtual_indexes.dtype, np.integer): # No interpolation needed, take the points along axis @@ -4614,9 +4663,7 @@ def _quantile( previous = np.take(arr, previous_indexes, axis=DATA_AXIS) next = np.take(arr, next_indexes, axis=DATA_AXIS) # --- Linear interpolation - gamma = _get_gamma(virtual_indexes, - previous_indexes, - interpolation) + gamma = _get_gamma(virtual_indexes, previous_indexes, method) result_shape = virtual_indexes.shape + (1,) * (arr.ndim - 1) gamma = gamma.reshape(result_shape) result = _lerp(previous, |