From cd7a02a4db7e760b881f3feeb832ffd84fa8645a Mon Sep 17 00:00:00 2001 From: abel Date: Thu, 2 Sep 2021 16:34:42 +0200 Subject: MAINT, ENH [#10736] Add interpolation methods to quantile - Added the missing linear interpolation methods. - Updated the existing unit tests. - Added pytest.mark.xfail for boolean arrays See - https://github.com/numpy/numpy/pull/19857#issuecomment-919258693 - https://github.com/numpy/numpy/issues/19154 --- numpy/lib/function_base.py | 593 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 469 insertions(+), 124 deletions(-) (limited to 'numpy/lib/function_base.py') diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index 20e32a78d..67a34f6e1 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -52,6 +52,89 @@ __all__ = [ ] +_QuantileInterpolation = dict( + # --- HYNDMAN and FAN methods + # Discrete methods + inverted_cdf=dict( + get_virtual_index=lambda n, quantiles: _inverted_cdf(n, quantiles), + fix_gamma=lambda gamma, _: gamma, # should never be called + ), + averaged_inverted_cdf=dict( + get_virtual_index=lambda n, quantiles: (n * quantiles) - 1, + fix_gamma=lambda gamma, _: _get_gamma_mask( + shape=gamma.shape, + default_value=1., + conditioned_value=0.5, + where=gamma == 0), + ), + closest_observation=dict( + get_virtual_index=lambda n, quantiles: _closest_observation(n, + quantiles), + fix_gamma=lambda gamma, _: gamma, # should never be called + ), + # Continuous methods + interpolated_inverted_cdf=dict( + get_virtual_index=lambda n, quantiles: + _virtual_index_formula(n, quantiles, 0, 1), + fix_gamma=lambda gamma, _: gamma, + ), + hazen=dict( + get_virtual_index=lambda n, quantiles: + _virtual_index_formula(n, quantiles, 0.5, 0.5), + fix_gamma=lambda gamma, _: gamma, + ), + weibull=dict( + get_virtual_index=lambda n, quantiles: + _virtual_index_formula(n, quantiles, 0, 0), + fix_gamma=lambda gamma, _: gamma, + ), + # Default value + linear=dict( + get_virtual_index=lambda n, quantiles: + _virtual_index_formula(n, quantiles, 1, 1), + fix_gamma=lambda gamma, _: gamma, + ), + median_unbiased=dict( + get_virtual_index=lambda n, quantiles: + _virtual_index_formula(n, quantiles, 1 / 3.0, 1 / 3.0), + fix_gamma=lambda gamma, _: gamma, + ), + normal_unbiased=dict( + get_virtual_index=lambda n, quantiles: + _virtual_index_formula(n, quantiles, 3 / 8.0, 3 / 8.0), + fix_gamma=lambda gamma, _: gamma, + ), + # --- OTHER METHODS fixme add deprecated ? + lower=dict( + get_virtual_index=lambda n, quantiles: np.floor( + (n - 1) * quantiles).astype(np.intp), + fix_gamma=lambda gamma, _: gamma, + # should never be called, index dtype is int + ), + higher=dict( + get_virtual_index=lambda n, quantiles: np.ceil( + (n - 1) * quantiles).astype(np.intp), + fix_gamma=lambda gamma, _: gamma, + # should never be called, index dtype is int + ), + midpoint=dict( + get_virtual_index=lambda n, quantiles: 0.5 * ( + np.floor((n - 1) * quantiles) + + np.ceil((n - 1) * quantiles)), + fix_gamma=lambda gamma, index: _get_gamma_mask( + shape=gamma.shape, + default_value=0.5, + conditioned_value=0., + where=index % 1 == 0), + ), + nearest=dict( + get_virtual_index=lambda n, quantiles: np.around( + (n - 1) * quantiles).astype(np.intp), + fix_gamma=lambda gamma, _: gamma, + # should never be called, index dtype is int + )) + + def _rot90_dispatcher(m, k=None, axes=None): return (m,) @@ -3760,8 +3843,13 @@ def _percentile_dispatcher(a, q, axis=None, out=None, overwrite_input=None, @array_function_dispatch(_percentile_dispatcher) -def percentile(a, q, axis=None, out=None, - overwrite_input=False, interpolation='linear', keepdims=False): +def percentile(a, + q, + axis=None, + out=None, + overwrite_input=False, + interpolation="linear", + keepdims=False): """ Compute the q-th percentile of the data along the specified axis. @@ -3790,20 +3878,75 @@ def percentile(a, q, axis=None, out=None, calculations, to save memory. In this case, the contents of the input `a` after this function completes is undefined. - interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + interpolation : str + Possible values: 'linear' (default), + 'inverted_cdf', 'averaged_inverted_cdf', + 'closest_observation', 'interpolated_inverted_cdf', + 'hazen', 'weibull', + 'median_unbiased', 'normal_unbiased', + 'lower', 'higher', + 'midpoint', 'nearest'. This optional parameter specifies the interpolation method to - use when the desired percentile lies between two data points - ``i < j``: + use when the desired quantile lies between two data points ``i < j``. + g is the fractional part of the index surrounded by ``i``. + alpha and beta are correction constants modifying i and j: + i + g = (q - alpha) / ( n - alpha - beta + 1 ) + * inverted_cdf: + method 1 of H&F. + This method give discontinuous results: + if g > 0 ; then take j + if g = 0 ; then take i + * averaged_inverted_cdf: + method 2 of H&F. + This method give discontinuous results: + if g > 0 ; then take j + if g = 0 ; then average between bounds + * closest_observation: + method 3 of H&F. + This method give discontinuous results: + if g > 0 ; then take j + if g = 0 and index is odd ; then take j + if g = 0 and index is even ; then take i + * interpolated_inverted_cdf: + method 4 of H&F. + This method give continuous results using: + alpha = 0 + beta = 1 + * hazen: + method 5 of H&F. + This method give continuous results using: + alpha = 1/2 + beta = 1/2 + * weibull: + method 6 of H&F. + This method give continuous results using: + alpha = 0 + beta = 0 + * inclusive: + Default method, aliased with "linear". + method 7 of H&F. + This method give continuous results using: + alpha = 1 + beta = 1 + * median_unbiased: + method 8 of H&F. + This method is probably the best method if the sample distribution + function is unknown (see reference). + This method give continuous results using: + alpha = 1/3 + beta = 1/3 + * normal_unbiased: + method 9 of H&F. + This method is probably the best method if the sample distribution + function is known to be normal. + This method give continuous results using: + alpha = 3/8 + beta = 3/8 + * lower: ``i``. + * higher: ``j``. + * nearest: ``i`` or ``j``, whichever is nearest. + * midpoint: ``(i + j) / 2``. - * 'linear': ``i + (j - i) * fraction``, where ``fraction`` - is the fractional part of the index surrounded by ``i`` - and ``j``. - * 'lower': ``i``. - * 'higher': ``j``. - * 'nearest': ``i`` or ``j``, whichever is nearest. - * 'midpoint': ``(i + j) / 2``. - - .. versionadded:: 1.9.0 keepdims : bool, optional If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the @@ -3897,6 +4040,11 @@ def percentile(a, q, axis=None, out=None, ax.legend() plt.show() + References + ---------- + [Hyndman&Fan] Hyndman, R. J., & Fan, Y. (1996). + Sample quantiles in statistical packages. + The American Statistician, 50(4), 361-365. """ q = np.true_divide(q, 100) q = asanyarray(q) # undo any decay that the ufunc performed (see gh-13105) @@ -3912,8 +4060,13 @@ def _quantile_dispatcher(a, q, axis=None, out=None, overwrite_input=None, @array_function_dispatch(_quantile_dispatcher) -def quantile(a, q, axis=None, out=None, - overwrite_input=False, interpolation='linear', keepdims=False): +def quantile(a, + q, + axis=None, + out=None, + overwrite_input=False, + interpolation="linear", + keepdims=False): """ Compute the q-th quantile of the data along the specified axis. @@ -3938,18 +4091,75 @@ def quantile(a, q, axis=None, out=None, If True, then allow the input array `a` to be modified by intermediate calculations, to save memory. In this case, the contents of the input `a` after this function completes is undefined. - interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + interpolation : str + Possible values: 'linear' (default), + 'inverted_cdf', 'averaged_inverted_cdf', + 'closest_observation', 'interpolated_inverted_cdf', + 'hazen', 'weibull', + 'median_unbiased', 'normal_unbiased', + 'lower', 'higher', + 'midpoint', 'nearest'. This optional parameter specifies the interpolation method to - use when the desired quantile lies between two data points - ``i < j``: - - * linear: ``i + (j - i) * fraction``, where ``fraction`` - is the fractional part of the index surrounded by ``i`` - and ``j``. - * lower: ``i``. - * higher: ``j``. - * nearest: ``i`` or ``j``, whichever is nearest. - * midpoint: ``(i + j) / 2``. + use when the desired quantile lies between two data points ``i < j``. + g is the fractional part of the index surrounded by ``i``. + alpha and beta are correction constants modifying i and j: + i + g = (q - alpha) / ( n - alpha - beta + 1 ) + * inverted_cdf: + method 1 of H&F. + This method give discontinuous results: + if g > 0 ; then take j + if g = 0 ; then take i + * averaged_inverted_cdf: + method 2 of H&F. + This method give discontinuous results: + if g > 0 ; then take j + if g = 0 ; then average between bounds + * closest_observation: + method 3 of H&F. + This method give discontinuous results: + if g > 0 ; then take j + if g = 0 and index is odd ; then take j + if g = 0 and index is even ; then take i + * interpolated_inverted_cdf: + method 4 of H&F. + This method give continuous results using: + alpha = 0 + beta = 1 + * hazen: + method 5 of H&F. + This method give continuous results using: + alpha = 1/2 + beta = 1/2 + * weibull: + method 6 of H&F. + This method give continuous results using: + alpha = 0 + beta = 0 + * linear: + Default method. + method 7 of H&F. + This method give continuous results using: + alpha = 1 + beta = 1 + * median_unbiased: + method 8 of H&F. + This method is probably the best method if the sample distribution + function is unknown (see reference). + This method give continuous results using: + alpha = 1/3 + beta = 1/3 + * normal_unbiased: + method 9 of H&F. + This method is probably the best method if the sample distribution + function is known to be normal. + This method give continuous results using: + alpha = 3/8 + beta = 3/8 + * lower: ``i``. + * higher: ``j``. + * nearest: ``i`` or ``j``, whichever is nearest. + * midpoint: ``(i + j) / 2``. + keepdims : bool, optional If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the @@ -4010,6 +4220,12 @@ def quantile(a, q, axis=None, out=None, >>> np.quantile(b, 0.5, axis=1, overwrite_input=True) array([7., 2.]) >>> assert not np.all(a == b) + + References + ---------- + [Hyndman&Fan] Hyndman, R. J., & Fan, Y. (1996). + Sample quantiles in statistical packages. + The American Statistician, 50(4), 361-365. """ q = np.asanyarray(q) if not _quantile_is_valid(q): @@ -4018,10 +4234,19 @@ def quantile(a, q, axis=None, out=None, a, q, axis, out, overwrite_input, interpolation, keepdims) -def _quantile_unchecked(a, q, axis=None, out=None, overwrite_input=False, - interpolation='linear', keepdims=False): +def _quantile_unchecked(a, + q, + axis=None, + out=None, + overwrite_input=False, + interpolation="linear", + keepdims=False): """Assumes that q is in [0, 1], and is an ndarray""" - r, k = _ureduce(a, func=_quantile_ureduce_func, q=q, axis=axis, out=out, + r, k = _ureduce(a, + func=_quantiles_ureduce_func, + q=q, + axis=axis, + out=out, overwrite_input=overwrite_input, interpolation=interpolation) if keepdims: @@ -4042,122 +4267,242 @@ def _quantile_is_valid(q): return True -def _lerp(a, b, t, out=None): - """ Linearly interpolate from a to b by a factor of t """ - diff_b_a = subtract(b, a) - # asanyarray is a stop-gap until gh-13105 - lerp_interpolation = asanyarray(add(a, diff_b_a*t, out=out)) - subtract(b, diff_b_a * (1 - t), out=lerp_interpolation, where=t>=0.5) - if lerp_interpolation.ndim == 0 and out is None: - lerp_interpolation = lerp_interpolation[()] # unpack 0d arrays - return lerp_interpolation +def _virtual_index_formula(n: np.array, + quantiles: np.array, + alpha: float, + beta: float) -> np.array: + """ + Compute the floating point indexes of an array for the linear + interpolation of quantiles. + Reference: + Hyndman&Fan paper "Sample Quantiles in Statistical Packages", + DOI: 10.1080/00031305.1996.10473566 + """ + return n * quantiles + ( + alpha + quantiles * (1 - alpha - beta) + ) - 1 -def _quantile_ureduce_func(a, q, axis=None, out=None, overwrite_input=False, - interpolation='linear', keepdims=False): - a = asarray(a) +def _get_gamma(virtual_indexes: np.array, + previous_indexes: np.array, + interpolation: _QuantileInterpolation) -> np.array: + """ + Compute the gamma (a.k.a 'm' or weight) for the linear interpolation + of quantiles. + When gamma == 0 the left bound will be picked, + When gamma == 1 the right bound will be. + """ + gamma = np.asanyarray(virtual_indexes - previous_indexes) + gamma = interpolation["fix_gamma"](gamma, virtual_indexes) + return np.asanyarray(gamma) - # ufuncs cause 0d array results to decay to scalars (see gh-13105), which - # makes them problematic for __setitem__ and attribute access. As a - # workaround, we call this on the result of every ufunc on a possibly-0d - # array. - not_scalar = np.asanyarray - # prepare a for partitioning - if overwrite_input: - if axis is None: - ap = a.ravel() - else: - ap = a - else: - if axis is None: - ap = a.flatten() - else: - ap = a.copy() +def _linear_interpolation_formula( + left: np.array, right: np.array, gamma: np.array, out: np.array = None +) -> np.array: + """ + Compute the linear interpolation weighted by gamma on each point of + two same shape array. + """ + # Equivalent to gamma * right + (1 - gamma) * left + # see gh-14685 + diff_right_left = subtract(right, left) + result = asanyarray(add(left, diff_right_left * gamma, out=out)) + result = subtract(right, + diff_right_left * (1 - gamma), + out=result, + where=gamma >= 0.5) + return result - if axis is None: - axis = 0 +def _get_gamma_mask(shape, default_value, conditioned_value, where): + out = np.full(shape, default_value) + out[where] = conditioned_value + return out + + +def _discret_interpolation_to_boundaries(index, gamma_condition_fun): + previous = np.floor(index) + next = previous + 1 + gamma = index - previous + return _get_gamma_mask(shape=index.shape, + default_value=next, + conditioned_value=previous, + where=gamma_condition_fun(gamma, index) + ).astype(np.intp) + + +def _closest_observation(n, quantiles): + gamma_fun = lambda gamma, index: (gamma == 0) & (np.floor(index) % 2 == 0) + return _discret_interpolation_to_boundaries((n * quantiles) - 1 - 0.5, + gamma_fun) + + +def _inverted_cdf(n, quantiles): + gamma_fun = lambda gamma, _: (gamma == 0) + return _discret_interpolation_to_boundaries((n * quantiles) - 1, + gamma_fun) + + +def _quantiles_ureduce_func( + a: np.array, + q: np.array, + axis: int = None, + out=None, + overwrite_input: bool = False, + interpolation="linear", +) -> np.array: if q.ndim > 2: # The code below works fine for nd, but it might not have useful # semantics. For now, keep the supported dimensions the same as it was # before. raise ValueError("q must be a scalar or 1d") - - Nx = ap.shape[axis] - indices = not_scalar(q * (Nx - 1)) - # round fractional indices according to interpolation method - if interpolation == 'lower': - indices = floor(indices).astype(intp) - elif interpolation == 'higher': - indices = ceil(indices).astype(intp) - elif interpolation == 'midpoint': - indices = 0.5 * (floor(indices) + ceil(indices)) - elif interpolation == 'nearest': - indices = around(indices).astype(intp) - elif interpolation == 'linear': - pass # keep index as fraction and interpolate + if overwrite_input: + if axis is None: + axis = 0 + arr = a.ravel() + else: + arr = a else: - raise ValueError( - "interpolation can only be 'linear', 'lower' 'higher', " - "'midpoint', or 'nearest'") + if axis is None: + axis = 0 + arr = a.flatten() + else: + arr = a.copy() + result = _quantile(arr, + quantiles=q, + axis=axis, + interpolation=interpolation, + out=out) + if result.ndim == 0 and out is None: + result = result[()] # unpack 0d arrays + elif result.size == 1 and out is None and q.ndim == 0: + result = result[0] + return result - # The dimensions of `q` are prepended to the output shape, so we need the - # axis being sampled from `ap` to be first. - ap = np.moveaxis(ap, axis, 0) - del axis - if np.issubdtype(indices.dtype, np.integer): - # take the points along axis +def _get_indexes(arr, virtual_indexes, valid_values_count): + """ + Get the valid indexes of arr neighbouring virtual_indexes. + Note + This is a companion function to linear interpolation of + Quantiles - if np.issubdtype(a.dtype, np.inexact): + Returns + ------- + (previous_indexes, next_indexes): Tuple + A Tuple of virtual_indexes neighbouring indexes + """ + previous_indexes = np.asanyarray(np.floor(virtual_indexes)) + next_indexes = np.asanyarray(previous_indexes + 1) + indexes_above_bounds = virtual_indexes >= valid_values_count - 1 + # When indexes is above max index, take the max value of the array + if indexes_above_bounds.any(): + previous_indexes[indexes_above_bounds] = -1 + next_indexes[indexes_above_bounds] = -1 + # When indexes is below min index, take the min value of the array + indexes_below_bounds = virtual_indexes < 0 + if indexes_below_bounds.any(): + previous_indexes[indexes_below_bounds] = 0 + next_indexes[indexes_below_bounds] = 0 + if np.issubdtype(arr.dtype, np.inexact): + # After the sort, slices having NaNs will have for last element a NaN + virtual_indexes_nans = np.isnan(virtual_indexes) + if virtual_indexes_nans.any(): + previous_indexes[virtual_indexes_nans] = -1 + next_indexes[virtual_indexes_nans] = -1 + previous_indexes = previous_indexes.astype(np.intp) + next_indexes = next_indexes.astype(np.intp) + return previous_indexes, next_indexes + + +def _quantile( + arr: np.array, + quantiles: np.array, + axis: int = -1, + interpolation="linear", + out=None, +): + """ + Private function that doesn't support extended axis or keepdims. + These methods are extended to this function using _ureduce + See nanpercentile for parameter usage + It computes the quantiles of the array for the given axis. + A linear interpolation is performed based on the `interpolation`. + + By default, the interpolation is "linear" where + alpha == beta == 1 which performs the 7th method of Hyndman&Fan. + With "median_unbiased" we get alpha == beta == 1/3 + thus the 8th method of Hyndman&Fan. + """ + # --- Setup + arr = np.asanyarray(arr) + values_count = arr.shape[axis] + # The dimensions of `q` are prepended to the output shape, so we need the + # axis being sampled from `arr` to be last. + DATA_AXIS = 0 + arr = np.moveaxis(arr, axis, destination=DATA_AXIS) + # --- Computation of indexes + # Index where to find the value in the sorted array. + # Virtual because it is a floating point value, not an valid index. + # The nearest neighbours are used for interpolation + try: + interpolation = _QuantileInterpolation[interpolation] + except KeyError: + raise ValueError( + f"{interpolation!r} is not a valid interpolation. Use one of: " + f"{_QuantileInterpolation.keys()}") + virtual_indexes = interpolation["get_virtual_index"](values_count, + quantiles) + virtual_indexes = np.asanyarray(virtual_indexes) + if np.issubdtype(virtual_indexes.dtype, np.integer): + # No interpolation needed, take the points along axis + if np.issubdtype(arr.dtype, np.inexact): # may contain nan, which would sort to the end - ap.partition(concatenate((indices.ravel(), [-1])), axis=0) - n = np.isnan(ap[-1]) + arr.partition(concatenate((virtual_indexes.ravel(), [-1])), axis=0) + slices_having_nans = np.isnan(arr[-1]) else: # cannot contain nan - ap.partition(indices.ravel(), axis=0) - n = np.array(False, dtype=bool) - - r = take(ap, indices, axis=0, out=out) - + arr.partition(virtual_indexes.ravel(), axis=0) + slices_having_nans = np.array(False, dtype=bool) + result = np.asanyarray(take(arr, virtual_indexes, axis=0, out=out)) else: - # weight the points above and below the indices - - indices_below = not_scalar(floor(indices)).astype(intp) - indices_above = not_scalar(indices_below + 1) - indices_above[indices_above > Nx - 1] = Nx - 1 - - if np.issubdtype(a.dtype, np.inexact): - # may contain nan, which would sort to the end - ap.partition(concatenate(( - indices_below.ravel(), indices_above.ravel(), [-1] - )), axis=0) - n = np.isnan(ap[-1]) + previous_indexes, next_indexes = _get_indexes(arr, + virtual_indexes, + values_count) + # --- Sorting + arr.partition( + np.unique(np.concatenate(([0, -1], + previous_indexes.ravel(), + next_indexes.ravel(), + ))), + axis=DATA_AXIS) + if np.issubdtype(arr.dtype, np.inexact): + slices_having_nans = np.isnan( + take(arr, indices=-1, axis=DATA_AXIS) + ) else: - # cannot contain nan - ap.partition(concatenate(( - indices_below.ravel(), indices_above.ravel() - )), axis=0) - n = np.array(False, dtype=bool) - - weights_shape = indices.shape + (1,) * (ap.ndim - 1) - weights_above = not_scalar(indices - indices_below).reshape(weights_shape) - - x_below = take(ap, indices_below, axis=0) - x_above = take(ap, indices_above, axis=0) - - r = _lerp(x_below, x_above, weights_above, out=out) - - # if any slice contained a nan, then all results on that slice are also nan - if np.any(n): - if r.ndim == 0 and out is None: + slices_having_nans = None + # --- Get values from indexes + previous = np.take(arr, previous_indexes, axis=DATA_AXIS) + next = np.take(arr, next_indexes, axis=DATA_AXIS) + # --- Linear interpolation + gamma = _get_gamma(virtual_indexes, + previous_indexes, + interpolation) + result_shape = virtual_indexes.shape + (1,) * (arr.ndim - 1) + gamma = gamma.reshape(result_shape) + result = _linear_interpolation_formula(previous, + next, + gamma, + out=out) + if np.any(slices_having_nans): + if result.ndim == 0 and out is None: # can't write to a scalar - r = a.dtype.type(np.nan) + result = np.array(np.nan, dtype=arr.dtype) else: - r[..., n] = a.dtype.type(np.nan) - - return r + result[..., slices_having_nans] = np.nan + return result def _trapz_dispatcher(y, x=None, dx=None, axis=None): -- cgit v1.2.1 From 303c12cfe7ad1b8b6ed5417c126857b29355b1fb Mon Sep 17 00:00:00 2001 From: Charles Harris Date: Fri, 8 Oct 2021 17:09:51 -0600 Subject: DOC: fix docstrings. Hopefully fix the docstrings of percentile, nanpercentile, quantile, and nanquantile so that CircleCI passes. --- numpy/lib/function_base.py | 424 +++++++++++++++++++++++++++------------------ 1 file changed, 255 insertions(+), 169 deletions(-) (limited to 'numpy/lib/function_base.py') diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index 67a34f6e1..dbaba87f9 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -3877,75 +3877,31 @@ def percentile(a, If True, then allow the input array `a` to be modified by intermediate calculations, to save memory. In this case, the contents of the input `a` after this function completes is undefined. - - interpolation : str - Possible values: 'linear' (default), - 'inverted_cdf', 'averaged_inverted_cdf', - 'closest_observation', 'interpolated_inverted_cdf', - 'hazen', 'weibull', - 'median_unbiased', 'normal_unbiased', - 'lower', 'higher', - 'midpoint', 'nearest'. - This optional parameter specifies the interpolation method to - use when the desired quantile lies between two data points ``i < j``. - g is the fractional part of the index surrounded by ``i``. - alpha and beta are correction constants modifying i and j: - i + g = (q - alpha) / ( n - alpha - beta + 1 ) - * inverted_cdf: - method 1 of H&F. - This method give discontinuous results: - if g > 0 ; then take j - if g = 0 ; then take i - * averaged_inverted_cdf: - method 2 of H&F. - This method give discontinuous results: - if g > 0 ; then take j - if g = 0 ; then average between bounds - * closest_observation: - method 3 of H&F. - This method give discontinuous results: - if g > 0 ; then take j - if g = 0 and index is odd ; then take j - if g = 0 and index is even ; then take i - * interpolated_inverted_cdf: - method 4 of H&F. - This method give continuous results using: - alpha = 0 - beta = 1 - * hazen: - method 5 of H&F. - This method give continuous results using: - alpha = 1/2 - beta = 1/2 - * weibull: - method 6 of H&F. - This method give continuous results using: - alpha = 0 - beta = 0 - * inclusive: - Default method, aliased with "linear". - method 7 of H&F. - This method give continuous results using: - alpha = 1 - beta = 1 - * median_unbiased: - method 8 of H&F. - This method is probably the best method if the sample distribution - function is unknown (see reference). - This method give continuous results using: - alpha = 1/3 - beta = 1/3 - * normal_unbiased: - method 9 of H&F. - This method is probably the best method if the sample distribution - function is known to be normal. - This method give continuous results using: - alpha = 3/8 - beta = 3/8 - * lower: ``i``. - * higher: ``j``. - * nearest: ``i`` or ``j``, whichever is nearest. - * midpoint: ``(i + j) / 2``. + interpolation : str, optional + This parameter specifies the interpolation method to + use when the desired quantile lies between two data points + There are many different methods, some unique to NumPy. See the + notes for explanation. Options + + * (NPY 1): 'lower' + * (NPY 2): 'higher', + * (NPY 3): 'midpoint' + * (NPY 4): 'nearest' + * (NPY 5): 'linear', aliased with 'inclusive' (default) + + New options: + + * (H&F 1): 'inverted_cdf' + * (H&F 2): 'averaged_inverted_cdf' + * (H&F 3): 'closest_observation' + * (H&F 4): 'interpolated_inverted_cdf' + * (H&F 5): 'hazen' + * (H&F 6): 'weibull' + * (H&F 7): 'inclusive', aliased with 'linear' (default) + * (H&F 8): 'median_unbiased' + * (H&F 9): 'normal_unbiased' + + .. versionadded;: 1.22.0 keepdims : bool, optional If this is set to True, the axes which are reduced are left in @@ -3971,18 +3927,104 @@ def percentile(a, mean median : equivalent to ``percentile(..., 50)`` nanpercentile - quantile : equivalent to percentile, except with q in the range [0, 1]. + quantile : equivalent to percentile, except q in the range [0, 1]. Notes ----- - Given a vector ``V`` of length ``N``, the q-th percentile of - ``V`` is the value ``q/100`` of the way from the minimum to the - maximum in a sorted copy of ``V``. The values and distances of - the two nearest neighbors as well as the `interpolation` parameter - will determine the percentile if the normalized ranking does not - match the location of ``q`` exactly. This function is the same as - the median if ``q=50``, the same as the minimum if ``q=0`` and the - same as the maximum if ``q=100``. + Given a vector ``V`` of length ``N``, the q-th percentile of ``V`` is + the value ``q/100`` of the way from the minimum to the maximum in a + sorted copy of ``V``. The values and distances of the two nearest + neighbors as well as the `interpolation` parameter will determine the + percentile if the normalized ranking does not match the location of + ``q`` exactly. This function is the same as the median if ``q=50``, the + same as the minimum if ``q=0`` and the same as the maximum if + ``q=100``. + + This optional `interpolation` parameter specifies the interpolation + method to use when the desired quantile lies between two data points + ``i < j``. If ``g`` is the fractional part of the index surrounded by + ``i`` and alpha and beta are correction constants modifying i and j. + + .. math:: + i + g = (q - alpha) / ( n - alpha - beta + 1 ) + + The different interpolation methods then work as follows + + inverted_cdf: + method 1 of H&F [1]_. + This method gives discontinuous results: + * if g > 0 ; then take j + * if g = 0 ; then take i + + averaged_inverted_cdf: + method 2 of H&F [1]_. + This method give discontinuous results: + * if g > 0 ; then take j + * if g = 0 ; then average between bounds + + closest_observation: + method 3 of H&F [1]_. + This method give discontinuous results: + * if g > 0 ; then take j + * if g = 0 and index is odd ; then take j + * if g = 0 and index is even ; then take i + + interpolated_inverted_cdf: + method 4 of H&F [1]_. + This method give continuous results using: + * alpha = 0 + * beta = 1 + + hazen: + method 5 of H&F [1]_. + This method give continuous results using: + * alpha = 1/2 + * beta = 1/2 + + weibull: + method 6 of H&F [1]_. + This method give continuous results using: + * alpha = 0 + * beta = 0 + + inclusive: + Default method, aliased with "linear". + method 7 of H&F [1]_. + This method give continuous results using: + * alpha = 1 + * beta = 1 + + median_unbiased: + method 8 of H&F [1]_. + This method is probably the best method if the sample + distribution function is unknown (see reference). + This method give continuous results using: + * alpha = 1/3 + * beta = 1/3 + + normal_unbiased: + method 9 of H&F [1]_. + This method is probably the best method if the sample + distribution function is known to be normal. + This method give continuous results using: + * alpha = 3/8 + * beta = 3/8 + + lower: + NumPy method kept for backwards compatibility. + Takes ``i`` as the interpolation point. + + higher: + NumPy method kept for backwards compatibility. + Takes ``j`` as the interpolation point. + + nearest: + NumPy method kept for backwards compatibility. + Takes ``i`` or ``j``, whichever is nearest. + + midpoint: + NumPy method kept for backwards compatibility. + Uses ``(i + j) / 2``. Examples -------- @@ -4042,9 +4084,10 @@ def percentile(a, References ---------- - [Hyndman&Fan] Hyndman, R. J., & Fan, Y. (1996). - Sample quantiles in statistical packages. - The American Statistician, 50(4), 361-365. + .. [1] R. J. Hyndman and Y. Fan, + "Sample quantiles in statistical packages," + The American Statistician, 50(4), pp. 361-365, 1996 + """ q = np.true_divide(q, 100) q = asanyarray(q) # undo any decay that the ufunc performed (see gh-13105) @@ -4080,85 +4123,42 @@ def quantile(a, Quantile or sequence of quantiles to compute, which must be between 0 and 1 inclusive. axis : {int, tuple of int, None}, optional - Axis or axes along which the quantiles are computed. The - default is to compute the quantile(s) along a flattened - version of the array. + Axis or axes along which the quantiles are computed. The default is + to compute the quantile(s) along a flattened version of the array. out : ndarray, optional - Alternative output array in which to place the result. It must - have the same shape and buffer length as the expected output, - but the type (of the output) will be cast if necessary. + Alternative output array in which to place the result. It must have + the same shape and buffer length as the expected output, but the + type (of the output) will be cast if necessary. overwrite_input : bool, optional - If True, then allow the input array `a` to be modified by intermediate - calculations, to save memory. In this case, the contents of the input - `a` after this function completes is undefined. - interpolation : str - Possible values: 'linear' (default), - 'inverted_cdf', 'averaged_inverted_cdf', - 'closest_observation', 'interpolated_inverted_cdf', - 'hazen', 'weibull', - 'median_unbiased', 'normal_unbiased', - 'lower', 'higher', - 'midpoint', 'nearest'. - This optional parameter specifies the interpolation method to - use when the desired quantile lies between two data points ``i < j``. - g is the fractional part of the index surrounded by ``i``. - alpha and beta are correction constants modifying i and j: - i + g = (q - alpha) / ( n - alpha - beta + 1 ) - * inverted_cdf: - method 1 of H&F. - This method give discontinuous results: - if g > 0 ; then take j - if g = 0 ; then take i - * averaged_inverted_cdf: - method 2 of H&F. - This method give discontinuous results: - if g > 0 ; then take j - if g = 0 ; then average between bounds - * closest_observation: - method 3 of H&F. - This method give discontinuous results: - if g > 0 ; then take j - if g = 0 and index is odd ; then take j - if g = 0 and index is even ; then take i - * interpolated_inverted_cdf: - method 4 of H&F. - This method give continuous results using: - alpha = 0 - beta = 1 - * hazen: - method 5 of H&F. - This method give continuous results using: - alpha = 1/2 - beta = 1/2 - * weibull: - method 6 of H&F. - This method give continuous results using: - alpha = 0 - beta = 0 - * linear: - Default method. - method 7 of H&F. - This method give continuous results using: - alpha = 1 - beta = 1 - * median_unbiased: - method 8 of H&F. - This method is probably the best method if the sample distribution - function is unknown (see reference). - This method give continuous results using: - alpha = 1/3 - beta = 1/3 - * normal_unbiased: - method 9 of H&F. - This method is probably the best method if the sample distribution - function is known to be normal. - This method give continuous results using: - alpha = 3/8 - beta = 3/8 - * lower: ``i``. - * higher: ``j``. - * nearest: ``i`` or ``j``, whichever is nearest. - * midpoint: ``(i + j) / 2``. + If True, then allow the input array `a` to be modified by + intermediate calculations, to save memory. In this case, the + contents of the input `a` after this function completes is + undefined. + interpolation : str, optional + This parameter specifies the interpolation method to use when the + desired quantile lies between two data points There are many + different methods, some unique to NumPy. See the notes for + explanation. Options: + + * (NPY 1): 'lower' + * (NPY 2): 'higher', + * (NPY 3): 'midpoint' + * (NPY 4): 'nearest' + * (NPY 5): 'linear', aliased with 'inclusive' (default) + + New options: + + * (H&F 1): 'inverted_cdf' + * (H&F 2): 'averaged_inverted_cdf' + * (H&F 3): 'closest_observation' + * (H&F 4): 'interpolated_inverted_cdf' + * (H&F 5): 'hazen' + * (H&F 6): 'weibull' + * (H&F 7): 'inclusive', aliased with 'linear' (default) + * (H&F 8): 'median_unbiased' + * (H&F 9): 'normal_unbiased' + + .. versionadded:: 1.22.0 keepdims : bool, optional If this is set to True, the axes which are reduced are left in @@ -4186,14 +4186,99 @@ def quantile(a, Notes ----- - Given a vector ``V`` of length ``N``, the q-th quantile of - ``V`` is the value ``q`` of the way from the minimum to the - maximum in a sorted copy of ``V``. The values and distances of - the two nearest neighbors as well as the `interpolation` parameter - will determine the quantile if the normalized ranking does not - match the location of ``q`` exactly. This function is the same as - the median if ``q=0.5``, the same as the minimum if ``q=0.0`` and the - same as the maximum if ``q=1.0``. + Given a vector ``V`` of length ``N``, the q-th quantile of ``V`` is the + value ``q`` of the way from the minimum to the maximum in a sorted copy of + ``V``. The values and distances of the two nearest neighbors as well as the + `interpolation` parameter will determine the quantile if the normalized + ranking does not match the location of ``q`` exactly. This function is the + same as the median if ``q=0.5``, the same as the minimum if ``q=0.0`` and + the same as the maximum if ``q=1.0``. + + This optional `interpolation` parameter specifies the interpolation method + to use when the desired quantile lies between two data points ``i < j``. If + ``g`` is the fractional part of the index surrounded by ``i`` and alpha + and beta are correction constants modifying i and j. + + .. math:: + i + g = (q - alpha) / ( n - alpha - beta + 1 ) + + The different interpolation methods then work as follows + + inverted_cdf: + method 1 of H&F [1]_. + This method gives discontinuous results: + * if g > 0 ; then take j + * if g = 0 ; then take i + + averaged_inverted_cdf: + method 2 of H&F [1]_. + This method give discontinuous results: + * if g > 0 ; then take j + * if g = 0 ; then average between bounds + + closest_observation: + method 3 of H&F [1]_. + This method give discontinuous results: + * if g > 0 ; then take j + * if g = 0 and index is odd ; then take j + * if g = 0 and index is even ; then take i + + interpolated_inverted_cdf: + method 4 of H&F [1]_. + This method give continuous results using: + * alpha = 0 + * beta = 1 + + hazen: + method 5 of H&F [1]_. + This method give continuous results using: + * alpha = 1/2 + * beta = 1/2 + + weibull: + method 6 of H&F [1]_. + This method give continuous results using: + * alpha = 0 + * beta = 0 + + inclusive: + Default method, aliased with "linear". + method 7 of H&F [1]_. + This method give continuous results using: + * alpha = 1 + * beta = 1 + + median_unbiased: + method 8 of H&F [1]_. + This method is probably the best method if the sample + distribution function is unknown (see reference). + This method give continuous results using: + * alpha = 1/3 + * beta = 1/3 + + normal_unbiased: + method 9 of H&F [1]_. + This method is probably the best method if the sample + distribution function is known to be normal. + This method give continuous results using: + * alpha = 3/8 + * beta = 3/8 + + lower: + NumPy method kept for backwards compatibility. + Takes ``i`` as the interpolation point. + + higher: + NumPy method kept for backwards compatibility. + Takes ``j`` as the interpolation point. + + nearest: + NumPy method kept for backwards compatibility. + Takes ``i`` or ``j``, whichever is nearest. + + midpoint: + NumPy method kept for backwards compatibility. + Uses ``(i + j) / 2``. Examples -------- @@ -4223,9 +4308,10 @@ def quantile(a, References ---------- - [Hyndman&Fan] Hyndman, R. J., & Fan, Y. (1996). - Sample quantiles in statistical packages. - The American Statistician, 50(4), 361-365. + .. [1] R. J. Hyndman and Y. Fan, + "Sample quantiles in statistical packages," + The American Statistician, 50(4), pp. 361-365, 1996 + """ q = np.asanyarray(q) if not _quantile_is_valid(q): -- cgit v1.2.1 From ab19ed256bf9b20340c92cebcfd6158241122c88 Mon Sep 17 00:00:00 2001 From: abel Date: Tue, 19 Oct 2021 12:10:16 +0200 Subject: Fix _lerp - some changes were unrelated to the PR and have been reverted, including, renaming and moving the logic around. - Also renamed _quantile_ureduce_func to its original name --- numpy/lib/function_base.py | 42 +++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 23 deletions(-) (limited to 'numpy/lib/function_base.py') diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index dbaba87f9..353490fc2 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -4329,7 +4329,7 @@ def _quantile_unchecked(a, keepdims=False): """Assumes that q is in [0, 1], and is an ndarray""" r, k = _ureduce(a, - func=_quantiles_ureduce_func, + func=_quantile_ureduce_func, q=q, axis=axis, out=out, @@ -4383,22 +4383,22 @@ def _get_gamma(virtual_indexes: np.array, return np.asanyarray(gamma) -def _linear_interpolation_formula( - left: np.array, right: np.array, gamma: np.array, out: np.array = None -) -> np.array: +def _lerp(a, b, t, out=None): """ Compute the linear interpolation weighted by gamma on each point of two same shape array. """ - # Equivalent to gamma * right + (1 - gamma) * left - # see gh-14685 - diff_right_left = subtract(right, left) - result = asanyarray(add(left, diff_right_left * gamma, out=out)) - result = subtract(right, - diff_right_left * (1 - gamma), - out=result, - where=gamma >= 0.5) - return result + # Equivalent to gamma * right + (1 - gamma) * left, see gh-14685 + diff_b_a = subtract(b, a) + # asanyarray is a stop-gap until gh-13105 + lerp_interpolation = asanyarray(add(a, diff_b_a * t, out=out)) + lerp_interpolation = subtract(b, + diff_b_a * (1 - t), + out=lerp_interpolation, + where=t >= 0.5) + if lerp_interpolation.ndim == 0 and out is None: + lerp_interpolation = lerp_interpolation[()] # unpack 0d arrays + return lerp_interpolation def _get_gamma_mask(shape, default_value, conditioned_value, where): @@ -4430,7 +4430,7 @@ def _inverted_cdf(n, quantiles): gamma_fun) -def _quantiles_ureduce_func( +def _quantile_ureduce_func( a: np.array, q: np.array, axis: int = None, @@ -4460,10 +4460,6 @@ def _quantiles_ureduce_func( axis=axis, interpolation=interpolation, out=out) - if result.ndim == 0 and out is None: - result = result[()] # unpack 0d arrays - elif result.size == 1 and out is None and q.ndim == 0: - result = result[0] return result @@ -4551,7 +4547,7 @@ def _quantile( # cannot contain nan arr.partition(virtual_indexes.ravel(), axis=0) slices_having_nans = np.array(False, dtype=bool) - result = np.asanyarray(take(arr, virtual_indexes, axis=0, out=out)) + result = take(arr, virtual_indexes, axis=0, out=out) else: previous_indexes, next_indexes = _get_indexes(arr, virtual_indexes, @@ -4578,10 +4574,10 @@ def _quantile( interpolation) result_shape = virtual_indexes.shape + (1,) * (arr.ndim - 1) gamma = gamma.reshape(result_shape) - result = _linear_interpolation_formula(previous, - next, - gamma, - out=out) + result = _lerp(previous, + next, + gamma, + out=out) if np.any(slices_having_nans): if result.ndim == 0 and out is None: # can't write to a scalar -- cgit v1.2.1 From d5e275b2bf65c1848203f6bff7606623793daeed Mon Sep 17 00:00:00 2001 From: abel Date: Tue, 19 Oct 2021 15:22:05 +0200 Subject: DOC: Improve quantile documentation Also removed unused imports --- numpy/lib/function_base.py | 87 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 67 insertions(+), 20 deletions(-) (limited to 'numpy/lib/function_base.py') diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index 353490fc2..a5ccc1189 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -9,8 +9,7 @@ import numpy.core.numeric as _nx from numpy.core import transpose from numpy.core.numeric import ( ones, zeros_like, arange, concatenate, array, asarray, asanyarray, empty, - ndarray, around, floor, ceil, take, dot, where, intp, - integer, isscalar, absolute + ndarray, take, dot, where, intp, integer, isscalar, absolute ) from numpy.core.umath import ( pi, add, arctan2, frompyfunc, cos, less_equal, sqrt, sin, @@ -51,7 +50,22 @@ __all__ = [ 'quantile' ] - +# _QuantileInterpolation is a dictionary listing all the supported +# interpolation methods to compute quantile/percentile. +# +# Below virtual_index refer to the index of the element where the percentile +# would be found in the sorted sample. +# When the sample contains exactly the percentile wanted, the virtual_index is +# an integer to the index of this element. +# When the percentile wanted is in between two elements, the virtual_index +# is made of a integer part (a.k.a 'i' or 'left') and a fractional part +# (a.k.a 'g' or 'gamma') +# +# Each _QuantileInterpolation have two properties +# get_virtual_index : Callable +# The function used to compute the virtual_index. +# fix_gamma : Callable +# A function used for discret methods to force the index to a specific value. _QuantileInterpolation = dict( # --- HYNDMAN and FAN methods # Discrete methods @@ -75,33 +89,33 @@ _QuantileInterpolation = dict( # Continuous methods interpolated_inverted_cdf=dict( get_virtual_index=lambda n, quantiles: - _virtual_index_formula(n, quantiles, 0, 1), + _compute_virtual_index(n, quantiles, 0, 1), fix_gamma=lambda gamma, _: gamma, ), hazen=dict( get_virtual_index=lambda n, quantiles: - _virtual_index_formula(n, quantiles, 0.5, 0.5), + _compute_virtual_index(n, quantiles, 0.5, 0.5), fix_gamma=lambda gamma, _: gamma, ), weibull=dict( get_virtual_index=lambda n, quantiles: - _virtual_index_formula(n, quantiles, 0, 0), + _compute_virtual_index(n, quantiles, 0, 0), fix_gamma=lambda gamma, _: gamma, ), # Default value linear=dict( get_virtual_index=lambda n, quantiles: - _virtual_index_formula(n, quantiles, 1, 1), + _compute_virtual_index(n, quantiles, 1, 1), fix_gamma=lambda gamma, _: gamma, ), median_unbiased=dict( get_virtual_index=lambda n, quantiles: - _virtual_index_formula(n, quantiles, 1 / 3.0, 1 / 3.0), + _compute_virtual_index(n, quantiles, 1 / 3.0, 1 / 3.0), fix_gamma=lambda gamma, _: gamma, ), normal_unbiased=dict( get_virtual_index=lambda n, quantiles: - _virtual_index_formula(n, quantiles, 3 / 8.0, 3 / 8.0), + _compute_virtual_index(n, quantiles, 3 / 8.0, 3 / 8.0), fix_gamma=lambda gamma, _: gamma, ), # --- OTHER METHODS fixme add deprecated ? @@ -3945,6 +3959,12 @@ def percentile(a, ``i < j``. If ``g`` is the fractional part of the index surrounded by ``i`` and alpha and beta are correction constants modifying i and j. + Below, 'q' is the quantile value, 'n' is the samle size and + alpha and beta are constants. + The following formula gives an interpolation "i + g" of where the quantile + would be in the sorted sample. + With 'i' being the floor and 'g' the fractional part of the result. + .. math:: i + g = (q - alpha) / ( n - alpha - beta + 1 ) @@ -4353,13 +4373,22 @@ def _quantile_is_valid(q): return True -def _virtual_index_formula(n: np.array, - quantiles: np.array, - alpha: float, - beta: float) -> np.array: +def _compute_virtual_index(n, quantiles, alpha: float, beta: float): """ Compute the floating point indexes of an array for the linear interpolation of quantiles. + n : array_like + The sample sizes. + quantiles : array_like + The quantiles values. + alpha : float + A constant used to correct the index computed. + beta : float + A constant used to correct the index computed. + + alpha and beta values depend on the chosen method + (see quantile documentation) + Reference: Hyndman&Fan paper "Sample Quantiles in Statistical Packages", DOI: 10.1080/00031305.1996.10473566 @@ -4369,14 +4398,24 @@ def _virtual_index_formula(n: np.array, ) - 1 -def _get_gamma(virtual_indexes: np.array, - previous_indexes: np.array, - interpolation: _QuantileInterpolation) -> np.array: +def _get_gamma(virtual_indexes, + previous_indexes, + interpolation: _QuantileInterpolation): """ - Compute the gamma (a.k.a 'm' or weight) for the linear interpolation + Compute gamma (a.k.a 'm' or 'weight') for the linear interpolation of quantiles. - When gamma == 0 the left bound will be picked, - When gamma == 1 the right bound will be. + + virtual_indexes : array_like + The indexes where the percentile is supposed to be found in the sorted + sample. + previous_indexes : array_like + The floor values of virtual_indexes. + interpolation : _QuantileInterpolation + The interpolation method chosen, which may have a specific rule + modifying gamma. + + gamma is usually the fractional part of virtual_indexes but can be modified + by the interpolation method. """ gamma = np.asanyarray(virtual_indexes - previous_indexes) gamma = interpolation["fix_gamma"](gamma, virtual_indexes) @@ -4387,8 +4426,16 @@ def _lerp(a, b, t, out=None): """ Compute the linear interpolation weighted by gamma on each point of two same shape array. + + a : array_like + Left bound. + b : array_like + Right bound. + t : array_like + The interpolation weight. + out : array_like + Output array. """ - # Equivalent to gamma * right + (1 - gamma) * left, see gh-14685 diff_b_a = subtract(b, a) # asanyarray is a stop-gap until gh-13105 lerp_interpolation = asanyarray(add(a, diff_b_a * t, out=out)) -- cgit v1.2.1 From 2a5422da7cb6759d75477738cc192fee3ca2a19c Mon Sep 17 00:00:00 2001 From: abel Date: Tue, 19 Oct 2021 17:55:06 +0200 Subject: Fix issue with nan scalar Also added unit test for it. --- numpy/lib/function_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'numpy/lib/function_base.py') diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index a5ccc1189..a16781f53 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -3959,7 +3959,7 @@ def percentile(a, ``i < j``. If ``g`` is the fractional part of the index surrounded by ``i`` and alpha and beta are correction constants modifying i and j. - Below, 'q' is the quantile value, 'n' is the samle size and + Below, 'q' is the quantile value, 'n' is the sample size and alpha and beta are constants. The following formula gives an interpolation "i + g" of where the quantile would be in the sorted sample. @@ -4628,7 +4628,7 @@ def _quantile( if np.any(slices_having_nans): if result.ndim == 0 and out is None: # can't write to a scalar - result = np.array(np.nan, dtype=arr.dtype) + result = arr.dtype.type(np.nan) else: result[..., slices_having_nans] = np.nan return result -- cgit v1.2.1 From 8413b5abf27221fb2bea070871c7cd8f8da5519c Mon Sep 17 00:00:00 2001 From: abel Date: Thu, 21 Oct 2021 09:59:53 +0200 Subject: MAINT: Clean following PR comments --- numpy/lib/function_base.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'numpy/lib/function_base.py') diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index a16781f53..c33626df4 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -3915,7 +3915,7 @@ def percentile(a, * (H&F 8): 'median_unbiased' * (H&F 9): 'normal_unbiased' - .. versionadded;: 1.22.0 + .. versionchanged:: 1.22.0 keepdims : bool, optional If this is set to True, the axes which are reduced are left in @@ -4439,10 +4439,7 @@ def _lerp(a, b, t, out=None): diff_b_a = subtract(b, a) # asanyarray is a stop-gap until gh-13105 lerp_interpolation = asanyarray(add(a, diff_b_a * t, out=out)) - lerp_interpolation = subtract(b, - diff_b_a * (1 - t), - out=lerp_interpolation, - where=t >= 0.5) + subtract(b, diff_b_a * (1 - t), out=lerp_interpolation, where=t >= 0.5) if lerp_interpolation.ndim == 0 and out is None: lerp_interpolation = lerp_interpolation[()] # unpack 0d arrays return lerp_interpolation @@ -4580,7 +4577,7 @@ def _quantile( except KeyError: raise ValueError( f"{interpolation!r} is not a valid interpolation. Use one of: " - f"{_QuantileInterpolation.keys()}") + f"{_QuantileInterpolation.keys()}") from None virtual_indexes = interpolation["get_virtual_index"](values_count, quantiles) virtual_indexes = np.asanyarray(virtual_indexes) -- cgit v1.2.1 From f7911c67176c1d370be27726e87195699e4b581e Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Thu, 4 Nov 2021 13:26:37 -0500 Subject: DOC: Refer to the quantile/percentile notes for nan versions --- numpy/lib/function_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'numpy/lib/function_base.py') diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index c33626df4..07d8a4269 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -3893,7 +3893,7 @@ def percentile(a, `a` after this function completes is undefined. interpolation : str, optional This parameter specifies the interpolation method to - use when the desired quantile lies between two data points + use when the desired percentile lies between two data points There are many different methods, some unique to NumPy. See the notes for explanation. Options -- cgit v1.2.1 From f9c2573898899ee3809dfd4ebba113f6de2d528b Mon Sep 17 00:00:00 2001 From: Antony Lee Date: Fri, 5 Nov 2021 00:05:04 +0100 Subject: PERF: Speedup np.quantile. Avoiding the unnecessary calls to moveaxis() speed up `np.quantile(x, .5)` (`x = np.random.rand(1000)`) by ~10% (although there's a lot of variability) for me. --- numpy/lib/function_base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'numpy/lib/function_base.py') diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index 07d8a4269..4d57acdb2 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -4567,7 +4567,8 @@ def _quantile( # The dimensions of `q` are prepended to the output shape, so we need the # axis being sampled from `arr` to be last. DATA_AXIS = 0 - arr = np.moveaxis(arr, axis, destination=DATA_AXIS) + if axis != DATA_AXIS: # But moveaxis is slow, so only call it if axis!=0. + arr = np.moveaxis(arr, axis, destination=DATA_AXIS) # --- Computation of indexes # Index where to find the value in the sorted array. # Virtual because it is a floating point value, not an valid index. -- cgit v1.2.1 From 035d853e32d6e60a40a6a845699723238a01431b Mon Sep 17 00:00:00 2001 From: abel Date: Mon, 8 Nov 2021 17:35:19 +0100 Subject: DOC: Remove non-existent alias --- numpy/lib/function_base.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'numpy/lib/function_base.py') diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index 4d57acdb2..86125168a 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -3901,7 +3901,7 @@ def percentile(a, * (NPY 2): 'higher', * (NPY 3): 'midpoint' * (NPY 4): 'nearest' - * (NPY 5): 'linear', aliased with 'inclusive' (default) + * (NPY 5): 'linear' New options: @@ -3911,7 +3911,7 @@ def percentile(a, * (H&F 4): 'interpolated_inverted_cdf' * (H&F 5): 'hazen' * (H&F 6): 'weibull' - * (H&F 7): 'inclusive', aliased with 'linear' (default) + * (H&F 7): 'linear' (default) * (H&F 8): 'median_unbiased' * (H&F 9): 'normal_unbiased' @@ -4007,8 +4007,7 @@ def percentile(a, * alpha = 0 * beta = 0 - inclusive: - Default method, aliased with "linear". + linear: method 7 of H&F [1]_. This method give continuous results using: * alpha = 1 @@ -4164,7 +4163,7 @@ def quantile(a, * (NPY 2): 'higher', * (NPY 3): 'midpoint' * (NPY 4): 'nearest' - * (NPY 5): 'linear', aliased with 'inclusive' (default) + * (NPY 5): 'linear' New options: @@ -4174,7 +4173,7 @@ def quantile(a, * (H&F 4): 'interpolated_inverted_cdf' * (H&F 5): 'hazen' * (H&F 6): 'weibull' - * (H&F 7): 'inclusive', aliased with 'linear' (default) + * (H&F 7): 'linear' (default) * (H&F 8): 'median_unbiased' * (H&F 9): 'normal_unbiased' @@ -4261,8 +4260,7 @@ def quantile(a, * alpha = 0 * beta = 0 - inclusive: - Default method, aliased with "linear". + linear: method 7 of H&F [1]_. This method give continuous results using: * alpha = 1 -- cgit v1.2.1 From 6cd68755c6fd0686dd57a9daec43d8aa09d15c3e Mon Sep 17 00:00:00 2001 From: abel Date: Tue, 9 Nov 2021 10:35:12 +0100 Subject: MTH: Update quantile default lerp method For method 7 of H&F, using `(n - 1) * quantiles` instead of the usual method gives a more accurate result. --- numpy/lib/function_base.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'numpy/lib/function_base.py') diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index 86125168a..3c9983edf 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -67,7 +67,7 @@ __all__ = [ # fix_gamma : Callable # A function used for discret methods to force the index to a specific value. _QuantileInterpolation = dict( - # --- HYNDMAN and FAN methods + # --- HYNDMAN AND FAN METHODS # Discrete methods inverted_cdf=dict( get_virtual_index=lambda n, quantiles: _inverted_cdf(n, quantiles), @@ -102,10 +102,12 @@ _QuantileInterpolation = dict( _compute_virtual_index(n, quantiles, 0, 0), fix_gamma=lambda gamma, _: gamma, ), - # Default value + # Default method. + # To avoid some rounding issues, `(n-1) * quantiles` is preferred to + # `_compute_virtual_index(n, quantiles, 1, 1)`. + # They are mathematically equivalent. linear=dict( - get_virtual_index=lambda n, quantiles: - _compute_virtual_index(n, quantiles, 1, 1), + get_virtual_index=lambda n, quantiles: (n - 1) * quantiles, fix_gamma=lambda gamma, _: gamma, ), median_unbiased=dict( @@ -118,7 +120,7 @@ _QuantileInterpolation = dict( _compute_virtual_index(n, quantiles, 3 / 8.0, 3 / 8.0), fix_gamma=lambda gamma, _: gamma, ), - # --- OTHER METHODS fixme add deprecated ? + # --- OTHER METHODS lower=dict( get_virtual_index=lambda n, quantiles: np.floor( (n - 1) * quantiles).astype(np.intp), -- cgit v1.2.1