diff options
| author | scoder <stefan_ml@behnel.de> | 2023-05-04 09:29:53 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2023-05-04 09:29:53 +0200 |
| commit | 442c8f48d3146ec32c7d5387310e171276cf10ac (patch) | |
| tree | d8911d1a64e384b7955d3fc09a07edd218a9f1ee /numpy/lib/arraysetops.py | |
| parent | 3e4a6cba2da27bbe2a6e12c163238e503c9f6a07 (diff) | |
| parent | 9163e933df91b516b6f0c7a9ba8ad1750e642f37 (diff) | |
| download | numpy-442c8f48d3146ec32c7d5387310e171276cf10ac.tar.gz | |
Merge branch 'main' into cython3_noexcept
Diffstat (limited to 'numpy/lib/arraysetops.py')
| -rw-r--r-- | numpy/lib/arraysetops.py | 243 |
1 files changed, 207 insertions, 36 deletions
diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py index 6cca1738b..300bbda26 100644 --- a/numpy/lib/arraysetops.py +++ b/numpy/lib/arraysetops.py @@ -1,28 +1,17 @@ """ Set operations for arrays based on sorting. -:Contains: - unique, - isin, - ediff1d, - intersect1d, - setxor1d, - in1d, - union1d, - setdiff1d - -:Notes: +Notes +----- For floating point arrays, inaccurate results may appear due to usual round-off and floating point comparison issues. Speed could be gained in some operations by an implementation of -sort(), that can provide directly the permutation vectors, avoiding -thus calls to argsort(). +`numpy.sort`, that can provide directly the permutation vectors, thus avoiding +calls to `numpy.argsort`. -To do: Optionally return indices analogously to unique for all functions. - -:Author: Robert Cimrman +Original author: Robert Cimrman """ import functools @@ -104,7 +93,7 @@ def ediff1d(ary, to_end=None, to_begin=None): else: to_begin = np.asanyarray(to_begin) if not np.can_cast(to_begin, dtype_req, casting="same_kind"): - raise TypeError("dtype of `to_end` must be compatible " + raise TypeError("dtype of `to_begin` must be compatible " "with input `ary` under the `same_kind` rule.") to_begin = to_begin.ravel() @@ -142,13 +131,13 @@ def _unpack_tuple(x): def _unique_dispatcher(ar, return_index=None, return_inverse=None, - return_counts=None, axis=None): + return_counts=None, axis=None, *, equal_nan=None): return (ar,) @array_function_dispatch(_unique_dispatcher) def unique(ar, return_index=False, return_inverse=False, - return_counts=False, axis=None): + return_counts=False, axis=None, *, equal_nan=True): """ Find the unique elements of an array. @@ -173,9 +162,6 @@ def unique(ar, return_index=False, return_inverse=False, return_counts : bool, optional If True, also return the number of times each unique item appears in `ar`. - - .. versionadded:: 1.9.0 - axis : int or None, optional The axis to operate on. If None, `ar` will be flattened. If an integer, the subarrays indexed by the given axis will be flattened and treated @@ -186,6 +172,11 @@ def unique(ar, return_index=False, return_inverse=False, .. versionadded:: 1.13.0 + equal_nan : bool, optional + If True, collapses multiple NaN values in the return array into one. + + .. versionadded:: 1.24 + Returns ------- unique : ndarray @@ -220,6 +211,16 @@ def unique(ar, return_index=False, return_inverse=False, flattened subarrays are sorted in lexicographic order starting with the first element. + .. versionchanged: NumPy 1.21 + If nan values are in the input array, a single nan is put + to the end of the sorted unique values. + + Also for complex arrays all NaN values are considered equivalent + (no matter whether the NaN is in the real or imaginary part). + As the representant for the returned array the smallest one in the + lexicographical order is chosen - see np.sort for how the lexicographical + order is defined for complex arrays. + Examples -------- >>> np.unique([1, 1, 2, 2, 3, 3]) @@ -270,7 +271,8 @@ def unique(ar, return_index=False, return_inverse=False, """ ar = np.asanyarray(ar) if axis is None: - ret = _unique1d(ar, return_index, return_inverse, return_counts) + ret = _unique1d(ar, return_index, return_inverse, return_counts, + equal_nan=equal_nan) return _unpack_tuple(ret) # axis was specified and not None @@ -278,7 +280,7 @@ def unique(ar, return_index=False, return_inverse=False, ar = np.moveaxis(ar, axis, 0) except np.AxisError: # this removes the "axis1" or "axis2" prefix from the error message - raise np.AxisError(axis, ar.ndim) + raise np.AxisError(axis, ar.ndim) from None # Must reshape to a contiguous 2D array for this to work... orig_shape, orig_dtype = ar.shape, ar.dtype @@ -300,10 +302,10 @@ def unique(ar, return_index=False, return_inverse=False, # array with shape `(len(ar),)`. Because `dtype` in this case has # itemsize 0, the total size of the result is still 0 bytes. consolidated = np.empty(len(ar), dtype=dtype) - except TypeError: + except TypeError as e: # There's no good way to do this for object arrays, etc... msg = 'The axis argument to unique is not supported for dtype {dt}' - raise TypeError(msg.format(dt=ar.dtype)) + raise TypeError(msg.format(dt=ar.dtype)) from e def reshape_uniq(uniq): n = len(uniq) @@ -313,13 +315,13 @@ def unique(ar, return_index=False, return_inverse=False, return uniq output = _unique1d(consolidated, return_index, - return_inverse, return_counts) + return_inverse, return_counts, equal_nan=equal_nan) output = (reshape_uniq(output[0]),) + output[1:] return _unpack_tuple(output) def _unique1d(ar, return_index=False, return_inverse=False, - return_counts=False): + return_counts=False, *, equal_nan=True): """ Find the unique elements of an array, ignoring shape. """ @@ -335,7 +337,19 @@ def _unique1d(ar, return_index=False, return_inverse=False, aux = ar mask = np.empty(aux.shape, dtype=np.bool_) mask[:1] = True - mask[1:] = aux[1:] != aux[:-1] + if (equal_nan and aux.shape[0] > 0 and aux.dtype.kind in "cfmM" and + np.isnan(aux[-1])): + if aux.dtype.kind == "c": # for complex all NaNs are considered equivalent + aux_firstnan = np.searchsorted(np.isnan(aux), True, side='left') + else: + aux_firstnan = np.searchsorted(aux, aux[-1], side='left') + if aux_firstnan > 0: + mask[1:aux_firstnan] = ( + aux[1:aux_firstnan] != aux[:aux_firstnan - 1]) + mask[aux_firstnan] = True + mask[aux_firstnan + 1:] = False + else: + mask[1:] = aux[1:] != aux[:-1] ret = (aux[mask],) if return_index: @@ -369,7 +383,9 @@ def intersect1d(ar1, ar2, assume_unique=False, return_indices=False): Input arrays. Will be flattened if not already 1D. assume_unique : bool If True, the input arrays are both assumed to be unique, which - can speed up the calculation. Default is False. + can speed up the calculation. If True but ``ar1`` or ``ar2`` are not + unique, incorrect results and out-of-bounds indices could result. + Default is False. return_indices : bool If True, the indices which correspond to the intersection of the two arrays are returned. The first instance of a value is used if there are @@ -500,12 +516,13 @@ def setxor1d(ar1, ar2, assume_unique=False): return aux[flag[1:] & flag[:-1]] -def _in1d_dispatcher(ar1, ar2, assume_unique=None, invert=None): +def _in1d_dispatcher(ar1, ar2, assume_unique=None, invert=None, *, + kind=None): return (ar1, ar2) @array_function_dispatch(_in1d_dispatcher) -def in1d(ar1, ar2, assume_unique=False, invert=False): +def in1d(ar1, ar2, assume_unique=False, invert=False, *, kind=None): """ Test whether each element of a 1-D array is also present in a second array. @@ -528,6 +545,26 @@ def in1d(ar1, ar2, assume_unique=False, invert=False): False where an element of `ar1` is in `ar2` and True otherwise). Default is False. ``np.in1d(a, b, invert=True)`` is equivalent to (but is faster than) ``np.invert(in1d(a, b))``. + kind : {None, 'sort', 'table'}, optional + The algorithm to use. This will not affect the final result, + but will affect the speed and memory use. The default, None, + will select automatically based on memory considerations. + + * If 'sort', will use a mergesort-based approach. This will have + a memory usage of roughly 6 times the sum of the sizes of + `ar1` and `ar2`, not accounting for size of dtypes. + * If 'table', will use a lookup table approach similar + to a counting sort. This is only available for boolean and + integer arrays. This will have a memory usage of the + size of `ar1` plus the max-min value of `ar2`. `assume_unique` + has no effect when the 'table' option is used. + * If None, will automatically choose 'table' if + the required memory allocation is less than or equal to + 6 times the sum of the sizes of `ar1` and `ar2`, + otherwise will use 'sort'. This is done to not use + a large amount of memory by default, even though + 'table' may be faster in most cases. If 'table' is chosen, + `assume_unique` will have no effect. .. versionadded:: 1.8.0 @@ -553,6 +590,13 @@ def in1d(ar1, ar2, assume_unique=False, invert=False): ``asarray(ar2)`` is an object array rather than the expected array of contained values. + Using ``kind='table'`` tends to be faster than `kind='sort'` if the + following relationship is true: + ``log10(len(ar2)) > (log10(max(ar2)-min(ar2)) - 2.27) / 0.927``, + but may use greater memory. The default value for `kind` will + be automatically selected based only on memory usage, so one may + manually set ``kind='table'`` if memory constraints can be relaxed. + .. versionadded:: 1.4.0 Examples @@ -574,6 +618,103 @@ def in1d(ar1, ar2, assume_unique=False, invert=False): ar1 = np.asarray(ar1).ravel() ar2 = np.asarray(ar2).ravel() + # Ensure that iteration through object arrays yields size-1 arrays + if ar2.dtype == object: + ar2 = ar2.reshape(-1, 1) + + if kind not in {None, 'sort', 'table'}: + raise ValueError( + f"Invalid kind: '{kind}'. Please use None, 'sort' or 'table'.") + + # Can use the table method if all arrays are integers or boolean: + is_int_arrays = all(ar.dtype.kind in ("u", "i", "b") for ar in (ar1, ar2)) + use_table_method = is_int_arrays and kind in {None, 'table'} + + if use_table_method: + if ar2.size == 0: + if invert: + return np.ones_like(ar1, dtype=bool) + else: + return np.zeros_like(ar1, dtype=bool) + + # Convert booleans to uint8 so we can use the fast integer algorithm + if ar1.dtype == bool: + ar1 = ar1.astype(np.uint8) + if ar2.dtype == bool: + ar2 = ar2.astype(np.uint8) + + ar2_min = np.min(ar2) + ar2_max = np.max(ar2) + + ar2_range = int(ar2_max) - int(ar2_min) + + # Constraints on whether we can actually use the table method: + # 1. Assert memory usage is not too large + below_memory_constraint = ar2_range <= 6 * (ar1.size + ar2.size) + # 2. Check overflows for (ar2 - ar2_min); dtype=ar2.dtype + range_safe_from_overflow = ar2_range <= np.iinfo(ar2.dtype).max + # 3. Check overflows for (ar1 - ar2_min); dtype=ar1.dtype + if ar1.size > 0: + ar1_min = np.min(ar1) + ar1_max = np.max(ar1) + + # After masking, the range of ar1 is guaranteed to be + # within the range of ar2: + ar1_upper = min(int(ar1_max), int(ar2_max)) + ar1_lower = max(int(ar1_min), int(ar2_min)) + + range_safe_from_overflow &= all(( + ar1_upper - int(ar2_min) <= np.iinfo(ar1.dtype).max, + ar1_lower - int(ar2_min) >= np.iinfo(ar1.dtype).min + )) + + # Optimal performance is for approximately + # log10(size) > (log10(range) - 2.27) / 0.927. + # However, here we set the requirement that by default + # the intermediate array can only be 6x + # the combined memory allocation of the original + # arrays. See discussion on + # https://github.com/numpy/numpy/pull/12065. + + if ( + range_safe_from_overflow and + (below_memory_constraint or kind == 'table') + ): + + if invert: + outgoing_array = np.ones_like(ar1, dtype=bool) + else: + outgoing_array = np.zeros_like(ar1, dtype=bool) + + # Make elements 1 where the integer exists in ar2 + if invert: + isin_helper_ar = np.ones(ar2_range + 1, dtype=bool) + isin_helper_ar[ar2 - ar2_min] = 0 + else: + isin_helper_ar = np.zeros(ar2_range + 1, dtype=bool) + isin_helper_ar[ar2 - ar2_min] = 1 + + # Mask out elements we know won't work + basic_mask = (ar1 <= ar2_max) & (ar1 >= ar2_min) + outgoing_array[basic_mask] = isin_helper_ar[ar1[basic_mask] - + ar2_min] + + return outgoing_array + elif kind == 'table': # not range_safe_from_overflow + raise RuntimeError( + "You have specified kind='table', " + "but the range of values in `ar2` or `ar1` exceed the " + "maximum integer of the datatype. " + "Please set `kind` to None or 'sort'." + ) + elif kind == 'table': + raise ValueError( + "The 'table' method is only " + "supported for boolean or integer arrays. " + "Please select 'sort' or None for kind." + ) + + # Check if one of the arrays may contain arbitrary objects contains_object = ar1.dtype.hasobject or ar2.dtype.hasobject @@ -617,14 +758,16 @@ def in1d(ar1, ar2, assume_unique=False, invert=False): return ret[rev_idx] -def _isin_dispatcher(element, test_elements, assume_unique=None, invert=None): +def _isin_dispatcher(element, test_elements, assume_unique=None, invert=None, + *, kind=None): return (element, test_elements) @array_function_dispatch(_isin_dispatcher) -def isin(element, test_elements, assume_unique=False, invert=False): +def isin(element, test_elements, assume_unique=False, invert=False, *, + kind=None): """ - Calculates `element in test_elements`, broadcasting over `element` only. + Calculates ``element in test_elements``, broadcasting over `element` only. Returns a boolean array of the same shape as `element` that is True where an element of `element` is in `test_elements` and False otherwise. @@ -644,6 +787,27 @@ def isin(element, test_elements, assume_unique=False, invert=False): calculating `element not in test_elements`. Default is False. ``np.isin(a, b, invert=True)`` is equivalent to (but faster than) ``np.invert(np.isin(a, b))``. + kind : {None, 'sort', 'table'}, optional + The algorithm to use. This will not affect the final result, + but will affect the speed and memory use. The default, None, + will select automatically based on memory considerations. + + * If 'sort', will use a mergesort-based approach. This will have + a memory usage of roughly 6 times the sum of the sizes of + `ar1` and `ar2`, not accounting for size of dtypes. + * If 'table', will use a lookup table approach similar + to a counting sort. This is only available for boolean and + integer arrays. This will have a memory usage of the + size of `ar1` plus the max-min value of `ar2`. `assume_unique` + has no effect when the 'table' option is used. + * If None, will automatically choose 'table' if + the required memory allocation is less than or equal to + 6 times the sum of the sizes of `ar1` and `ar2`, + otherwise will use 'sort'. This is done to not use + a large amount of memory by default, even though + 'table' may be faster in most cases. If 'table' is chosen, + `assume_unique` will have no effect. + Returns ------- @@ -671,6 +835,13 @@ def isin(element, test_elements, assume_unique=False, invert=False): of the `array` constructor's way of handling non-sequence collections. Converting the set to a list usually gives the desired behavior. + Using ``kind='table'`` tends to be faster than `kind='sort'` if the + following relationship is true: + ``log10(len(ar2)) > (log10(max(ar2)-min(ar2)) - 2.27) / 0.927``, + but may use greater memory. The default value for `kind` will + be automatically selected based only on memory usage, so one may + manually set ``kind='table'`` if memory constraints can be relaxed. + .. versionadded:: 1.13.0 Examples @@ -717,7 +888,7 @@ def isin(element, test_elements, assume_unique=False, invert=False): """ element = np.asarray(element) return in1d(element, test_elements, assume_unique=assume_unique, - invert=invert).reshape(element.shape) + invert=invert, kind=kind).reshape(element.shape) def _union1d_dispatcher(ar1, ar2): |
