diff options
author | Eric Wieser <wieser.eric@gmail.com> | 2018-07-31 00:41:28 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-07-31 00:41:28 -0700 |
commit | 7f4579279a6a6aa07df664b901afa36ab3fc5ce0 (patch) | |
tree | 3524c05c661f4948eabf066b46b5ad3aaf6ad617 /numpy/lib/arraysetops.py | |
parent | 24960daf3e326591047eb099af840da6e95d0910 (diff) | |
parent | 9bb569c4e0e1cf08128179d157bdab10c8706a97 (diff) | |
download | numpy-7f4579279a6a6aa07df664b901afa36ab3fc5ce0.tar.gz |
Merge branch 'master' into ix_-preserve-type
Diffstat (limited to 'numpy/lib/arraysetops.py')
-rw-r--r-- | numpy/lib/arraysetops.py | 325 |
1 files changed, 243 insertions, 82 deletions
diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py index fae3e3cbc..5880ea154 100644 --- a/numpy/lib/arraysetops.py +++ b/numpy/lib/arraysetops.py @@ -1,9 +1,10 @@ """ -Set operations for 1D numeric arrays based on sorting. +Set operations for arrays based on sorting. :Contains: - ediff1d, unique, + isin, + ediff1d, intersect1d, setxor1d, in1d, @@ -31,7 +32,7 @@ import numpy as np __all__ = [ 'ediff1d', 'intersect1d', 'setxor1d', 'union1d', 'setdiff1d', 'unique', - 'in1d' + 'in1d', 'isin' ] @@ -109,16 +110,25 @@ def ediff1d(ary, to_end=None, to_begin=None): return result +def _unpack_tuple(x): + """ Unpacks one-element tuples for use as return values """ + if len(x) == 1: + return x[0] + else: + return x + + def unique(ar, return_index=False, return_inverse=False, return_counts=False, axis=None): """ Find the unique elements of an array. Returns the sorted unique elements of an array. There are three optional - outputs in addition to the unique elements: the indices of the input array - that give the unique values, the indices of the unique array that - reconstruct the input array, and the number of times each unique value - comes up in the input array. + outputs in addition to the unique elements: + + * the indices of the input array that give the unique values + * the indices of the unique array that reconstruct the input array + * the number of times each unique value comes up in the input array Parameters ---------- @@ -134,16 +144,18 @@ def unique(ar, return_index=False, return_inverse=False, return_counts : bool, optional If True, also return the number of times each unique item appears in `ar`. + .. versionadded:: 1.9.0 - axis : int or None, optional - The axis to operate on. If None, `ar` will be flattened beforehand. - Otherwise, duplicate items will be removed along the provided axis, - with all the other axes belonging to the each of the unique elements. - Object arrays or structured arrays that contain objects are not - supported if the `axis` kwarg is used. - .. versionadded:: 1.13.0 + axis : int or None, optional + The axis to operate on. If None, `ar` will be flattened. If an integer, + the subarrays indexed by the given axis will be flattened and treated + as the elements of a 1-D array with the dimension of the given axis, + see the notes for more details. Object arrays or structured arrays + that contain objects are not supported if the `axis` kwarg is used. The + default is None. + .. versionadded:: 1.13.0 Returns ------- @@ -158,6 +170,7 @@ def unique(ar, return_index=False, return_inverse=False, unique_counts : ndarray, optional The number of times each of the unique values comes up in the original array. Only provided if `return_counts` is True. + .. versionadded:: 1.9.0 See Also @@ -165,6 +178,17 @@ def unique(ar, return_index=False, return_inverse=False, numpy.lib.arraysetops : Module with a number of other functions for performing set operations on arrays. + Notes + ----- + When an axis is specified the subarrays indexed by the axis are sorted. + This is done by making the specified axis the first dimension of the array + and then flattening the subarrays in C order. The flattened subarrays are + then viewed as a structured type with each element given a label, with the + effect that we end up with a 1-D array of structured types that can be + treated in the same way as any other 1-D array. The result is that the + flattened subarrays are sorted in lexicographic order starting with the + first element. + Examples -------- >>> np.unique([1, 1, 2, 2, 3, 3]) @@ -206,24 +230,21 @@ def unique(ar, return_index=False, return_inverse=False, """ ar = np.asanyarray(ar) if axis is None: - return _unique1d(ar, return_index, return_inverse, return_counts) - if not (-ar.ndim <= axis < ar.ndim): - raise ValueError('Invalid axis kwarg specified for unique') + ret = _unique1d(ar, return_index, return_inverse, return_counts) + return _unpack_tuple(ret) + + # axis was specified and not None + try: + ar = np.swapaxes(ar, axis, 0) + except np.AxisError: + # this removes the "axis1" or "axis2" prefix from the error message + raise np.AxisError(axis, ar.ndim) - ar = np.swapaxes(ar, axis, 0) - orig_shape, orig_dtype = ar.shape, ar.dtype # Must reshape to a contiguous 2D array for this to work... + orig_shape, orig_dtype = ar.shape, ar.dtype ar = ar.reshape(orig_shape[0], -1) ar = np.ascontiguousarray(ar) - - if ar.dtype.char in (np.typecodes['AllInteger'] + - np.typecodes['Datetime'] + 'S'): - # Optimization: Creating a view of your data with a np.void data type of - # size the number of bytes in a full row. Handles any type where items - # have a unique binary representation, i.e. 0 is only 0, not +0 and -0. - dtype = np.dtype((np.void, ar.dtype.itemsize * ar.shape[1])) - else: - dtype = [('f{i}'.format(i=i), ar.dtype) for i in range(ar.shape[1])] + dtype = [('f{i}'.format(i=i), ar.dtype) for i in range(ar.shape[1])] try: consolidated = ar.view(dtype) @@ -240,11 +261,9 @@ def unique(ar, return_index=False, return_inverse=False, output = _unique1d(consolidated, return_index, return_inverse, return_counts) - if not (return_index or return_inverse or return_counts): - return reshape_uniq(output) - else: - uniq = reshape_uniq(output[0]) - return (uniq,) + output[1:] + output = (reshape_uniq(output[0]),) + output[1:] + return _unpack_tuple(output) + def _unique1d(ar, return_index=False, return_inverse=False, return_counts=False): @@ -254,20 +273,6 @@ def _unique1d(ar, return_index=False, return_inverse=False, ar = np.asanyarray(ar).flatten() optional_indices = return_index or return_inverse - optional_returns = optional_indices or return_counts - - if ar.size == 0: - if not optional_returns: - ret = ar - else: - ret = (ar,) - if return_index: - ret += (np.empty(0, np.bool),) - if return_inverse: - ret += (np.empty(0, np.bool),) - if return_counts: - ret += (np.empty(0, np.intp),) - return ret if optional_indices: perm = ar.argsort(kind='mergesort' if return_index else 'quicksort') @@ -275,25 +280,25 @@ def _unique1d(ar, return_index=False, return_inverse=False, else: ar.sort() aux = ar - flag = np.concatenate(([True], aux[1:] != aux[:-1])) - - if not optional_returns: - ret = aux[flag] - else: - ret = (aux[flag],) - if return_index: - ret += (perm[flag],) - if return_inverse: - iflag = np.cumsum(flag) - 1 - inv_idx = np.empty(ar.shape, dtype=np.intp) - inv_idx[perm] = iflag - ret += (inv_idx,) - if return_counts: - idx = np.concatenate(np.nonzero(flag) + ([ar.size],)) - ret += (np.diff(idx),) + mask = np.empty(aux.shape, dtype=np.bool_) + mask[:1] = True + mask[1:] = aux[1:] != aux[:-1] + + ret = (aux[mask],) + if return_index: + ret += (perm[mask],) + if return_inverse: + imask = np.cumsum(mask) - 1 + inv_idx = np.empty(mask.shape, dtype=np.intp) + inv_idx[perm] = imask + ret += (inv_idx,) + if return_counts: + idx = np.concatenate(np.nonzero(mask) + ([mask.size],)) + ret += (np.diff(idx),) return ret -def intersect1d(ar1, ar2, assume_unique=False): + +def intersect1d(ar1, ar2, assume_unique=False, return_indices=False): """ Find the intersection of two arrays. @@ -302,15 +307,28 @@ def intersect1d(ar1, ar2, assume_unique=False): Parameters ---------- ar1, ar2 : array_like - Input arrays. + Input arrays. Will be flattened if not already 1D. assume_unique : bool If True, the input arrays are both assumed to be unique, which can speed up the calculation. Default is False. - + return_indices : bool + If True, the indices which correspond to the intersection of the + two arrays are returned. The first instance of a value is used + if there are multiple. Default is False. + + .. versionadded:: 1.15.0 + Returns ------- intersect1d : ndarray Sorted 1D array of common and unique elements. + comm1 : ndarray + The indices of the first occurrences of the common values in `ar1`. + Only provided if `return_indices` is True. + comm2 : ndarray + The indices of the first occurrences of the common values in `ar2`. + Only provided if `return_indices` is True. + See Also -------- @@ -327,14 +345,49 @@ def intersect1d(ar1, ar2, assume_unique=False): >>> from functools import reduce >>> reduce(np.intersect1d, ([1, 3, 4, 3], [3, 1, 2, 1], [6, 3, 4, 2])) array([3]) + + To return the indices of the values common to the input arrays + along with the intersected values: + >>> x = np.array([1, 1, 2, 3, 4]) + >>> y = np.array([2, 1, 4, 6]) + >>> xy, x_ind, y_ind = np.intersect1d(x, y, return_indices=True) + >>> x_ind, y_ind + (array([0, 2, 4]), array([1, 0, 2])) + >>> xy, x[x_ind], y[y_ind] + (array([1, 2, 4]), array([1, 2, 4]), array([1, 2, 4])) + """ if not assume_unique: - # Might be faster than unique( intersect1d( ar1, ar2 ) )? - ar1 = unique(ar1) - ar2 = unique(ar2) + if return_indices: + ar1, ind1 = unique(ar1, return_index=True) + ar2, ind2 = unique(ar2, return_index=True) + else: + ar1 = unique(ar1) + ar2 = unique(ar2) + else: + ar1 = ar1.ravel() + ar2 = ar2.ravel() + aux = np.concatenate((ar1, ar2)) - aux.sort() - return aux[:-1][aux[1:] == aux[:-1]] + if return_indices: + aux_sort_indices = np.argsort(aux, kind='mergesort') + aux = aux[aux_sort_indices] + else: + aux.sort() + + mask = aux[1:] == aux[:-1] + int1d = aux[:-1][mask] + + if return_indices: + ar1_indices = aux_sort_indices[:-1][mask] + ar2_indices = aux_sort_indices[1:][mask] - ar1.size + if not assume_unique: + ar1_indices = ind1[ar1_indices] + ar2_indices = ind2[ar2_indices] + + return int1d, ar1_indices, ar2_indices + else: + return int1d def setxor1d(ar1, ar2, assume_unique=False): """ @@ -374,11 +427,9 @@ def setxor1d(ar1, ar2, assume_unique=False): return aux aux.sort() -# flag = ediff1d( aux, to_end = 1, to_begin = 1 ) == 0 flag = np.concatenate(([True], aux[1:] != aux[:-1], [True])) -# flag2 = ediff1d( flag ) == 0 - flag2 = flag[1:] == flag[:-1] - return aux[flag2] + return aux[flag[1:] & flag[:-1]] + def in1d(ar1, ar2, assume_unique=False, invert=False): """ @@ -387,6 +438,8 @@ def in1d(ar1, ar2, assume_unique=False, invert=False): Returns a boolean array the same length as `ar1` that is True where an element of `ar1` is in `ar2` and False otherwise. + We recommend using :func:`isin` instead of `in1d` for new code. + Parameters ---------- ar1 : (M,) array_like @@ -411,6 +464,8 @@ def in1d(ar1, ar2, assume_unique=False, invert=False): See Also -------- + isin : Version of this function that preserves the + shape of ar1. numpy.lib.arraysetops : Module with a number of other functions for performing set operations on arrays. @@ -432,12 +487,12 @@ def in1d(ar1, ar2, assume_unique=False, invert=False): >>> states = [0, 2] >>> mask = np.in1d(test, states) >>> mask - array([ True, False, True, False, True], dtype=bool) + array([ True, False, True, False, True]) >>> test[mask] array([0, 2, 0]) >>> mask = np.in1d(test, states, invert=True) >>> mask - array([False, True, False, True, False], dtype=bool) + array([False, True, False, True, False]) >>> test[mask] array([1, 5]) """ @@ -445,14 +500,20 @@ def in1d(ar1, ar2, assume_unique=False, invert=False): ar1 = np.asarray(ar1).ravel() ar2 = np.asarray(ar2).ravel() - # This code is significantly faster when the condition is satisfied. - if len(ar2) < 10 * len(ar1) ** 0.145: + # Check if one of the arrays may contain arbitrary objects + contains_object = ar1.dtype.hasobject or ar2.dtype.hasobject + + # This code is run when + # a) the first condition is true, making the code significantly faster + # b) the second condition is true (i.e. `ar1` or `ar2` may contain + # arbitrary objects), since then sorting is not guaranteed to work + if len(ar2) < 10 * len(ar1) ** 0.145 or contains_object: if invert: - mask = np.ones(len(ar1), dtype=np.bool) + mask = np.ones(len(ar1), dtype=bool) for a in ar2: mask &= (ar1 != a) else: - mask = np.zeros(len(ar1), dtype=np.bool) + mask = np.zeros(len(ar1), dtype=bool) for a in ar2: mask |= (ar1 == a) return mask @@ -481,6 +542,105 @@ def in1d(ar1, ar2, assume_unique=False, invert=False): else: return ret[rev_idx] + +def isin(element, test_elements, assume_unique=False, invert=False): + """ + Calculates `element in test_elements`, broadcasting over `element` only. + Returns a boolean array of the same shape as `element` that is True + where an element of `element` is in `test_elements` and False otherwise. + + Parameters + ---------- + element : array_like + Input array. + test_elements : array_like + The values against which to test each value of `element`. + This argument is flattened if it is an array or array_like. + See notes for behavior with non-array-like parameters. + assume_unique : bool, optional + If True, the input arrays are both assumed to be unique, which + can speed up the calculation. Default is False. + invert : bool, optional + If True, the values in the returned array are inverted, as if + calculating `element not in test_elements`. Default is False. + ``np.isin(a, b, invert=True)`` is equivalent to (but faster + than) ``np.invert(np.isin(a, b))``. + + Returns + ------- + isin : ndarray, bool + Has the same shape as `element`. The values `element[isin]` + are in `test_elements`. + + See Also + -------- + in1d : Flattened version of this function. + numpy.lib.arraysetops : Module with a number of other functions for + performing set operations on arrays. + + Notes + ----- + + `isin` is an element-wise function version of the python keyword `in`. + ``isin(a, b)`` is roughly equivalent to + ``np.array([item in b for item in a])`` if `a` and `b` are 1-D sequences. + + `element` and `test_elements` are converted to arrays if they are not + already. If `test_elements` is a set (or other non-sequence collection) + it will be converted to an object array with one element, rather than an + array of the values contained in `test_elements`. This is a consequence + of the `array` constructor's way of handling non-sequence collections. + Converting the set to a list usually gives the desired behavior. + + .. versionadded:: 1.13.0 + + Examples + -------- + >>> element = 2*np.arange(4).reshape((2, 2)) + >>> element + array([[0, 2], + [4, 6]]) + >>> test_elements = [1, 2, 4, 8] + >>> mask = np.isin(element, test_elements) + >>> mask + array([[ False, True], + [ True, False]]) + >>> element[mask] + array([2, 4]) + + The indices of the matched values can be obtained with `nonzero`: + + >>> np.nonzero(mask) + (array([0, 1]), array([1, 0])) + + The test can also be inverted: + + >>> mask = np.isin(element, test_elements, invert=True) + >>> mask + array([[ True, False], + [ False, True]]) + >>> element[mask] + array([0, 6]) + + Because of how `array` handles sets, the following does not + work as expected: + + >>> test_set = {1, 2, 4, 8} + >>> np.isin(element, test_set) + array([[ False, False], + [ False, False]]) + + Casting the set to a list gives the expected result: + + >>> np.isin(element, list(test_set)) + array([[ False, True], + [ True, False]]) + """ + element = np.asarray(element) + return in1d(element, test_elements, assume_unique=assume_unique, + invert=invert).reshape(element.shape) + + def union1d(ar1, ar2): """ Find the union of two arrays. @@ -514,7 +674,7 @@ def union1d(ar1, ar2): >>> reduce(np.union1d, ([1, 3, 4, 3], [3, 1, 2, 1], [6, 3, 4, 2])) array([1, 2, 3, 4, 6]) """ - return unique(np.concatenate((ar1, ar2))) + return unique(np.concatenate((ar1, ar2), axis=None)) def setdiff1d(ar1, ar2, assume_unique=False): """ @@ -556,3 +716,4 @@ def setdiff1d(ar1, ar2, assume_unique=False): ar1 = unique(ar1) ar2 = unique(ar2) return ar1[in1d(ar1, ar2, assume_unique=True, invert=True)] + |