summaryrefslogtreecommitdiff
path: root/numpy/lib/arraysetops.py
diff options
context:
space:
mode:
Diffstat (limited to 'numpy/lib/arraysetops.py')
-rw-r--r--numpy/lib/arraysetops.py325
1 files changed, 243 insertions, 82 deletions
diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py
index fae3e3cbc..5880ea154 100644
--- a/numpy/lib/arraysetops.py
+++ b/numpy/lib/arraysetops.py
@@ -1,9 +1,10 @@
"""
-Set operations for 1D numeric arrays based on sorting.
+Set operations for arrays based on sorting.
:Contains:
- ediff1d,
unique,
+ isin,
+ ediff1d,
intersect1d,
setxor1d,
in1d,
@@ -31,7 +32,7 @@ import numpy as np
__all__ = [
'ediff1d', 'intersect1d', 'setxor1d', 'union1d', 'setdiff1d', 'unique',
- 'in1d'
+ 'in1d', 'isin'
]
@@ -109,16 +110,25 @@ def ediff1d(ary, to_end=None, to_begin=None):
return result
+def _unpack_tuple(x):
+ """ Unpacks one-element tuples for use as return values """
+ if len(x) == 1:
+ return x[0]
+ else:
+ return x
+
+
def unique(ar, return_index=False, return_inverse=False,
return_counts=False, axis=None):
"""
Find the unique elements of an array.
Returns the sorted unique elements of an array. There are three optional
- outputs in addition to the unique elements: the indices of the input array
- that give the unique values, the indices of the unique array that
- reconstruct the input array, and the number of times each unique value
- comes up in the input array.
+ outputs in addition to the unique elements:
+
+ * the indices of the input array that give the unique values
+ * the indices of the unique array that reconstruct the input array
+ * the number of times each unique value comes up in the input array
Parameters
----------
@@ -134,16 +144,18 @@ def unique(ar, return_index=False, return_inverse=False,
return_counts : bool, optional
If True, also return the number of times each unique item appears
in `ar`.
+
.. versionadded:: 1.9.0
- axis : int or None, optional
- The axis to operate on. If None, `ar` will be flattened beforehand.
- Otherwise, duplicate items will be removed along the provided axis,
- with all the other axes belonging to the each of the unique elements.
- Object arrays or structured arrays that contain objects are not
- supported if the `axis` kwarg is used.
- .. versionadded:: 1.13.0
+ axis : int or None, optional
+ The axis to operate on. If None, `ar` will be flattened. If an integer,
+ the subarrays indexed by the given axis will be flattened and treated
+ as the elements of a 1-D array with the dimension of the given axis,
+ see the notes for more details. Object arrays or structured arrays
+ that contain objects are not supported if the `axis` kwarg is used. The
+ default is None.
+ .. versionadded:: 1.13.0
Returns
-------
@@ -158,6 +170,7 @@ def unique(ar, return_index=False, return_inverse=False,
unique_counts : ndarray, optional
The number of times each of the unique values comes up in the
original array. Only provided if `return_counts` is True.
+
.. versionadded:: 1.9.0
See Also
@@ -165,6 +178,17 @@ def unique(ar, return_index=False, return_inverse=False,
numpy.lib.arraysetops : Module with a number of other functions for
performing set operations on arrays.
+ Notes
+ -----
+ When an axis is specified the subarrays indexed by the axis are sorted.
+ This is done by making the specified axis the first dimension of the array
+ and then flattening the subarrays in C order. The flattened subarrays are
+ then viewed as a structured type with each element given a label, with the
+ effect that we end up with a 1-D array of structured types that can be
+ treated in the same way as any other 1-D array. The result is that the
+ flattened subarrays are sorted in lexicographic order starting with the
+ first element.
+
Examples
--------
>>> np.unique([1, 1, 2, 2, 3, 3])
@@ -206,24 +230,21 @@ def unique(ar, return_index=False, return_inverse=False,
"""
ar = np.asanyarray(ar)
if axis is None:
- return _unique1d(ar, return_index, return_inverse, return_counts)
- if not (-ar.ndim <= axis < ar.ndim):
- raise ValueError('Invalid axis kwarg specified for unique')
+ ret = _unique1d(ar, return_index, return_inverse, return_counts)
+ return _unpack_tuple(ret)
+
+ # axis was specified and not None
+ try:
+ ar = np.swapaxes(ar, axis, 0)
+ except np.AxisError:
+ # this removes the "axis1" or "axis2" prefix from the error message
+ raise np.AxisError(axis, ar.ndim)
- ar = np.swapaxes(ar, axis, 0)
- orig_shape, orig_dtype = ar.shape, ar.dtype
# Must reshape to a contiguous 2D array for this to work...
+ orig_shape, orig_dtype = ar.shape, ar.dtype
ar = ar.reshape(orig_shape[0], -1)
ar = np.ascontiguousarray(ar)
-
- if ar.dtype.char in (np.typecodes['AllInteger'] +
- np.typecodes['Datetime'] + 'S'):
- # Optimization: Creating a view of your data with a np.void data type of
- # size the number of bytes in a full row. Handles any type where items
- # have a unique binary representation, i.e. 0 is only 0, not +0 and -0.
- dtype = np.dtype((np.void, ar.dtype.itemsize * ar.shape[1]))
- else:
- dtype = [('f{i}'.format(i=i), ar.dtype) for i in range(ar.shape[1])]
+ dtype = [('f{i}'.format(i=i), ar.dtype) for i in range(ar.shape[1])]
try:
consolidated = ar.view(dtype)
@@ -240,11 +261,9 @@ def unique(ar, return_index=False, return_inverse=False,
output = _unique1d(consolidated, return_index,
return_inverse, return_counts)
- if not (return_index or return_inverse or return_counts):
- return reshape_uniq(output)
- else:
- uniq = reshape_uniq(output[0])
- return (uniq,) + output[1:]
+ output = (reshape_uniq(output[0]),) + output[1:]
+ return _unpack_tuple(output)
+
def _unique1d(ar, return_index=False, return_inverse=False,
return_counts=False):
@@ -254,20 +273,6 @@ def _unique1d(ar, return_index=False, return_inverse=False,
ar = np.asanyarray(ar).flatten()
optional_indices = return_index or return_inverse
- optional_returns = optional_indices or return_counts
-
- if ar.size == 0:
- if not optional_returns:
- ret = ar
- else:
- ret = (ar,)
- if return_index:
- ret += (np.empty(0, np.bool),)
- if return_inverse:
- ret += (np.empty(0, np.bool),)
- if return_counts:
- ret += (np.empty(0, np.intp),)
- return ret
if optional_indices:
perm = ar.argsort(kind='mergesort' if return_index else 'quicksort')
@@ -275,25 +280,25 @@ def _unique1d(ar, return_index=False, return_inverse=False,
else:
ar.sort()
aux = ar
- flag = np.concatenate(([True], aux[1:] != aux[:-1]))
-
- if not optional_returns:
- ret = aux[flag]
- else:
- ret = (aux[flag],)
- if return_index:
- ret += (perm[flag],)
- if return_inverse:
- iflag = np.cumsum(flag) - 1
- inv_idx = np.empty(ar.shape, dtype=np.intp)
- inv_idx[perm] = iflag
- ret += (inv_idx,)
- if return_counts:
- idx = np.concatenate(np.nonzero(flag) + ([ar.size],))
- ret += (np.diff(idx),)
+ mask = np.empty(aux.shape, dtype=np.bool_)
+ mask[:1] = True
+ mask[1:] = aux[1:] != aux[:-1]
+
+ ret = (aux[mask],)
+ if return_index:
+ ret += (perm[mask],)
+ if return_inverse:
+ imask = np.cumsum(mask) - 1
+ inv_idx = np.empty(mask.shape, dtype=np.intp)
+ inv_idx[perm] = imask
+ ret += (inv_idx,)
+ if return_counts:
+ idx = np.concatenate(np.nonzero(mask) + ([mask.size],))
+ ret += (np.diff(idx),)
return ret
-def intersect1d(ar1, ar2, assume_unique=False):
+
+def intersect1d(ar1, ar2, assume_unique=False, return_indices=False):
"""
Find the intersection of two arrays.
@@ -302,15 +307,28 @@ def intersect1d(ar1, ar2, assume_unique=False):
Parameters
----------
ar1, ar2 : array_like
- Input arrays.
+ Input arrays. Will be flattened if not already 1D.
assume_unique : bool
If True, the input arrays are both assumed to be unique, which
can speed up the calculation. Default is False.
-
+ return_indices : bool
+ If True, the indices which correspond to the intersection of the
+ two arrays are returned. The first instance of a value is used
+ if there are multiple. Default is False.
+
+ .. versionadded:: 1.15.0
+
Returns
-------
intersect1d : ndarray
Sorted 1D array of common and unique elements.
+ comm1 : ndarray
+ The indices of the first occurrences of the common values in `ar1`.
+ Only provided if `return_indices` is True.
+ comm2 : ndarray
+ The indices of the first occurrences of the common values in `ar2`.
+ Only provided if `return_indices` is True.
+
See Also
--------
@@ -327,14 +345,49 @@ def intersect1d(ar1, ar2, assume_unique=False):
>>> from functools import reduce
>>> reduce(np.intersect1d, ([1, 3, 4, 3], [3, 1, 2, 1], [6, 3, 4, 2]))
array([3])
+
+ To return the indices of the values common to the input arrays
+ along with the intersected values:
+ >>> x = np.array([1, 1, 2, 3, 4])
+ >>> y = np.array([2, 1, 4, 6])
+ >>> xy, x_ind, y_ind = np.intersect1d(x, y, return_indices=True)
+ >>> x_ind, y_ind
+ (array([0, 2, 4]), array([1, 0, 2]))
+ >>> xy, x[x_ind], y[y_ind]
+ (array([1, 2, 4]), array([1, 2, 4]), array([1, 2, 4]))
+
"""
if not assume_unique:
- # Might be faster than unique( intersect1d( ar1, ar2 ) )?
- ar1 = unique(ar1)
- ar2 = unique(ar2)
+ if return_indices:
+ ar1, ind1 = unique(ar1, return_index=True)
+ ar2, ind2 = unique(ar2, return_index=True)
+ else:
+ ar1 = unique(ar1)
+ ar2 = unique(ar2)
+ else:
+ ar1 = ar1.ravel()
+ ar2 = ar2.ravel()
+
aux = np.concatenate((ar1, ar2))
- aux.sort()
- return aux[:-1][aux[1:] == aux[:-1]]
+ if return_indices:
+ aux_sort_indices = np.argsort(aux, kind='mergesort')
+ aux = aux[aux_sort_indices]
+ else:
+ aux.sort()
+
+ mask = aux[1:] == aux[:-1]
+ int1d = aux[:-1][mask]
+
+ if return_indices:
+ ar1_indices = aux_sort_indices[:-1][mask]
+ ar2_indices = aux_sort_indices[1:][mask] - ar1.size
+ if not assume_unique:
+ ar1_indices = ind1[ar1_indices]
+ ar2_indices = ind2[ar2_indices]
+
+ return int1d, ar1_indices, ar2_indices
+ else:
+ return int1d
def setxor1d(ar1, ar2, assume_unique=False):
"""
@@ -374,11 +427,9 @@ def setxor1d(ar1, ar2, assume_unique=False):
return aux
aux.sort()
-# flag = ediff1d( aux, to_end = 1, to_begin = 1 ) == 0
flag = np.concatenate(([True], aux[1:] != aux[:-1], [True]))
-# flag2 = ediff1d( flag ) == 0
- flag2 = flag[1:] == flag[:-1]
- return aux[flag2]
+ return aux[flag[1:] & flag[:-1]]
+
def in1d(ar1, ar2, assume_unique=False, invert=False):
"""
@@ -387,6 +438,8 @@ def in1d(ar1, ar2, assume_unique=False, invert=False):
Returns a boolean array the same length as `ar1` that is True
where an element of `ar1` is in `ar2` and False otherwise.
+ We recommend using :func:`isin` instead of `in1d` for new code.
+
Parameters
----------
ar1 : (M,) array_like
@@ -411,6 +464,8 @@ def in1d(ar1, ar2, assume_unique=False, invert=False):
See Also
--------
+ isin : Version of this function that preserves the
+ shape of ar1.
numpy.lib.arraysetops : Module with a number of other functions for
performing set operations on arrays.
@@ -432,12 +487,12 @@ def in1d(ar1, ar2, assume_unique=False, invert=False):
>>> states = [0, 2]
>>> mask = np.in1d(test, states)
>>> mask
- array([ True, False, True, False, True], dtype=bool)
+ array([ True, False, True, False, True])
>>> test[mask]
array([0, 2, 0])
>>> mask = np.in1d(test, states, invert=True)
>>> mask
- array([False, True, False, True, False], dtype=bool)
+ array([False, True, False, True, False])
>>> test[mask]
array([1, 5])
"""
@@ -445,14 +500,20 @@ def in1d(ar1, ar2, assume_unique=False, invert=False):
ar1 = np.asarray(ar1).ravel()
ar2 = np.asarray(ar2).ravel()
- # This code is significantly faster when the condition is satisfied.
- if len(ar2) < 10 * len(ar1) ** 0.145:
+ # Check if one of the arrays may contain arbitrary objects
+ contains_object = ar1.dtype.hasobject or ar2.dtype.hasobject
+
+ # This code is run when
+ # a) the first condition is true, making the code significantly faster
+ # b) the second condition is true (i.e. `ar1` or `ar2` may contain
+ # arbitrary objects), since then sorting is not guaranteed to work
+ if len(ar2) < 10 * len(ar1) ** 0.145 or contains_object:
if invert:
- mask = np.ones(len(ar1), dtype=np.bool)
+ mask = np.ones(len(ar1), dtype=bool)
for a in ar2:
mask &= (ar1 != a)
else:
- mask = np.zeros(len(ar1), dtype=np.bool)
+ mask = np.zeros(len(ar1), dtype=bool)
for a in ar2:
mask |= (ar1 == a)
return mask
@@ -481,6 +542,105 @@ def in1d(ar1, ar2, assume_unique=False, invert=False):
else:
return ret[rev_idx]
+
+def isin(element, test_elements, assume_unique=False, invert=False):
+ """
+ Calculates `element in test_elements`, broadcasting over `element` only.
+ Returns a boolean array of the same shape as `element` that is True
+ where an element of `element` is in `test_elements` and False otherwise.
+
+ Parameters
+ ----------
+ element : array_like
+ Input array.
+ test_elements : array_like
+ The values against which to test each value of `element`.
+ This argument is flattened if it is an array or array_like.
+ See notes for behavior with non-array-like parameters.
+ assume_unique : bool, optional
+ If True, the input arrays are both assumed to be unique, which
+ can speed up the calculation. Default is False.
+ invert : bool, optional
+ If True, the values in the returned array are inverted, as if
+ calculating `element not in test_elements`. Default is False.
+ ``np.isin(a, b, invert=True)`` is equivalent to (but faster
+ than) ``np.invert(np.isin(a, b))``.
+
+ Returns
+ -------
+ isin : ndarray, bool
+ Has the same shape as `element`. The values `element[isin]`
+ are in `test_elements`.
+
+ See Also
+ --------
+ in1d : Flattened version of this function.
+ numpy.lib.arraysetops : Module with a number of other functions for
+ performing set operations on arrays.
+
+ Notes
+ -----
+
+ `isin` is an element-wise function version of the python keyword `in`.
+ ``isin(a, b)`` is roughly equivalent to
+ ``np.array([item in b for item in a])`` if `a` and `b` are 1-D sequences.
+
+ `element` and `test_elements` are converted to arrays if they are not
+ already. If `test_elements` is a set (or other non-sequence collection)
+ it will be converted to an object array with one element, rather than an
+ array of the values contained in `test_elements`. This is a consequence
+ of the `array` constructor's way of handling non-sequence collections.
+ Converting the set to a list usually gives the desired behavior.
+
+ .. versionadded:: 1.13.0
+
+ Examples
+ --------
+ >>> element = 2*np.arange(4).reshape((2, 2))
+ >>> element
+ array([[0, 2],
+ [4, 6]])
+ >>> test_elements = [1, 2, 4, 8]
+ >>> mask = np.isin(element, test_elements)
+ >>> mask
+ array([[ False, True],
+ [ True, False]])
+ >>> element[mask]
+ array([2, 4])
+
+ The indices of the matched values can be obtained with `nonzero`:
+
+ >>> np.nonzero(mask)
+ (array([0, 1]), array([1, 0]))
+
+ The test can also be inverted:
+
+ >>> mask = np.isin(element, test_elements, invert=True)
+ >>> mask
+ array([[ True, False],
+ [ False, True]])
+ >>> element[mask]
+ array([0, 6])
+
+ Because of how `array` handles sets, the following does not
+ work as expected:
+
+ >>> test_set = {1, 2, 4, 8}
+ >>> np.isin(element, test_set)
+ array([[ False, False],
+ [ False, False]])
+
+ Casting the set to a list gives the expected result:
+
+ >>> np.isin(element, list(test_set))
+ array([[ False, True],
+ [ True, False]])
+ """
+ element = np.asarray(element)
+ return in1d(element, test_elements, assume_unique=assume_unique,
+ invert=invert).reshape(element.shape)
+
+
def union1d(ar1, ar2):
"""
Find the union of two arrays.
@@ -514,7 +674,7 @@ def union1d(ar1, ar2):
>>> reduce(np.union1d, ([1, 3, 4, 3], [3, 1, 2, 1], [6, 3, 4, 2]))
array([1, 2, 3, 4, 6])
"""
- return unique(np.concatenate((ar1, ar2)))
+ return unique(np.concatenate((ar1, ar2), axis=None))
def setdiff1d(ar1, ar2, assume_unique=False):
"""
@@ -556,3 +716,4 @@ def setdiff1d(ar1, ar2, assume_unique=False):
ar1 = unique(ar1)
ar2 = unique(ar2)
return ar1[in1d(ar1, ar2, assume_unique=True, invert=True)]
+