Merge branch 'master' into ix_-preserve-type

author: Eric Wieser <wieser.eric@gmail.com> 2018-07-31 00:41:28 -0700
committer: GitHub <noreply@github.com> 2018-07-31 00:41:28 -0700
commit: 7f4579279a6a6aa07df664b901afa36ab3fc5ce0 (patch)
tree: 3524c05c661f4948eabf066b46b5ad3aaf6ad617 /numpy/lib/arraysetops.py
parent: 24960daf3e326591047eb099af840da6e95d0910 (diff)
parent: 9bb569c4e0e1cf08128179d157bdab10c8706a97 (diff)
download: numpy-7f4579279a6a6aa07df664b901afa36ab3fc5ce0.tar.gz
1 files changed, 243 insertions, 82 deletions
diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py
index fae3e3cbc..5880ea154 100644
--- a/numpy/lib/arraysetops.py
+++ b/numpy/lib/arraysetops.py
@@ -1,9 +1,10 @@
 """
-Set operations for 1D numeric arrays based on sorting.
+Set operations for arrays based on sorting.
 
 :Contains:
-  ediff1d,
   unique,
+  isin,
+  ediff1d,
   intersect1d,
   setxor1d,
   in1d,
@@ -31,7 +32,7 @@ import numpy as np
 
 __all__ = [
     'ediff1d', 'intersect1d', 'setxor1d', 'union1d', 'setdiff1d', 'unique',
-    'in1d'
+    'in1d', 'isin'
     ]
 
 
@@ -109,16 +110,25 @@ def ediff1d(ary, to_end=None, to_begin=None):
     return result
 
 
+def _unpack_tuple(x):
+    """ Unpacks one-element tuples for use as return values """
+    if len(x) == 1:
+        return x[0]
+    else:
+        return x
+
+
 def unique(ar, return_index=False, return_inverse=False,
            return_counts=False, axis=None):
     """
     Find the unique elements of an array.
 
     Returns the sorted unique elements of an array. There are three optional
-    outputs in addition to the unique elements: the indices of the input array
-    that give the unique values, the indices of the unique array that
-    reconstruct the input array, and the number of times each unique value
-    comes up in the input array.
+    outputs in addition to the unique elements:
+
+    * the indices of the input array that give the unique values
+    * the indices of the unique array that reconstruct the input array
+    * the number of times each unique value comes up in the input array
 
     Parameters
     ----------
@@ -134,16 +144,18 @@ def unique(ar, return_index=False, return_inverse=False,
     return_counts : bool, optional
         If True, also return the number of times each unique item appears
         in `ar`.
+
         .. versionadded:: 1.9.0
-    axis : int or None, optional
-        The axis to operate on. If None, `ar` will be flattened beforehand.
-        Otherwise, duplicate items will be removed along the provided axis,
-        with all the other axes belonging to the each of the unique elements.
-        Object arrays or structured arrays that contain objects are not
-        supported if the `axis` kwarg is used.
-        .. versionadded:: 1.13.0
 
+    axis : int or None, optional
+        The axis to operate on. If None, `ar` will be flattened. If an integer,
+        the subarrays indexed by the given axis will be flattened and treated
+        as the elements of a 1-D array with the dimension of the given axis,
+        see the notes for more details.  Object arrays or structured arrays
+        that contain objects are not supported if the `axis` kwarg is used. The
+        default is None.
 
+        .. versionadded:: 1.13.0
 
     Returns
     -------
@@ -158,6 +170,7 @@ def unique(ar, return_index=False, return_inverse=False,
     unique_counts : ndarray, optional
         The number of times each of the unique values comes up in the
         original array. Only provided if `return_counts` is True.
+
         .. versionadded:: 1.9.0
 
     See Also
@@ -165,6 +178,17 @@ def unique(ar, return_index=False, return_inverse=False,
     numpy.lib.arraysetops : Module with a number of other functions for
                             performing set operations on arrays.
 
+    Notes
+    -----
+    When an axis is specified the subarrays indexed by the axis are sorted.
+    This is done by making the specified axis the first dimension of the array
+    and then flattening the subarrays in C order. The flattened subarrays are
+    then viewed as a structured type with each element given a label, with the
+    effect that we end up with a 1-D array of structured types that can be
+    treated in the same way as any other 1-D array. The result is that the
+    flattened subarrays are sorted in lexicographic order starting with the
+    first element.
+
     Examples
     --------
     >>> np.unique([1, 1, 2, 2, 3, 3])
@@ -206,24 +230,21 @@ def unique(ar, return_index=False, return_inverse=False,
     """
     ar = np.asanyarray(ar)
     if axis is None:
-        return _unique1d(ar, return_index, return_inverse, return_counts)
-    if not (-ar.ndim <= axis < ar.ndim):
-        raise ValueError('Invalid axis kwarg specified for unique')
+        ret = _unique1d(ar, return_index, return_inverse, return_counts)
+        return _unpack_tuple(ret)
+
+    # axis was specified and not None
+    try:
+        ar = np.swapaxes(ar, axis, 0)
+    except np.AxisError:
+        # this removes the "axis1" or "axis2" prefix from the error message
+        raise np.AxisError(axis, ar.ndim)
 
-    ar = np.swapaxes(ar, axis, 0)
-    orig_shape, orig_dtype = ar.shape, ar.dtype
     # Must reshape to a contiguous 2D array for this to work...
+    orig_shape, orig_dtype = ar.shape, ar.dtype
     ar = ar.reshape(orig_shape[0], -1)
     ar = np.ascontiguousarray(ar)
-
-    if ar.dtype.char in (np.typecodes['AllInteger'] +
-                         np.typecodes['Datetime'] + 'S'):
-        # Optimization: Creating a view of your data with a np.void data type of
-        # size the number of bytes in a full row. Handles any type where items
-        # have a unique binary representation, i.e. 0 is only 0, not +0 and -0.
-        dtype = np.dtype((np.void, ar.dtype.itemsize * ar.shape[1]))
-    else:
-        dtype = [('f{i}'.format(i=i), ar.dtype) for i in range(ar.shape[1])]
+    dtype = [('f{i}'.format(i=i), ar.dtype) for i in range(ar.shape[1])]
 
     try:
         consolidated = ar.view(dtype)
@@ -240,11 +261,9 @@ def unique(ar, return_index=False, return_inverse=False,
 
     output = _unique1d(consolidated, return_index,
                        return_inverse, return_counts)
-    if not (return_index or return_inverse or return_counts):
-        return reshape_uniq(output)
-    else:
-        uniq = reshape_uniq(output[0])
-        return (uniq,) + output[1:]
+    output = (reshape_uniq(output[0]),) + output[1:]
+    return _unpack_tuple(output)
+
 
 def _unique1d(ar, return_index=False, return_inverse=False,
               return_counts=False):
@@ -254,20 +273,6 @@ def _unique1d(ar, return_index=False, return_inverse=False,
     ar = np.asanyarray(ar).flatten()
 
     optional_indices = return_index or return_inverse
-    optional_returns = optional_indices or return_counts
-
-    if ar.size == 0:
-        if not optional_returns:
-            ret = ar
-        else:
-            ret = (ar,)
-            if return_index:
-                ret += (np.empty(0, np.bool),)
-            if return_inverse:
-                ret += (np.empty(0, np.bool),)
-            if return_counts:
-                ret += (np.empty(0, np.intp),)
-        return ret
 
     if optional_indices:
         perm = ar.argsort(kind='mergesort' if return_index else 'quicksort')
@@ -275,25 +280,25 @@ def _unique1d(ar, return_index=False, return_inverse=False,
     else:
         ar.sort()
         aux = ar
-    flag = np.concatenate(([True], aux[1:] != aux[:-1]))
-
-    if not optional_returns:
-        ret = aux[flag]
-    else:
-        ret = (aux[flag],)
-        if return_index:
-            ret += (perm[flag],)
-        if return_inverse:
-            iflag = np.cumsum(flag) - 1
-            inv_idx = np.empty(ar.shape, dtype=np.intp)
-            inv_idx[perm] = iflag
-            ret += (inv_idx,)
-        if return_counts:
-            idx = np.concatenate(np.nonzero(flag) + ([ar.size],))
-            ret += (np.diff(idx),)
+    mask = np.empty(aux.shape, dtype=np.bool_)
+    mask[:1] = True
+    mask[1:] = aux[1:] != aux[:-1]
+
+    ret = (aux[mask],)
+    if return_index:
+        ret += (perm[mask],)
+    if return_inverse:
+        imask = np.cumsum(mask) - 1
+        inv_idx = np.empty(mask.shape, dtype=np.intp)
+        inv_idx[perm] = imask
+        ret += (inv_idx,)
+    if return_counts:
+        idx = np.concatenate(np.nonzero(mask) + ([mask.size],))
+        ret += (np.diff(idx),)
     return ret
 
-def intersect1d(ar1, ar2, assume_unique=False):
+
+def intersect1d(ar1, ar2, assume_unique=False, return_indices=False):
     """
     Find the intersection of two arrays.
 
@@ -302,15 +307,28 @@ def intersect1d(ar1, ar2, assume_unique=False):
     Parameters
     ----------
     ar1, ar2 : array_like
-        Input arrays.
+        Input arrays. Will be flattened if not already 1D.
     assume_unique : bool
         If True, the input arrays are both assumed to be unique, which
         can speed up the calculation.  Default is False.
-
+    return_indices : bool
+        If True, the indices which correspond to the intersection of the 
+        two arrays are returned. The first instance of a value is used 
+        if there are multiple. Default is False. 
+    
+        .. versionadded:: 1.15.0    
+        
     Returns
     -------
     intersect1d : ndarray
         Sorted 1D array of common and unique elements.
+    comm1 : ndarray
+        The indices of the first occurrences of the common values in `ar1`.
+        Only provided if `return_indices` is True.
+    comm2 : ndarray
+        The indices of the first occurrences of the common values in `ar2`. 
+        Only provided if `return_indices` is True.
+
 
     See Also
     --------
@@ -327,14 +345,49 @@ def intersect1d(ar1, ar2, assume_unique=False):
     >>> from functools import reduce
     >>> reduce(np.intersect1d, ([1, 3, 4, 3], [3, 1, 2, 1], [6, 3, 4, 2]))
     array([3])
+    
+    To return the indices of the values common to the input arrays
+    along with the intersected values:
+    >>> x = np.array([1, 1, 2, 3, 4])
+    >>> y = np.array([2, 1, 4, 6])
+    >>> xy, x_ind, y_ind = np.intersect1d(x, y, return_indices=True)
+    >>> x_ind, y_ind
+    (array([0, 2, 4]), array([1, 0, 2]))
+    >>> xy, x[x_ind], y[y_ind]
+    (array([1, 2, 4]), array([1, 2, 4]), array([1, 2, 4]))
+    
     """
     if not assume_unique:
-        # Might be faster than unique( intersect1d( ar1, ar2 ) )?
-        ar1 = unique(ar1)
-        ar2 = unique(ar2)
+        if return_indices:
+            ar1, ind1 = unique(ar1, return_index=True)
+            ar2, ind2 = unique(ar2, return_index=True)
+        else:
+            ar1 = unique(ar1)
+            ar2 = unique(ar2)
+    else:
+        ar1 = ar1.ravel()
+        ar2 = ar2.ravel()
+        
     aux = np.concatenate((ar1, ar2))
-    aux.sort()
-    return aux[:-1][aux[1:] == aux[:-1]]
+    if return_indices:
+        aux_sort_indices = np.argsort(aux, kind='mergesort')
+        aux = aux[aux_sort_indices]
+    else:
+        aux.sort()
+
+    mask = aux[1:] == aux[:-1]
+    int1d = aux[:-1][mask]
+
+    if return_indices:
+        ar1_indices = aux_sort_indices[:-1][mask]
+        ar2_indices = aux_sort_indices[1:][mask] - ar1.size
+        if not assume_unique:
+            ar1_indices = ind1[ar1_indices]
+            ar2_indices = ind2[ar2_indices]
+
+        return int1d, ar1_indices, ar2_indices
+    else:
+        return int1d
 
 def setxor1d(ar1, ar2, assume_unique=False):
     """
@@ -374,11 +427,9 @@ def setxor1d(ar1, ar2, assume_unique=False):
         return aux
 
     aux.sort()
-#    flag = ediff1d( aux, to_end = 1, to_begin = 1 ) == 0
     flag = np.concatenate(([True], aux[1:] != aux[:-1], [True]))
-#    flag2 = ediff1d( flag ) == 0
-    flag2 = flag[1:] == flag[:-1]
-    return aux[flag2]
+    return aux[flag[1:] & flag[:-1]]
+
 
 def in1d(ar1, ar2, assume_unique=False, invert=False):
     """
@@ -387,6 +438,8 @@ def in1d(ar1, ar2, assume_unique=False, invert=False):
     Returns a boolean array the same length as `ar1` that is True
     where an element of `ar1` is in `ar2` and False otherwise.
 
+    We recommend using :func:`isin` instead of `in1d` for new code.
+
     Parameters
     ----------
     ar1 : (M,) array_like
@@ -411,6 +464,8 @@ def in1d(ar1, ar2, assume_unique=False, invert=False):
 
     See Also
     --------
+    isin                  : Version of this function that preserves the
+                            shape of ar1.
     numpy.lib.arraysetops : Module with a number of other functions for
                             performing set operations on arrays.
 
@@ -432,12 +487,12 @@ def in1d(ar1, ar2, assume_unique=False, invert=False):
     >>> states = [0, 2]
     >>> mask = np.in1d(test, states)
     >>> mask
-    array([ True, False,  True, False,  True], dtype=bool)
+    array([ True, False,  True, False,  True])
     >>> test[mask]
     array([0, 2, 0])
     >>> mask = np.in1d(test, states, invert=True)
     >>> mask
-    array([False,  True, False,  True, False], dtype=bool)
+    array([False,  True, False,  True, False])
     >>> test[mask]
     array([1, 5])
     """
@@ -445,14 +500,20 @@ def in1d(ar1, ar2, assume_unique=False, invert=False):
     ar1 = np.asarray(ar1).ravel()
     ar2 = np.asarray(ar2).ravel()
 
-    # This code is significantly faster when the condition is satisfied.
-    if len(ar2) < 10 * len(ar1) ** 0.145:
+    # Check if one of the arrays may contain arbitrary objects
+    contains_object = ar1.dtype.hasobject or ar2.dtype.hasobject
+
+    # This code is run when
+    # a) the first condition is true, making the code significantly faster
+    # b) the second condition is true (i.e. `ar1` or `ar2` may contain
+    #    arbitrary objects), since then sorting is not guaranteed to work
+    if len(ar2) < 10 * len(ar1) ** 0.145 or contains_object:
         if invert:
-            mask = np.ones(len(ar1), dtype=np.bool)
+            mask = np.ones(len(ar1), dtype=bool)
             for a in ar2:
                 mask &= (ar1 != a)
         else:
-            mask = np.zeros(len(ar1), dtype=np.bool)
+            mask = np.zeros(len(ar1), dtype=bool)
             for a in ar2:
                 mask |= (ar1 == a)
         return mask
@@ -481,6 +542,105 @@ def in1d(ar1, ar2, assume_unique=False, invert=False):
     else:
         return ret[rev_idx]
 
+
+def isin(element, test_elements, assume_unique=False, invert=False):
+    """
+    Calculates `element in test_elements`, broadcasting over `element` only.
+    Returns a boolean array of the same shape as `element` that is True
+    where an element of `element` is in `test_elements` and False otherwise.
+
+    Parameters
+    ----------
+    element : array_like
+        Input array.
+    test_elements : array_like
+        The values against which to test each value of `element`.
+        This argument is flattened if it is an array or array_like.
+        See notes for behavior with non-array-like parameters.
+    assume_unique : bool, optional
+        If True, the input arrays are both assumed to be unique, which
+        can speed up the calculation.  Default is False.
+    invert : bool, optional
+        If True, the values in the returned array are inverted, as if
+        calculating `element not in test_elements`. Default is False.
+        ``np.isin(a, b, invert=True)`` is equivalent to (but faster
+        than) ``np.invert(np.isin(a, b))``.
+
+    Returns
+    -------
+    isin : ndarray, bool
+        Has the same shape as `element`. The values `element[isin]`
+        are in `test_elements`.
+
+    See Also
+    --------
+    in1d                  : Flattened version of this function.
+    numpy.lib.arraysetops : Module with a number of other functions for
+                            performing set operations on arrays.
+
+    Notes
+    -----
+
+    `isin` is an element-wise function version of the python keyword `in`.
+    ``isin(a, b)`` is roughly equivalent to
+    ``np.array([item in b for item in a])`` if `a` and `b` are 1-D sequences.
+
+    `element` and `test_elements` are converted to arrays if they are not
+    already. If `test_elements` is a set (or other non-sequence collection)
+    it will be converted to an object array with one element, rather than an
+    array of the values contained in `test_elements`. This is a consequence
+    of the `array` constructor's way of handling non-sequence collections.
+    Converting the set to a list usually gives the desired behavior.
+
+    .. versionadded:: 1.13.0
+
+    Examples
+    --------
+    >>> element = 2*np.arange(4).reshape((2, 2))
+    >>> element
+    array([[0, 2],
+           [4, 6]])
+    >>> test_elements = [1, 2, 4, 8]
+    >>> mask = np.isin(element, test_elements)
+    >>> mask
+    array([[ False,  True],
+           [ True,  False]])
+    >>> element[mask]
+    array([2, 4])
+
+    The indices of the matched values can be obtained with `nonzero`:
+
+    >>> np.nonzero(mask)
+    (array([0, 1]), array([1, 0]))
+
+    The test can also be inverted:
+
+    >>> mask = np.isin(element, test_elements, invert=True)
+    >>> mask
+    array([[ True, False],
+           [ False, True]])
+    >>> element[mask]
+    array([0, 6])
+
+    Because of how `array` handles sets, the following does not
+    work as expected:
+
+    >>> test_set = {1, 2, 4, 8}
+    >>> np.isin(element, test_set)
+    array([[ False, False],
+           [ False, False]])
+
+    Casting the set to a list gives the expected result:
+
+    >>> np.isin(element, list(test_set))
+    array([[ False,  True],
+           [ True,  False]])
+    """
+    element = np.asarray(element)
+    return in1d(element, test_elements, assume_unique=assume_unique,
+                invert=invert).reshape(element.shape)
+
+
 def union1d(ar1, ar2):
     """
     Find the union of two arrays.
@@ -514,7 +674,7 @@ def union1d(ar1, ar2):
     >>> reduce(np.union1d, ([1, 3, 4, 3], [3, 1, 2, 1], [6, 3, 4, 2]))
     array([1, 2, 3, 4, 6])
     """
-    return unique(np.concatenate((ar1, ar2)))
+    return unique(np.concatenate((ar1, ar2), axis=None))
 
 def setdiff1d(ar1, ar2, assume_unique=False):
     """
@@ -556,3 +716,4 @@ def setdiff1d(ar1, ar2, assume_unique=False):
         ar1 = unique(ar1)
         ar2 = unique(ar2)
     return ar1[in1d(ar1, ar2, assume_unique=True, invert=True)]
+
author	Eric Wieser <wieser.eric@gmail.com>	2018-07-31 00:41:28 -0700
committer	GitHub <noreply@github.com>	2018-07-31 00:41:28 -0700
commit	7f4579279a6a6aa07df664b901afa36ab3fc5ce0 (patch)
tree	3524c05c661f4948eabf066b46b5ad3aaf6ad617 /numpy/lib/arraysetops.py
parent	24960daf3e326591047eb099af840da6e95d0910 (diff)
parent	9bb569c4e0e1cf08128179d157bdab10c8706a97 (diff)
download	numpy-7f4579279a6a6aa07df664b901afa36ab3fc5ce0.tar.gz