summaryrefslogtreecommitdiff
path: root/numpy/lib
diff options
context:
space:
mode:
authorSebastian Berg <sebastian@sipsolutions.net>2022-06-23 07:15:27 -0700
committerGitHub <noreply@github.com>2022-06-23 07:15:27 -0700
commit91e753c1c52ad4cf9849d8a99aaaed2767ed9dec (patch)
tree4106263405065ed9b69729f5033150e159b5c313 /numpy/lib
parentb7216ca844dc8446219140859ee0b235b1814a16 (diff)
parent1d3bdd15f6c12874e6d659a87aed21d58ebd272a (diff)
downloadnumpy-91e753c1c52ad4cf9849d8a99aaaed2767ed9dec.tar.gz
Merge pull request #12065 from MilesCranmer/master
MAINT: Optimize np.isin and np.in1d for integer arrays
Diffstat (limited to 'numpy/lib')
-rw-r--r--numpy/lib/arraysetops.py138
-rw-r--r--numpy/lib/tests/test_arraysetops.py137
2 files changed, 244 insertions, 31 deletions
diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py
index d42ab2675..9a44d7e44 100644
--- a/numpy/lib/arraysetops.py
+++ b/numpy/lib/arraysetops.py
@@ -516,12 +516,13 @@ def setxor1d(ar1, ar2, assume_unique=False):
return aux[flag[1:] & flag[:-1]]
-def _in1d_dispatcher(ar1, ar2, assume_unique=None, invert=None):
+def _in1d_dispatcher(ar1, ar2, assume_unique=None, invert=None, *,
+ kind=None):
return (ar1, ar2)
@array_function_dispatch(_in1d_dispatcher)
-def in1d(ar1, ar2, assume_unique=False, invert=False):
+def in1d(ar1, ar2, assume_unique=False, invert=False, *, kind=None):
"""
Test whether each element of a 1-D array is also present in a second array.
@@ -544,6 +545,26 @@ def in1d(ar1, ar2, assume_unique=False, invert=False):
False where an element of `ar1` is in `ar2` and True otherwise).
Default is False. ``np.in1d(a, b, invert=True)`` is equivalent
to (but is faster than) ``np.invert(in1d(a, b))``.
+ kind : {None, 'sort', 'table'}, optional
+ The algorithm to use. This will not affect the final result,
+ but will affect the speed and memory use. The default, None,
+ will select automatically based on memory considerations.
+
+ * If 'sort', will use a mergesort-based approach. This will have
+ a memory usage of roughly 6 times the sum of the sizes of
+ `ar1` and `ar2`, not accounting for size of dtypes.
+ * If 'table', will use a lookup table approach similar
+ to a counting sort. This is only available for boolean and
+ integer arrays. This will have a memory usage of the
+ size of `ar1` plus the max-min value of `ar2`. `assume_unique`
+ has no effect when the 'table' option is used.
+ * If None, will automatically choose 'table' if
+ the required memory allocation is less than or equal to
+ 6 times the sum of the sizes of `ar1` and `ar2`,
+ otherwise will use 'sort'. This is done to not use
+ a large amount of memory by default, even though
+ 'table' may be faster in most cases. If 'table' is chosen,
+ `assume_unique` will have no effect.
.. versionadded:: 1.8.0
@@ -569,6 +590,13 @@ def in1d(ar1, ar2, assume_unique=False, invert=False):
``asarray(ar2)`` is an object array rather than the expected array of
contained values.
+ Using ``kind='table'`` tends to be faster than `kind='sort'` if the
+ following relationship is true:
+ ``log10(len(ar2)) > (log10(max(ar2)-min(ar2)) - 2.27) / 0.927``,
+ but may use greater memory. The default value for `kind` will
+ be automatically selected based only on memory usage, so one may
+ manually set ``kind='table'`` if memory constraints can be relaxed.
+
.. versionadded:: 1.4.0
Examples
@@ -593,6 +621,76 @@ def in1d(ar1, ar2, assume_unique=False, invert=False):
# Ensure that iteration through object arrays yields size-1 arrays
if ar2.dtype == object:
ar2 = ar2.reshape(-1, 1)
+ # Convert booleans to uint8 so we can use the fast integer algorithm
+ if ar1.dtype == bool:
+ ar1 = ar1 + np.uint8(0)
+ if ar2.dtype == bool:
+ ar2 = ar2 + np.uint8(0)
+
+ # Check if we can use a fast integer algorithm:
+ integer_arrays = (np.issubdtype(ar1.dtype, np.integer) and
+ np.issubdtype(ar2.dtype, np.integer))
+
+ if kind not in {None, 'sort', 'table'}:
+ raise ValueError(
+ f"Invalid kind: '{kind}'. Please use None, 'sort' or 'table'.")
+
+ if integer_arrays and kind in {None, 'table'}:
+ ar2_min = np.min(ar2)
+ ar2_max = np.max(ar2)
+
+ ar2_range = int(ar2_max) - int(ar2_min)
+
+ # Constraints on whether we can actually use the table method:
+ range_safe_from_overflow = ar2_range < np.iinfo(ar2.dtype).max
+ below_memory_constraint = ar2_range <= 6 * (ar1.size + ar2.size)
+
+ # Optimal performance is for approximately
+ # log10(size) > (log10(range) - 2.27) / 0.927.
+ # However, here we set the requirement that by default
+ # the intermediate array can only be 6x
+ # the combined memory allocation of the original
+ # arrays. See discussion on
+ # https://github.com/numpy/numpy/pull/12065.
+
+ if (
+ range_safe_from_overflow and
+ (below_memory_constraint or kind == 'table')
+ ):
+
+ if invert:
+ outgoing_array = np.ones_like(ar1, dtype=bool)
+ else:
+ outgoing_array = np.zeros_like(ar1, dtype=bool)
+
+ # Make elements 1 where the integer exists in ar2
+ if invert:
+ isin_helper_ar = np.ones(ar2_range + 1, dtype=bool)
+ isin_helper_ar[ar2 - ar2_min] = 0
+ else:
+ isin_helper_ar = np.zeros(ar2_range + 1, dtype=bool)
+ isin_helper_ar[ar2 - ar2_min] = 1
+
+ # Mask out elements we know won't work
+ basic_mask = (ar1 <= ar2_max) & (ar1 >= ar2_min)
+ outgoing_array[basic_mask] = isin_helper_ar[ar1[basic_mask] -
+ ar2_min]
+
+ return outgoing_array
+ elif kind == 'table': # not range_safe_from_overflow
+ raise RuntimeError(
+ "You have specified kind='table', "
+ "but the range of values in `ar2` exceeds the "
+ "maximum integer of the datatype. "
+ "Please set `kind` to None or 'sort'."
+ )
+ elif kind == 'table':
+ raise ValueError(
+ "The 'table' method is only "
+ "supported for boolean or integer arrays. "
+ "Please select 'sort' or None for kind."
+ )
+
# Check if one of the arrays may contain arbitrary objects
contains_object = ar1.dtype.hasobject or ar2.dtype.hasobject
@@ -637,12 +735,14 @@ def in1d(ar1, ar2, assume_unique=False, invert=False):
return ret[rev_idx]
-def _isin_dispatcher(element, test_elements, assume_unique=None, invert=None):
+def _isin_dispatcher(element, test_elements, assume_unique=None, invert=None,
+ *, kind=None):
return (element, test_elements)
@array_function_dispatch(_isin_dispatcher)
-def isin(element, test_elements, assume_unique=False, invert=False):
+def isin(element, test_elements, assume_unique=False, invert=False, *,
+ kind=None):
"""
Calculates ``element in test_elements``, broadcasting over `element` only.
Returns a boolean array of the same shape as `element` that is True
@@ -664,6 +764,27 @@ def isin(element, test_elements, assume_unique=False, invert=False):
calculating `element not in test_elements`. Default is False.
``np.isin(a, b, invert=True)`` is equivalent to (but faster
than) ``np.invert(np.isin(a, b))``.
+ kind : {None, 'sort', 'table'}, optional
+ The algorithm to use. This will not affect the final result,
+ but will affect the speed and memory use. The default, None,
+ will select automatically based on memory considerations.
+
+ * If 'sort', will use a mergesort-based approach. This will have
+ a memory usage of roughly 6 times the sum of the sizes of
+ `ar1` and `ar2`, not accounting for size of dtypes.
+ * If 'table', will use a lookup table approach similar
+ to a counting sort. This is only available for boolean and
+ integer arrays. This will have a memory usage of the
+ size of `ar1` plus the max-min value of `ar2`. `assume_unique`
+ has no effect when the 'table' option is used.
+ * If None, will automatically choose 'table' if
+ the required memory allocation is less than or equal to
+ 6 times the sum of the sizes of `ar1` and `ar2`,
+ otherwise will use 'sort'. This is done to not use
+ a large amount of memory by default, even though
+ 'table' may be faster in most cases. If 'table' is chosen,
+ `assume_unique` will have no effect.
+
Returns
-------
@@ -691,6 +812,13 @@ def isin(element, test_elements, assume_unique=False, invert=False):
of the `array` constructor's way of handling non-sequence collections.
Converting the set to a list usually gives the desired behavior.
+ Using ``kind='table'`` tends to be faster than `kind='sort'` if the
+ following relationship is true:
+ ``log10(len(ar2)) > (log10(max(ar2)-min(ar2)) - 2.27) / 0.927``,
+ but may use greater memory. The default value for `kind` will
+ be automatically selected based only on memory usage, so one may
+ manually set ``kind='table'`` if memory constraints can be relaxed.
+
.. versionadded:: 1.13.0
Examples
@@ -737,7 +865,7 @@ def isin(element, test_elements, assume_unique=False, invert=False):
"""
element = np.asarray(element)
return in1d(element, test_elements, assume_unique=assume_unique,
- invert=invert).reshape(element.shape)
+ invert=invert, kind=kind).reshape(element.shape)
def _union1d_dispatcher(ar1, ar2):
diff --git a/numpy/lib/tests/test_arraysetops.py b/numpy/lib/tests/test_arraysetops.py
index e64634b69..d91d36282 100644
--- a/numpy/lib/tests/test_arraysetops.py
+++ b/numpy/lib/tests/test_arraysetops.py
@@ -195,7 +195,8 @@ class TestSetOps:
assert_equal(actual, expected)
assert actual.dtype == expected.dtype
- def test_isin(self):
+ @pytest.mark.parametrize("kind", [None, "sort", "table"])
+ def test_isin(self, kind):
# the tests for in1d cover most of isin's behavior
# if in1d is removed, would need to change those tests to test
# isin instead.
@@ -205,7 +206,7 @@ class TestSetOps:
isin_slow = np.vectorize(_isin_slow, otypes=[bool], excluded={1})
def assert_isin_equal(a, b):
- x = isin(a, b)
+ x = isin(a, b, kind=kind)
y = isin_slow(a, b)
assert_array_equal(x, y)
@@ -231,12 +232,14 @@ class TestSetOps:
assert_isin_equal(5, 6)
# empty array-like:
- x = []
- assert_isin_equal(x, b)
- assert_isin_equal(a, x)
- assert_isin_equal(x, x)
-
- def test_in1d(self):
+ if kind in {None, "sort"}:
+ x = []
+ assert_isin_equal(x, b)
+ assert_isin_equal(a, x)
+ assert_isin_equal(x, x)
+
+ @pytest.mark.parametrize("kind", [None, "sort", "table"])
+ def test_in1d(self, kind):
# we use two different sizes for the b array here to test the
# two different paths in in1d().
for mult in (1, 10):
@@ -244,57 +247,58 @@ class TestSetOps:
a = [5, 7, 1, 2]
b = [2, 4, 3, 1, 5] * mult
ec = np.array([True, False, True, True])
- c = in1d(a, b, assume_unique=True)
+ c = in1d(a, b, assume_unique=True, kind=kind)
assert_array_equal(c, ec)
a[0] = 8
ec = np.array([False, False, True, True])
- c = in1d(a, b, assume_unique=True)
+ c = in1d(a, b, assume_unique=True, kind=kind)
assert_array_equal(c, ec)
a[0], a[3] = 4, 8
ec = np.array([True, False, True, False])
- c = in1d(a, b, assume_unique=True)
+ c = in1d(a, b, assume_unique=True, kind=kind)
assert_array_equal(c, ec)
a = np.array([5, 4, 5, 3, 4, 4, 3, 4, 3, 5, 2, 1, 5, 5])
b = [2, 3, 4] * mult
ec = [False, True, False, True, True, True, True, True, True,
False, True, False, False, False]
- c = in1d(a, b)
+ c = in1d(a, b, kind=kind)
assert_array_equal(c, ec)
b = b + [5, 5, 4] * mult
ec = [True, True, True, True, True, True, True, True, True, True,
True, False, True, True]
- c = in1d(a, b)
+ c = in1d(a, b, kind=kind)
assert_array_equal(c, ec)
a = np.array([5, 7, 1, 2])
b = np.array([2, 4, 3, 1, 5] * mult)
ec = np.array([True, False, True, True])
- c = in1d(a, b)
+ c = in1d(a, b, kind=kind)
assert_array_equal(c, ec)
a = np.array([5, 7, 1, 1, 2])
b = np.array([2, 4, 3, 3, 1, 5] * mult)
ec = np.array([True, False, True, True, True])
- c = in1d(a, b)
+ c = in1d(a, b, kind=kind)
assert_array_equal(c, ec)
a = np.array([5, 5])
b = np.array([2, 2] * mult)
ec = np.array([False, False])
- c = in1d(a, b)
+ c = in1d(a, b, kind=kind)
assert_array_equal(c, ec)
a = np.array([5])
b = np.array([2])
ec = np.array([False])
- c = in1d(a, b)
+ c = in1d(a, b, kind=kind)
assert_array_equal(c, ec)
- assert_array_equal(in1d([], []), [])
+ if kind in {None, "sort"}:
+ assert_array_equal(in1d([], [], kind=kind), [])
def test_in1d_char_array(self):
a = np.array(['a', 'b', 'c', 'd', 'e', 'c', 'e', 'b'])
@@ -305,16 +309,29 @@ class TestSetOps:
assert_array_equal(c, ec)
- def test_in1d_invert(self):
+ @pytest.mark.parametrize("kind", [None, "sort", "table"])
+ def test_in1d_invert(self, kind):
"Test in1d's invert parameter"
# We use two different sizes for the b array here to test the
# two different paths in in1d().
for mult in (1, 10):
a = np.array([5, 4, 5, 3, 4, 4, 3, 4, 3, 5, 2, 1, 5, 5])
b = [2, 3, 4] * mult
- assert_array_equal(np.invert(in1d(a, b)), in1d(a, b, invert=True))
-
- def test_in1d_ravel(self):
+ assert_array_equal(np.invert(in1d(a, b, kind=kind)),
+ in1d(a, b, invert=True, kind=kind))
+
+ # float:
+ if kind in {None, "sort"}:
+ for mult in (1, 10):
+ a = np.array([5, 4, 5, 3, 4, 4, 3, 4, 3, 5, 2, 1, 5, 5],
+ dtype=np.float32)
+ b = [2, 3, 4] * mult
+ b = np.array(b, dtype=np.float32)
+ assert_array_equal(np.invert(in1d(a, b, kind=kind)),
+ in1d(a, b, invert=True, kind=kind))
+
+ @pytest.mark.parametrize("kind", [None, "sort", "table"])
+ def test_in1d_ravel(self, kind):
# Test that in1d ravels its input arrays. This is not documented
# behavior however. The test is to ensure consistentency.
a = np.arange(6).reshape(2, 3)
@@ -322,10 +339,44 @@ class TestSetOps:
long_b = np.arange(3, 63).reshape(30, 2)
ec = np.array([False, False, False, True, True, True])
- assert_array_equal(in1d(a, b, assume_unique=True), ec)
- assert_array_equal(in1d(a, b, assume_unique=False), ec)
- assert_array_equal(in1d(a, long_b, assume_unique=True), ec)
- assert_array_equal(in1d(a, long_b, assume_unique=False), ec)
+ assert_array_equal(in1d(a, b, assume_unique=True, kind=kind),
+ ec)
+ assert_array_equal(in1d(a, b, assume_unique=False,
+ kind=kind),
+ ec)
+ assert_array_equal(in1d(a, long_b, assume_unique=True,
+ kind=kind),
+ ec)
+ assert_array_equal(in1d(a, long_b, assume_unique=False,
+ kind=kind),
+ ec)
+
+ def test_in1d_hit_alternate_algorithm(self):
+ """Hit the standard isin code with integers"""
+ # Need extreme range to hit standard code
+ # This hits it without the use of kind='table'
+ a = np.array([5, 4, 5, 3, 4, 4, 1e9], dtype=np.int64)
+ b = np.array([2, 3, 4, 1e9], dtype=np.int64)
+ expected = np.array([0, 1, 0, 1, 1, 1, 1], dtype=bool)
+ assert_array_equal(expected, in1d(a, b))
+ assert_array_equal(np.invert(expected), in1d(a, b, invert=True))
+
+ a = np.array([5, 7, 1, 2], dtype=np.int64)
+ b = np.array([2, 4, 3, 1, 5, 1e9], dtype=np.int64)
+ ec = np.array([True, False, True, True])
+ c = in1d(a, b, assume_unique=True)
+ assert_array_equal(c, ec)
+
+ @pytest.mark.parametrize("kind", [None, "sort", "table"])
+ def test_in1d_boolean(self, kind):
+ """Test that in1d works for boolean input"""
+ a = np.array([True, False])
+ b = np.array([False, False, False])
+ expected = np.array([False, True])
+ assert_array_equal(expected,
+ in1d(a, b, kind=kind))
+ assert_array_equal(np.invert(expected),
+ in1d(a, b, invert=True, kind=kind))
def test_in1d_first_array_is_object(self):
ar1 = [None]
@@ -391,6 +442,40 @@ class TestSetOps:
result = np.in1d(ar1, ar2, invert=True)
assert_array_equal(result, np.invert(expected))
+ def test_in1d_errors(self):
+ """Test that in1d raises expected errors."""
+
+ # Error 1: `kind` is not one of 'sort' 'table' or None.
+ ar1 = np.array([1, 2, 3, 4, 5])
+ ar2 = np.array([2, 4, 6, 8, 10])
+ assert_raises(ValueError, in1d, ar1, ar2, kind='quicksort')
+
+ # Error 2: `kind="table"` does not work for non-integral arrays.
+ obj_ar1 = np.array([1, 'a', 3, 'b', 5], dtype=object)
+ obj_ar2 = np.array([1, 'a', 3, 'b', 5], dtype=object)
+ assert_raises(ValueError, in1d, obj_ar1, obj_ar2, kind='table')
+
+ for dtype in [np.int32, np.int64]:
+ ar1 = np.array([-1, 2, 3, 4, 5], dtype=dtype)
+ # The range of this array will overflow:
+ overflow_ar2 = np.array([-1, np.iinfo(dtype).max], dtype=dtype)
+
+ # Error 3: `kind="table"` will trigger a runtime error
+ # if there is an integer overflow expected when computing the
+ # range of ar2
+ assert_raises(
+ RuntimeError,
+ in1d, ar1, overflow_ar2, kind='table'
+ )
+
+ # Non-error: `kind=None` will *not* trigger a runtime error
+ # if there is an integer overflow, it will switch to
+ # the `sort` algorithm.
+ result = np.in1d(ar1, overflow_ar2, kind=None)
+ assert_array_equal(result, [True] + [False] * 4)
+ result = np.in1d(ar1, overflow_ar2, kind='sort')
+ assert_array_equal(result, [True] + [False] * 4)
+
def test_union1d(self):
a = np.array([5, 4, 7, 1, 2])
b = np.array([2, 4, 3, 3, 2, 1, 5])