summaryrefslogtreecommitdiff
path: root/numpy/lib/arraysetops.py
diff options
context:
space:
mode:
authorMilesCranmer <miles.cranmer@gmail.com>2022-06-17 11:08:06 -0400
committerMilesCranmer <miles.cranmer@gmail.com>2022-06-17 11:08:06 -0400
commita8677bb0cf046d2490228c12c282ef05fe08d81b (patch)
tree91ad90547a242570175373c8b27eaceeacbd72e0 /numpy/lib/arraysetops.py
parent7a1ee13ee28083c484a42a657067570773bcddbe (diff)
downloadnumpy-a8677bb0cf046d2490228c12c282ef05fe08d81b.tar.gz
MAINT: Switch to old in1d for large memory usage
Diffstat (limited to 'numpy/lib/arraysetops.py')
-rw-r--r--numpy/lib/arraysetops.py31
1 files changed, 18 insertions, 13 deletions
diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py
index cc6fe6259..bc5877c4b 100644
--- a/numpy/lib/arraysetops.py
+++ b/numpy/lib/arraysetops.py
@@ -559,11 +559,12 @@ def in1d(ar1, ar2, assume_unique=False, invert=False, method='auto'):
to be the faster method if the following formula is true:
`log10(len(ar2)) > (log10(max(ar2)-min(ar2)) - 2.27) / 0.927`,
but may use greater memory.
- - If 'auto', will automatically choose the method which is
- expected to perform the fastest, using the above
- formula. For larger sizes or smaller range,
- 'dictionary' is chosen. For larger range or smaller
- sizes, 'sort' is chosen.
+ - If 'auto', will automatically choose 'dictionary' if
+ the required memory allocation is less than or equal to
+ 6 times the sum of the sizes of `ar1` and `ar2`,
+ otherwise will use 'sort'. This is done to not use
+ a large amount of memory by default, even though
+ 'dictionary' may be faster in most cases.
.. versionadded:: 1.8.0
@@ -631,6 +632,7 @@ def in1d(ar1, ar2, assume_unique=False, invert=False, method='auto'):
if integer_arrays and method in {'auto', 'dictionary'}:
ar2_min = np.min(ar2)
ar2_max = np.max(ar2)
+ ar1_size = ar1.size
ar2_size = ar2.size
# Check for integer overflow
@@ -640,17 +642,20 @@ def in1d(ar1, ar2, assume_unique=False, invert=False, method='auto'):
# Optimal performance is for approximately
# log10(size) > (log10(range) - 2.27) / 0.927.
- # See discussion on
- # https://github.com/numpy/numpy/pull/12065
- optimal_parameters = (
- np.log10(ar2_size) >
- ((np.log10(ar2_range + 1.0) - 2.27) / 0.927)
- )
+ # However, here we set the requirement that
+ # the intermediate array can only be 6x
+ # the combined memory allocation of the original
+ # arrays.
+ # (see discussion on
+ # https://github.com/numpy/numpy/pull/12065)
+ below_memory_constraint = (
+ ar2_range <= 6 * (ar1_size + ar2_size)
+ )
except FloatingPointError:
- optimal_parameters = False
+ below_memory_constraint = False
# Use the fast integer algorithm
- if optimal_parameters or method == 'dictionary':
+ if below_memory_constraint or method == 'dictionary':
if invert:
outgoing_array = np.ones_like(ar1, dtype=bool)