diff options
author | MilesCranmer <miles.cranmer@gmail.com> | 2022-06-17 11:08:06 -0400 |
---|---|---|
committer | MilesCranmer <miles.cranmer@gmail.com> | 2022-06-17 11:08:06 -0400 |
commit | a8677bb0cf046d2490228c12c282ef05fe08d81b (patch) | |
tree | 91ad90547a242570175373c8b27eaceeacbd72e0 /numpy/lib/arraysetops.py | |
parent | 7a1ee13ee28083c484a42a657067570773bcddbe (diff) | |
download | numpy-a8677bb0cf046d2490228c12c282ef05fe08d81b.tar.gz |
MAINT: Switch to old in1d for large memory usage
Diffstat (limited to 'numpy/lib/arraysetops.py')
-rw-r--r-- | numpy/lib/arraysetops.py | 31 |
1 files changed, 18 insertions, 13 deletions
diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py index cc6fe6259..bc5877c4b 100644 --- a/numpy/lib/arraysetops.py +++ b/numpy/lib/arraysetops.py @@ -559,11 +559,12 @@ def in1d(ar1, ar2, assume_unique=False, invert=False, method='auto'): to be the faster method if the following formula is true: `log10(len(ar2)) > (log10(max(ar2)-min(ar2)) - 2.27) / 0.927`, but may use greater memory. - - If 'auto', will automatically choose the method which is - expected to perform the fastest, using the above - formula. For larger sizes or smaller range, - 'dictionary' is chosen. For larger range or smaller - sizes, 'sort' is chosen. + - If 'auto', will automatically choose 'dictionary' if + the required memory allocation is less than or equal to + 6 times the sum of the sizes of `ar1` and `ar2`, + otherwise will use 'sort'. This is done to not use + a large amount of memory by default, even though + 'dictionary' may be faster in most cases. .. versionadded:: 1.8.0 @@ -631,6 +632,7 @@ def in1d(ar1, ar2, assume_unique=False, invert=False, method='auto'): if integer_arrays and method in {'auto', 'dictionary'}: ar2_min = np.min(ar2) ar2_max = np.max(ar2) + ar1_size = ar1.size ar2_size = ar2.size # Check for integer overflow @@ -640,17 +642,20 @@ def in1d(ar1, ar2, assume_unique=False, invert=False, method='auto'): # Optimal performance is for approximately # log10(size) > (log10(range) - 2.27) / 0.927. - # See discussion on - # https://github.com/numpy/numpy/pull/12065 - optimal_parameters = ( - np.log10(ar2_size) > - ((np.log10(ar2_range + 1.0) - 2.27) / 0.927) - ) + # However, here we set the requirement that + # the intermediate array can only be 6x + # the combined memory allocation of the original + # arrays. + # (see discussion on + # https://github.com/numpy/numpy/pull/12065) + below_memory_constraint = ( + ar2_range <= 6 * (ar1_size + ar2_size) + ) except FloatingPointError: - optimal_parameters = False + below_memory_constraint = False # Use the fast integer algorithm - if optimal_parameters or method == 'dictionary': + if below_memory_constraint or method == 'dictionary': if invert: outgoing_array = np.ones_like(ar1, dtype=bool) |