From 7cb937c37f2cb33c096055b3783a006f71c938df Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Fri, 10 Jun 2022 20:06:32 -0400 Subject: DOC: Describe memory considerations in in1d/isin --- numpy/lib/arraysetops.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) (limited to 'numpy/lib/arraysetops.py') diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py index 4a07d6e9e..cc6fe6259 100644 --- a/numpy/lib/arraysetops.py +++ b/numpy/lib/arraysetops.py @@ -549,15 +549,21 @@ def in1d(ar1, ar2, assume_unique=False, invert=False, method='auto'): The algorithm to use. This will not affect the final result, but will affect the speed. Default is 'auto'. - - If 'sort', will use a sort-based approach. + - If 'sort', will use a mergesort-based approach. This will have + a memory usage of roughly 6 times the sum of the sizes of + `ar1` and `ar2`, not accounting for size of dtypes. - If 'dictionary', will use a key-dictionary approach similar to a counting sort. This is only available for boolean and - integer arrays. + integer arrays. This will have a memory usage of the + size of `ar1` plus the max-min value of `ar2`. This tends + to be the faster method if the following formula is true: + `log10(len(ar2)) > (log10(max(ar2)-min(ar2)) - 2.27) / 0.927`, + but may use greater memory. - If 'auto', will automatically choose the method which is - expected to perform the fastest, which depends - on the size and range of `ar2`. For larger sizes or - smaller range, 'dictionary' is chosen. - For larger range or smaller sizes, 'sort' is chosen. + expected to perform the fastest, using the above + formula. For larger sizes or smaller range, + 'dictionary' is chosen. For larger range or smaller + sizes, 'sort' is chosen. .. versionadded:: 1.8.0 @@ -749,13 +755,19 @@ def isin(element, test_elements, assume_unique=False, invert=False, The algorithm to use. This will not affect the final result, but will affect the speed. Default is 'auto'. - - If 'sort', will use a sort-based approach. + - If 'sort', will use a mergesort-based approach. This will have + a memory usage of roughly 6 times the sum of the sizes of + `ar1` and `ar2`, not accounting for size of dtypes. - If 'dictionary', will use a key-dictionary approach similar to a counting sort. This is only available for boolean and - integer arrays. + integer arrays. This will have a memory usage of the + size of `ar1` plus the max-min value of `ar2`. This tends + to be the faster method if the following formula is true: + `log10(len(ar2)) > (log10(max(ar2)-min(ar2)) - 2.27) / 0.927`, + but may use greater memory. - If 'auto', will automatically choose the method which is - expected to perform the fastest, which depends - on the size and range of `ar2`. For larger sizes, + expected to perform the fastest, using the above + formula. For larger sizes or smaller range, 'dictionary' is chosen. For larger range or smaller sizes, 'sort' is chosen. -- cgit v1.2.1