summaryrefslogtreecommitdiff
path: root/numpy/random/generator.pyx
diff options
context:
space:
mode:
Diffstat (limited to 'numpy/random/generator.pyx')
-rw-r--r--numpy/random/generator.pyx98
1 files changed, 62 insertions, 36 deletions
diff --git a/numpy/random/generator.pyx b/numpy/random/generator.pyx
index 6adf0f00b..c7432d8c1 100644
--- a/numpy/random/generator.pyx
+++ b/numpy/random/generator.pyx
@@ -353,7 +353,8 @@ cdef class Generator:
Return random integers from `low` (inclusive) to `high` (exclusive), or
if endpoint=True, `low` (inclusive) to `high` (inclusive). Replaces
- randint (with endpoint=False) and random_integers (with endpoint=True)
+ `RandomState.randint` (with endpoint=False) and
+ `RandomState.random_integers` (with endpoint=True)
Return random integers from the "discrete uniform" distribution of
the specified dtype. If `high` is None (the default), then results are
@@ -503,15 +504,8 @@ cdef class Generator:
return self.integers(0, 4294967296, size=n_uint32,
dtype=np.uint32).astype('<u4').tobytes()[:length]
- def randint(self, low, high=None, size=None, dtype=np.int64, endpoint=False):
- """
- Deprecated, renamed to ``integers``
- """
- warnings.warn("Renamed to integers", RuntimeWarning)
- self.integers(low, high, size, dtype, endpoint)
-
@cython.wraparound(True)
- def choice(self, a, size=None, replace=True, p=None, axis=0):
+ def choice(self, a, size=None, replace=True, p=None, axis=0, bint shuffle=True):
"""
choice(a, size=None, replace=True, p=None, axis=0):
@@ -538,6 +532,9 @@ cdef class Generator:
axis : int, optional
The axis along which the selection is performed. The default, 0,
selects by row.
+ shuffle : boolean, optional
+ Whether the sample is shuffled when sampling without replacement.
+ Default is True, False provides a speedup.
Returns
-------
@@ -593,14 +590,12 @@ cdef class Generator:
dtype='<U11')
"""
- cdef char* idx_ptr
- cdef int64_t buf
- cdef char* buf_ptr
- cdef set idx_set
cdef int64_t val, t, loc, size_i, pop_size_i
cdef int64_t *idx_data
cdef np.npy_intp j
+ cdef uint64_t set_size, mask
+ cdef uint64_t[::1] hash_set
# Format and Verify input
a = np.array(a, copy=False)
if a.ndim == 0:
@@ -687,36 +682,45 @@ cdef class Generator:
size_i = size
pop_size_i = pop_size
# This is a heuristic tuning. should be improvable
- if pop_size_i > 200 and (size > 200 or size > (10 * pop_size // size)):
+ if shuffle:
+ cutoff = 50
+ else:
+ cutoff = 20
+ if pop_size_i > 10000 and (size_i > (pop_size_i // cutoff)):
# Tail shuffle size elements
- idx = np.arange(pop_size, dtype=np.int64)
- idx_ptr = np.PyArray_BYTES(<np.ndarray>idx)
- buf_ptr = <char*>&buf
- self._shuffle_raw(pop_size_i, max(pop_size_i - size_i,1),
- 8, 8, idx_ptr, buf_ptr)
+ idx = np.PyArray_Arange(0, pop_size_i, 1, np.NPY_INT64)
+ idx_data = <int64_t*>(<np.ndarray>idx).data
+ with self.lock, nogil:
+ self._shuffle_int(pop_size_i, max(pop_size_i - size_i, 1),
+ idx_data)
# Copy to allow potentially large array backing idx to be gc
idx = idx[(pop_size - size):].copy()
else:
- # Floyds's algorithm with precomputed indices
- # Worst case, O(n**2) when size is close to pop_size
+ # Floyd's algorithm
idx = np.empty(size, dtype=np.int64)
idx_data = <int64_t*>np.PyArray_DATA(<np.ndarray>idx)
- idx_set = set()
- loc = 0
- # Sample indices with one pass to avoid reacquiring the lock
- with self.lock:
- for j in range(pop_size_i - size_i, pop_size_i):
- idx_data[loc] = random_interval(&self._bitgen, j)
- loc += 1
- loc = 0
- while len(idx_set) < size_i:
+ # smallest power of 2 larger than 1.2 * size
+ set_size = <uint64_t>(1.2 * size_i)
+ mask = _gen_mask(set_size)
+ set_size = 1 + mask
+ hash_set = np.full(set_size, <uint64_t>-1, np.uint64)
+ with self.lock, cython.wraparound(False), nogil:
for j in range(pop_size_i - size_i, pop_size_i):
- if idx_data[loc] not in idx_set:
- val = idx_data[loc]
- else:
- idx_data[loc] = val = j
- idx_set.add(val)
- loc += 1
+ val = random_bounded_uint64(&self._bitgen, 0, j, 0, 0)
+ loc = val & mask
+ while hash_set[loc] != <uint64_t>-1 and hash_set[loc] != <uint64_t>val:
+ loc = (loc + 1) & mask
+ if hash_set[loc] == <uint64_t>-1: # then val not in hash_set
+ hash_set[loc] = val
+ idx_data[j - pop_size_i + size_i] = val
+ else: # we need to insert j instead
+ loc = j & mask
+ while hash_set[loc] != <uint64_t>-1:
+ loc = (loc + 1) & mask
+ hash_set[loc] = j
+ idx_data[j - pop_size_i + size_i] = j
+ if shuffle:
+ self._shuffle_int(size_i, 1, idx_data)
if shape is not None:
idx.shape = shape
@@ -3888,6 +3892,28 @@ cdef class Generator:
string.memcpy(data + j * stride, data + i * stride, itemsize)
string.memcpy(data + i * stride, buf, itemsize)
+ cdef inline void _shuffle_int(self, np.npy_intp n, np.npy_intp first,
+ int64_t* data) nogil:
+ """
+ Parameters
+ ----------
+ n
+ Number of elements in data
+ first
+ First observation to shuffle. Shuffles n-1,
+ n-2, ..., first, so that when first=1 the entire
+ array is shuffled
+ data
+ Location of data
+ """
+ cdef np.npy_intp i, j
+ cdef int64_t temp
+ for i in reversed(range(first, n)):
+ j = random_bounded_uint64(&self._bitgen, 0, i, 0, 0)
+ temp = data[j]
+ data[j] = data[i]
+ data[i] = temp
+
def permutation(self, object x):
"""
permutation(x)