diff options
Diffstat (limited to 'numpy/random/generator.pyx')
-rw-r--r-- | numpy/random/generator.pyx | 98 |
1 files changed, 62 insertions, 36 deletions
diff --git a/numpy/random/generator.pyx b/numpy/random/generator.pyx index 6adf0f00b..c7432d8c1 100644 --- a/numpy/random/generator.pyx +++ b/numpy/random/generator.pyx @@ -353,7 +353,8 @@ cdef class Generator: Return random integers from `low` (inclusive) to `high` (exclusive), or if endpoint=True, `low` (inclusive) to `high` (inclusive). Replaces - randint (with endpoint=False) and random_integers (with endpoint=True) + `RandomState.randint` (with endpoint=False) and + `RandomState.random_integers` (with endpoint=True) Return random integers from the "discrete uniform" distribution of the specified dtype. If `high` is None (the default), then results are @@ -503,15 +504,8 @@ cdef class Generator: return self.integers(0, 4294967296, size=n_uint32, dtype=np.uint32).astype('<u4').tobytes()[:length] - def randint(self, low, high=None, size=None, dtype=np.int64, endpoint=False): - """ - Deprecated, renamed to ``integers`` - """ - warnings.warn("Renamed to integers", RuntimeWarning) - self.integers(low, high, size, dtype, endpoint) - @cython.wraparound(True) - def choice(self, a, size=None, replace=True, p=None, axis=0): + def choice(self, a, size=None, replace=True, p=None, axis=0, bint shuffle=True): """ choice(a, size=None, replace=True, p=None, axis=0): @@ -538,6 +532,9 @@ cdef class Generator: axis : int, optional The axis along which the selection is performed. The default, 0, selects by row. + shuffle : boolean, optional + Whether the sample is shuffled when sampling without replacement. + Default is True, False provides a speedup. Returns ------- @@ -593,14 +590,12 @@ cdef class Generator: dtype='<U11') """ - cdef char* idx_ptr - cdef int64_t buf - cdef char* buf_ptr - cdef set idx_set cdef int64_t val, t, loc, size_i, pop_size_i cdef int64_t *idx_data cdef np.npy_intp j + cdef uint64_t set_size, mask + cdef uint64_t[::1] hash_set # Format and Verify input a = np.array(a, copy=False) if a.ndim == 0: @@ -687,36 +682,45 @@ cdef class Generator: size_i = size pop_size_i = pop_size # This is a heuristic tuning. should be improvable - if pop_size_i > 200 and (size > 200 or size > (10 * pop_size // size)): + if shuffle: + cutoff = 50 + else: + cutoff = 20 + if pop_size_i > 10000 and (size_i > (pop_size_i // cutoff)): # Tail shuffle size elements - idx = np.arange(pop_size, dtype=np.int64) - idx_ptr = np.PyArray_BYTES(<np.ndarray>idx) - buf_ptr = <char*>&buf - self._shuffle_raw(pop_size_i, max(pop_size_i - size_i,1), - 8, 8, idx_ptr, buf_ptr) + idx = np.PyArray_Arange(0, pop_size_i, 1, np.NPY_INT64) + idx_data = <int64_t*>(<np.ndarray>idx).data + with self.lock, nogil: + self._shuffle_int(pop_size_i, max(pop_size_i - size_i, 1), + idx_data) # Copy to allow potentially large array backing idx to be gc idx = idx[(pop_size - size):].copy() else: - # Floyds's algorithm with precomputed indices - # Worst case, O(n**2) when size is close to pop_size + # Floyd's algorithm idx = np.empty(size, dtype=np.int64) idx_data = <int64_t*>np.PyArray_DATA(<np.ndarray>idx) - idx_set = set() - loc = 0 - # Sample indices with one pass to avoid reacquiring the lock - with self.lock: - for j in range(pop_size_i - size_i, pop_size_i): - idx_data[loc] = random_interval(&self._bitgen, j) - loc += 1 - loc = 0 - while len(idx_set) < size_i: + # smallest power of 2 larger than 1.2 * size + set_size = <uint64_t>(1.2 * size_i) + mask = _gen_mask(set_size) + set_size = 1 + mask + hash_set = np.full(set_size, <uint64_t>-1, np.uint64) + with self.lock, cython.wraparound(False), nogil: for j in range(pop_size_i - size_i, pop_size_i): - if idx_data[loc] not in idx_set: - val = idx_data[loc] - else: - idx_data[loc] = val = j - idx_set.add(val) - loc += 1 + val = random_bounded_uint64(&self._bitgen, 0, j, 0, 0) + loc = val & mask + while hash_set[loc] != <uint64_t>-1 and hash_set[loc] != <uint64_t>val: + loc = (loc + 1) & mask + if hash_set[loc] == <uint64_t>-1: # then val not in hash_set + hash_set[loc] = val + idx_data[j - pop_size_i + size_i] = val + else: # we need to insert j instead + loc = j & mask + while hash_set[loc] != <uint64_t>-1: + loc = (loc + 1) & mask + hash_set[loc] = j + idx_data[j - pop_size_i + size_i] = j + if shuffle: + self._shuffle_int(size_i, 1, idx_data) if shape is not None: idx.shape = shape @@ -3888,6 +3892,28 @@ cdef class Generator: string.memcpy(data + j * stride, data + i * stride, itemsize) string.memcpy(data + i * stride, buf, itemsize) + cdef inline void _shuffle_int(self, np.npy_intp n, np.npy_intp first, + int64_t* data) nogil: + """ + Parameters + ---------- + n + Number of elements in data + first + First observation to shuffle. Shuffles n-1, + n-2, ..., first, so that when first=1 the entire + array is shuffled + data + Location of data + """ + cdef np.npy_intp i, j + cdef int64_t temp + for i in reversed(range(first, n)): + j = random_bounded_uint64(&self._bitgen, 0, i, 0, 0) + temp = data[j] + data[j] = data[i] + data[i] = temp + def permutation(self, object x): """ permutation(x) |