diff options
Diffstat (limited to 'numpy/random/_generator.pyx')
-rw-r--r-- | numpy/random/_generator.pyx | 249 |
1 files changed, 248 insertions, 1 deletions
diff --git a/numpy/random/_generator.pyx b/numpy/random/_generator.pyx index 22b17ab03..6d9fe20f9 100644 --- a/numpy/random/_generator.pyx +++ b/numpy/random/_generator.pyx @@ -13,7 +13,7 @@ from numpy.core.multiarray import normalize_axis_index from libc cimport string from libc.stdint cimport (uint8_t, uint16_t, uint32_t, uint64_t, - int32_t, int64_t) + int32_t, int64_t, INT64_MAX, SIZE_MAX) from ._bounded_integers cimport (_rand_bool, _rand_int32, _rand_int64, _rand_int16, _rand_int8, _rand_uint64, _rand_uint32, _rand_uint16, _rand_uint8, _gen_mask) @@ -126,9 +126,38 @@ cdef extern from "include/distributions.h": void random_multinomial(bitgen_t *bitgen_state, int64_t n, int64_t *mnix, double *pix, np.npy_intp d, binomial_t *binomial) nogil + int random_mvhg_count(bitgen_t *bitgen_state, + int64_t total, + size_t num_colors, int64_t *colors, + int64_t nsample, + size_t num_variates, int64_t *variates) nogil + void random_mvhg_marginals(bitgen_t *bitgen_state, + int64_t total, + size_t num_colors, int64_t *colors, + int64_t nsample, + size_t num_variates, int64_t *variates) nogil + np.import_array() +cdef int64_t _safe_sum_nonneg_int64(size_t num_colors, int64_t *colors): + """ + Sum the values in the array `colors`. + + Return -1 if an overflow occurs. + The values in *colors are assumed to be nonnegative. + """ + cdef size_t i + cdef int64_t sum + + sum = 0 + for i in range(num_colors): + if colors[i] > INT64_MAX - sum: + return -1 + sum += colors[i] + return sum + + cdef bint _check_bit_generator(object bitgen): """Check if an object satisfies the BitGenerator interface. """ @@ -3241,6 +3270,8 @@ cdef class Generator: See Also -------- + multivariate_hypergeometric : Draw samples from the multivariate + hypergeometric distribution. scipy.stats.hypergeom : probability density function, distribution or cumulative density function, etc. @@ -3739,6 +3770,222 @@ cdef class Generator: return multin + def multivariate_hypergeometric(self, object colors, object nsample, + size=None, method='marginals'): + """ + multivariate_hypergeometric(colors, nsample, size=None, + method='marginals') + + Generate variates from a multivariate hypergeometric distribution. + + The multivariate hypergeometric distribution is a generalization + of the hypergeometric distribution. + + Choose ``nsample`` items at random without replacement from a + collection with ``N`` distinct types. ``N`` is the length of + ``colors``, and the values in ``colors`` are the number of occurrences + of that type in the collection. The total number of items in the + collection is ``sum(colors)``. Each random variate generated by this + function is a vector of length ``N`` holding the counts of the + different types that occurred in the ``nsample`` items. + + The name ``colors`` comes from a common description of the + distribution: it is the probability distribution of the number of + marbles of each color selected without replacement from an urn + containing marbles of different colors; ``colors[i]`` is the number + of marbles in the urn with color ``i``. + + Parameters + ---------- + colors : sequence of integers + The number of each type of item in the collection from which + a sample is drawn. The values in ``colors`` must be nonnegative. + To avoid loss of precision in the algorithm, ``sum(colors)`` + must be less than ``10**9`` when `method` is "marginals". + nsample : int + The number of items selected. ``nsample`` must not be greater + than ``sum(colors)``. + size : int or tuple of ints, optional + The number of variates to generate, either an integer or a tuple + holding the shape of the array of variates. If the given size is, + e.g., ``(k, m)``, then ``k * m`` variates are drawn, where one + variate is a vector of length ``len(colors)``, and the return value + has shape ``(k, m, len(colors))``. If `size` is an integer, the + output has shape ``(size, len(colors))``. Default is None, in + which case a single variate is returned as an array with shape + ``(len(colors),)``. + method : string, optional + Specify the algorithm that is used to generate the variates. + Must be 'count' or 'marginals' (the default). See the Notes + for a description of the methods. + + Returns + ------- + variates : ndarray + Array of variates drawn from the multivariate hypergeometric + distribution. + + See Also + -------- + hypergeometric : Draw samples from the (univariate) hypergeometric + distribution. + + Notes + ----- + The two methods do not return the same sequence of variates. + + The "count" algorithm is roughly equivalent to the following numpy + code:: + + choices = np.repeat(np.arange(len(colors)), colors) + selection = np.random.choice(choices, nsample, replace=False) + variate = np.bincount(selection, minlength=len(colors)) + + The "count" algorithm uses a temporary array of integers with length + ``sum(colors)``. + + The "marginals" algorithm generates a variate by using repeated + calls to the univariate hypergeometric sampler. It is roughly + equivalent to:: + + variate = np.zeros(len(colors), dtype=np.int64) + # `remaining` is the cumulative sum of `colors` from the last + # element to the first; e.g. if `colors` is [3, 1, 5], then + # `remaining` is [9, 6, 5]. + remaining = np.cumsum(colors[::-1])[::-1] + for i in range(len(colors)-1): + if nsample < 1: + break + variate[i] = hypergeometric(colors[i], remaining[i+1], + nsample) + nsample -= variate[i] + variate[-1] = nsample + + The default method is "marginals". For some cases (e.g. when + `colors` contains relatively small integers), the "count" method + can be significantly faster than the "marginals" method. If + performance of the algorithm is important, test the two methods + with typical inputs to decide which works best. + + .. versionadded:: 1.18.0 + + Examples + -------- + >>> colors = [16, 8, 4] + >>> seed = 4861946401452 + >>> gen = np.random.Generator(np.random.PCG64(seed)) + >>> gen.multivariate_hypergeometric(colors, 6) + array([5, 0, 1]) + >>> gen.multivariate_hypergeometric(colors, 6, size=3) + array([[5, 0, 1], + [2, 2, 2], + [3, 3, 0]]) + >>> gen.multivariate_hypergeometric(colors, 6, size=(2, 2)) + array([[[3, 2, 1], + [3, 2, 1]], + [[4, 1, 1], + [3, 2, 1]]]) + """ + cdef int64_t nsamp + cdef size_t num_colors + cdef int64_t total + cdef int64_t *colors_ptr + cdef int64_t max_index + cdef size_t num_variates + cdef int64_t *variates_ptr + cdef int result + + if method not in ['count', 'marginals']: + raise ValueError('method must be "count" or "marginals".') + + try: + operator.index(nsample) + except TypeError: + raise ValueError('nsample must be an integer') + + if nsample < 0: + raise ValueError("nsample must be nonnegative.") + if nsample > INT64_MAX: + raise ValueError("nsample must not exceed %d" % INT64_MAX) + nsamp = nsample + + # Validation of colors, a 1-d sequence of nonnegative integers. + invalid_colors = False + try: + colors = np.asarray(colors) + if colors.ndim != 1: + invalid_colors = True + elif colors.size > 0 and not np.issubdtype(colors.dtype, + np.integer): + invalid_colors = True + elif np.any((colors < 0) | (colors > INT64_MAX)): + invalid_colors = True + except ValueError: + invalid_colors = True + if invalid_colors: + raise ValueError('colors must be a one-dimensional sequence ' + 'of nonnegative integers not exceeding %d.' % + INT64_MAX) + + colors = np.ascontiguousarray(colors, dtype=np.int64) + num_colors = colors.size + + colors_ptr = <int64_t *> np.PyArray_DATA(colors) + + total = _safe_sum_nonneg_int64(num_colors, colors_ptr) + if total == -1: + raise ValueError("sum(colors) must not exceed the maximum value " + "of a 64 bit signed integer (%d)" % INT64_MAX) + + if method == 'marginals' and total >= 1000000000: + raise ValueError('When method is "marginals", sum(colors) must ' + 'be less than 1000000000.') + + # The C code that implements the 'count' method will malloc an + # array of size total*sizeof(size_t). Here we ensure that that + # product does not overflow. + if SIZE_MAX > <uint64_t>INT64_MAX: + max_index = INT64_MAX // sizeof(size_t) + else: + max_index = SIZE_MAX // sizeof(size_t) + if method == 'count' and total > max_index: + raise ValueError("When method is 'count', sum(colors) must not " + "exceed %d" % max_index) + if nsamp > total: + raise ValueError("nsample > sum(colors)") + + # Figure out the shape of the return array. + if size is None: + shape = (num_colors,) + elif np.isscalar(size): + shape = (size, num_colors) + else: + shape = tuple(size) + (num_colors,) + variates = np.zeros(shape, dtype=np.int64) + + if num_colors == 0: + return variates + + # One variate is a vector of length num_colors. + num_variates = variates.size // num_colors + variates_ptr = <int64_t *> np.PyArray_DATA(variates) + + if method == 'count': + with self.lock, nogil: + result = random_mvhg_count(&self._bitgen, total, + num_colors, colors_ptr, nsamp, + num_variates, variates_ptr) + if result == -1: + raise MemoryError("Insufficent memory for multivariate_" + "hypergeometric with method='count' and " + "sum(colors)=%d" % total) + else: + with self.lock, nogil: + random_mvhg_marginals(&self._bitgen, total, + num_colors, colors_ptr, nsamp, + num_variates, variates_ptr) + return variates + def dirichlet(self, object alpha, size=None): """ dirichlet(alpha, size=None) |