summaryrefslogtreecommitdiff
path: root/numpy/random/_generator.pyx
diff options
context:
space:
mode:
Diffstat (limited to 'numpy/random/_generator.pyx')
-rw-r--r--numpy/random/_generator.pyx249
1 files changed, 248 insertions, 1 deletions
diff --git a/numpy/random/_generator.pyx b/numpy/random/_generator.pyx
index 22b17ab03..6d9fe20f9 100644
--- a/numpy/random/_generator.pyx
+++ b/numpy/random/_generator.pyx
@@ -13,7 +13,7 @@ from numpy.core.multiarray import normalize_axis_index
from libc cimport string
from libc.stdint cimport (uint8_t, uint16_t, uint32_t, uint64_t,
- int32_t, int64_t)
+ int32_t, int64_t, INT64_MAX, SIZE_MAX)
from ._bounded_integers cimport (_rand_bool, _rand_int32, _rand_int64,
_rand_int16, _rand_int8, _rand_uint64, _rand_uint32, _rand_uint16,
_rand_uint8, _gen_mask)
@@ -126,9 +126,38 @@ cdef extern from "include/distributions.h":
void random_multinomial(bitgen_t *bitgen_state, int64_t n, int64_t *mnix,
double *pix, np.npy_intp d, binomial_t *binomial) nogil
+ int random_mvhg_count(bitgen_t *bitgen_state,
+ int64_t total,
+ size_t num_colors, int64_t *colors,
+ int64_t nsample,
+ size_t num_variates, int64_t *variates) nogil
+ void random_mvhg_marginals(bitgen_t *bitgen_state,
+ int64_t total,
+ size_t num_colors, int64_t *colors,
+ int64_t nsample,
+ size_t num_variates, int64_t *variates) nogil
+
np.import_array()
+cdef int64_t _safe_sum_nonneg_int64(size_t num_colors, int64_t *colors):
+ """
+ Sum the values in the array `colors`.
+
+ Return -1 if an overflow occurs.
+ The values in *colors are assumed to be nonnegative.
+ """
+ cdef size_t i
+ cdef int64_t sum
+
+ sum = 0
+ for i in range(num_colors):
+ if colors[i] > INT64_MAX - sum:
+ return -1
+ sum += colors[i]
+ return sum
+
+
cdef bint _check_bit_generator(object bitgen):
"""Check if an object satisfies the BitGenerator interface.
"""
@@ -3241,6 +3270,8 @@ cdef class Generator:
See Also
--------
+ multivariate_hypergeometric : Draw samples from the multivariate
+ hypergeometric distribution.
scipy.stats.hypergeom : probability density function, distribution or
cumulative density function, etc.
@@ -3739,6 +3770,222 @@ cdef class Generator:
return multin
+ def multivariate_hypergeometric(self, object colors, object nsample,
+ size=None, method='marginals'):
+ """
+ multivariate_hypergeometric(colors, nsample, size=None,
+ method='marginals')
+
+ Generate variates from a multivariate hypergeometric distribution.
+
+ The multivariate hypergeometric distribution is a generalization
+ of the hypergeometric distribution.
+
+ Choose ``nsample`` items at random without replacement from a
+ collection with ``N`` distinct types. ``N`` is the length of
+ ``colors``, and the values in ``colors`` are the number of occurrences
+ of that type in the collection. The total number of items in the
+ collection is ``sum(colors)``. Each random variate generated by this
+ function is a vector of length ``N`` holding the counts of the
+ different types that occurred in the ``nsample`` items.
+
+ The name ``colors`` comes from a common description of the
+ distribution: it is the probability distribution of the number of
+ marbles of each color selected without replacement from an urn
+ containing marbles of different colors; ``colors[i]`` is the number
+ of marbles in the urn with color ``i``.
+
+ Parameters
+ ----------
+ colors : sequence of integers
+ The number of each type of item in the collection from which
+ a sample is drawn. The values in ``colors`` must be nonnegative.
+ To avoid loss of precision in the algorithm, ``sum(colors)``
+ must be less than ``10**9`` when `method` is "marginals".
+ nsample : int
+ The number of items selected. ``nsample`` must not be greater
+ than ``sum(colors)``.
+ size : int or tuple of ints, optional
+ The number of variates to generate, either an integer or a tuple
+ holding the shape of the array of variates. If the given size is,
+ e.g., ``(k, m)``, then ``k * m`` variates are drawn, where one
+ variate is a vector of length ``len(colors)``, and the return value
+ has shape ``(k, m, len(colors))``. If `size` is an integer, the
+ output has shape ``(size, len(colors))``. Default is None, in
+ which case a single variate is returned as an array with shape
+ ``(len(colors),)``.
+ method : string, optional
+ Specify the algorithm that is used to generate the variates.
+ Must be 'count' or 'marginals' (the default). See the Notes
+ for a description of the methods.
+
+ Returns
+ -------
+ variates : ndarray
+ Array of variates drawn from the multivariate hypergeometric
+ distribution.
+
+ See Also
+ --------
+ hypergeometric : Draw samples from the (univariate) hypergeometric
+ distribution.
+
+ Notes
+ -----
+ The two methods do not return the same sequence of variates.
+
+ The "count" algorithm is roughly equivalent to the following numpy
+ code::
+
+ choices = np.repeat(np.arange(len(colors)), colors)
+ selection = np.random.choice(choices, nsample, replace=False)
+ variate = np.bincount(selection, minlength=len(colors))
+
+ The "count" algorithm uses a temporary array of integers with length
+ ``sum(colors)``.
+
+ The "marginals" algorithm generates a variate by using repeated
+ calls to the univariate hypergeometric sampler. It is roughly
+ equivalent to::
+
+ variate = np.zeros(len(colors), dtype=np.int64)
+ # `remaining` is the cumulative sum of `colors` from the last
+ # element to the first; e.g. if `colors` is [3, 1, 5], then
+ # `remaining` is [9, 6, 5].
+ remaining = np.cumsum(colors[::-1])[::-1]
+ for i in range(len(colors)-1):
+ if nsample < 1:
+ break
+ variate[i] = hypergeometric(colors[i], remaining[i+1],
+ nsample)
+ nsample -= variate[i]
+ variate[-1] = nsample
+
+ The default method is "marginals". For some cases (e.g. when
+ `colors` contains relatively small integers), the "count" method
+ can be significantly faster than the "marginals" method. If
+ performance of the algorithm is important, test the two methods
+ with typical inputs to decide which works best.
+
+ .. versionadded:: 1.18.0
+
+ Examples
+ --------
+ >>> colors = [16, 8, 4]
+ >>> seed = 4861946401452
+ >>> gen = np.random.Generator(np.random.PCG64(seed))
+ >>> gen.multivariate_hypergeometric(colors, 6)
+ array([5, 0, 1])
+ >>> gen.multivariate_hypergeometric(colors, 6, size=3)
+ array([[5, 0, 1],
+ [2, 2, 2],
+ [3, 3, 0]])
+ >>> gen.multivariate_hypergeometric(colors, 6, size=(2, 2))
+ array([[[3, 2, 1],
+ [3, 2, 1]],
+ [[4, 1, 1],
+ [3, 2, 1]]])
+ """
+ cdef int64_t nsamp
+ cdef size_t num_colors
+ cdef int64_t total
+ cdef int64_t *colors_ptr
+ cdef int64_t max_index
+ cdef size_t num_variates
+ cdef int64_t *variates_ptr
+ cdef int result
+
+ if method not in ['count', 'marginals']:
+ raise ValueError('method must be "count" or "marginals".')
+
+ try:
+ operator.index(nsample)
+ except TypeError:
+ raise ValueError('nsample must be an integer')
+
+ if nsample < 0:
+ raise ValueError("nsample must be nonnegative.")
+ if nsample > INT64_MAX:
+ raise ValueError("nsample must not exceed %d" % INT64_MAX)
+ nsamp = nsample
+
+ # Validation of colors, a 1-d sequence of nonnegative integers.
+ invalid_colors = False
+ try:
+ colors = np.asarray(colors)
+ if colors.ndim != 1:
+ invalid_colors = True
+ elif colors.size > 0 and not np.issubdtype(colors.dtype,
+ np.integer):
+ invalid_colors = True
+ elif np.any((colors < 0) | (colors > INT64_MAX)):
+ invalid_colors = True
+ except ValueError:
+ invalid_colors = True
+ if invalid_colors:
+ raise ValueError('colors must be a one-dimensional sequence '
+ 'of nonnegative integers not exceeding %d.' %
+ INT64_MAX)
+
+ colors = np.ascontiguousarray(colors, dtype=np.int64)
+ num_colors = colors.size
+
+ colors_ptr = <int64_t *> np.PyArray_DATA(colors)
+
+ total = _safe_sum_nonneg_int64(num_colors, colors_ptr)
+ if total == -1:
+ raise ValueError("sum(colors) must not exceed the maximum value "
+ "of a 64 bit signed integer (%d)" % INT64_MAX)
+
+ if method == 'marginals' and total >= 1000000000:
+ raise ValueError('When method is "marginals", sum(colors) must '
+ 'be less than 1000000000.')
+
+ # The C code that implements the 'count' method will malloc an
+ # array of size total*sizeof(size_t). Here we ensure that that
+ # product does not overflow.
+ if SIZE_MAX > <uint64_t>INT64_MAX:
+ max_index = INT64_MAX // sizeof(size_t)
+ else:
+ max_index = SIZE_MAX // sizeof(size_t)
+ if method == 'count' and total > max_index:
+ raise ValueError("When method is 'count', sum(colors) must not "
+ "exceed %d" % max_index)
+ if nsamp > total:
+ raise ValueError("nsample > sum(colors)")
+
+ # Figure out the shape of the return array.
+ if size is None:
+ shape = (num_colors,)
+ elif np.isscalar(size):
+ shape = (size, num_colors)
+ else:
+ shape = tuple(size) + (num_colors,)
+ variates = np.zeros(shape, dtype=np.int64)
+
+ if num_colors == 0:
+ return variates
+
+ # One variate is a vector of length num_colors.
+ num_variates = variates.size // num_colors
+ variates_ptr = <int64_t *> np.PyArray_DATA(variates)
+
+ if method == 'count':
+ with self.lock, nogil:
+ result = random_mvhg_count(&self._bitgen, total,
+ num_colors, colors_ptr, nsamp,
+ num_variates, variates_ptr)
+ if result == -1:
+ raise MemoryError("Insufficent memory for multivariate_"
+ "hypergeometric with method='count' and "
+ "sum(colors)=%d" % total)
+ else:
+ with self.lock, nogil:
+ random_mvhg_marginals(&self._bitgen, total,
+ num_colors, colors_ptr, nsamp,
+ num_variates, variates_ptr)
+ return variates
+
def dirichlet(self, object alpha, size=None):
"""
dirichlet(alpha, size=None)