summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatti Picus <matti.picus@gmail.com>2019-10-20 16:11:08 +0300
committerGitHub <noreply@github.com>2019-10-20 16:11:08 +0300
commit2aa3bbab5f49af4c14338bff30b82a3ccaff7935 (patch)
tree0d4b0b8c6a2769eaa3ee7ddda529c6fdec7558f6
parent9c8e904a5c56dcdcb316ee8aa88e244b593d0788 (diff)
parent0455447b5019ab40cf9cfa4b2225c1260824bd6b (diff)
downloadnumpy-2aa3bbab5f49af4c14338bff30b82a3ccaff7935.tar.gz
Merge pull request #13794 from WarrenWeckesser/new-mvhg
ENH: random: Add the multivariate hypergeometric distribution.
-rw-r--r--doc/release/upcoming_changes/13794.new_function.rst5
-rw-r--r--doc/source/reference/random/generator.rst1
-rw-r--r--numpy/random/_generator.pyx249
-rw-r--r--numpy/random/include/distributions.h14
-rw-r--r--numpy/random/setup.py3
-rw-r--r--numpy/random/src/distributions/random_mvhg_count.c131
-rw-r--r--numpy/random/src/distributions/random_mvhg_marginals.c138
-rw-r--r--numpy/random/tests/test_generator_mt19937.py136
8 files changed, 674 insertions, 3 deletions
diff --git a/doc/release/upcoming_changes/13794.new_function.rst b/doc/release/upcoming_changes/13794.new_function.rst
new file mode 100644
index 000000000..cf8b38bb0
--- /dev/null
+++ b/doc/release/upcoming_changes/13794.new_function.rst
@@ -0,0 +1,5 @@
+Multivariate hypergeometric distribution added to `numpy.random`
+----------------------------------------------------------------
+The method `multivariate_hypergeometric` has been added to the class
+`numpy.random.Generator`. This method generates random variates from
+the multivariate hypergeometric probability distribution.
diff --git a/doc/source/reference/random/generator.rst b/doc/source/reference/random/generator.rst
index 068143270..a2cbb493a 100644
--- a/doc/source/reference/random/generator.rst
+++ b/doc/source/reference/random/generator.rst
@@ -62,6 +62,7 @@ Distributions
~numpy.random.Generator.lognormal
~numpy.random.Generator.logseries
~numpy.random.Generator.multinomial
+ ~numpy.random.Generator.multivariate_hypergeometric
~numpy.random.Generator.multivariate_normal
~numpy.random.Generator.negative_binomial
~numpy.random.Generator.noncentral_chisquare
diff --git a/numpy/random/_generator.pyx b/numpy/random/_generator.pyx
index 22b17ab03..6d9fe20f9 100644
--- a/numpy/random/_generator.pyx
+++ b/numpy/random/_generator.pyx
@@ -13,7 +13,7 @@ from numpy.core.multiarray import normalize_axis_index
from libc cimport string
from libc.stdint cimport (uint8_t, uint16_t, uint32_t, uint64_t,
- int32_t, int64_t)
+ int32_t, int64_t, INT64_MAX, SIZE_MAX)
from ._bounded_integers cimport (_rand_bool, _rand_int32, _rand_int64,
_rand_int16, _rand_int8, _rand_uint64, _rand_uint32, _rand_uint16,
_rand_uint8, _gen_mask)
@@ -126,9 +126,38 @@ cdef extern from "include/distributions.h":
void random_multinomial(bitgen_t *bitgen_state, int64_t n, int64_t *mnix,
double *pix, np.npy_intp d, binomial_t *binomial) nogil
+ int random_mvhg_count(bitgen_t *bitgen_state,
+ int64_t total,
+ size_t num_colors, int64_t *colors,
+ int64_t nsample,
+ size_t num_variates, int64_t *variates) nogil
+ void random_mvhg_marginals(bitgen_t *bitgen_state,
+ int64_t total,
+ size_t num_colors, int64_t *colors,
+ int64_t nsample,
+ size_t num_variates, int64_t *variates) nogil
+
np.import_array()
+cdef int64_t _safe_sum_nonneg_int64(size_t num_colors, int64_t *colors):
+ """
+ Sum the values in the array `colors`.
+
+ Return -1 if an overflow occurs.
+ The values in *colors are assumed to be nonnegative.
+ """
+ cdef size_t i
+ cdef int64_t sum
+
+ sum = 0
+ for i in range(num_colors):
+ if colors[i] > INT64_MAX - sum:
+ return -1
+ sum += colors[i]
+ return sum
+
+
cdef bint _check_bit_generator(object bitgen):
"""Check if an object satisfies the BitGenerator interface.
"""
@@ -3241,6 +3270,8 @@ cdef class Generator:
See Also
--------
+ multivariate_hypergeometric : Draw samples from the multivariate
+ hypergeometric distribution.
scipy.stats.hypergeom : probability density function, distribution or
cumulative density function, etc.
@@ -3739,6 +3770,222 @@ cdef class Generator:
return multin
+ def multivariate_hypergeometric(self, object colors, object nsample,
+ size=None, method='marginals'):
+ """
+ multivariate_hypergeometric(colors, nsample, size=None,
+ method='marginals')
+
+ Generate variates from a multivariate hypergeometric distribution.
+
+ The multivariate hypergeometric distribution is a generalization
+ of the hypergeometric distribution.
+
+ Choose ``nsample`` items at random without replacement from a
+ collection with ``N`` distinct types. ``N`` is the length of
+ ``colors``, and the values in ``colors`` are the number of occurrences
+ of that type in the collection. The total number of items in the
+ collection is ``sum(colors)``. Each random variate generated by this
+ function is a vector of length ``N`` holding the counts of the
+ different types that occurred in the ``nsample`` items.
+
+ The name ``colors`` comes from a common description of the
+ distribution: it is the probability distribution of the number of
+ marbles of each color selected without replacement from an urn
+ containing marbles of different colors; ``colors[i]`` is the number
+ of marbles in the urn with color ``i``.
+
+ Parameters
+ ----------
+ colors : sequence of integers
+ The number of each type of item in the collection from which
+ a sample is drawn. The values in ``colors`` must be nonnegative.
+ To avoid loss of precision in the algorithm, ``sum(colors)``
+ must be less than ``10**9`` when `method` is "marginals".
+ nsample : int
+ The number of items selected. ``nsample`` must not be greater
+ than ``sum(colors)``.
+ size : int or tuple of ints, optional
+ The number of variates to generate, either an integer or a tuple
+ holding the shape of the array of variates. If the given size is,
+ e.g., ``(k, m)``, then ``k * m`` variates are drawn, where one
+ variate is a vector of length ``len(colors)``, and the return value
+ has shape ``(k, m, len(colors))``. If `size` is an integer, the
+ output has shape ``(size, len(colors))``. Default is None, in
+ which case a single variate is returned as an array with shape
+ ``(len(colors),)``.
+ method : string, optional
+ Specify the algorithm that is used to generate the variates.
+ Must be 'count' or 'marginals' (the default). See the Notes
+ for a description of the methods.
+
+ Returns
+ -------
+ variates : ndarray
+ Array of variates drawn from the multivariate hypergeometric
+ distribution.
+
+ See Also
+ --------
+ hypergeometric : Draw samples from the (univariate) hypergeometric
+ distribution.
+
+ Notes
+ -----
+ The two methods do not return the same sequence of variates.
+
+ The "count" algorithm is roughly equivalent to the following numpy
+ code::
+
+ choices = np.repeat(np.arange(len(colors)), colors)
+ selection = np.random.choice(choices, nsample, replace=False)
+ variate = np.bincount(selection, minlength=len(colors))
+
+ The "count" algorithm uses a temporary array of integers with length
+ ``sum(colors)``.
+
+ The "marginals" algorithm generates a variate by using repeated
+ calls to the univariate hypergeometric sampler. It is roughly
+ equivalent to::
+
+ variate = np.zeros(len(colors), dtype=np.int64)
+ # `remaining` is the cumulative sum of `colors` from the last
+ # element to the first; e.g. if `colors` is [3, 1, 5], then
+ # `remaining` is [9, 6, 5].
+ remaining = np.cumsum(colors[::-1])[::-1]
+ for i in range(len(colors)-1):
+ if nsample < 1:
+ break
+ variate[i] = hypergeometric(colors[i], remaining[i+1],
+ nsample)
+ nsample -= variate[i]
+ variate[-1] = nsample
+
+ The default method is "marginals". For some cases (e.g. when
+ `colors` contains relatively small integers), the "count" method
+ can be significantly faster than the "marginals" method. If
+ performance of the algorithm is important, test the two methods
+ with typical inputs to decide which works best.
+
+ .. versionadded:: 1.18.0
+
+ Examples
+ --------
+ >>> colors = [16, 8, 4]
+ >>> seed = 4861946401452
+ >>> gen = np.random.Generator(np.random.PCG64(seed))
+ >>> gen.multivariate_hypergeometric(colors, 6)
+ array([5, 0, 1])
+ >>> gen.multivariate_hypergeometric(colors, 6, size=3)
+ array([[5, 0, 1],
+ [2, 2, 2],
+ [3, 3, 0]])
+ >>> gen.multivariate_hypergeometric(colors, 6, size=(2, 2))
+ array([[[3, 2, 1],
+ [3, 2, 1]],
+ [[4, 1, 1],
+ [3, 2, 1]]])
+ """
+ cdef int64_t nsamp
+ cdef size_t num_colors
+ cdef int64_t total
+ cdef int64_t *colors_ptr
+ cdef int64_t max_index
+ cdef size_t num_variates
+ cdef int64_t *variates_ptr
+ cdef int result
+
+ if method not in ['count', 'marginals']:
+ raise ValueError('method must be "count" or "marginals".')
+
+ try:
+ operator.index(nsample)
+ except TypeError:
+ raise ValueError('nsample must be an integer')
+
+ if nsample < 0:
+ raise ValueError("nsample must be nonnegative.")
+ if nsample > INT64_MAX:
+ raise ValueError("nsample must not exceed %d" % INT64_MAX)
+ nsamp = nsample
+
+ # Validation of colors, a 1-d sequence of nonnegative integers.
+ invalid_colors = False
+ try:
+ colors = np.asarray(colors)
+ if colors.ndim != 1:
+ invalid_colors = True
+ elif colors.size > 0 and not np.issubdtype(colors.dtype,
+ np.integer):
+ invalid_colors = True
+ elif np.any((colors < 0) | (colors > INT64_MAX)):
+ invalid_colors = True
+ except ValueError:
+ invalid_colors = True
+ if invalid_colors:
+ raise ValueError('colors must be a one-dimensional sequence '
+ 'of nonnegative integers not exceeding %d.' %
+ INT64_MAX)
+
+ colors = np.ascontiguousarray(colors, dtype=np.int64)
+ num_colors = colors.size
+
+ colors_ptr = <int64_t *> np.PyArray_DATA(colors)
+
+ total = _safe_sum_nonneg_int64(num_colors, colors_ptr)
+ if total == -1:
+ raise ValueError("sum(colors) must not exceed the maximum value "
+ "of a 64 bit signed integer (%d)" % INT64_MAX)
+
+ if method == 'marginals' and total >= 1000000000:
+ raise ValueError('When method is "marginals", sum(colors) must '
+ 'be less than 1000000000.')
+
+ # The C code that implements the 'count' method will malloc an
+ # array of size total*sizeof(size_t). Here we ensure that that
+ # product does not overflow.
+ if SIZE_MAX > <uint64_t>INT64_MAX:
+ max_index = INT64_MAX // sizeof(size_t)
+ else:
+ max_index = SIZE_MAX // sizeof(size_t)
+ if method == 'count' and total > max_index:
+ raise ValueError("When method is 'count', sum(colors) must not "
+ "exceed %d" % max_index)
+ if nsamp > total:
+ raise ValueError("nsample > sum(colors)")
+
+ # Figure out the shape of the return array.
+ if size is None:
+ shape = (num_colors,)
+ elif np.isscalar(size):
+ shape = (size, num_colors)
+ else:
+ shape = tuple(size) + (num_colors,)
+ variates = np.zeros(shape, dtype=np.int64)
+
+ if num_colors == 0:
+ return variates
+
+ # One variate is a vector of length num_colors.
+ num_variates = variates.size // num_colors
+ variates_ptr = <int64_t *> np.PyArray_DATA(variates)
+
+ if method == 'count':
+ with self.lock, nogil:
+ result = random_mvhg_count(&self._bitgen, total,
+ num_colors, colors_ptr, nsamp,
+ num_variates, variates_ptr)
+ if result == -1:
+ raise MemoryError("Insufficent memory for multivariate_"
+ "hypergeometric with method='count' and "
+ "sum(colors)=%d" % total)
+ else:
+ with self.lock, nogil:
+ random_mvhg_marginals(&self._bitgen, total,
+ num_colors, colors_ptr, nsamp,
+ num_variates, variates_ptr)
+ return variates
+
def dirichlet(self, object alpha, size=None):
"""
dirichlet(alpha, size=None)
diff --git a/numpy/random/include/distributions.h b/numpy/random/include/distributions.h
index fb69c7d2c..c02ea605e 100644
--- a/numpy/random/include/distributions.h
+++ b/numpy/random/include/distributions.h
@@ -170,6 +170,20 @@ DECLDIR void random_bounded_bool_fill(bitgen_t *bitgen_state, npy_bool off,
DECLDIR void random_multinomial(bitgen_t *bitgen_state, RAND_INT_TYPE n, RAND_INT_TYPE *mnix,
double *pix, npy_intp d, binomial_t *binomial);
+/* multivariate hypergeometric, "count" method */
+DECLDIR int random_mvhg_count(bitgen_t *bitgen_state,
+ int64_t total,
+ size_t num_colors, int64_t *colors,
+ int64_t nsample,
+ size_t num_variates, int64_t *variates);
+
+/* multivariate hypergeometric, "marginals" method */
+DECLDIR void random_mvhg_marginals(bitgen_t *bitgen_state,
+ int64_t total,
+ size_t num_colors, int64_t *colors,
+ int64_t nsample,
+ size_t num_variates, int64_t *variates);
+
/* Common to legacy-distributions.c and distributions.c but not exported */
RAND_INT_TYPE random_binomial_btpe(bitgen_t *bitgen_state,
diff --git a/numpy/random/setup.py b/numpy/random/setup.py
index ddb16339b..ca01250f4 100644
--- a/numpy/random/setup.py
+++ b/numpy/random/setup.py
@@ -100,6 +100,8 @@ def configuration(parent_package='', top_path=None):
other_srcs = [
'src/distributions/logfactorial.c',
'src/distributions/distributions.c',
+ 'src/distributions/random_mvhg_count.c',
+ 'src/distributions/random_mvhg_marginals.c',
'src/distributions/random_hypergeometric.c',
]
for gen in ['_generator', '_bounded_integers']:
@@ -114,7 +116,6 @@ def configuration(parent_package='', top_path=None):
define_macros=defs,
)
config.add_extension('mtrand',
- # mtrand does not depend on random_hypergeometric.c.
sources=['mtrand.c',
'src/legacy/legacy-distributions.c',
'src/distributions/logfactorial.c',
diff --git a/numpy/random/src/distributions/random_mvhg_count.c b/numpy/random/src/distributions/random_mvhg_count.c
new file mode 100644
index 000000000..9c0cc045d
--- /dev/null
+++ b/numpy/random/src/distributions/random_mvhg_count.c
@@ -0,0 +1,131 @@
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+
+#include "include/distributions.h"
+
+/*
+ * random_mvhg_count
+ *
+ * Draw variates from the multivariate hypergeometric distribution--
+ * the "count" algorithm.
+ *
+ * Parameters
+ * ----------
+ * bitgen_t *bitgen_state
+ * Pointer to a `bitgen_t` instance.
+ * int64_t total
+ * The sum of the values in the array `colors`. (This is redundant
+ * information, but we know the caller has already computed it, so
+ * we might as well use it.)
+ * size_t num_colors
+ * The length of the `colors` array.
+ * int64_t *colors
+ * The array of colors (i.e. the number of each type in the collection
+ * from which the random variate is drawn).
+ * int64_t nsample
+ * The number of objects drawn without replacement for each variate.
+ * `nsample` must not exceed sum(colors). This condition is not checked;
+ * it is assumed that the caller has already validated the value.
+ * size_t num_variates
+ * The number of variates to be produced and put in the array
+ * pointed to by `variates`. One variate is a vector of length
+ * `num_colors`, so the array pointed to by `variates` must have length
+ * `num_variates * num_colors`.
+ * int64_t *variates
+ * The array that will hold the result. It must have length
+ * `num_variates * num_colors`.
+ * The array is not initialized in the function; it is expected that the
+ * array has been initialized with zeros when the function is called.
+ *
+ * Notes
+ * -----
+ * The "count" algorithm for drawing one variate is roughly equivalent to the
+ * following numpy code:
+ *
+ * choices = np.repeat(np.arange(len(colors)), colors)
+ * selection = np.random.choice(choices, nsample, replace=False)
+ * variate = np.bincount(selection, minlength=len(colors))
+ *
+ * This function uses a temporary array with length sum(colors).
+ *
+ * Assumptions on the arguments (not checked in the function):
+ * * colors[k] >= 0 for k in range(num_colors)
+ * * total = sum(colors)
+ * * 0 <= nsample <= total
+ * * the product total * sizeof(size_t) does not exceed SIZE_MAX
+ * * the product num_variates * num_colors does not overflow
+ */
+
+int random_mvhg_count(bitgen_t *bitgen_state,
+ int64_t total,
+ size_t num_colors, int64_t *colors,
+ int64_t nsample,
+ size_t num_variates, int64_t *variates)
+{
+ size_t *choices;
+ bool more_than_half;
+
+ if ((total == 0) || (nsample == 0) || (num_variates == 0)) {
+ // Nothing to do.
+ return 0;
+ }
+
+ choices = malloc(total * (sizeof *choices));
+ if (choices == NULL) {
+ return -1;
+ }
+
+ /*
+ * If colors contains, for example, [3 2 5], then choices
+ * will contain [0 0 0 1 1 2 2 2 2 2].
+ */
+ for (size_t i = 0, k = 0; i < num_colors; ++i) {
+ for (int64_t j = 0; j < colors[i]; ++j) {
+ choices[k] = i;
+ ++k;
+ }
+ }
+
+ more_than_half = nsample > (total / 2);
+ if (more_than_half) {
+ nsample = total - nsample;
+ }
+
+ for (size_t i = 0; i < num_variates * num_colors; i += num_colors) {
+ /*
+ * Fisher-Yates shuffle, but only loop through the first
+ * `nsample` entries of `choices`. After the loop,
+ * choices[:nsample] contains a random sample from the
+ * the full array.
+ */
+ for (size_t j = 0; j < (size_t) nsample; ++j) {
+ size_t tmp, k;
+ // Note: nsample is not greater than total, so there is no danger
+ // of integer underflow in `(size_t) total - j - 1`.
+ k = j + (size_t) random_interval(bitgen_state,
+ (size_t) total - j - 1);
+ tmp = choices[k];
+ choices[k] = choices[j];
+ choices[j] = tmp;
+ }
+ /*
+ * Count the number of occurrences of each value in choices[:nsample].
+ * The result, stored in sample[i:i+num_colors], is the sample from
+ * the multivariate hypergeometric distribution.
+ */
+ for (size_t j = 0; j < (size_t) nsample; ++j) {
+ variates[i + choices[j]] += 1;
+ }
+
+ if (more_than_half) {
+ for (size_t k = 0; k < num_colors; ++k) {
+ variates[i + k] = colors[k] - variates[i + k];
+ }
+ }
+ }
+
+ free(choices);
+
+ return 0;
+}
diff --git a/numpy/random/src/distributions/random_mvhg_marginals.c b/numpy/random/src/distributions/random_mvhg_marginals.c
new file mode 100644
index 000000000..301a4acad
--- /dev/null
+++ b/numpy/random/src/distributions/random_mvhg_marginals.c
@@ -0,0 +1,138 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <math.h>
+
+#include "include/distributions.h"
+#include "logfactorial.h"
+
+
+/*
+ * random_mvhg_marginals
+ *
+ * Draw samples from the multivariate hypergeometric distribution--
+ * the "marginals" algorithm.
+ *
+ * This version generates the sample by iteratively calling
+ * hypergeometric() (the univariate hypergeometric distribution).
+ *
+ * Parameters
+ * ----------
+ * bitgen_t *bitgen_state
+ * Pointer to a `bitgen_t` instance.
+ * int64_t total
+ * The sum of the values in the array `colors`. (This is redundant
+ * information, but we know the caller has already computed it, so
+ * we might as well use it.)
+ * size_t num_colors
+ * The length of the `colors` array. The functions assumes
+ * num_colors > 0.
+ * int64_t *colors
+ * The array of colors (i.e. the number of each type in the collection
+ * from which the random variate is drawn).
+ * int64_t nsample
+ * The number of objects drawn without replacement for each variate.
+ * `nsample` must not exceed sum(colors). This condition is not checked;
+ * it is assumed that the caller has already validated the value.
+ * size_t num_variates
+ * The number of variates to be produced and put in the array
+ * pointed to by `variates`. One variate is a vector of length
+ * `num_colors`, so the array pointed to by `variates` must have length
+ * `num_variates * num_colors`.
+ * int64_t *variates
+ * The array that will hold the result. It must have length
+ * `num_variates * num_colors`.
+ * The array is not initialized in the function; it is expected that the
+ * array has been initialized with zeros when the function is called.
+ *
+ * Notes
+ * -----
+ * Here's an example that demonstrates the idea of this algorithm.
+ *
+ * Suppose the urn contains red, green, blue and yellow marbles.
+ * Let nred be the number of red marbles, and define the quantities for
+ * the other colors similarly. The total number of marbles is
+ *
+ * total = nred + ngreen + nblue + nyellow.
+ *
+ * To generate a sample using rk_hypergeometric:
+ *
+ * red_sample = hypergeometric(ngood=nred, nbad=total - nred,
+ * nsample=nsample)
+ *
+ * This gives us the number of red marbles in the sample. The number of
+ * marbles in the sample that are *not* red is nsample - red_sample.
+ * To figure out the distribution of those marbles, we again use
+ * rk_hypergeometric:
+ *
+ * green_sample = hypergeometric(ngood=ngreen,
+ * nbad=total - nred - ngreen,
+ * nsample=nsample - red_sample)
+ *
+ * Similarly,
+ *
+ * blue_sample = hypergeometric(
+ * ngood=nblue,
+ * nbad=total - nred - ngreen - nblue,
+ * nsample=nsample - red_sample - green_sample)
+ *
+ * Finally,
+ *
+ * yellow_sample = total - (red_sample + green_sample + blue_sample).
+ *
+ * The above sequence of steps is implemented as a loop for an arbitrary
+ * number of colors in the innermost loop in the code below. `remaining`
+ * is the value passed to `nbad`; it is `total - colors[0]` in the first
+ * call to random_hypergeometric(), and then decreases by `colors[j]` in
+ * each iteration. `num_to_sample` is the `nsample` argument. It
+ * starts at this function's `nsample` input, and is decreased by the
+ * result of the call to random_hypergeometric() in each iteration.
+ *
+ * Assumptions on the arguments (not checked in the function):
+ * * colors[k] >= 0 for k in range(num_colors)
+ * * total = sum(colors)
+ * * 0 <= nsample <= total
+ * * the product num_variates * num_colors does not overflow
+ */
+
+void random_mvhg_marginals(bitgen_t *bitgen_state,
+ int64_t total,
+ size_t num_colors, int64_t *colors,
+ int64_t nsample,
+ size_t num_variates, int64_t *variates)
+{
+ bool more_than_half;
+
+ if ((total == 0) || (nsample == 0) || (num_variates == 0)) {
+ // Nothing to do.
+ return;
+ }
+
+ more_than_half = nsample > (total / 2);
+ if (more_than_half) {
+ nsample = total - nsample;
+ }
+
+ for (size_t i = 0; i < num_variates * num_colors; i += num_colors) {
+ int64_t num_to_sample = nsample;
+ int64_t remaining = total;
+ for (size_t j = 0; (num_to_sample > 0) && (j + 1 < num_colors); ++j) {
+ int64_t r;
+ remaining -= colors[j];
+ r = random_hypergeometric(bitgen_state,
+ colors[j], remaining, num_to_sample);
+ variates[i + j] = r;
+ num_to_sample -= r;
+ }
+
+ if (num_to_sample > 0) {
+ variates[i + num_colors - 1] = num_to_sample;
+ }
+
+ if (more_than_half) {
+ for (size_t k = 0; k < num_colors; ++k) {
+ variates[i + k] = colors[k] - variates[i + k];
+ }
+ }
+ }
+}
diff --git a/numpy/random/tests/test_generator_mt19937.py b/numpy/random/tests/test_generator_mt19937.py
index 391c33c1a..526275dda 100644
--- a/numpy/random/tests/test_generator_mt19937.py
+++ b/numpy/random/tests/test_generator_mt19937.py
@@ -4,7 +4,7 @@ import pytest
import numpy as np
from numpy.testing import (
- assert_, assert_raises, assert_equal,
+ assert_, assert_raises, assert_equal, assert_allclose,
assert_warns, assert_no_warnings, assert_array_equal,
assert_array_almost_equal, suppress_warnings)
@@ -115,6 +115,140 @@ class TestMultinomial(object):
assert_array_equal(non_contig, contig)
+class TestMultivariateHypergeometric(object):
+
+ def setup(self):
+ self.seed = 8675309
+
+ def test_argument_validation(self):
+ # Error cases...
+
+ # `colors` must be a 1-d sequence
+ assert_raises(ValueError, random.multivariate_hypergeometric,
+ 10, 4)
+
+ # Negative nsample
+ assert_raises(ValueError, random.multivariate_hypergeometric,
+ [2, 3, 4], -1)
+
+ # Negative color
+ assert_raises(ValueError, random.multivariate_hypergeometric,
+ [-1, 2, 3], 2)
+
+ # nsample exceeds sum(colors)
+ assert_raises(ValueError, random.multivariate_hypergeometric,
+ [2, 3, 4], 10)
+
+ # nsample exceeds sum(colors) (edge case of empty colors)
+ assert_raises(ValueError, random.multivariate_hypergeometric,
+ [], 1)
+
+ # Validation errors associated with very large values in colors.
+ assert_raises(ValueError, random.multivariate_hypergeometric,
+ [999999999, 101], 5, 1, 'marginals')
+
+ int64_info = np.iinfo(np.int64)
+ max_int64 = int64_info.max
+ max_int64_index = max_int64 // int64_info.dtype.itemsize
+ assert_raises(ValueError, random.multivariate_hypergeometric,
+ [max_int64_index - 100, 101], 5, 1, 'count')
+
+ @pytest.mark.parametrize('method', ['count', 'marginals'])
+ def test_edge_cases(self, method):
+ # Set the seed, but in fact, all the results in this test are
+ # deterministic, so we don't really need this.
+ random = Generator(MT19937(self.seed))
+
+ x = random.multivariate_hypergeometric([0, 0, 0], 0, method=method)
+ assert_array_equal(x, [0, 0, 0])
+
+ x = random.multivariate_hypergeometric([], 0, method=method)
+ assert_array_equal(x, [])
+
+ x = random.multivariate_hypergeometric([], 0, size=1, method=method)
+ assert_array_equal(x, np.empty((1, 0), dtype=np.int64))
+
+ x = random.multivariate_hypergeometric([1, 2, 3], 0, method=method)
+ assert_array_equal(x, [0, 0, 0])
+
+ x = random.multivariate_hypergeometric([9, 0, 0], 3, method=method)
+ assert_array_equal(x, [3, 0, 0])
+
+ colors = [1, 1, 0, 1, 1]
+ x = random.multivariate_hypergeometric(colors, sum(colors),
+ method=method)
+ assert_array_equal(x, colors)
+
+ x = random.multivariate_hypergeometric([3, 4, 5], 12, size=3,
+ method=method)
+ assert_array_equal(x, [[3, 4, 5]]*3)
+
+ # Cases for nsample:
+ # nsample < 10
+ # 10 <= nsample < colors.sum()/2
+ # colors.sum()/2 < nsample < colors.sum() - 10
+ # colors.sum() - 10 < nsample < colors.sum()
+ @pytest.mark.parametrize('nsample', [8, 25, 45, 55])
+ @pytest.mark.parametrize('method', ['count', 'marginals'])
+ @pytest.mark.parametrize('size', [5, (2, 3), 150000])
+ def test_typical_cases(self, nsample, method, size):
+ random = Generator(MT19937(self.seed))
+
+ colors = np.array([10, 5, 20, 25])
+ sample = random.multivariate_hypergeometric(colors, nsample, size,
+ method=method)
+ if isinstance(size, int):
+ expected_shape = (size,) + colors.shape
+ else:
+ expected_shape = size + colors.shape
+ assert_equal(sample.shape, expected_shape)
+ assert_((sample >= 0).all())
+ assert_((sample <= colors).all())
+ assert_array_equal(sample.sum(axis=-1),
+ np.full(size, fill_value=nsample, dtype=int))
+ if isinstance(size, int) and size >= 100000:
+ # This sample is large enough to compare its mean to
+ # the expected values.
+ assert_allclose(sample.mean(axis=0),
+ nsample * colors / colors.sum(),
+ rtol=1e-3, atol=0.005)
+
+ def test_repeatability1(self):
+ random = Generator(MT19937(self.seed))
+ sample = random.multivariate_hypergeometric([3, 4, 5], 5, size=5,
+ method='count')
+ expected = np.array([[2, 1, 2],
+ [2, 1, 2],
+ [1, 1, 3],
+ [2, 0, 3],
+ [2, 1, 2]])
+ assert_array_equal(sample, expected)
+
+ def test_repeatability2(self):
+ random = Generator(MT19937(self.seed))
+ sample = random.multivariate_hypergeometric([20, 30, 50], 50,
+ size=5,
+ method='marginals')
+ expected = np.array([[ 9, 17, 24],
+ [ 7, 13, 30],
+ [ 9, 15, 26],
+ [ 9, 17, 24],
+ [12, 14, 24]])
+ assert_array_equal(sample, expected)
+
+ def test_repeatability3(self):
+ random = Generator(MT19937(self.seed))
+ sample = random.multivariate_hypergeometric([20, 30, 50], 12,
+ size=5,
+ method='marginals')
+ expected = np.array([[2, 3, 7],
+ [5, 3, 4],
+ [2, 5, 5],
+ [5, 3, 4],
+ [1, 5, 6]])
+ assert_array_equal(sample, expected)
+
+
class TestSetState(object):
def setup(self):
self.seed = 1234567890