summaryrefslogtreecommitdiff
path: root/numpy/random
diff options
context:
space:
mode:
authorWarren Weckesser <warren.weckesser@gmail.com>2019-06-10 23:00:39 -0400
committerWarren Weckesser <warren.weckesser@gmail.com>2019-06-14 16:14:31 -0400
commitb2d2b677c26c8da7052bbc653b20ad9717f078fe (patch)
treeefbeb7af0d5ae96d7ee61cafc8625c937dad5d2b /numpy/random
parente3eb3986dd87e700a694d6b4151c96ef92dfabe0 (diff)
downloadnumpy-b2d2b677c26c8da7052bbc653b20ad9717f078fe.tar.gz
MAINT: random: Rewrite the hypergeometric distribution.
Summary of the changes: * Move the functions random_hypergeometric_hyp, random_hypergeometric_hrua and random_hypergeometric from distributions.c to legacy-distributions.c. These are now the legacy implementation of hypergeometric. * Add the files logfactorial.c and logfactorial.h, containing the function logfactorial(int64_t k). * Add the files random_hypergeometric.c and random_hypergeometric.h, containing the function random_hypergeometric (the new implementation of the hypergeometric distribution). See more details below. * Fix two tests in numpy/random/tests/test_generator_mt19937.py that used values returned by the hypergeometric distribution. The new implementation changes the stream, so those tests needed to be updated. * Remove another test obviated by an added constraint on the arguments of hypergeometric. Details of the rewrite: If you carefully step through the old function rk_hypergeometric_hyp(), you'll see that the end result is basically the same as the new function hypergeometric_sample(), but the new function accomplishes the result with just integers. The floating point calculations in the old code caused problems when the arguments were extremely large (explained in more detail in the unmerged pull request https://github.com/numpy/numpy/pull/9834). The new version of hypergeometric_hrua() is a new translation of Stadlober's ratio-of-uniforms algorithm for the hypergeometric distribution. It fixes a mistake in the old implementation that made the method less efficient than it could be (see the details in the unmerged pull request https://github.com/numpy/numpy/pull/11138), and uses a faster function for computing log(k!). The HRUA algorithm suffers from loss of floating point precision when the arguments are *extremely* large (see the comments in github issue 11443). To avoid these problems, the arguments `ngood` and `nbad` of hypergeometric must be less than 10**9. This constraint obviates an existing regression test that was run on systems with 64 bit long integers, so that test was removed.
Diffstat (limited to 'numpy/random')
-rw-r--r--numpy/random/generator.pyx25
-rw-r--r--numpy/random/setup.py11
-rw-r--r--numpy/random/src/distributions/distributions.c105
-rw-r--r--numpy/random/src/distributions/distributions.h6
-rw-r--r--numpy/random/src/distributions/logfactorial.c158
-rw-r--r--numpy/random/src/distributions/logfactorial.h9
-rw-r--r--numpy/random/src/distributions/random_hypergeometric.c260
-rw-r--r--numpy/random/src/legacy/legacy-distributions.c118
-rw-r--r--numpy/random/tests/test_generator_mt19937.py13
-rw-r--r--numpy/random/tests/test_generator_mt19937_regressions.py11
10 files changed, 592 insertions, 124 deletions
diff --git a/numpy/random/generator.pyx b/numpy/random/generator.pyx
index 4e2b18f44..2d2979441 100644
--- a/numpy/random/generator.pyx
+++ b/numpy/random/generator.pyx
@@ -3095,9 +3095,11 @@ cdef class Generator:
Parameters
----------
ngood : int or array_like of ints
- Number of ways to make a good selection. Must be nonnegative.
+ Number of ways to make a good selection. Must be nonnegative and
+ less than 10**9.
nbad : int or array_like of ints
- Number of ways to make a bad selection. Must be nonnegative.
+ Number of ways to make a bad selection. Must be nonnegative and
+ less than 10**9.
nsample : int or array_like of ints
Number of items sampled. Must be nonnegative and less than
``ngood + nbad``.
@@ -3142,6 +3144,13 @@ cdef class Generator:
replacement (or the sample space is infinite). As the sample space
becomes large, this distribution approaches the binomial.
+ The arguments `ngood` and `nbad` each must be less than `10**9`. For
+ extremely large arguments, the algorithm that is used to compute the
+ samples [4]_ breaks down because of loss of precision in floating point
+ calculations. For such large values, if `nsample` is not also large,
+ the distribution can be approximated with the binomial distribution,
+ `binomial(n=nsample, p=ngood/(ngood + nbad))`.
+
References
----------
.. [1] Lentner, Marvin, "Elementary Applied Statistics", Bogden
@@ -3151,6 +3160,9 @@ cdef class Generator:
http://mathworld.wolfram.com/HypergeometricDistribution.html
.. [3] Wikipedia, "Hypergeometric distribution",
https://en.wikipedia.org/wiki/Hypergeometric_distribution
+ .. [4] Stadlober, Ernst, "The ratio of uniforms approach for generating
+ discrete random variates", Journal of Computational and Applied
+ Mathematics, 31, pp. 181-189 (1990).
Examples
--------
@@ -3172,6 +3184,7 @@ cdef class Generator:
# answer = 0.003 ... pretty unlikely!
"""
+ DEF HYPERGEOM_MAX = 10**9
cdef bint is_scalar = True
cdef np.ndarray ongood, onbad, onsample
cdef int64_t lngood, lnbad, lnsample
@@ -3186,6 +3199,9 @@ cdef class Generator:
lnbad = <int64_t>nbad
lnsample = <int64_t>nsample
+ if lngood >= HYPERGEOM_MAX or lnbad >= HYPERGEOM_MAX:
+ raise ValueError("both ngood and nbad must be less than %d" %
+ HYPERGEOM_MAX)
if lngood + lnbad < lnsample:
raise ValueError("ngood + nbad < nsample")
return disc(&random_hypergeometric, &self._bitgen, size, self.lock, 0, 3,
@@ -3193,8 +3209,13 @@ cdef class Generator:
lnbad, 'nbad', CONS_NON_NEGATIVE,
lnsample, 'nsample', CONS_NON_NEGATIVE)
+ if np.any(ongood >= HYPERGEOM_MAX) or np.any(onbad >= HYPERGEOM_MAX):
+ raise ValueError("both ngood and nbad must be less than %d" %
+ HYPERGEOM_MAX)
+
if np.any(np.less(np.add(ongood, onbad), onsample)):
raise ValueError("ngood + nbad < nsample")
+
return discrete_broadcast_iii(&random_hypergeometric, &self._bitgen, size, self.lock,
ongood, 'ngood', CONS_NON_NEGATIVE,
onbad, 'nbad', CONS_NON_NEGATIVE,
diff --git a/numpy/random/setup.py b/numpy/random/setup.py
index 5f467a2f5..605134704 100644
--- a/numpy/random/setup.py
+++ b/numpy/random/setup.py
@@ -126,12 +126,15 @@ def configuration(parent_package='', top_path=None):
depends=['%s.pyx' % gen],
define_macros=defs,
)
+ other_srcs = [
+ 'src/distributions/logfactorial.c',
+ 'src/distributions/distributions.c',
+ 'src/distributions/random_hypergeometric.c',
+ ]
for gen in ['generator', 'bounded_integers']:
# gen.pyx, src/distributions/distributions.c
config.add_extension(gen,
- sources=['{0}.c'.format(gen),
- join('src', 'distributions',
- 'distributions.c')],
+ sources=['{0}.c'.format(gen)] + other_srcs,
libraries=EXTRA_LIBRARIES,
extra_compile_args=EXTRA_COMPILE_ARGS,
include_dirs=['.', 'src'],
@@ -140,8 +143,10 @@ def configuration(parent_package='', top_path=None):
define_macros=defs,
)
config.add_extension('mtrand',
+ # mtrand does not depend on random_hypergeometric.c.
sources=['mtrand.c',
'src/legacy/legacy-distributions.c',
+ 'src/distributions/logfactorial.c',
'src/distributions/distributions.c'],
include_dirs=['.', 'src', 'src/legacy'],
libraries=EXTRA_LIBRARIES,
diff --git a/numpy/random/src/distributions/distributions.c b/numpy/random/src/distributions/distributions.c
index def29d850..c0550ad8e 100644
--- a/numpy/random/src/distributions/distributions.c
+++ b/numpy/random/src/distributions/distributions.c
@@ -1,5 +1,6 @@
#include "distributions.h"
#include "ziggurat_constants.h"
+#include "logfactorial.h"
#if defined(_MSC_VER) && defined(_WIN64)
#include <intrin.h>
@@ -468,8 +469,11 @@ uint64_t random_uint(bitgen_t *bitgen_state) {
* log-gamma function to support some of these distributions. The
* algorithm comes from SPECFUN by Shanjie Zhang and Jianming Jin and their
* book "Computation of Special Functions", 1996, John Wiley & Sons, Inc.
+ *
+ * If loggam(k+1) is being used to compute log(k!) for an integer k, consider
+ * using logfactorial(k) instead.
*/
-static double loggam(double x) {
+double loggam(double x) {
double x0, x2, xp, gl, gl0;
RAND_INT_TYPE k, n;
@@ -1127,105 +1131,6 @@ double random_triangular(bitgen_t *bitgen_state, double left, double mode,
}
}
-RAND_INT_TYPE random_hypergeometric_hyp(bitgen_t *bitgen_state,
- RAND_INT_TYPE good, RAND_INT_TYPE bad,
- RAND_INT_TYPE sample) {
- RAND_INT_TYPE d1, k, z;
- double d2, u, y;
-
- d1 = bad + good - sample;
- d2 = (double)MIN(bad, good);
-
- y = d2;
- k = sample;
- while (y > 0.0) {
- u = next_double(bitgen_state);
- y -= (RAND_INT_TYPE)floor(u + y / (d1 + k));
- k--;
- if (k == 0)
- break;
- }
- z = (RAND_INT_TYPE)(d2 - y);
- if (good > bad)
- z = sample - z;
- return z;
-}
-
-/* D1 = 2*sqrt(2/e) */
-/* D2 = 3 - 2*sqrt(3/e) */
-#define D1 1.7155277699214135
-#define D2 0.8989161620588988
-RAND_INT_TYPE random_hypergeometric_hrua(bitgen_t *bitgen_state,
- RAND_INT_TYPE good, RAND_INT_TYPE bad,
- RAND_INT_TYPE sample) {
- RAND_INT_TYPE mingoodbad, maxgoodbad, popsize, m, d9;
- double d4, d5, d6, d7, d8, d10, d11;
- RAND_INT_TYPE Z;
- double T, W, X, Y;
-
- mingoodbad = MIN(good, bad);
- popsize = good + bad;
- maxgoodbad = MAX(good, bad);
- m = MIN(sample, popsize - sample);
- d4 = ((double)mingoodbad) / popsize;
- d5 = 1.0 - d4;
- d6 = m * d4 + 0.5;
- d7 = sqrt((double)(popsize - m) * sample * d4 * d5 / (popsize - 1) + 0.5);
- d8 = D1 * d7 + D2;
- d9 = (RAND_INT_TYPE)floor((double)(m + 1) * (mingoodbad + 1) / (popsize + 2));
- d10 = (loggam(d9 + 1) + loggam(mingoodbad - d9 + 1) + loggam(m - d9 + 1) +
- loggam(maxgoodbad - m + d9 + 1));
- d11 = MIN(MIN(m, mingoodbad) + 1.0, floor(d6 + 16 * d7));
- /* 16 for 16-decimal-digit precision in D1 and D2 */
-
- while (1) {
- X = next_double(bitgen_state);
- Y = next_double(bitgen_state);
- W = d6 + d8 * (Y - 0.5) / X;
-
- /* fast rejection: */
- if ((W < 0.0) || (W >= d11))
- continue;
-
- Z = (RAND_INT_TYPE)floor(W);
- T = d10 - (loggam(Z + 1) + loggam(mingoodbad - Z + 1) + loggam(m - Z + 1) +
- loggam(maxgoodbad - m + Z + 1));
-
- /* fast acceptance: */
- if ((X * (4.0 - X) - 3.0) <= T)
- break;
-
- /* fast rejection: */
- if (X * (X - T) >= 1)
- continue;
- /* log(0.0) is ok here, since always accept */
- if (2.0 * log(X) <= T)
- break; /* acceptance */
- }
-
- /* this is a correction to HRUA* by Ivan Frohne in rv.py */
- if (good > bad)
- Z = m - Z;
-
- /* another fix from rv.py to allow sample to exceed popsize/2 */
- if (m < sample)
- Z = good - Z;
-
- return Z;
-}
-#undef D1
-#undef D2
-
-RAND_INT_TYPE random_hypergeometric(bitgen_t *bitgen_state, RAND_INT_TYPE good,
- RAND_INT_TYPE bad, RAND_INT_TYPE sample) {
- if (sample > 10) {
- return random_hypergeometric_hrua(bitgen_state, good, bad, sample);
- } else if (sample > 0) {
- return random_hypergeometric_hyp(bitgen_state, good, bad, sample);
- } else {
- return 0;
- }
-}
uint64_t random_interval(bitgen_t *bitgen_state, uint64_t max) {
uint64_t mask, value;
diff --git a/numpy/random/src/distributions/distributions.h b/numpy/random/src/distributions/distributions.h
index e43a5279c..3178725af 100644
--- a/numpy/random/src/distributions/distributions.h
+++ b/numpy/random/src/distributions/distributions.h
@@ -84,6 +84,8 @@ static NPY_INLINE double next_double(bitgen_t *bitgen_state) {
return bitgen_state->next_double(bitgen_state->state);
}
+DECLDIR double loggam(double x);
+
DECLDIR float random_float(bitgen_t *bitgen_state);
DECLDIR double random_double(bitgen_t *bitgen_state);
DECLDIR void random_double_fill(bitgen_t *bitgen_state, npy_intp cnt, double *out);
@@ -160,8 +162,8 @@ DECLDIR RAND_INT_TYPE random_geometric_search(bitgen_t *bitgen_state, double p);
DECLDIR RAND_INT_TYPE random_geometric_inversion(bitgen_t *bitgen_state, double p);
DECLDIR RAND_INT_TYPE random_geometric(bitgen_t *bitgen_state, double p);
DECLDIR RAND_INT_TYPE random_zipf(bitgen_t *bitgen_state, double a);
-DECLDIR RAND_INT_TYPE random_hypergeometric(bitgen_t *bitgen_state, RAND_INT_TYPE good,
- RAND_INT_TYPE bad, RAND_INT_TYPE sample);
+DECLDIR int64_t random_hypergeometric(bitgen_t *bitgen_state,
+ int64_t good, int64_t bad, int64_t sample);
DECLDIR uint64_t random_interval(bitgen_t *bitgen_state, uint64_t max);
diff --git a/numpy/random/src/distributions/logfactorial.c b/numpy/random/src/distributions/logfactorial.c
new file mode 100644
index 000000000..130516469
--- /dev/null
+++ b/numpy/random/src/distributions/logfactorial.c
@@ -0,0 +1,158 @@
+
+#include <math.h>
+#include <stdint.h>
+
+/*
+ * logfact[k] holds log(k!) for k = 0, 1, 2, ..., 125.
+ */
+
+static const double logfact[] = {
+ 0,
+ 0,
+ 0.69314718055994529,
+ 1.791759469228055,
+ 3.1780538303479458,
+ 4.7874917427820458,
+ 6.5792512120101012,
+ 8.5251613610654147,
+ 10.604602902745251,
+ 12.801827480081469,
+ 15.104412573075516,
+ 17.502307845873887,
+ 19.987214495661885,
+ 22.552163853123425,
+ 25.19122118273868,
+ 27.89927138384089,
+ 30.671860106080672,
+ 33.505073450136891,
+ 36.395445208033053,
+ 39.339884187199495,
+ 42.335616460753485,
+ 45.380138898476908,
+ 48.471181351835227,
+ 51.606675567764377,
+ 54.784729398112319,
+ 58.003605222980518,
+ 61.261701761002001,
+ 64.557538627006338,
+ 67.88974313718154,
+ 71.257038967168015,
+ 74.658236348830158,
+ 78.092223553315307,
+ 81.557959456115043,
+ 85.054467017581516,
+ 88.580827542197682,
+ 92.136175603687093,
+ 95.719694542143202,
+ 99.330612454787428,
+ 102.96819861451381,
+ 106.63176026064346,
+ 110.32063971475739,
+ 114.03421178146171,
+ 117.77188139974507,
+ 121.53308151543864,
+ 125.3172711493569,
+ 129.12393363912722,
+ 132.95257503561632,
+ 136.80272263732635,
+ 140.67392364823425,
+ 144.5657439463449,
+ 148.47776695177302,
+ 152.40959258449735,
+ 156.3608363030788,
+ 160.3311282166309,
+ 164.32011226319517,
+ 168.32744544842765,
+ 172.35279713916279,
+ 176.39584840699735,
+ 180.45629141754378,
+ 184.53382886144948,
+ 188.6281734236716,
+ 192.7390472878449,
+ 196.86618167289001,
+ 201.00931639928152,
+ 205.1681994826412,
+ 209.34258675253685,
+ 213.53224149456327,
+ 217.73693411395422,
+ 221.95644181913033,
+ 226.1905483237276,
+ 230.43904356577696,
+ 234.70172344281826,
+ 238.97838956183432,
+ 243.26884900298271,
+ 247.57291409618688,
+ 251.89040220972319,
+ 256.22113555000954,
+ 260.56494097186322,
+ 264.92164979855278,
+ 269.29109765101981,
+ 273.67312428569369,
+ 278.06757344036612,
+ 282.4742926876304,
+ 286.89313329542699,
+ 291.32395009427029,
+ 295.76660135076065,
+ 300.22094864701415,
+ 304.68685676566872,
+ 309.1641935801469,
+ 313.65282994987905,
+ 318.1526396202093,
+ 322.66349912672615,
+ 327.1852877037752,
+ 331.71788719692847,
+ 336.26118197919845,
+ 340.81505887079902,
+ 345.37940706226686,
+ 349.95411804077025,
+ 354.53908551944079,
+ 359.1342053695754,
+ 363.73937555556347,
+ 368.35449607240474,
+ 372.97946888568902,
+ 377.61419787391867,
+ 382.25858877306001,
+ 386.91254912321756,
+ 391.57598821732961,
+ 396.24881705179155,
+ 400.93094827891576,
+ 405.6222961611449,
+ 410.32277652693733,
+ 415.03230672824964,
+ 419.75080559954472,
+ 424.47819341825709,
+ 429.21439186665157,
+ 433.95932399501481,
+ 438.71291418612117,
+ 443.47508812091894,
+ 448.24577274538461,
+ 453.02489623849613,
+ 457.81238798127816,
+ 462.60817852687489,
+ 467.4121995716082,
+ 472.22438392698058,
+ 477.04466549258564,
+ 481.87297922988796
+};
+
+/*
+ * Compute log(k!)
+ */
+
+double logfactorial(int64_t k)
+{
+ const double halfln2pi = 0.9189385332046728;
+
+ if (k < (int64_t) (sizeof(logfact)/sizeof(logfact[0]))) {
+ /* Use the lookup table. */
+ return logfact[k];
+ }
+
+ /*
+ * Use the Stirling series, truncated at the 1/k**3 term.
+ * (In a Python implementation of this approximation, the result
+ * was within 2 ULP of the best 64 bit floating point value for
+ * k up to 10000000.)
+ */
+ return (k + 0.5)*log(k) - k + (halfln2pi + (1.0/k)*(1/12.0 - 1/(360.0*k*k)));
+}
diff --git a/numpy/random/src/distributions/logfactorial.h b/numpy/random/src/distributions/logfactorial.h
new file mode 100644
index 000000000..1fedef3f6
--- /dev/null
+++ b/numpy/random/src/distributions/logfactorial.h
@@ -0,0 +1,9 @@
+
+#ifndef LOGFACTORIAL_H
+#define LOGFACTORIAL_H
+
+#include <stdint.h>
+
+double logfactorial(int64_t k);
+
+#endif
diff --git a/numpy/random/src/distributions/random_hypergeometric.c b/numpy/random/src/distributions/random_hypergeometric.c
new file mode 100644
index 000000000..59a3a4b9b
--- /dev/null
+++ b/numpy/random/src/distributions/random_hypergeometric.c
@@ -0,0 +1,260 @@
+#include <stdint.h>
+#include "distributions.h"
+#include "logfactorial.h"
+
+/*
+ * Generate a sample from the hypergeometric distribution.
+ *
+ * Assume sample is not greater than half the total. See below
+ * for how the opposite case is handled.
+ *
+ * We initialize the following:
+ * computed_sample = sample
+ * remaining_good = good
+ * remaining_total = good + bad
+ *
+ * In the loop:
+ * * computed_sample counts down to 0;
+ * * remaining_good is the number of good choices not selected yet;
+ * * remaining_total is the total number of choices not selected yet.
+ *
+ * In the loop, we select items by choosing a random integer in
+ * the interval [0, remaining_total), and if the value is less
+ * than remaining_good, it means we have selected a good one,
+ * so remaining_good is decremented. Then, regardless of that
+ * result, computed_sample is decremented. The loop continues
+ * until either computed_sample is 0, remaining_good is 0, or
+ * remaining_total == remaining_good. In the latter case, it
+ * means there are only good choices left, so we can stop the
+ * loop early and select what is left of computed_sample from
+ * the good choices (i.e. decrease remaining_good by computed_sample).
+ *
+ * When the loop exits, the actual number of good choices is
+ * good - remaining_good.
+ *
+ * If sample is more than half the total, then initially we set
+ * computed_sample = total - sample
+ * and at the end we return remaining_good (i.e. the loop in effect
+ * selects the complement of the result).
+ *
+ * It is assumed that when this function is called:
+ * * good, bad and sample are nonnegative;
+ * * the sum good+bad will not result in overflow;
+ * * sample <= good+bad.
+ */
+
+static int64_t hypergeometric_sample(bitgen_t *bitgen_state,
+ int64_t good, int64_t bad, int64_t sample)
+{
+ int64_t remaining_total, remaining_good, result, computed_sample;
+ int64_t total = good + bad;
+
+ if (sample > total/2) {
+ computed_sample = total - sample;
+ }
+ else {
+ computed_sample = sample;
+ }
+
+ remaining_total = total;
+ remaining_good = good;
+
+ while ((computed_sample > 0) && (remaining_good > 0) &&
+ (remaining_total > remaining_good)) {
+ // random_interval(bitgen_state, max) returns an integer in
+ // [0, max] *inclusive*, so we decrement remaining_total before
+ // passing it to random_interval().
+ --remaining_total;
+ if ((int64_t) random_interval(bitgen_state,
+ remaining_total) < remaining_good) {
+ // Selected a "good" one, so decrement remaining_good.
+ --remaining_good;
+ }
+ --computed_sample;
+ }
+
+ if (remaining_total == remaining_good) {
+ // Only "good" choices are left.
+ remaining_good -= computed_sample;
+ }
+
+ if (sample > total/2) {
+ result = remaining_good;
+ }
+ else {
+ result = good - remaining_good;
+ }
+
+ return result;
+}
+
+
+// D1 = 2*sqrt(2/e)
+// D2 = 3 - 2*sqrt(3/e)
+#define D1 1.7155277699214135
+#define D2 0.8989161620588988
+
+/*
+ * Generate variates from the hypergeometric distribution
+ * using the ratio-of-uniforms method.
+ *
+ * In the code, the variable names a, b, c, g, h, m, p, q, K, T,
+ * U and X match the names used in "Algorithm HRUA" beginning on
+ * page 82 of Stadlober's 1989 thesis.
+ *
+ * It is assumed that when this function is called:
+ * * good, bad and sample are nonnegative;
+ * * the sum good+bad will not result in overflow;
+ * * sample <= good+bad.
+ *
+ * References:
+ * - Ernst Stadlober's thesis "Sampling from Poisson, Binomial and
+ * Hypergeometric Distributions: Ratio of Uniforms as a Simple and
+ * Fast Alternative" (1989)
+ * - Ernst Stadlober, "The ratio of uniforms approach for generating
+ * discrete random variates", Journal of Computational and Applied
+ * Mathematics, 31, pp. 181-189 (1990).
+ */
+
+static int64_t hypergeometric_hrua(bitgen_t *bitgen_state,
+ int64_t good, int64_t bad, int64_t sample)
+{
+ int64_t mingoodbad, maxgoodbad, popsize;
+ int64_t computed_sample;
+ double p, q;
+ double mu, var;
+ double a, c, b, h, g;
+ int64_t m, K;
+
+ popsize = good + bad;
+ computed_sample = MIN(sample, popsize - sample);
+ mingoodbad = MIN(good, bad);
+ maxgoodbad = MAX(good, bad);
+
+ /*
+ * Variables that do not match Stadlober (1989)
+ * Here Stadlober
+ * ---------------- ---------
+ * mingoodbad M
+ * popsize N
+ * computed_sample n
+ */
+
+ p = ((double) mingoodbad) / popsize;
+ q = ((double) maxgoodbad) / popsize;
+
+ // mu is the mean of the distribution.
+ mu = computed_sample * p;
+
+ a = mu + 0.5;
+
+ // var is the variance of the distribution.
+ var = ((double)(popsize - computed_sample) *
+ computed_sample * p * q / (popsize - 1));
+
+ c = sqrt(var + 0.5);
+
+ /*
+ * h is 2*s_hat (See Stadlober's theses (1989), Eq. (5.17); or
+ * Stadlober (1990), Eq. 8). s_hat is the scale of the "table mountain"
+ * function that dominates the scaled hypergeometric PMF ("scaled" means
+ * normalized to have a maximum value of 1).
+ */
+ h = D1*c + D2;
+
+ m = (int64_t) floor((double)(computed_sample + 1) * (mingoodbad + 1) /
+ (popsize + 2));
+
+ g = (logfactorial(m) +
+ logfactorial(mingoodbad - m) +
+ logfactorial(computed_sample - m) +
+ logfactorial(maxgoodbad - computed_sample + m));
+
+ /*
+ * b is the upper bound for random samples:
+ * ... min(computed_sample, mingoodbad) + 1 is the length of the support.
+ * ... floor(a + 16*c) is 16 standard deviations beyond the mean.
+ *
+ * The idea behind the second upper bound is that values that far out in
+ * the tail have negligible probabilities.
+ *
+ * There is a comment in a previous version of this algorithm that says
+ * "16 for 16-decimal-digit precision in D1 and D2",
+ * but there is no documented justification for this value. A lower value
+ * might work just as well, but I've kept the value 16 here.
+ */
+ b = MIN(MIN(computed_sample, mingoodbad) + 1, floor(a + 16*c));
+
+ while (1) {
+ double U, V, X, T;
+ double gp;
+ U = random_double(bitgen_state);
+ V = random_double(bitgen_state); // "U star" in Stadlober (1989)
+ X = a + h*(V - 0.5) / U;
+
+ // fast rejection:
+ if ((X < 0.0) || (X >= b)) {
+ continue;
+ }
+
+ K = (int64_t) floor(X);
+
+ gp = (logfactorial(K) +
+ logfactorial(mingoodbad - K) +
+ logfactorial(computed_sample - K) +
+ logfactorial(maxgoodbad - computed_sample + K));
+
+ T = g - gp;
+
+ // fast acceptance:
+ if ((U*(4.0 - U) - 3.0) <= T) {
+ break;
+ }
+
+ // fast rejection:
+ if (U*(U - T) >= 1) {
+ continue;
+ }
+
+ if (2.0*log(U) <= T) {
+ // acceptance
+ break;
+ }
+ }
+
+ if (good > bad) {
+ K = computed_sample - K;
+ }
+
+ if (computed_sample < sample) {
+ K = good - K;
+ }
+
+ return K;
+}
+
+
+/*
+ * Draw a sample from the hypergeometric distribution.
+ *
+ * It is assumed that when this function is called:
+ * * good, bad and sample are nonnegative;
+ * * the sum good+bad will not result in overflow;
+ * * sample <= good+bad.
+ */
+
+int64_t random_hypergeometric(bitgen_t *bitgen_state,
+ int64_t good, int64_t bad, int64_t sample)
+{
+ int64_t r;
+
+ if ((sample >= 10) && (sample <= good + bad - 10)) {
+ // This will use the ratio-of-uniforms method.
+ r = hypergeometric_hrua(bitgen_state, good, bad, sample);
+ }
+ else {
+ // The simpler implementation is faster for small samples.
+ r = hypergeometric_sample(bitgen_state, good, bad, sample);
+ }
+ return r;
+}
diff --git a/numpy/random/src/legacy/legacy-distributions.c b/numpy/random/src/legacy/legacy-distributions.c
index 99665b370..4741a0352 100644
--- a/numpy/random/src/legacy/legacy-distributions.c
+++ b/numpy/random/src/legacy/legacy-distributions.c
@@ -1,5 +1,6 @@
#include "legacy-distributions.h"
+
static NPY_INLINE double legacy_double(aug_bitgen_t *aug_state) {
return aug_state->bit_generator->next_double(aug_state->bit_generator->state);
}
@@ -213,6 +214,113 @@ double legacy_exponential(aug_bitgen_t *aug_state, double scale) {
return scale * legacy_standard_exponential(aug_state);
}
+
+static RAND_INT_TYPE random_hypergeometric_hyp(bitgen_t *bitgen_state,
+ RAND_INT_TYPE good,
+ RAND_INT_TYPE bad,
+ RAND_INT_TYPE sample) {
+ RAND_INT_TYPE d1, k, z;
+ double d2, u, y;
+
+ d1 = bad + good - sample;
+ d2 = (double)MIN(bad, good);
+
+ y = d2;
+ k = sample;
+ while (y > 0.0) {
+ u = next_double(bitgen_state);
+ y -= (RAND_INT_TYPE)floor(u + y / (d1 + k));
+ k--;
+ if (k == 0)
+ break;
+ }
+ z = (RAND_INT_TYPE)(d2 - y);
+ if (good > bad)
+ z = sample - z;
+ return z;
+}
+
+/* D1 = 2*sqrt(2/e) */
+/* D2 = 3 - 2*sqrt(3/e) */
+#define D1 1.7155277699214135
+#define D2 0.8989161620588988
+static RAND_INT_TYPE random_hypergeometric_hrua(bitgen_t *bitgen_state,
+ RAND_INT_TYPE good,
+ RAND_INT_TYPE bad,
+ RAND_INT_TYPE sample) {
+ RAND_INT_TYPE mingoodbad, maxgoodbad, popsize, m, d9;
+ double d4, d5, d6, d7, d8, d10, d11;
+ RAND_INT_TYPE Z;
+ double T, W, X, Y;
+
+ mingoodbad = MIN(good, bad);
+ popsize = good + bad;
+ maxgoodbad = MAX(good, bad);
+ m = MIN(sample, popsize - sample);
+ d4 = ((double)mingoodbad) / popsize;
+ d5 = 1.0 - d4;
+ d6 = m * d4 + 0.5;
+ d7 = sqrt((double)(popsize - m) * sample * d4 * d5 / (popsize - 1) + 0.5);
+ d8 = D1 * d7 + D2;
+ d9 = (RAND_INT_TYPE)floor((double)(m + 1) * (mingoodbad + 1) / (popsize + 2));
+ d10 = (loggam(d9 + 1) + loggam(mingoodbad - d9 + 1) + loggam(m - d9 + 1) +
+ loggam(maxgoodbad - m + d9 + 1));
+ d11 = MIN(MIN(m, mingoodbad) + 1.0, floor(d6 + 16 * d7));
+ /* 16 for 16-decimal-digit precision in D1 and D2 */
+
+ while (1) {
+ X = next_double(bitgen_state);
+ Y = next_double(bitgen_state);
+ W = d6 + d8 * (Y - 0.5) / X;
+
+ /* fast rejection: */
+ if ((W < 0.0) || (W >= d11))
+ continue;
+
+ Z = (RAND_INT_TYPE)floor(W);
+ T = d10 - (loggam(Z + 1) + loggam(mingoodbad - Z + 1) + loggam(m - Z + 1) +
+ loggam(maxgoodbad - m + Z + 1));
+
+ /* fast acceptance: */
+ if ((X * (4.0 - X) - 3.0) <= T)
+ break;
+
+ /* fast rejection: */
+ if (X * (X - T) >= 1)
+ continue;
+ /* log(0.0) is ok here, since always accept */
+ if (2.0 * log(X) <= T)
+ break; /* acceptance */
+ }
+
+ /* this is a correction to HRUA* by Ivan Frohne in rv.py */
+ if (good > bad)
+ Z = m - Z;
+
+ /* another fix from rv.py to allow sample to exceed popsize/2 */
+ if (m < sample)
+ Z = good - Z;
+
+ return Z;
+}
+#undef D1
+#undef D2
+
+static RAND_INT_TYPE random_hypergeometric_original(bitgen_t *bitgen_state,
+ RAND_INT_TYPE good,
+ RAND_INT_TYPE bad,
+ RAND_INT_TYPE sample)
+{
+ if (sample > 10) {
+ return random_hypergeometric_hrua(bitgen_state, good, bad, sample);
+ } else if (sample > 0) {
+ return random_hypergeometric_hyp(bitgen_state, good, bad, sample);
+ } else {
+ return 0;
+ }
+}
+
+
/*
* This is a wrapper function that matches the expected template. In the legacy
* generator, all int types are long, so this accepts int64 and then converts
@@ -223,12 +331,14 @@ double legacy_exponential(aug_bitgen_t *aug_state, double scale) {
*/
int64_t legacy_random_hypergeometric(bitgen_t *bitgen_state, int64_t good,
int64_t bad, int64_t sample) {
- return (int64_t)random_hypergeometric(bitgen_state, (RAND_INT_TYPE)good,
- (RAND_INT_TYPE)bad,
- (RAND_INT_TYPE)sample);
+ return (int64_t)random_hypergeometric_original(bitgen_state,
+ (RAND_INT_TYPE)good,
+ (RAND_INT_TYPE)bad,
+ (RAND_INT_TYPE)sample);
}
- int64_t legacy_random_logseries(bitgen_t *bitgen_state, double p) {
+
+int64_t legacy_random_logseries(bitgen_t *bitgen_state, double p) {
return (int64_t)random_logseries(bitgen_state, p);
}
diff --git a/numpy/random/tests/test_generator_mt19937.py b/numpy/random/tests/test_generator_mt19937.py
index c6259c42a..44324c9e3 100644
--- a/numpy/random/tests/test_generator_mt19937.py
+++ b/numpy/random/tests/test_generator_mt19937.py
@@ -897,9 +897,9 @@ class TestRandomDist(object):
def test_hypergeometric(self):
random.bit_generator.seed(self.seed)
actual = random.hypergeometric(10.1, 5.5, 14, size=(3, 2))
- desired = np.array([[10, 10],
- [10, 10],
- [9, 9]])
+ desired = np.array([[9, 9],
+ [10, 9],
+ [9, 10]])
assert_array_equal(actual, desired)
# Test nbad = 0
@@ -1875,7 +1875,7 @@ class TestBroadcast(object):
bad_nsample_one = [-1]
bad_nsample_two = [4]
hypergeom = random.hypergeometric
- desired = np.array([1, 1, 1])
+ desired = np.array([0, 0, 1])
self.set_seed()
actual = hypergeom(ngood * 3, nbad, nsample)
@@ -1906,6 +1906,11 @@ class TestBroadcast(object):
assert_raises(ValueError, hypergeom, 10, 10, -1)
assert_raises(ValueError, hypergeom, 10, 10, 25)
+ # ValueError for arguments that are too big.
+ assert_raises(ValueError, hypergeom, 2**30, 10, 20)
+ assert_raises(ValueError, hypergeom, 999, 2**31, 50)
+ assert_raises(ValueError, hypergeom, 999, [2**29, 2**30], 1000)
+
def test_logseries(self):
p = [0.5]
bad_p_one = [2]
diff --git a/numpy/random/tests/test_generator_mt19937_regressions.py b/numpy/random/tests/test_generator_mt19937_regressions.py
index cd4e08d6f..7dca65071 100644
--- a/numpy/random/tests/test_generator_mt19937_regressions.py
+++ b/numpy/random/tests/test_generator_mt19937_regressions.py
@@ -23,15 +23,8 @@ class TestRegression(object):
assert_(np.all(mt19937.hypergeometric(18, 3, 11, size=10) > 0))
# Test for ticket #5623
- args = [
- (2**20 - 2, 2**20 - 2, 2**20 - 2), # Check for 32-bit systems
- ]
- is_64bits = sys.maxsize > 2**32
- if is_64bits and sys.platform != 'win32':
- # Check for 64-bit systems
- args.append((2**40 - 2, 2**40 - 2, 2**40 - 2))
- for arg in args:
- assert_(mt19937.hypergeometric(*arg) > 0)
+ args = (2**20 - 2, 2**20 - 2, 2**20 - 2) # Check for 32-bit systems
+ assert_(mt19937.hypergeometric(*args) > 0)
def test_logseries_convergence(self):
# Test for ticket #923