41 files changed, 2937 insertions, 977 deletions
diff --git a/numpy/random/__init__.py b/numpy/random/__init__.py
index e7eecc5cd..1ceb5c4dd 100644
--- a/numpy/random/__init__.py
+++ b/numpy/random/__init__.py
@@ -179,21 +179,19 @@ __all__ = [
 
 # add these for module-freeze analysis (like PyInstaller)
 from . import _pickle
-from . import common
-from . import bounded_integers
-from . import entropy
-
+from . import _common
+from . import _bounded_integers
+
+from ._generator import Generator, default_rng
+from ._bit_generator import SeedSequence, BitGenerator
+from ._mt19937 import MT19937
+from ._pcg64 import PCG64
+from ._philox import Philox
+from ._sfc64 import SFC64
 from .mtrand import *
-from .generator import Generator, default_rng
-from .bit_generator import SeedSequence
-from .mt19937 import MT19937
-from .pcg64 import PCG64
-from .philox import Philox
-from .sfc64 import SFC64
-from .mtrand import RandomState
 
 __all__ += ['Generator', 'RandomState', 'SeedSequence', 'MT19937',
-            'Philox', 'PCG64', 'SFC64', 'default_rng']
+            'Philox', 'PCG64', 'SFC64', 'default_rng', 'BitGenerator']
 
 
 def __RandomState_ctor():
diff --git a/numpy/random/bit_generator.pxd b/numpy/random/_bit_generator.pxd
index 984033f17..30fa4a27d 100644
--- a/numpy/random/bit_generator.pxd
+++ b/numpy/random/_bit_generator.pxd
@@ -1,6 +1,15 @@
-
-from .common cimport bitgen_t, uint32_t
 cimport numpy as np
+from libc.stdint cimport uint32_t, uint64_t
+
+cdef extern from "include/bitgen.h":
+    struct bitgen:
+        void *state
+        uint64_t (*next_uint64)(void *st) nogil
+        uint32_t (*next_uint32)(void *st) nogil
+        double (*next_double)(void *st) nogil
+        uint64_t (*next_raw)(void *st) nogil
+
+    ctypedef bitgen bitgen_t
 
 cdef class BitGenerator():
     cdef readonly object _seed_seq
diff --git a/numpy/random/bit_generator.pyx b/numpy/random/_bit_generator.pyx
index eb608af6c..21d21e6bb 100644
--- a/numpy/random/bit_generator.pyx
+++ b/numpy/random/_bit_generator.pyx
@@ -53,9 +53,7 @@ from cpython.pycapsule cimport PyCapsule_New
 import numpy as np
 cimport numpy as np
 
-from libc.stdint cimport uint32_t
-from .common cimport (random_raw, benchmark, prepare_ctypes, prepare_cffi)
-from .distributions cimport bitgen_t
+from ._common cimport (random_raw, benchmark, prepare_ctypes, prepare_cffi)
 
 __all__ = ['SeedSequence', 'BitGenerator']
 
@@ -116,7 +114,7 @@ def _coerce_to_uint32_array(x):
     Examples
     --------
     >>> import numpy as np
-    >>> from numpy.random.bit_generator import _coerce_to_uint32_array
+    >>> from numpy.random._bit_generator import _coerce_to_uint32_array
     >>> _coerce_to_uint32_array(12345)
     array([12345], dtype=uint32)
     >>> _coerce_to_uint32_array('12345')
@@ -484,13 +482,12 @@ cdef class BitGenerator():
 
     Parameters
     ----------
-    seed : {None, int, array_like[ints], ISeedSequence}, optional
+    seed : {None, int, array_like[ints], SeedSequence}, optional
         A seed to initialize the `BitGenerator`. If None, then fresh,
         unpredictable entropy will be pulled from the OS. If an ``int`` or
         ``array_like[ints]`` is passed, then it will be passed to
-        `SeedSequence` to derive the initial `BitGenerator` state. One may also
-        pass in an implementor of the `ISeedSequence` interface like
-        `SeedSequence`.
+        ~`numpy.random.SeedSequence` to derive the initial `BitGenerator` state.
+        One may also pass in a `SeedSequence` instance.
 
     Attributes
     ----------
diff --git a/numpy/random/_bounded_integers.pxd b/numpy/random/_bounded_integers.pxd
new file mode 100644
index 000000000..d3ee97a70
--- /dev/null
+++ b/numpy/random/_bounded_integers.pxd
@@ -0,0 +1,29 @@
+from libc.stdint cimport (uint8_t, uint16_t, uint32_t, uint64_t,
+                          int8_t, int16_t, int32_t, int64_t, intptr_t)
+import numpy as np
+cimport numpy as np
+ctypedef np.npy_bool bool_t
+
+from ._bit_generator cimport bitgen_t
+
+cdef inline uint64_t _gen_mask(uint64_t max_val) nogil:
+    """Mask generator for use in bounded random numbers"""
+    # Smallest bit mask >= max
+    cdef uint64_t mask = max_val
+    mask |= mask >> 1
+    mask |= mask >> 2
+    mask |= mask >> 4
+    mask |= mask >> 8
+    mask |= mask >> 16
+    mask |= mask >> 32
+    return mask
+
+cdef object _rand_uint64(object low, object high, object size, bint use_masked, bint closed, bitgen_t *state, object lock)
+cdef object _rand_uint32(object low, object high, object size, bint use_masked, bint closed, bitgen_t *state, object lock)
+cdef object _rand_uint16(object low, object high, object size, bint use_masked, bint closed, bitgen_t *state, object lock)
+cdef object _rand_uint8(object low, object high, object size, bint use_masked, bint closed, bitgen_t *state, object lock)
+cdef object _rand_bool(object low, object high, object size, bint use_masked, bint closed, bitgen_t *state, object lock)
+cdef object _rand_int64(object low, object high, object size, bint use_masked, bint closed, bitgen_t *state, object lock)
+cdef object _rand_int32(object low, object high, object size, bint use_masked, bint closed, bitgen_t *state, object lock)
+cdef object _rand_int16(object low, object high, object size, bint use_masked, bint closed, bitgen_t *state, object lock)
+cdef object _rand_int8(object low, object high, object size, bint use_masked, bint closed, bitgen_t *state, object lock)
diff --git a/numpy/random/bounded_integers.pxd.in b/numpy/random/_bounded_integers.pxd.in
index 7a3f224dc..320d35774 100644
--- a/numpy/random/bounded_integers.pxd.in
+++ b/numpy/random/_bounded_integers.pxd.in
@@ -4,7 +4,7 @@ import numpy as np
 cimport numpy as np
 ctypedef np.npy_bool bool_t
 
-from .common cimport bitgen_t
+from ._bit_generator cimport bitgen_t
 
 cdef inline uint64_t _gen_mask(uint64_t max_val) nogil:
     """Mask generator for use in bounded random numbers"""
diff --git a/numpy/random/_bounded_integers.pyx b/numpy/random/_bounded_integers.pyx
new file mode 100644
index 000000000..d6a534b43
--- /dev/null
+++ b/numpy/random/_bounded_integers.pyx
@@ -0,0 +1,1564 @@
+#!python
+#cython: wraparound=False, nonecheck=False, boundscheck=False, cdivision=True
+
+import numpy as np
+cimport numpy as np
+
+__all__ = []
+
+np.import_array()
+
+cdef extern from "include/distributions.h":
+    # Generate random numbers in closed interval [off, off + rng].
+    uint64_t random_bounded_uint64(bitgen_t *bitgen_state,
+                                   uint64_t off, uint64_t rng,
+                                   uint64_t mask, bint use_masked) nogil
+    uint32_t random_buffered_bounded_uint32(bitgen_t *bitgen_state,
+                                            uint32_t off, uint32_t rng,
+                                            uint32_t mask, bint use_masked,
+                                            int *bcnt, uint32_t *buf) nogil
+    uint16_t random_buffered_bounded_uint16(bitgen_t *bitgen_state,
+                                            uint16_t off, uint16_t rng,
+                                            uint16_t mask, bint use_masked,
+                                            int *bcnt, uint32_t *buf) nogil
+    uint8_t random_buffered_bounded_uint8(bitgen_t *bitgen_state,
+                                          uint8_t off, uint8_t rng,
+                                          uint8_t mask, bint use_masked,
+                                          int *bcnt, uint32_t *buf) nogil
+    np.npy_bool random_buffered_bounded_bool(bitgen_t *bitgen_state,
+                                             np.npy_bool off, np.npy_bool rng,
+                                             np.npy_bool mask, bint use_masked,
+                                             int *bcnt, uint32_t *buf) nogil
+    void random_bounded_uint64_fill(bitgen_t *bitgen_state,
+                                    uint64_t off, uint64_t rng, np.npy_intp cnt,
+                                    bint use_masked,
+                                    uint64_t *out) nogil
+    void random_bounded_uint32_fill(bitgen_t *bitgen_state,
+                                    uint32_t off, uint32_t rng, np.npy_intp cnt,
+                                    bint use_masked,
+                                    uint32_t *out) nogil
+    void random_bounded_uint16_fill(bitgen_t *bitgen_state,
+                                    uint16_t off, uint16_t rng, np.npy_intp cnt,
+                                    bint use_masked,
+                                    uint16_t *out) nogil
+    void random_bounded_uint8_fill(bitgen_t *bitgen_state,
+                                   uint8_t off, uint8_t rng, np.npy_intp cnt,
+                                   bint use_masked,
+                                   uint8_t *out) nogil
+    void random_bounded_bool_fill(bitgen_t *bitgen_state,
+                                  np.npy_bool off, np.npy_bool rng, np.npy_intp cnt,
+                                  bint use_masked,
+                                  np.npy_bool *out) nogil
+
+
+
+_integers_types = {'bool': (0, 2),
+                 'int8': (-2**7, 2**7),
+                 'int16': (-2**15, 2**15),
+                 'int32': (-2**31, 2**31),
+                 'int64': (-2**63, 2**63),
+                 'uint8': (0, 2**8),
+                 'uint16': (0, 2**16),
+                 'uint32': (0, 2**32),
+                 'uint64': (0, 2**64)}
+
+
+cdef object _rand_uint32_broadcast(np.ndarray low, np.ndarray high, object size,
+                                       bint use_masked, bint closed,
+                                       bitgen_t *state, object lock):
+    """
+    Array path for smaller integer types
+
+    This path is simpler since the high value in the open interval [low, high)
+    must be in-range for the next larger type, uint64. Here we case to
+    this type for checking and the recast to uint32 when producing the
+    random integers.
+    """
+    cdef uint32_t rng, last_rng, off, val, mask, out_val, is_open
+    cdef uint32_t buf
+    cdef uint32_t *out_data
+    cdef uint64_t low_v, high_v
+    cdef np.ndarray low_arr, high_arr, out_arr
+    cdef np.npy_intp i, cnt
+    cdef np.broadcast it
+    cdef int buf_rem = 0
+
+    # Array path
+    is_open = not closed
+    low_arr = <np.ndarray>low
+    high_arr = <np.ndarray>high
+    if np.any(np.less(low_arr, 0)):
+        raise ValueError('low is out of bounds for uint32')
+    if closed:
+        high_comp = np.greater_equal
+        low_high_comp = np.greater
+    else:
+        high_comp = np.greater
+        low_high_comp = np.greater_equal
+
+    if np.any(high_comp(high_arr, 0X100000000ULL)):
+        raise ValueError('high is out of bounds for uint32')
+    if np.any(low_high_comp(low_arr, high_arr)):
+        comp = '>' if closed else '>='
+        raise ValueError('low {comp} high'.format(comp=comp))
+
+    low_arr = <np.ndarray>np.PyArray_FROM_OTF(low, np.NPY_UINT64, np.NPY_ALIGNED | np.NPY_FORCECAST)
+    high_arr = <np.ndarray>np.PyArray_FROM_OTF(high, np.NPY_UINT64, np.NPY_ALIGNED | np.NPY_FORCECAST)
+
+    if size is not None:
+        out_arr = <np.ndarray>np.empty(size, np.uint32)
+    else:
+        it = np.PyArray_MultiIterNew2(low_arr, high_arr)
+        out_arr = <np.ndarray>np.empty(it.shape, np.uint32)
+
+    it = np.PyArray_MultiIterNew3(low_arr, high_arr, out_arr)
+    out_data = <uint32_t *>np.PyArray_DATA(out_arr)
+    cnt = np.PyArray_SIZE(out_arr)
+    mask = last_rng = 0
+    with lock, nogil:
+        for i in range(cnt):
+            low_v = (<uint64_t*>np.PyArray_MultiIter_DATA(it, 0))[0]
+            high_v = (<uint64_t*>np.PyArray_MultiIter_DATA(it, 1))[0]
+            # Subtract 1 since generator produces values on the closed int [off, off+rng]
+            rng = <uint32_t>((high_v - is_open) - low_v)
+            off = <uint32_t>(<uint64_t>low_v)
+
+            if rng != last_rng:
+                # Smallest bit mask >= max
+                mask = <uint32_t>_gen_mask(rng)
+
+            out_data[i] = random_buffered_bounded_uint32(state, off, rng, mask, use_masked, &buf_rem, &buf)
+
+            np.PyArray_MultiIter_NEXT(it)
+    return out_arr
+
+cdef object _rand_uint16_broadcast(np.ndarray low, np.ndarray high, object size,
+                                       bint use_masked, bint closed,
+                                       bitgen_t *state, object lock):
+    """
+    Array path for smaller integer types
+
+    This path is simpler since the high value in the open interval [low, high)
+    must be in-range for the next larger type, uint32. Here we case to
+    this type for checking and the recast to uint16 when producing the
+    random integers.
+    """
+    cdef uint16_t rng, last_rng, off, val, mask, out_val, is_open
+    cdef uint32_t buf
+    cdef uint16_t *out_data
+    cdef uint32_t low_v, high_v
+    cdef np.ndarray low_arr, high_arr, out_arr
+    cdef np.npy_intp i, cnt
+    cdef np.broadcast it
+    cdef int buf_rem = 0
+
+    # Array path
+    is_open = not closed
+    low_arr = <np.ndarray>low
+    high_arr = <np.ndarray>high
+    if np.any(np.less(low_arr, 0)):
+        raise ValueError('low is out of bounds for uint16')
+    if closed:
+        high_comp = np.greater_equal
+        low_high_comp = np.greater
+    else:
+        high_comp = np.greater
+        low_high_comp = np.greater_equal
+
+    if np.any(high_comp(high_arr, 0X10000UL)):
+        raise ValueError('high is out of bounds for uint16')
+    if np.any(low_high_comp(low_arr, high_arr)):
+        comp = '>' if closed else '>='
+        raise ValueError('low {comp} high'.format(comp=comp))
+
+    low_arr = <np.ndarray>np.PyArray_FROM_OTF(low, np.NPY_UINT32, np.NPY_ALIGNED | np.NPY_FORCECAST)
+    high_arr = <np.ndarray>np.PyArray_FROM_OTF(high, np.NPY_UINT32, np.NPY_ALIGNED | np.NPY_FORCECAST)
+
+    if size is not None:
+        out_arr = <np.ndarray>np.empty(size, np.uint16)
+    else:
+        it = np.PyArray_MultiIterNew2(low_arr, high_arr)
+        out_arr = <np.ndarray>np.empty(it.shape, np.uint16)
+
+    it = np.PyArray_MultiIterNew3(low_arr, high_arr, out_arr)
+    out_data = <uint16_t *>np.PyArray_DATA(out_arr)
+    cnt = np.PyArray_SIZE(out_arr)
+    mask = last_rng = 0
+    with lock, nogil:
+        for i in range(cnt):
+            low_v = (<uint32_t*>np.PyArray_MultiIter_DATA(it, 0))[0]
+            high_v = (<uint32_t*>np.PyArray_MultiIter_DATA(it, 1))[0]
+            # Subtract 1 since generator produces values on the closed int [off, off+rng]
+            rng = <uint16_t>((high_v - is_open) - low_v)
+            off = <uint16_t>(<uint32_t>low_v)
+
+            if rng != last_rng:
+                # Smallest bit mask >= max
+                mask = <uint16_t>_gen_mask(rng)
+
+            out_data[i] = random_buffered_bounded_uint16(state, off, rng, mask, use_masked, &buf_rem, &buf)
+
+            np.PyArray_MultiIter_NEXT(it)
+    return out_arr
+
+cdef object _rand_uint8_broadcast(np.ndarray low, np.ndarray high, object size,
+                                       bint use_masked, bint closed,
+                                       bitgen_t *state, object lock):
+    """
+    Array path for smaller integer types
+
+    This path is simpler since the high value in the open interval [low, high)
+    must be in-range for the next larger type, uint16. Here we case to
+    this type for checking and the recast to uint8 when producing the
+    random integers.
+    """
+    cdef uint8_t rng, last_rng, off, val, mask, out_val, is_open
+    cdef uint32_t buf
+    cdef uint8_t *out_data
+    cdef uint16_t low_v, high_v
+    cdef np.ndarray low_arr, high_arr, out_arr
+    cdef np.npy_intp i, cnt
+    cdef np.broadcast it
+    cdef int buf_rem = 0
+
+    # Array path
+    is_open = not closed
+    low_arr = <np.ndarray>low
+    high_arr = <np.ndarray>high
+    if np.any(np.less(low_arr, 0)):
+        raise ValueError('low is out of bounds for uint8')
+    if closed:
+        high_comp = np.greater_equal
+        low_high_comp = np.greater
+    else:
+        high_comp = np.greater
+        low_high_comp = np.greater_equal
+
+    if np.any(high_comp(high_arr, 0X100UL)):
+        raise ValueError('high is out of bounds for uint8')
+    if np.any(low_high_comp(low_arr, high_arr)):
+        comp = '>' if closed else '>='
+        raise ValueError('low {comp} high'.format(comp=comp))
+
+    low_arr = <np.ndarray>np.PyArray_FROM_OTF(low, np.NPY_UINT16, np.NPY_ALIGNED | np.NPY_FORCECAST)
+    high_arr = <np.ndarray>np.PyArray_FROM_OTF(high, np.NPY_UINT16, np.NPY_ALIGNED | np.NPY_FORCECAST)
+
+    if size is not None:
+        out_arr = <np.ndarray>np.empty(size, np.uint8)
+    else:
+        it = np.PyArray_MultiIterNew2(low_arr, high_arr)
+        out_arr = <np.ndarray>np.empty(it.shape, np.uint8)
+
+    it = np.PyArray_MultiIterNew3(low_arr, high_arr, out_arr)
+    out_data = <uint8_t *>np.PyArray_DATA(out_arr)
+    cnt = np.PyArray_SIZE(out_arr)
+    mask = last_rng = 0
+    with lock, nogil:
+        for i in range(cnt):
+            low_v = (<uint16_t*>np.PyArray_MultiIter_DATA(it, 0))[0]
+            high_v = (<uint16_t*>np.PyArray_MultiIter_DATA(it, 1))[0]
+            # Subtract 1 since generator produces values on the closed int [off, off+rng]
+            rng = <uint8_t>((high_v - is_open) - low_v)
+            off = <uint8_t>(<uint16_t>low_v)
+
+            if rng != last_rng:
+                # Smallest bit mask >= max
+                mask = <uint8_t>_gen_mask(rng)
+
+            out_data[i] = random_buffered_bounded_uint8(state, off, rng, mask, use_masked, &buf_rem, &buf)
+
+            np.PyArray_MultiIter_NEXT(it)
+    return out_arr
+
+cdef object _rand_bool_broadcast(np.ndarray low, np.ndarray high, object size,
+                                       bint use_masked, bint closed,
+                                       bitgen_t *state, object lock):
+    """
+    Array path for smaller integer types
+
+    This path is simpler since the high value in the open interval [low, high)
+    must be in-range for the next larger type, uint8. Here we case to
+    this type for checking and the recast to bool when producing the
+    random integers.
+    """
+    cdef bool_t rng, last_rng, off, val, mask, out_val, is_open
+    cdef uint32_t buf
+    cdef bool_t *out_data
+    cdef uint8_t low_v, high_v
+    cdef np.ndarray low_arr, high_arr, out_arr
+    cdef np.npy_intp i, cnt
+    cdef np.broadcast it
+    cdef int buf_rem = 0
+
+    # Array path
+    is_open = not closed
+    low_arr = <np.ndarray>low
+    high_arr = <np.ndarray>high
+    if np.any(np.less(low_arr, 0)):
+        raise ValueError('low is out of bounds for bool')
+    if closed:
+        high_comp = np.greater_equal
+        low_high_comp = np.greater
+    else:
+        high_comp = np.greater
+        low_high_comp = np.greater_equal
+
+    if np.any(high_comp(high_arr, 0x2UL)):
+        raise ValueError('high is out of bounds for bool')
+    if np.any(low_high_comp(low_arr, high_arr)):
+        comp = '>' if closed else '>='
+        raise ValueError('low {comp} high'.format(comp=comp))
+
+    low_arr = <np.ndarray>np.PyArray_FROM_OTF(low, np.NPY_UINT8, np.NPY_ALIGNED | np.NPY_FORCECAST)
+    high_arr = <np.ndarray>np.PyArray_FROM_OTF(high, np.NPY_UINT8, np.NPY_ALIGNED | np.NPY_FORCECAST)
+
+    if size is not None:
+        out_arr = <np.ndarray>np.empty(size, np.bool_)
+    else:
+        it = np.PyArray_MultiIterNew2(low_arr, high_arr)
+        out_arr = <np.ndarray>np.empty(it.shape, np.bool_)
+
+    it = np.PyArray_MultiIterNew3(low_arr, high_arr, out_arr)
+    out_data = <bool_t *>np.PyArray_DATA(out_arr)
+    cnt = np.PyArray_SIZE(out_arr)
+    mask = last_rng = 0
+    with lock, nogil:
+        for i in range(cnt):
+            low_v = (<uint8_t*>np.PyArray_MultiIter_DATA(it, 0))[0]
+            high_v = (<uint8_t*>np.PyArray_MultiIter_DATA(it, 1))[0]
+            # Subtract 1 since generator produces values on the closed int [off, off+rng]
+            rng = <bool_t>((high_v - is_open) - low_v)
+            off = <bool_t>(<uint8_t>low_v)
+
+            if rng != last_rng:
+                # Smallest bit mask >= max
+                mask = <bool_t>_gen_mask(rng)
+
+            out_data[i] = random_buffered_bounded_bool(state, off, rng, mask, use_masked, &buf_rem, &buf)
+
+            np.PyArray_MultiIter_NEXT(it)
+    return out_arr
+
+cdef object _rand_int32_broadcast(np.ndarray low, np.ndarray high, object size,
+                                       bint use_masked, bint closed,
+                                       bitgen_t *state, object lock):
+    """
+    Array path for smaller integer types
+
+    This path is simpler since the high value in the open interval [low, high)
+    must be in-range for the next larger type, uint64. Here we case to
+    this type for checking and the recast to int32 when producing the
+    random integers.
+    """
+    cdef uint32_t rng, last_rng, off, val, mask, out_val, is_open
+    cdef uint32_t buf
+    cdef uint32_t *out_data
+    cdef uint64_t low_v, high_v
+    cdef np.ndarray low_arr, high_arr, out_arr
+    cdef np.npy_intp i, cnt
+    cdef np.broadcast it
+    cdef int buf_rem = 0
+
+    # Array path
+    is_open = not closed
+    low_arr = <np.ndarray>low
+    high_arr = <np.ndarray>high
+    if np.any(np.less(low_arr, -0x80000000LL)):
+        raise ValueError('low is out of bounds for int32')
+    if closed:
+        high_comp = np.greater_equal
+        low_high_comp = np.greater
+    else:
+        high_comp = np.greater
+        low_high_comp = np.greater_equal
+
+    if np.any(high_comp(high_arr, 0x80000000LL)):
+        raise ValueError('high is out of bounds for int32')
+    if np.any(low_high_comp(low_arr, high_arr)):
+        comp = '>' if closed else '>='
+        raise ValueError('low {comp} high'.format(comp=comp))
+
+    low_arr = <np.ndarray>np.PyArray_FROM_OTF(low, np.NPY_INT64, np.NPY_ALIGNED | np.NPY_FORCECAST)
+    high_arr = <np.ndarray>np.PyArray_FROM_OTF(high, np.NPY_INT64, np.NPY_ALIGNED | np.NPY_FORCECAST)
+
+    if size is not None:
+        out_arr = <np.ndarray>np.empty(size, np.int32)
+    else:
+        it = np.PyArray_MultiIterNew2(low_arr, high_arr)
+        out_arr = <np.ndarray>np.empty(it.shape, np.int32)
+
+    it = np.PyArray_MultiIterNew3(low_arr, high_arr, out_arr)
+    out_data = <uint32_t *>np.PyArray_DATA(out_arr)
+    cnt = np.PyArray_SIZE(out_arr)
+    mask = last_rng = 0
+    with lock, nogil:
+        for i in range(cnt):
+            low_v = (<uint64_t*>np.PyArray_MultiIter_DATA(it, 0))[0]
+            high_v = (<uint64_t*>np.PyArray_MultiIter_DATA(it, 1))[0]
+            # Subtract 1 since generator produces values on the closed int [off, off+rng]
+            rng = <uint32_t>((high_v - is_open) - low_v)
+            off = <uint32_t>(<uint64_t>low_v)
+
+            if rng != last_rng:
+                # Smallest bit mask >= max
+                mask = <uint32_t>_gen_mask(rng)
+
+            out_data[i] = random_buffered_bounded_uint32(state, off, rng, mask, use_masked, &buf_rem, &buf)
+
+            np.PyArray_MultiIter_NEXT(it)
+    return out_arr
+
+cdef object _rand_int16_broadcast(np.ndarray low, np.ndarray high, object size,
+                                       bint use_masked, bint closed,
+                                       bitgen_t *state, object lock):
+    """
+    Array path for smaller integer types
+
+    This path is simpler since the high value in the open interval [low, high)
+    must be in-range for the next larger type, uint32. Here we case to
+    this type for checking and the recast to int16 when producing the
+    random integers.
+    """
+    cdef uint16_t rng, last_rng, off, val, mask, out_val, is_open
+    cdef uint32_t buf
+    cdef uint16_t *out_data
+    cdef uint32_t low_v, high_v
+    cdef np.ndarray low_arr, high_arr, out_arr
+    cdef np.npy_intp i, cnt
+    cdef np.broadcast it
+    cdef int buf_rem = 0
+
+    # Array path
+    is_open = not closed
+    low_arr = <np.ndarray>low
+    high_arr = <np.ndarray>high
+    if np.any(np.less(low_arr, -0x8000LL)):
+        raise ValueError('low is out of bounds for int16')
+    if closed:
+        high_comp = np.greater_equal
+        low_high_comp = np.greater
+    else:
+        high_comp = np.greater
+        low_high_comp = np.greater_equal
+
+    if np.any(high_comp(high_arr, 0x8000LL)):
+        raise ValueError('high is out of bounds for int16')
+    if np.any(low_high_comp(low_arr, high_arr)):
+        comp = '>' if closed else '>='
+        raise ValueError('low {comp} high'.format(comp=comp))
+
+    low_arr = <np.ndarray>np.PyArray_FROM_OTF(low, np.NPY_INT32, np.NPY_ALIGNED | np.NPY_FORCECAST)
+    high_arr = <np.ndarray>np.PyArray_FROM_OTF(high, np.NPY_INT32, np.NPY_ALIGNED | np.NPY_FORCECAST)
+
+    if size is not None:
+        out_arr = <np.ndarray>np.empty(size, np.int16)
+    else:
+        it = np.PyArray_MultiIterNew2(low_arr, high_arr)
+        out_arr = <np.ndarray>np.empty(it.shape, np.int16)
+
+    it = np.PyArray_MultiIterNew3(low_arr, high_arr, out_arr)
+    out_data = <uint16_t *>np.PyArray_DATA(out_arr)
+    cnt = np.PyArray_SIZE(out_arr)
+    mask = last_rng = 0
+    with lock, nogil:
+        for i in range(cnt):
+            low_v = (<uint32_t*>np.PyArray_MultiIter_DATA(it, 0))[0]
+            high_v = (<uint32_t*>np.PyArray_MultiIter_DATA(it, 1))[0]
+            # Subtract 1 since generator produces values on the closed int [off, off+rng]
+            rng = <uint16_t>((high_v - is_open) - low_v)
+            off = <uint16_t>(<uint32_t>low_v)
+
+            if rng != last_rng:
+                # Smallest bit mask >= max
+                mask = <uint16_t>_gen_mask(rng)
+
+            out_data[i] = random_buffered_bounded_uint16(state, off, rng, mask, use_masked, &buf_rem, &buf)
+
+            np.PyArray_MultiIter_NEXT(it)
+    return out_arr
+
+cdef object _rand_int8_broadcast(np.ndarray low, np.ndarray high, object size,
+                                       bint use_masked, bint closed,
+                                       bitgen_t *state, object lock):
+    """
+    Array path for smaller integer types
+
+    This path is simpler since the high value in the open interval [low, high)
+    must be in-range for the next larger type, uint16. Here we case to
+    this type for checking and the recast to int8 when producing the
+    random integers.
+    """
+    cdef uint8_t rng, last_rng, off, val, mask, out_val, is_open
+    cdef uint32_t buf
+    cdef uint8_t *out_data
+    cdef uint16_t low_v, high_v
+    cdef np.ndarray low_arr, high_arr, out_arr
+    cdef np.npy_intp i, cnt
+    cdef np.broadcast it
+    cdef int buf_rem = 0
+
+    # Array path
+    is_open = not closed
+    low_arr = <np.ndarray>low
+    high_arr = <np.ndarray>high
+    if np.any(np.less(low_arr, -0x80LL)):
+        raise ValueError('low is out of bounds for int8')
+    if closed:
+        high_comp = np.greater_equal
+        low_high_comp = np.greater
+    else:
+        high_comp = np.greater
+        low_high_comp = np.greater_equal
+
+    if np.any(high_comp(high_arr, 0x80LL)):
+        raise ValueError('high is out of bounds for int8')
+    if np.any(low_high_comp(low_arr, high_arr)):
+        comp = '>' if closed else '>='
+        raise ValueError('low {comp} high'.format(comp=comp))
+
+    low_arr = <np.ndarray>np.PyArray_FROM_OTF(low, np.NPY_INT16, np.NPY_ALIGNED | np.NPY_FORCECAST)
+    high_arr = <np.ndarray>np.PyArray_FROM_OTF(high, np.NPY_INT16, np.NPY_ALIGNED | np.NPY_FORCECAST)
+
+    if size is not None:
+        out_arr = <np.ndarray>np.empty(size, np.int8)
+    else:
+        it = np.PyArray_MultiIterNew2(low_arr, high_arr)
+        out_arr = <np.ndarray>np.empty(it.shape, np.int8)
+
+    it = np.PyArray_MultiIterNew3(low_arr, high_arr, out_arr)
+    out_data = <uint8_t *>np.PyArray_DATA(out_arr)
+    cnt = np.PyArray_SIZE(out_arr)
+    mask = last_rng = 0
+    with lock, nogil:
+        for i in range(cnt):
+            low_v = (<uint16_t*>np.PyArray_MultiIter_DATA(it, 0))[0]
+            high_v = (<uint16_t*>np.PyArray_MultiIter_DATA(it, 1))[0]
+            # Subtract 1 since generator produces values on the closed int [off, off+rng]
+            rng = <uint8_t>((high_v - is_open) - low_v)
+            off = <uint8_t>(<uint16_t>low_v)
+
+            if rng != last_rng:
+                # Smallest bit mask >= max
+                mask = <uint8_t>_gen_mask(rng)
+
+            out_data[i] = random_buffered_bounded_uint8(state, off, rng, mask, use_masked, &buf_rem, &buf)
+
+            np.PyArray_MultiIter_NEXT(it)
+    return out_arr
+
+
+cdef object _rand_uint64_broadcast(object low, object high, object size,
+                                       bint use_masked, bint closed,
+                                       bitgen_t *state, object lock):
+    """
+    Array path for 64-bit integer types
+
+    Requires special treatment since the high value can be out-of-range for
+    the largest (64 bit) integer type since the generator is specified on the
+    interval [low,high).
+
+    The internal generator does not have this issue since it generates from
+    the closes interval [low, high-1] and high-1 is always in range for the
+    64 bit integer type.
+    """
+
+    cdef np.ndarray low_arr, high_arr, out_arr, highm1_arr
+    cdef np.npy_intp i, cnt, n
+    cdef np.broadcast it
+    cdef object closed_upper
+    cdef uint64_t *out_data
+    cdef uint64_t *highm1_data
+    cdef uint64_t low_v, high_v
+    cdef uint64_t rng, last_rng, val, mask, off, out_val
+
+    low_arr = <np.ndarray>low
+    high_arr = <np.ndarray>high
+
+    if np.any(np.less(low_arr, 0x0ULL)):
+        raise ValueError('low is out of bounds for uint64')
+    dt = high_arr.dtype
+    if closed or np.issubdtype(dt, np.integer):
+        # Avoid object dtype path if already an integer
+        high_lower_comp = np.less if closed else np.less_equal
+        if np.any(high_lower_comp(high_arr, 0x0ULL)):
+            comp = '>' if closed else '>='
+            raise ValueError('low {comp} high'.format(comp=comp))
+        high_m1 = high_arr if closed else high_arr - dt.type(1)
+        if np.any(np.greater(high_m1, 0xFFFFFFFFFFFFFFFFULL)):
+            raise ValueError('high is out of bounds for uint64')
+        highm1_arr = <np.ndarray>np.PyArray_FROM_OTF(high_m1, np.NPY_UINT64, np.NPY_ALIGNED | np.NPY_FORCECAST)
+    else:
+        # If input is object or a floating type
+        highm1_arr = <np.ndarray>np.empty_like(high_arr, dtype=np.uint64)
+        highm1_data = <uint64_t *>np.PyArray_DATA(highm1_arr)
+        cnt = np.PyArray_SIZE(high_arr)
+        flat = high_arr.flat
+        for i in range(cnt):
+            # Subtract 1 since generator produces values on the closed int [off, off+rng]
+            closed_upper = int(flat[i]) - 1
+            if closed_upper > 0xFFFFFFFFFFFFFFFFULL:
+                raise ValueError('high is out of bounds for uint64')
+            if closed_upper < 0x0ULL:
+                comp = '>' if closed else '>='
+                raise ValueError('low {comp} high'.format(comp=comp))
+            highm1_data[i] = <uint64_t>closed_upper
+
+    if np.any(np.greater(low_arr, highm1_arr)):
+        comp = '>' if closed else '>='
+        raise ValueError('low {comp} high'.format(comp=comp))
+
+    high_arr = highm1_arr
+    low_arr = <np.ndarray>np.PyArray_FROM_OTF(low, np.NPY_UINT64, np.NPY_ALIGNED | np.NPY_FORCECAST)
+
+    if size is not None:
+        out_arr = <np.ndarray>np.empty(size, np.uint64)
+    else:
+        it = np.PyArray_MultiIterNew2(low_arr, high_arr)
+        out_arr = <np.ndarray>np.empty(it.shape, np.uint64)
+
+    it = np.PyArray_MultiIterNew3(low_arr, high_arr, out_arr)
+    out_data = <uint64_t *>np.PyArray_DATA(out_arr)
+    n = np.PyArray_SIZE(out_arr)
+    mask = last_rng = 0
+    with lock, nogil:
+        for i in range(n):
+            low_v = (<uint64_t*>np.PyArray_MultiIter_DATA(it, 0))[0]
+            high_v = (<uint64_t*>np.PyArray_MultiIter_DATA(it, 1))[0]
+            # Generator produces values on the closed int [off, off+rng], -1 subtracted above
+            rng = <uint64_t>(high_v - low_v)
+            off = <uint64_t>(<uint64_t>low_v)
+
+            if rng != last_rng:
+                mask = _gen_mask(rng)
+            out_data[i] = random_bounded_uint64(state, off, rng, mask, use_masked)
+
+            np.PyArray_MultiIter_NEXT(it)
+
+    return out_arr
+
+cdef object _rand_int64_broadcast(object low, object high, object size,
+                                       bint use_masked, bint closed,
+                                       bitgen_t *state, object lock):
+    """
+    Array path for 64-bit integer types
+
+    Requires special treatment since the high value can be out-of-range for
+    the largest (64 bit) integer type since the generator is specified on the
+    interval [low,high).
+
+    The internal generator does not have this issue since it generates from
+    the closes interval [low, high-1] and high-1 is always in range for the
+    64 bit integer type.
+    """
+
+    cdef np.ndarray low_arr, high_arr, out_arr, highm1_arr
+    cdef np.npy_intp i, cnt, n
+    cdef np.broadcast it
+    cdef object closed_upper
+    cdef uint64_t *out_data
+    cdef int64_t *highm1_data
+    cdef int64_t low_v, high_v
+    cdef uint64_t rng, last_rng, val, mask, off, out_val
+
+    low_arr = <np.ndarray>low
+    high_arr = <np.ndarray>high
+
+    if np.any(np.less(low_arr, -0x8000000000000000LL)):
+        raise ValueError('low is out of bounds for int64')
+    dt = high_arr.dtype
+    if closed or np.issubdtype(dt, np.integer):
+        # Avoid object dtype path if already an integer
+        high_lower_comp = np.less if closed else np.less_equal
+        if np.any(high_lower_comp(high_arr, -0x8000000000000000LL)):
+            comp = '>' if closed else '>='
+            raise ValueError('low {comp} high'.format(comp=comp))
+        high_m1 = high_arr if closed else high_arr - dt.type(1)
+        if np.any(np.greater(high_m1, 0x7FFFFFFFFFFFFFFFLL)):
+            raise ValueError('high is out of bounds for int64')
+        highm1_arr = <np.ndarray>np.PyArray_FROM_OTF(high_m1, np.NPY_INT64, np.NPY_ALIGNED | np.NPY_FORCECAST)
+    else:
+        # If input is object or a floating type
+        highm1_arr = <np.ndarray>np.empty_like(high_arr, dtype=np.int64)
+        highm1_data = <int64_t *>np.PyArray_DATA(highm1_arr)
+        cnt = np.PyArray_SIZE(high_arr)
+        flat = high_arr.flat
+        for i in range(cnt):
+            # Subtract 1 since generator produces values on the closed int [off, off+rng]
+            closed_upper = int(flat[i]) - 1
+            if closed_upper > 0x7FFFFFFFFFFFFFFFLL:
+                raise ValueError('high is out of bounds for int64')
+            if closed_upper < -0x8000000000000000LL:
+                comp = '>' if closed else '>='
+                raise ValueError('low {comp} high'.format(comp=comp))
+            highm1_data[i] = <int64_t>closed_upper
+
+    if np.any(np.greater(low_arr, highm1_arr)):
+        comp = '>' if closed else '>='
+        raise ValueError('low {comp} high'.format(comp=comp))
+
+    high_arr = highm1_arr
+    low_arr = <np.ndarray>np.PyArray_FROM_OTF(low, np.NPY_INT64, np.NPY_ALIGNED | np.NPY_FORCECAST)
+
+    if size is not None:
+        out_arr = <np.ndarray>np.empty(size, np.int64)
+    else:
+        it = np.PyArray_MultiIterNew2(low_arr, high_arr)
+        out_arr = <np.ndarray>np.empty(it.shape, np.int64)
+
+    it = np.PyArray_MultiIterNew3(low_arr, high_arr, out_arr)
+    out_data = <uint64_t *>np.PyArray_DATA(out_arr)
+    n = np.PyArray_SIZE(out_arr)
+    mask = last_rng = 0
+    with lock, nogil:
+        for i in range(n):
+            low_v = (<int64_t*>np.PyArray_MultiIter_DATA(it, 0))[0]
+            high_v = (<int64_t*>np.PyArray_MultiIter_DATA(it, 1))[0]
+            # Generator produces values on the closed int [off, off+rng], -1 subtracted above
+            rng = <uint64_t>(high_v - low_v)
+            off = <uint64_t>(<int64_t>low_v)
+
+            if rng != last_rng:
+                mask = _gen_mask(rng)
+            out_data[i] = random_bounded_uint64(state, off, rng, mask, use_masked)
+
+            np.PyArray_MultiIter_NEXT(it)
+
+    return out_arr
+
+
+cdef object _rand_uint64(object low, object high, object size,
+                             bint use_masked, bint closed,
+                             bitgen_t *state, object lock):
+    """
+    _rand_uint64(low, high, size, use_masked, *state, lock)
+
+    Return random np.uint64 integers from `low` (inclusive) to `high` (exclusive).
+
+    Return random integers from the "discrete uniform" distribution in the
+    interval [`low`, `high`).  If `high` is None (the default),
+    then results are from [0, `low`). On entry the arguments are presumed
+    to have been validated for size and order for the np.uint64 type.
+
+    Parameters
+    ----------
+    low : int or array-like
+        Lowest (signed) integer to be drawn from the distribution (unless
+        ``high=None``, in which case this parameter is the *highest* such
+        integer).
+    high : int or array-like
+        If provided, one above the largest (signed) integer to be drawn from the
+        distribution (see above for behavior if ``high=None``).
+    size : int or tuple of ints
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    use_masked : bool
+        If True then rejection sampling with a range mask is used else Lemire's algorithm is used.
+    closed : bool
+        If True then sample from [low, high].  If False, sample [low, high)
+    state : bit generator
+        Bit generator state to use in the core random number generators
+    lock : threading.Lock
+        Lock to prevent multiple using a single generator simultaneously
+
+    Returns
+    -------
+    out : python scalar or ndarray of np.uint64
+          `size`-shaped array of random integers from the appropriate
+          distribution, or a single such random int if `size` not provided.
+
+    Notes
+    -----
+    The internal integer generator produces values from the closed
+    interval [low, high-(not closed)].  This requires some care since
+    high can be out-of-range for uint64. The scalar path leaves
+    integers as Python integers until the 1 has been subtracted to
+    avoid needing to cast to a larger type.
+    """
+    cdef np.ndarray out_arr, low_arr, high_arr
+    cdef uint64_t rng, off, out_val
+    cdef uint64_t *out_data
+    cdef np.npy_intp i, n, cnt
+
+    if size is not None:
+        if (np.prod(size) == 0):
+            return np.empty(size, dtype=np.uint64)
+
+    low_arr = <np.ndarray>np.array(low, copy=False)
+    high_arr = <np.ndarray>np.array(high, copy=False)
+    low_ndim = np.PyArray_NDIM(low_arr)
+    high_ndim = np.PyArray_NDIM(high_arr)
+    if ((low_ndim == 0 or (low_ndim == 1 and low_arr.size == 1 and size is not None)) and
+            (high_ndim == 0 or (high_ndim == 1 and high_arr.size == 1 and size is not None))):
+        low = int(low_arr)
+        high = int(high_arr)
+        # Subtract 1 since internal generator produces on closed interval [low, high]
+        if not closed:
+            high -= 1
+
+        if low < 0x0ULL:
+            raise ValueError("low is out of bounds for uint64")
+        if high > 0xFFFFFFFFFFFFFFFFULL:
+            raise ValueError("high is out of bounds for uint64")
+        if low > high:  # -1 already subtracted, closed interval
+            comp = '>' if closed else '>='
+            raise ValueError('low {comp} high'.format(comp=comp))
+
+        rng = <uint64_t>(high - low)
+        off = <uint64_t>(<uint64_t>low)
+        if size is None:
+            with lock:
+                random_bounded_uint64_fill(state, off, rng, 1, use_masked, &out_val)
+            return np.uint64(<uint64_t>out_val)
+        else:
+            out_arr = <np.ndarray>np.empty(size, np.uint64)
+            cnt = np.PyArray_SIZE(out_arr)
+            out_data = <uint64_t *>np.PyArray_DATA(out_arr)
+            with lock, nogil:
+                random_bounded_uint64_fill(state, off, rng, cnt, use_masked, out_data)
+            return out_arr
+    return _rand_uint64_broadcast(low_arr, high_arr, size, use_masked, closed, state, lock)
+
+cdef object _rand_uint32(object low, object high, object size,
+                             bint use_masked, bint closed,
+                             bitgen_t *state, object lock):
+    """
+    _rand_uint32(low, high, size, use_masked, *state, lock)
+
+    Return random np.uint32 integers from `low` (inclusive) to `high` (exclusive).
+
+    Return random integers from the "discrete uniform" distribution in the
+    interval [`low`, `high`).  If `high` is None (the default),
+    then results are from [0, `low`). On entry the arguments are presumed
+    to have been validated for size and order for the np.uint32 type.
+
+    Parameters
+    ----------
+    low : int or array-like
+        Lowest (signed) integer to be drawn from the distribution (unless
+        ``high=None``, in which case this parameter is the *highest* such
+        integer).
+    high : int or array-like
+        If provided, one above the largest (signed) integer to be drawn from the
+        distribution (see above for behavior if ``high=None``).
+    size : int or tuple of ints
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    use_masked : bool
+        If True then rejection sampling with a range mask is used else Lemire's algorithm is used.
+    closed : bool
+        If True then sample from [low, high].  If False, sample [low, high)
+    state : bit generator
+        Bit generator state to use in the core random number generators
+    lock : threading.Lock
+        Lock to prevent multiple using a single generator simultaneously
+
+    Returns
+    -------
+    out : python scalar or ndarray of np.uint32
+          `size`-shaped array of random integers from the appropriate
+          distribution, or a single such random int if `size` not provided.
+
+    Notes
+    -----
+    The internal integer generator produces values from the closed
+    interval [low, high-(not closed)].  This requires some care since
+    high can be out-of-range for uint32. The scalar path leaves
+    integers as Python integers until the 1 has been subtracted to
+    avoid needing to cast to a larger type.
+    """
+    cdef np.ndarray out_arr, low_arr, high_arr
+    cdef uint32_t rng, off, out_val
+    cdef uint32_t *out_data
+    cdef np.npy_intp i, n, cnt
+
+    if size is not None:
+        if (np.prod(size) == 0):
+            return np.empty(size, dtype=np.uint32)
+
+    low_arr = <np.ndarray>np.array(low, copy=False)
+    high_arr = <np.ndarray>np.array(high, copy=False)
+    low_ndim = np.PyArray_NDIM(low_arr)
+    high_ndim = np.PyArray_NDIM(high_arr)
+    if ((low_ndim == 0 or (low_ndim == 1 and low_arr.size == 1 and size is not None)) and
+            (high_ndim == 0 or (high_ndim == 1 and high_arr.size == 1 and size is not None))):
+        low = int(low_arr)
+        high = int(high_arr)
+        # Subtract 1 since internal generator produces on closed interval [low, high]
+        if not closed:
+            high -= 1
+
+        if low < 0x0UL:
+            raise ValueError("low is out of bounds for uint32")
+        if high > 0XFFFFFFFFUL:
+            raise ValueError("high is out of bounds for uint32")
+        if low > high:  # -1 already subtracted, closed interval
+            comp = '>' if closed else '>='
+            raise ValueError('low {comp} high'.format(comp=comp))
+
+        rng = <uint32_t>(high - low)
+        off = <uint32_t>(<uint32_t>low)
+        if size is None:
+            with lock:
+                random_bounded_uint32_fill(state, off, rng, 1, use_masked, &out_val)
+            return np.uint32(<uint32_t>out_val)
+        else:
+            out_arr = <np.ndarray>np.empty(size, np.uint32)
+            cnt = np.PyArray_SIZE(out_arr)
+            out_data = <uint32_t *>np.PyArray_DATA(out_arr)
+            with lock, nogil:
+                random_bounded_uint32_fill(state, off, rng, cnt, use_masked, out_data)
+            return out_arr
+    return _rand_uint32_broadcast(low_arr, high_arr, size, use_masked, closed, state, lock)
+
+cdef object _rand_uint16(object low, object high, object size,
+                             bint use_masked, bint closed,
+                             bitgen_t *state, object lock):
+    """
+    _rand_uint16(low, high, size, use_masked, *state, lock)
+
+    Return random np.uint16 integers from `low` (inclusive) to `high` (exclusive).
+
+    Return random integers from the "discrete uniform" distribution in the
+    interval [`low`, `high`).  If `high` is None (the default),
+    then results are from [0, `low`). On entry the arguments are presumed
+    to have been validated for size and order for the np.uint16 type.
+
+    Parameters
+    ----------
+    low : int or array-like
+        Lowest (signed) integer to be drawn from the distribution (unless
+        ``high=None``, in which case this parameter is the *highest* such
+        integer).
+    high : int or array-like
+        If provided, one above the largest (signed) integer to be drawn from the
+        distribution (see above for behavior if ``high=None``).
+    size : int or tuple of ints
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    use_masked : bool
+        If True then rejection sampling with a range mask is used else Lemire's algorithm is used.
+    closed : bool
+        If True then sample from [low, high].  If False, sample [low, high)
+    state : bit generator
+        Bit generator state to use in the core random number generators
+    lock : threading.Lock
+        Lock to prevent multiple using a single generator simultaneously
+
+    Returns
+    -------
+    out : python scalar or ndarray of np.uint16
+          `size`-shaped array of random integers from the appropriate
+          distribution, or a single such random int if `size` not provided.
+
+    Notes
+    -----
+    The internal integer generator produces values from the closed
+    interval [low, high-(not closed)].  This requires some care since
+    high can be out-of-range for uint16. The scalar path leaves
+    integers as Python integers until the 1 has been subtracted to
+    avoid needing to cast to a larger type.
+    """
+    cdef np.ndarray out_arr, low_arr, high_arr
+    cdef uint16_t rng, off, out_val
+    cdef uint16_t *out_data
+    cdef np.npy_intp i, n, cnt
+
+    if size is not None:
+        if (np.prod(size) == 0):
+            return np.empty(size, dtype=np.uint16)
+
+    low_arr = <np.ndarray>np.array(low, copy=False)
+    high_arr = <np.ndarray>np.array(high, copy=False)
+    low_ndim = np.PyArray_NDIM(low_arr)
+    high_ndim = np.PyArray_NDIM(high_arr)
+    if ((low_ndim == 0 or (low_ndim == 1 and low_arr.size == 1 and size is not None)) and
+            (high_ndim == 0 or (high_ndim == 1 and high_arr.size == 1 and size is not None))):
+        low = int(low_arr)
+        high = int(high_arr)
+        # Subtract 1 since internal generator produces on closed interval [low, high]
+        if not closed:
+            high -= 1
+
+        if low < 0x0UL:
+            raise ValueError("low is out of bounds for uint16")
+        if high > 0XFFFFUL:
+            raise ValueError("high is out of bounds for uint16")
+        if low > high:  # -1 already subtracted, closed interval
+            comp = '>' if closed else '>='
+            raise ValueError('low {comp} high'.format(comp=comp))
+
+        rng = <uint16_t>(high - low)
+        off = <uint16_t>(<uint16_t>low)
+        if size is None:
+            with lock:
+                random_bounded_uint16_fill(state, off, rng, 1, use_masked, &out_val)
+            return np.uint16(<uint16_t>out_val)
+        else:
+            out_arr = <np.ndarray>np.empty(size, np.uint16)
+            cnt = np.PyArray_SIZE(out_arr)
+            out_data = <uint16_t *>np.PyArray_DATA(out_arr)
+            with lock, nogil:
+                random_bounded_uint16_fill(state, off, rng, cnt, use_masked, out_data)
+            return out_arr
+    return _rand_uint16_broadcast(low_arr, high_arr, size, use_masked, closed, state, lock)
+
+cdef object _rand_uint8(object low, object high, object size,
+                             bint use_masked, bint closed,
+                             bitgen_t *state, object lock):
+    """
+    _rand_uint8(low, high, size, use_masked, *state, lock)
+
+    Return random np.uint8 integers from `low` (inclusive) to `high` (exclusive).
+
+    Return random integers from the "discrete uniform" distribution in the
+    interval [`low`, `high`).  If `high` is None (the default),
+    then results are from [0, `low`). On entry the arguments are presumed
+    to have been validated for size and order for the np.uint8 type.
+
+    Parameters
+    ----------
+    low : int or array-like
+        Lowest (signed) integer to be drawn from the distribution (unless
+        ``high=None``, in which case this parameter is the *highest* such
+        integer).
+    high : int or array-like
+        If provided, one above the largest (signed) integer to be drawn from the
+        distribution (see above for behavior if ``high=None``).
+    size : int or tuple of ints
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    use_masked : bool
+        If True then rejection sampling with a range mask is used else Lemire's algorithm is used.
+    closed : bool
+        If True then sample from [low, high].  If False, sample [low, high)
+    state : bit generator
+        Bit generator state to use in the core random number generators
+    lock : threading.Lock
+        Lock to prevent multiple using a single generator simultaneously
+
+    Returns
+    -------
+    out : python scalar or ndarray of np.uint8
+          `size`-shaped array of random integers from the appropriate
+          distribution, or a single such random int if `size` not provided.
+
+    Notes
+    -----
+    The internal integer generator produces values from the closed
+    interval [low, high-(not closed)].  This requires some care since
+    high can be out-of-range for uint8. The scalar path leaves
+    integers as Python integers until the 1 has been subtracted to
+    avoid needing to cast to a larger type.
+    """
+    cdef np.ndarray out_arr, low_arr, high_arr
+    cdef uint8_t rng, off, out_val
+    cdef uint8_t *out_data
+    cdef np.npy_intp i, n, cnt
+
+    if size is not None:
+        if (np.prod(size) == 0):
+            return np.empty(size, dtype=np.uint8)
+
+    low_arr = <np.ndarray>np.array(low, copy=False)
+    high_arr = <np.ndarray>np.array(high, copy=False)
+    low_ndim = np.PyArray_NDIM(low_arr)
+    high_ndim = np.PyArray_NDIM(high_arr)
+    if ((low_ndim == 0 or (low_ndim == 1 and low_arr.size == 1 and size is not None)) and
+            (high_ndim == 0 or (high_ndim == 1 and high_arr.size == 1 and size is not None))):
+        low = int(low_arr)
+        high = int(high_arr)
+        # Subtract 1 since internal generator produces on closed interval [low, high]
+        if not closed:
+            high -= 1
+
+        if low < 0x0UL:
+            raise ValueError("low is out of bounds for uint8")
+        if high > 0XFFUL:
+            raise ValueError("high is out of bounds for uint8")
+        if low > high:  # -1 already subtracted, closed interval
+            comp = '>' if closed else '>='
+            raise ValueError('low {comp} high'.format(comp=comp))
+
+        rng = <uint8_t>(high - low)
+        off = <uint8_t>(<uint8_t>low)
+        if size is None:
+            with lock:
+                random_bounded_uint8_fill(state, off, rng, 1, use_masked, &out_val)
+            return np.uint8(<uint8_t>out_val)
+        else:
+            out_arr = <np.ndarray>np.empty(size, np.uint8)
+            cnt = np.PyArray_SIZE(out_arr)
+            out_data = <uint8_t *>np.PyArray_DATA(out_arr)
+            with lock, nogil:
+                random_bounded_uint8_fill(state, off, rng, cnt, use_masked, out_data)
+            return out_arr
+    return _rand_uint8_broadcast(low_arr, high_arr, size, use_masked, closed, state, lock)
+
+cdef object _rand_bool(object low, object high, object size,
+                             bint use_masked, bint closed,
+                             bitgen_t *state, object lock):
+    """
+    _rand_bool(low, high, size, use_masked, *state, lock)
+
+    Return random np.bool integers from `low` (inclusive) to `high` (exclusive).
+
+    Return random integers from the "discrete uniform" distribution in the
+    interval [`low`, `high`).  If `high` is None (the default),
+    then results are from [0, `low`). On entry the arguments are presumed
+    to have been validated for size and order for the np.bool type.
+
+    Parameters
+    ----------
+    low : int or array-like
+        Lowest (signed) integer to be drawn from the distribution (unless
+        ``high=None``, in which case this parameter is the *highest* such
+        integer).
+    high : int or array-like
+        If provided, one above the largest (signed) integer to be drawn from the
+        distribution (see above for behavior if ``high=None``).
+    size : int or tuple of ints
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    use_masked : bool
+        If True then rejection sampling with a range mask is used else Lemire's algorithm is used.
+    closed : bool
+        If True then sample from [low, high].  If False, sample [low, high)
+    state : bit generator
+        Bit generator state to use in the core random number generators
+    lock : threading.Lock
+        Lock to prevent multiple using a single generator simultaneously
+
+    Returns
+    -------
+    out : python scalar or ndarray of np.bool
+          `size`-shaped array of random integers from the appropriate
+          distribution, or a single such random int if `size` not provided.
+
+    Notes
+    -----
+    The internal integer generator produces values from the closed
+    interval [low, high-(not closed)].  This requires some care since
+    high can be out-of-range for bool. The scalar path leaves
+    integers as Python integers until the 1 has been subtracted to
+    avoid needing to cast to a larger type.
+    """
+    cdef np.ndarray out_arr, low_arr, high_arr
+    cdef bool_t rng, off, out_val
+    cdef bool_t *out_data
+    cdef np.npy_intp i, n, cnt
+
+    if size is not None:
+        if (np.prod(size) == 0):
+            return np.empty(size, dtype=np.bool)
+
+    low_arr = <np.ndarray>np.array(low, copy=False)
+    high_arr = <np.ndarray>np.array(high, copy=False)
+    low_ndim = np.PyArray_NDIM(low_arr)
+    high_ndim = np.PyArray_NDIM(high_arr)
+    if ((low_ndim == 0 or (low_ndim == 1 and low_arr.size == 1 and size is not None)) and
+            (high_ndim == 0 or (high_ndim == 1 and high_arr.size == 1 and size is not None))):
+        low = int(low_arr)
+        high = int(high_arr)
+        # Subtract 1 since internal generator produces on closed interval [low, high]
+        if not closed:
+            high -= 1
+
+        if low < 0x0UL:
+            raise ValueError("low is out of bounds for bool")
+        if high > 0x1UL:
+            raise ValueError("high is out of bounds for bool")
+        if low > high:  # -1 already subtracted, closed interval
+            comp = '>' if closed else '>='
+            raise ValueError('low {comp} high'.format(comp=comp))
+
+        rng = <bool_t>(high - low)
+        off = <bool_t>(<bool_t>low)
+        if size is None:
+            with lock:
+                random_bounded_bool_fill(state, off, rng, 1, use_masked, &out_val)
+            return np.bool_(<bool_t>out_val)
+        else:
+            out_arr = <np.ndarray>np.empty(size, np.bool)
+            cnt = np.PyArray_SIZE(out_arr)
+            out_data = <bool_t *>np.PyArray_DATA(out_arr)
+            with lock, nogil:
+                random_bounded_bool_fill(state, off, rng, cnt, use_masked, out_data)
+            return out_arr
+    return _rand_bool_broadcast(low_arr, high_arr, size, use_masked, closed, state, lock)
+
+cdef object _rand_int64(object low, object high, object size,
+                             bint use_masked, bint closed,
+                             bitgen_t *state, object lock):
+    """
+    _rand_int64(low, high, size, use_masked, *state, lock)
+
+    Return random np.int64 integers from `low` (inclusive) to `high` (exclusive).
+
+    Return random integers from the "discrete uniform" distribution in the
+    interval [`low`, `high`).  If `high` is None (the default),
+    then results are from [0, `low`). On entry the arguments are presumed
+    to have been validated for size and order for the np.int64 type.
+
+    Parameters
+    ----------
+    low : int or array-like
+        Lowest (signed) integer to be drawn from the distribution (unless
+        ``high=None``, in which case this parameter is the *highest* such
+        integer).
+    high : int or array-like
+        If provided, one above the largest (signed) integer to be drawn from the
+        distribution (see above for behavior if ``high=None``).
+    size : int or tuple of ints
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    use_masked : bool
+        If True then rejection sampling with a range mask is used else Lemire's algorithm is used.
+    closed : bool
+        If True then sample from [low, high].  If False, sample [low, high)
+    state : bit generator
+        Bit generator state to use in the core random number generators
+    lock : threading.Lock
+        Lock to prevent multiple using a single generator simultaneously
+
+    Returns
+    -------
+    out : python scalar or ndarray of np.int64
+          `size`-shaped array of random integers from the appropriate
+          distribution, or a single such random int if `size` not provided.
+
+    Notes
+    -----
+    The internal integer generator produces values from the closed
+    interval [low, high-(not closed)].  This requires some care since
+    high can be out-of-range for uint64. The scalar path leaves
+    integers as Python integers until the 1 has been subtracted to
+    avoid needing to cast to a larger type.
+    """
+    cdef np.ndarray out_arr, low_arr, high_arr
+    cdef uint64_t rng, off, out_val
+    cdef uint64_t *out_data
+    cdef np.npy_intp i, n, cnt
+
+    if size is not None:
+        if (np.prod(size) == 0):
+            return np.empty(size, dtype=np.int64)
+
+    low_arr = <np.ndarray>np.array(low, copy=False)
+    high_arr = <np.ndarray>np.array(high, copy=False)
+    low_ndim = np.PyArray_NDIM(low_arr)
+    high_ndim = np.PyArray_NDIM(high_arr)
+    if ((low_ndim == 0 or (low_ndim == 1 and low_arr.size == 1 and size is not None)) and
+            (high_ndim == 0 or (high_ndim == 1 and high_arr.size == 1 and size is not None))):
+        low = int(low_arr)
+        high = int(high_arr)
+        # Subtract 1 since internal generator produces on closed interval [low, high]
+        if not closed:
+            high -= 1
+
+        if low < -0x8000000000000000LL:
+            raise ValueError("low is out of bounds for int64")
+        if high > 0x7FFFFFFFFFFFFFFFL:
+            raise ValueError("high is out of bounds for int64")
+        if low > high:  # -1 already subtracted, closed interval
+            comp = '>' if closed else '>='
+            raise ValueError('low {comp} high'.format(comp=comp))
+
+        rng = <uint64_t>(high - low)
+        off = <uint64_t>(<int64_t>low)
+        if size is None:
+            with lock:
+                random_bounded_uint64_fill(state, off, rng, 1, use_masked, &out_val)
+            return np.int64(<int64_t>out_val)
+        else:
+            out_arr = <np.ndarray>np.empty(size, np.int64)
+            cnt = np.PyArray_SIZE(out_arr)
+            out_data = <uint64_t *>np.PyArray_DATA(out_arr)
+            with lock, nogil:
+                random_bounded_uint64_fill(state, off, rng, cnt, use_masked, out_data)
+            return out_arr
+    return _rand_int64_broadcast(low_arr, high_arr, size, use_masked, closed, state, lock)
+
+cdef object _rand_int32(object low, object high, object size,
+                             bint use_masked, bint closed,
+                             bitgen_t *state, object lock):
+    """
+    _rand_int32(low, high, size, use_masked, *state, lock)
+
+    Return random np.int32 integers from `low` (inclusive) to `high` (exclusive).
+
+    Return random integers from the "discrete uniform" distribution in the
+    interval [`low`, `high`).  If `high` is None (the default),
+    then results are from [0, `low`). On entry the arguments are presumed
+    to have been validated for size and order for the np.int32 type.
+
+    Parameters
+    ----------
+    low : int or array-like
+        Lowest (signed) integer to be drawn from the distribution (unless
+        ``high=None``, in which case this parameter is the *highest* such
+        integer).
+    high : int or array-like
+        If provided, one above the largest (signed) integer to be drawn from the
+        distribution (see above for behavior if ``high=None``).
+    size : int or tuple of ints
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    use_masked : bool
+        If True then rejection sampling with a range mask is used else Lemire's algorithm is used.
+    closed : bool
+        If True then sample from [low, high].  If False, sample [low, high)
+    state : bit generator
+        Bit generator state to use in the core random number generators
+    lock : threading.Lock
+        Lock to prevent multiple using a single generator simultaneously
+
+    Returns
+    -------
+    out : python scalar or ndarray of np.int32
+          `size`-shaped array of random integers from the appropriate
+          distribution, or a single such random int if `size` not provided.
+
+    Notes
+    -----
+    The internal integer generator produces values from the closed
+    interval [low, high-(not closed)].  This requires some care since
+    high can be out-of-range for uint32. The scalar path leaves
+    integers as Python integers until the 1 has been subtracted to
+    avoid needing to cast to a larger type.
+    """
+    cdef np.ndarray out_arr, low_arr, high_arr
+    cdef uint32_t rng, off, out_val
+    cdef uint32_t *out_data
+    cdef np.npy_intp i, n, cnt
+
+    if size is not None:
+        if (np.prod(size) == 0):
+            return np.empty(size, dtype=np.int32)
+
+    low_arr = <np.ndarray>np.array(low, copy=False)
+    high_arr = <np.ndarray>np.array(high, copy=False)
+    low_ndim = np.PyArray_NDIM(low_arr)
+    high_ndim = np.PyArray_NDIM(high_arr)
+    if ((low_ndim == 0 or (low_ndim == 1 and low_arr.size == 1 and size is not None)) and
+            (high_ndim == 0 or (high_ndim == 1 and high_arr.size == 1 and size is not None))):
+        low = int(low_arr)
+        high = int(high_arr)
+        # Subtract 1 since internal generator produces on closed interval [low, high]
+        if not closed:
+            high -= 1
+
+        if low < -0x80000000L:
+            raise ValueError("low is out of bounds for int32")
+        if high > 0x7FFFFFFFL:
+            raise ValueError("high is out of bounds for int32")
+        if low > high:  # -1 already subtracted, closed interval
+            comp = '>' if closed else '>='
+            raise ValueError('low {comp} high'.format(comp=comp))
+
+        rng = <uint32_t>(high - low)
+        off = <uint32_t>(<int32_t>low)
+        if size is None:
+            with lock:
+                random_bounded_uint32_fill(state, off, rng, 1, use_masked, &out_val)
+            return np.int32(<int32_t>out_val)
+        else:
+            out_arr = <np.ndarray>np.empty(size, np.int32)
+            cnt = np.PyArray_SIZE(out_arr)
+            out_data = <uint32_t *>np.PyArray_DATA(out_arr)
+            with lock, nogil:
+                random_bounded_uint32_fill(state, off, rng, cnt, use_masked, out_data)
+            return out_arr
+    return _rand_int32_broadcast(low_arr, high_arr, size, use_masked, closed, state, lock)
+
+cdef object _rand_int16(object low, object high, object size,
+                             bint use_masked, bint closed,
+                             bitgen_t *state, object lock):
+    """
+    _rand_int16(low, high, size, use_masked, *state, lock)
+
+    Return random np.int16 integers from `low` (inclusive) to `high` (exclusive).
+
+    Return random integers from the "discrete uniform" distribution in the
+    interval [`low`, `high`).  If `high` is None (the default),
+    then results are from [0, `low`). On entry the arguments are presumed
+    to have been validated for size and order for the np.int16 type.
+
+    Parameters
+    ----------
+    low : int or array-like
+        Lowest (signed) integer to be drawn from the distribution (unless
+        ``high=None``, in which case this parameter is the *highest* such
+        integer).
+    high : int or array-like
+        If provided, one above the largest (signed) integer to be drawn from the
+        distribution (see above for behavior if ``high=None``).
+    size : int or tuple of ints
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    use_masked : bool
+        If True then rejection sampling with a range mask is used else Lemire's algorithm is used.
+    closed : bool
+        If True then sample from [low, high].  If False, sample [low, high)
+    state : bit generator
+        Bit generator state to use in the core random number generators
+    lock : threading.Lock
+        Lock to prevent multiple using a single generator simultaneously
+
+    Returns
+    -------
+    out : python scalar or ndarray of np.int16
+          `size`-shaped array of random integers from the appropriate
+          distribution, or a single such random int if `size` not provided.
+
+    Notes
+    -----
+    The internal integer generator produces values from the closed
+    interval [low, high-(not closed)].  This requires some care since
+    high can be out-of-range for uint16. The scalar path leaves
+    integers as Python integers until the 1 has been subtracted to
+    avoid needing to cast to a larger type.
+    """
+    cdef np.ndarray out_arr, low_arr, high_arr
+    cdef uint16_t rng, off, out_val
+    cdef uint16_t *out_data
+    cdef np.npy_intp i, n, cnt
+
+    if size is not None:
+        if (np.prod(size) == 0):
+            return np.empty(size, dtype=np.int16)
+
+    low_arr = <np.ndarray>np.array(low, copy=False)
+    high_arr = <np.ndarray>np.array(high, copy=False)
+    low_ndim = np.PyArray_NDIM(low_arr)
+    high_ndim = np.PyArray_NDIM(high_arr)
+    if ((low_ndim == 0 or (low_ndim == 1 and low_arr.size == 1 and size is not None)) and
+            (high_ndim == 0 or (high_ndim == 1 and high_arr.size == 1 and size is not None))):
+        low = int(low_arr)
+        high = int(high_arr)
+        # Subtract 1 since internal generator produces on closed interval [low, high]
+        if not closed:
+            high -= 1
+
+        if low < -0x8000L:
+            raise ValueError("low is out of bounds for int16")
+        if high > 0x7FFFL:
+            raise ValueError("high is out of bounds for int16")
+        if low > high:  # -1 already subtracted, closed interval
+            comp = '>' if closed else '>='
+            raise ValueError('low {comp} high'.format(comp=comp))
+
+        rng = <uint16_t>(high - low)
+        off = <uint16_t>(<int16_t>low)
+        if size is None:
+            with lock:
+                random_bounded_uint16_fill(state, off, rng, 1, use_masked, &out_val)
+            return np.int16(<int16_t>out_val)
+        else:
+            out_arr = <np.ndarray>np.empty(size, np.int16)
+            cnt = np.PyArray_SIZE(out_arr)
+            out_data = <uint16_t *>np.PyArray_DATA(out_arr)
+            with lock, nogil:
+                random_bounded_uint16_fill(state, off, rng, cnt, use_masked, out_data)
+            return out_arr
+    return _rand_int16_broadcast(low_arr, high_arr, size, use_masked, closed, state, lock)
+
+cdef object _rand_int8(object low, object high, object size,
+                             bint use_masked, bint closed,
+                             bitgen_t *state, object lock):
+    """
+    _rand_int8(low, high, size, use_masked, *state, lock)
+
+    Return random np.int8 integers from `low` (inclusive) to `high` (exclusive).
+
+    Return random integers from the "discrete uniform" distribution in the
+    interval [`low`, `high`).  If `high` is None (the default),
+    then results are from [0, `low`). On entry the arguments are presumed
+    to have been validated for size and order for the np.int8 type.
+
+    Parameters
+    ----------
+    low : int or array-like
+        Lowest (signed) integer to be drawn from the distribution (unless
+        ``high=None``, in which case this parameter is the *highest* such
+        integer).
+    high : int or array-like
+        If provided, one above the largest (signed) integer to be drawn from the
+        distribution (see above for behavior if ``high=None``).
+    size : int or tuple of ints
+        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
+        ``m * n * k`` samples are drawn.  Default is None, in which case a
+        single value is returned.
+    use_masked : bool
+        If True then rejection sampling with a range mask is used else Lemire's algorithm is used.
+    closed : bool
+        If True then sample from [low, high].  If False, sample [low, high)
+    state : bit generator
+        Bit generator state to use in the core random number generators
+    lock : threading.Lock
+        Lock to prevent multiple using a single generator simultaneously
+
+    Returns
+    -------
+    out : python scalar or ndarray of np.int8
+          `size`-shaped array of random integers from the appropriate
+          distribution, or a single such random int if `size` not provided.
+
+    Notes
+    -----
+    The internal integer generator produces values from the closed
+    interval [low, high-(not closed)].  This requires some care since
+    high can be out-of-range for uint8. The scalar path leaves
+    integers as Python integers until the 1 has been subtracted to
+    avoid needing to cast to a larger type.
+    """
+    cdef np.ndarray out_arr, low_arr, high_arr
+    cdef uint8_t rng, off, out_val
+    cdef uint8_t *out_data
+    cdef np.npy_intp i, n, cnt
+
+    if size is not None:
+        if (np.prod(size) == 0):
+            return np.empty(size, dtype=np.int8)
+
+    low_arr = <np.ndarray>np.array(low, copy=False)
+    high_arr = <np.ndarray>np.array(high, copy=False)
+    low_ndim = np.PyArray_NDIM(low_arr)
+    high_ndim = np.PyArray_NDIM(high_arr)
+    if ((low_ndim == 0 or (low_ndim == 1 and low_arr.size == 1 and size is not None)) and
+            (high_ndim == 0 or (high_ndim == 1 and high_arr.size == 1 and size is not None))):
+        low = int(low_arr)
+        high = int(high_arr)
+        # Subtract 1 since internal generator produces on closed interval [low, high]
+        if not closed:
+            high -= 1
+
+        if low < -0x80L:
+            raise ValueError("low is out of bounds for int8")
+        if high > 0x7FL:
+            raise ValueError("high is out of bounds for int8")
+        if low > high:  # -1 already subtracted, closed interval
+            comp = '>' if closed else '>='
+            raise ValueError('low {comp} high'.format(comp=comp))
+
+        rng = <uint8_t>(high - low)
+        off = <uint8_t>(<int8_t>low)
+        if size is None:
+            with lock:
+                random_bounded_uint8_fill(state, off, rng, 1, use_masked, &out_val)
+            return np.int8(<int8_t>out_val)
+        else:
+            out_arr = <np.ndarray>np.empty(size, np.int8)
+            cnt = np.PyArray_SIZE(out_arr)
+            out_data = <uint8_t *>np.PyArray_DATA(out_arr)
+            with lock, nogil:
+                random_bounded_uint8_fill(state, off, rng, cnt, use_masked, out_data)
+            return out_arr
+    return _rand_int8_broadcast(low_arr, high_arr, size, use_masked, closed, state, lock)
diff --git a/numpy/random/bounded_integers.pyx.in b/numpy/random/_bounded_integers.pyx.in
index 411b65a37..47cb13b3a 100644
--- a/numpy/random/bounded_integers.pyx.in
+++ b/numpy/random/_bounded_integers.pyx.in
@@ -4,12 +4,54 @@
 import numpy as np
 cimport numpy as np
 
-from .distributions cimport *
-
 __all__ = []
 
 np.import_array()
 
+cdef extern from "include/distributions.h":
+    # Generate random numbers in closed interval [off, off + rng].
+    uint64_t random_bounded_uint64(bitgen_t *bitgen_state,
+                                   uint64_t off, uint64_t rng,
+                                   uint64_t mask, bint use_masked) nogil
+    uint32_t random_buffered_bounded_uint32(bitgen_t *bitgen_state,
+                                            uint32_t off, uint32_t rng,
+                                            uint32_t mask, bint use_masked,
+                                            int *bcnt, uint32_t *buf) nogil
+    uint16_t random_buffered_bounded_uint16(bitgen_t *bitgen_state,
+                                            uint16_t off, uint16_t rng,
+                                            uint16_t mask, bint use_masked,
+                                            int *bcnt, uint32_t *buf) nogil
+    uint8_t random_buffered_bounded_uint8(bitgen_t *bitgen_state,
+                                          uint8_t off, uint8_t rng,
+                                          uint8_t mask, bint use_masked,
+                                          int *bcnt, uint32_t *buf) nogil
+    np.npy_bool random_buffered_bounded_bool(bitgen_t *bitgen_state,
+                                             np.npy_bool off, np.npy_bool rng,
+                                             np.npy_bool mask, bint use_masked,
+                                             int *bcnt, uint32_t *buf) nogil
+    void random_bounded_uint64_fill(bitgen_t *bitgen_state,
+                                    uint64_t off, uint64_t rng, np.npy_intp cnt,
+                                    bint use_masked,
+                                    uint64_t *out) nogil
+    void random_bounded_uint32_fill(bitgen_t *bitgen_state,
+                                    uint32_t off, uint32_t rng, np.npy_intp cnt,
+                                    bint use_masked,
+                                    uint32_t *out) nogil
+    void random_bounded_uint16_fill(bitgen_t *bitgen_state,
+                                    uint16_t off, uint16_t rng, np.npy_intp cnt,
+                                    bint use_masked,
+                                    uint16_t *out) nogil
+    void random_bounded_uint8_fill(bitgen_t *bitgen_state,
+                                   uint8_t off, uint8_t rng, np.npy_intp cnt,
+                                   bint use_masked,
+                                   uint8_t *out) nogil
+    void random_bounded_bool_fill(bitgen_t *bitgen_state,
+                                  np.npy_bool off, np.npy_bool rng, np.npy_intp cnt,
+                                  bint use_masked,
+                                  np.npy_bool *out) nogil
+
+
+
 _integers_types = {'bool': (0, 2),
                  'int8': (-2**7, 2**7),
                  'int16': (-2**15, 2**15),
diff --git a/numpy/random/common.pxd b/numpy/random/_common.pxd
index 2f7baa06e..74bebca83 100644
--- a/numpy/random/common.pxd
+++ b/numpy/random/_common.pxd
@@ -1,23 +1,12 @@
 #cython: language_level=3
 
-from libc.stdint cimport (uint8_t, uint16_t, uint32_t, uint64_t,
-                          int8_t, int16_t, int32_t, int64_t, intptr_t,
-                          uintptr_t)
-from libc.math cimport sqrt
-
-cdef extern from "numpy/random/bitgen.h":
-    struct bitgen:
-        void *state
-        uint64_t (*next_uint64)(void *st) nogil
-        uint32_t (*next_uint32)(void *st) nogil
-        double (*next_double)(void *st) nogil
-        uint64_t (*next_raw)(void *st) nogil
-
-    ctypedef bitgen bitgen_t
+from libc.stdint cimport uint32_t, uint64_t, int32_t, int64_t
 
 import numpy as np
 cimport numpy as np
 
+from ._bit_generator cimport bitgen_t
+
 cdef double POISSON_LAM_MAX
 cdef double LEGACY_POISSON_LAM_MAX
 cdef uint64_t MAXSIZE
@@ -44,7 +33,7 @@ cdef object prepare_ctypes(bitgen_t *bitgen)
 cdef int check_constraint(double val, object name, constraint_type cons) except -1
 cdef int check_array_constraint(np.ndarray val, object name, constraint_type cons) except -1
 
-cdef extern from "src/aligned_malloc/aligned_malloc.h":
+cdef extern from "include/aligned_malloc.h":
     cdef void *PyArray_realloc_aligned(void *p, size_t n)
     cdef void *PyArray_malloc_aligned(size_t n)
     cdef void *PyArray_calloc_aligned(size_t n, size_t s)
@@ -56,6 +45,7 @@ ctypedef double (*random_double_1)(void *state, double a) nogil
 ctypedef double (*random_double_2)(void *state, double a, double b) nogil
 ctypedef double (*random_double_3)(void *state, double a, double b, double c) nogil
 
+ctypedef double (*random_float_fill)(bitgen_t *state, np.npy_intp count, float* out) nogil
 ctypedef float (*random_float_0)(bitgen_t *state) nogil
 ctypedef float (*random_float_1)(bitgen_t *state, float a) nogil
 
diff --git a/numpy/random/common.pyx b/numpy/random/_common.pyx
index 74cd5f033..ef1afac7c 100644
--- a/numpy/random/common.pyx
+++ b/numpy/random/_common.pyx
@@ -6,7 +6,7 @@ import sys
 import numpy as np
 cimport numpy as np
 
-from .common cimport *
+from libc.stdint cimport uintptr_t
 
 __all__ = ['interface']
 
@@ -262,14 +262,16 @@ cdef object double_fill(void *func, bitgen_t *state, object size, object lock, o
     return out_array
 
 cdef object float_fill(void *func, bitgen_t *state, object size, object lock, object out):
-    cdef random_float_0 random_func = (<random_float_0>func)
+    cdef random_float_fill random_func = (<random_float_fill>func)
+    cdef float out_val
     cdef float *out_array_data
     cdef np.ndarray out_array
     cdef np.npy_intp i, n
 
     if size is None and out is None:
         with lock:
-            return random_func(state)
+            random_func(state, 1, &out_val)
+            return out_val
 
     if out is not None:
         check_output(out, np.float32, size)
@@ -280,8 +282,7 @@ cdef object float_fill(void *func, bitgen_t *state, object size, object lock, ob
     n = np.PyArray_SIZE(out_array)
     out_array_data = <float *>np.PyArray_DATA(out_array)
     with lock, nogil:
-        for i in range(n):
-            out_array_data[i] = random_func(state)
+        random_func(state, n, out_array_data)
     return out_array
 
 cdef object float_fill_from_double(void *func, bitgen_t *state, object size, object lock, object out):
diff --git a/numpy/random/generator.pyx b/numpy/random/_generator.pyx
index 26fd95129..6d9fe20f9 100644
--- a/numpy/random/generator.pyx
+++ b/numpy/random/_generator.pyx
@@ -3,35 +3,159 @@
 import operator
 import warnings
 
-import numpy as np
-
-from .bounded_integers import _integers_types
-from .pcg64 import PCG64
-
 from cpython.pycapsule cimport PyCapsule_IsValid, PyCapsule_GetPointer
 from cpython cimport (Py_INCREF, PyFloat_AsDouble)
-from libc cimport string
 
 cimport cython
+import numpy as np
 cimport numpy as np
+from numpy.core.multiarray import normalize_axis_index
+
+from libc cimport string
+from libc.stdint cimport (uint8_t, uint16_t, uint32_t, uint64_t,
+                          int32_t, int64_t, INT64_MAX, SIZE_MAX)
+from ._bounded_integers cimport (_rand_bool, _rand_int32, _rand_int64,
+         _rand_int16, _rand_int8, _rand_uint64, _rand_uint32, _rand_uint16,
+         _rand_uint8, _gen_mask)
+from ._bounded_integers import _integers_types
+from ._pcg64 import PCG64
+from ._bit_generator cimport bitgen_t
+from ._common cimport (POISSON_LAM_MAX, CONS_POSITIVE, CONS_NONE,
+            CONS_NON_NEGATIVE, CONS_BOUNDED_0_1, CONS_BOUNDED_GT_0_1,
+            CONS_GT_1, CONS_POSITIVE_NOT_NAN, CONS_POISSON,
+            double_fill, cont, kahan_sum, cont_broadcast_3, float_fill, cont_f,
+            check_array_constraint, check_constraint, disc, discrete_broadcast_iii,
+        )
+
+
+cdef extern from "include/distributions.h":
+
+    struct s_binomial_t:
+        int has_binomial
+        double psave
+        int64_t nsave
+        double r
+        double q
+        double fm
+        int64_t m
+        double p1
+        double xm
+        double xl
+        double xr
+        double c
+        double laml
+        double lamr
+        double p2
+        double p3
+        double p4
+
+    ctypedef s_binomial_t binomial_t
+
+    double random_standard_uniform(bitgen_t *bitgen_state) nogil
+    void random_standard_uniform_fill(bitgen_t* bitgen_state, np.npy_intp cnt, double *out) nogil
+    double random_standard_exponential(bitgen_t *bitgen_state) nogil
+    void random_standard_exponential_fill(bitgen_t *bitgen_state, np.npy_intp cnt, double *out) nogil
+    double random_standard_exponential_zig(bitgen_t *bitgen_state) nogil
+    void random_standard_exponential_zig_fill(bitgen_t *bitgen_state, np.npy_intp cnt, double *out) nogil
+    double random_standard_normal(bitgen_t* bitgen_state) nogil
+    void random_standard_normal_fill(bitgen_t *bitgen_state, np.npy_intp count, double *out) nogil
+    void random_standard_normal_fill_f(bitgen_t *bitgen_state, np.npy_intp count, float *out) nogil
+    double random_standard_gamma(bitgen_t *bitgen_state, double shape) nogil
+
+    float random_standard_uniform_f(bitgen_t *bitgen_state) nogil
+    void random_standard_uniform_fill_f(bitgen_t* bitgen_state, np.npy_intp cnt, float *out) nogil
+    float random_standard_exponential_f(bitgen_t *bitgen_state) nogil
+    float random_standard_exponential_zig_f(bitgen_t *bitgen_state) nogil
+    void random_standard_exponential_fill_f(bitgen_t *bitgen_state, np.npy_intp cnt, float *out) nogil
+    void random_standard_exponential_zig_fill_f(bitgen_t *bitgen_state, np.npy_intp cnt, float *out) nogil
+    float random_standard_normal_f(bitgen_t* bitgen_state) nogil
+    float random_standard_gamma_f(bitgen_t *bitgen_state, float shape) nogil
+
+    int64_t random_positive_int64(bitgen_t *bitgen_state) nogil
+    int32_t random_positive_int32(bitgen_t *bitgen_state) nogil
+    int64_t random_positive_int(bitgen_t *bitgen_state) nogil
+    uint64_t random_uint(bitgen_t *bitgen_state) nogil
+
+    double random_normal(bitgen_t *bitgen_state, double loc, double scale) nogil
+
+    double random_gamma(bitgen_t *bitgen_state, double shape, double scale) nogil
+    float random_gamma_f(bitgen_t *bitgen_state, float shape, float scale) nogil
+
+    double random_exponential(bitgen_t *bitgen_state, double scale) nogil
+    double random_uniform(bitgen_t *bitgen_state, double lower, double range) nogil
+    double random_beta(bitgen_t *bitgen_state, double a, double b) nogil
+    double random_chisquare(bitgen_t *bitgen_state, double df) nogil
+    double random_f(bitgen_t *bitgen_state, double dfnum, double dfden) nogil
+    double random_standard_cauchy(bitgen_t *bitgen_state) nogil
+    double random_pareto(bitgen_t *bitgen_state, double a) nogil
+    double random_weibull(bitgen_t *bitgen_state, double a) nogil
+    double random_power(bitgen_t *bitgen_state, double a) nogil
+    double random_laplace(bitgen_t *bitgen_state, double loc, double scale) nogil
+    double random_gumbel(bitgen_t *bitgen_state, double loc, double scale) nogil
+    double random_logistic(bitgen_t *bitgen_state, double loc, double scale) nogil
+    double random_lognormal(bitgen_t *bitgen_state, double mean, double sigma) nogil
+    double random_rayleigh(bitgen_t *bitgen_state, double mode) nogil
+    double random_standard_t(bitgen_t *bitgen_state, double df) nogil
+    double random_noncentral_chisquare(bitgen_t *bitgen_state, double df,
+                                       double nonc) nogil
+    double random_noncentral_f(bitgen_t *bitgen_state, double dfnum,
+                               double dfden, double nonc) nogil
+    double random_wald(bitgen_t *bitgen_state, double mean, double scale) nogil
+    double random_vonmises(bitgen_t *bitgen_state, double mu, double kappa) nogil
+    double random_triangular(bitgen_t *bitgen_state, double left, double mode,
+                             double right) nogil
+
+    int64_t random_poisson(bitgen_t *bitgen_state, double lam) nogil
+    int64_t random_negative_binomial(bitgen_t *bitgen_state, double n, double p) nogil
+    int64_t random_binomial(bitgen_t *bitgen_state, double p, int64_t n, binomial_t *binomial) nogil
+    int64_t random_logseries(bitgen_t *bitgen_state, double p) nogil
+    int64_t random_geometric_search(bitgen_t *bitgen_state, double p) nogil
+    int64_t random_geometric_inversion(bitgen_t *bitgen_state, double p) nogil
+    int64_t random_geometric(bitgen_t *bitgen_state, double p) nogil
+    int64_t random_zipf(bitgen_t *bitgen_state, double a) nogil
+    int64_t random_hypergeometric(bitgen_t *bitgen_state, int64_t good, int64_t bad,
+                                    int64_t sample) nogil
+
+    uint64_t random_interval(bitgen_t *bitgen_state, uint64_t max) nogil
+
+    # Generate random uint64 numbers in closed interval [off, off + rng].
+    uint64_t random_bounded_uint64(bitgen_t *bitgen_state,
+                                   uint64_t off, uint64_t rng,
+                                   uint64_t mask, bint use_masked) nogil
+
+    void random_multinomial(bitgen_t *bitgen_state, int64_t n, int64_t *mnix,
+                            double *pix, np.npy_intp d, binomial_t *binomial) nogil
+
+    int random_mvhg_count(bitgen_t *bitgen_state,
+                          int64_t total,
+                          size_t num_colors, int64_t *colors,
+                          int64_t nsample,
+                          size_t num_variates, int64_t *variates) nogil
+    void random_mvhg_marginals(bitgen_t *bitgen_state,
+                               int64_t total,
+                               size_t num_colors, int64_t *colors,
+                               int64_t nsample,
+                               size_t num_variates, int64_t *variates) nogil
 
-from .bounded_integers cimport *
-from .common cimport *
-from .distributions cimport *
+np.import_array()
 
 
-__all__ = ['Generator', 'beta', 'binomial', 'bytes', 'chisquare', 'choice',
-           'dirichlet', 'exponential', 'f', 'gamma',
-           'geometric', 'gumbel', 'hypergeometric', 'integers', 'laplace',
-           'logistic', 'lognormal', 'logseries', 'multinomial',
-           'multivariate_normal', 'negative_binomial', 'noncentral_chisquare',
-           'noncentral_f', 'normal', 'pareto', 'permutation',
-           'poisson', 'power', 'random',  'rayleigh', 'shuffle',
-           'standard_cauchy', 'standard_exponential', 'standard_gamma',
-           'standard_normal', 'standard_t', 'triangular',
-           'uniform', 'vonmises', 'wald', 'weibull', 'zipf']
+cdef int64_t _safe_sum_nonneg_int64(size_t num_colors, int64_t *colors):
+    """
+    Sum the values in the array `colors`.
 
-np.import_array()
+    Return -1 if an overflow occurs.
+    The values in *colors are assumed to be nonnegative.
+    """
+    cdef size_t i
+    cdef int64_t sum
+
+    sum = 0
+    for i in range(num_colors):
+        if colors[i] > INT64_MAX - sum:
+            return -1
+        sum += colors[i]
+    return sum
 
 
 cdef bint _check_bit_generator(object bitgen):
@@ -192,9 +316,9 @@ cdef class Generator:
         cdef double temp
         key = np.dtype(dtype).name
         if key == 'float64':
-            return double_fill(&random_double_fill, &self._bitgen, size, self.lock, out)
+            return double_fill(&random_standard_uniform_fill, &self._bitgen, size, self.lock, out)
         elif key == 'float32':
-            return float_fill(&random_float, &self._bitgen, size, self.lock, out)
+            return float_fill(&random_standard_uniform_fill_f, &self._bitgen, size, self.lock, out)
         else:
             raise TypeError('Unsupported dtype "%s" for random' % key)
 
@@ -340,9 +464,9 @@ cdef class Generator:
                 return double_fill(&random_standard_exponential_fill, &self._bitgen, size, self.lock, out)
         elif key == 'float32':
             if method == u'zig':
-                return float_fill(&random_standard_exponential_zig_f, &self._bitgen, size, self.lock, out)
+                return float_fill(&random_standard_exponential_zig_fill_f, &self._bitgen, size, self.lock, out)
             else:
-                return float_fill(&random_standard_exponential_f, &self._bitgen, size, self.lock, out)
+                return float_fill(&random_standard_exponential_fill_f, &self._bitgen, size, self.lock, out)
         else:
             raise TypeError('Unsupported dtype "%s" for standard_exponential'
                             % key)
@@ -919,9 +1043,9 @@ cdef class Generator:
         """
         key = np.dtype(dtype).name
         if key == 'float64':
-            return double_fill(&random_gauss_zig_fill, &self._bitgen, size, self.lock, out)
+            return double_fill(&random_standard_normal_fill, &self._bitgen, size, self.lock, out)
         elif key == 'float32':
-            return float_fill(&random_gauss_zig_f, &self._bitgen, size, self.lock, out)
+            return float_fill(&random_standard_normal_fill_f, &self._bitgen, size, self.lock, out)
 
         else:
             raise TypeError('Unsupported dtype "%s" for standard_normal' % key)
@@ -1022,7 +1146,7 @@ cdef class Generator:
                [ 0.39924804,  4.68456316,  4.99394529,  4.84057254]])  # random
 
         """
-        return cont(&random_normal_zig, &self._bitgen, size, self.lock, 2,
+        return cont(&random_normal, &self._bitgen, size, self.lock, 2,
                     loc, '', CONS_NONE,
                     scale, 'scale', CONS_NON_NEGATIVE,
                     0.0, '', CONS_NONE,
@@ -1108,13 +1232,13 @@ cdef class Generator:
         cdef void *func
         key = np.dtype(dtype).name
         if key == 'float64':
-            return cont(&random_standard_gamma_zig, &self._bitgen, size, self.lock, 1,
+            return cont(&random_standard_gamma, &self._bitgen, size, self.lock, 1,
                         shape, 'shape', CONS_NON_NEGATIVE,
                         0.0, '', CONS_NONE,
                         0.0, '', CONS_NONE,
                         out)
         if key == 'float32':
-            return cont_f(&random_standard_gamma_zig_f, &self._bitgen, size, self.lock,
+            return cont_f(&random_standard_gamma_f, &self._bitgen, size, self.lock,
                           shape, 'shape', CONS_NON_NEGATIVE,
                           out)
         else:
@@ -3146,6 +3270,8 @@ cdef class Generator:
 
         See Also
         --------
+        multivariate_hypergeometric : Draw samples from the multivariate
+            hypergeometric distribution.
         scipy.stats.hypergeom : probability density function, distribution or
             cumulative density function, etc.
 
@@ -3644,6 +3770,222 @@ cdef class Generator:
 
         return multin
 
+    def multivariate_hypergeometric(self, object colors, object nsample,
+                                    size=None, method='marginals'):
+        """
+        multivariate_hypergeometric(colors, nsample, size=None,
+                                    method='marginals')
+
+        Generate variates from a multivariate hypergeometric distribution.
+
+        The multivariate hypergeometric distribution is a generalization
+        of the hypergeometric distribution.
+
+        Choose ``nsample`` items at random without replacement from a
+        collection with ``N`` distinct types.  ``N`` is the length of
+        ``colors``, and the values in ``colors`` are the number of occurrences
+        of that type in the collection.  The total number of items in the
+        collection is ``sum(colors)``.  Each random variate generated by this
+        function is a vector of length ``N`` holding the counts of the
+        different types that occurred in the ``nsample`` items.
+
+        The name ``colors`` comes from a common description of the
+        distribution: it is the probability distribution of the number of
+        marbles of each color selected without replacement from an urn
+        containing marbles of different colors; ``colors[i]`` is the number
+        of marbles in the urn with color ``i``.
+
+        Parameters
+        ----------
+        colors : sequence of integers
+            The number of each type of item in the collection from which
+            a sample is drawn.  The values in ``colors`` must be nonnegative.
+            To avoid loss of precision in the algorithm, ``sum(colors)``
+            must be less than ``10**9`` when `method` is "marginals".
+        nsample : int
+            The number of items selected.  ``nsample`` must not be greater
+            than ``sum(colors)``.
+        size : int or tuple of ints, optional
+            The number of variates to generate, either an integer or a tuple
+            holding the shape of the array of variates.  If the given size is,
+            e.g., ``(k, m)``, then ``k * m`` variates are drawn, where one
+            variate is a vector of length ``len(colors)``, and the return value
+            has shape ``(k, m, len(colors))``.  If `size` is an integer, the
+            output has shape ``(size, len(colors))``.  Default is None, in
+            which case a single variate is returned as an array with shape
+            ``(len(colors),)``.
+        method : string, optional
+            Specify the algorithm that is used to generate the variates.
+            Must be 'count' or 'marginals' (the default).  See the Notes
+            for a description of the methods.
+
+        Returns
+        -------
+        variates : ndarray
+            Array of variates drawn from the multivariate hypergeometric
+            distribution.
+
+        See Also
+        --------
+        hypergeometric : Draw samples from the (univariate) hypergeometric
+            distribution.
+
+        Notes
+        -----
+        The two methods do not return the same sequence of variates.
+
+        The "count" algorithm is roughly equivalent to the following numpy
+        code::
+
+            choices = np.repeat(np.arange(len(colors)), colors)
+            selection = np.random.choice(choices, nsample, replace=False)
+            variate = np.bincount(selection, minlength=len(colors))
+
+        The "count" algorithm uses a temporary array of integers with length
+        ``sum(colors)``.
+
+        The "marginals" algorithm generates a variate by using repeated
+        calls to the univariate hypergeometric sampler.  It is roughly
+        equivalent to::
+
+            variate = np.zeros(len(colors), dtype=np.int64)
+            # `remaining` is the cumulative sum of `colors` from the last
+            # element to the first; e.g. if `colors` is [3, 1, 5], then
+            # `remaining` is [9, 6, 5].
+            remaining = np.cumsum(colors[::-1])[::-1]
+            for i in range(len(colors)-1):
+                if nsample < 1:
+                    break
+                variate[i] = hypergeometric(colors[i], remaining[i+1],
+                                           nsample)
+                nsample -= variate[i]
+            variate[-1] = nsample
+
+        The default method is "marginals".  For some cases (e.g. when
+        `colors` contains relatively small integers), the "count" method
+        can be significantly faster than the "marginals" method.  If
+        performance of the algorithm is important, test the two methods
+        with typical inputs to decide which works best.
+
+        .. versionadded:: 1.18.0
+
+        Examples
+        --------
+        >>> colors = [16, 8, 4]
+        >>> seed = 4861946401452
+        >>> gen = np.random.Generator(np.random.PCG64(seed))
+        >>> gen.multivariate_hypergeometric(colors, 6)
+        array([5, 0, 1])
+        >>> gen.multivariate_hypergeometric(colors, 6, size=3)
+        array([[5, 0, 1],
+               [2, 2, 2],
+               [3, 3, 0]])
+        >>> gen.multivariate_hypergeometric(colors, 6, size=(2, 2))
+        array([[[3, 2, 1],
+                [3, 2, 1]],
+               [[4, 1, 1],
+                [3, 2, 1]]])
+        """
+        cdef int64_t nsamp
+        cdef size_t num_colors
+        cdef int64_t total
+        cdef int64_t *colors_ptr
+        cdef int64_t max_index
+        cdef size_t num_variates
+        cdef int64_t *variates_ptr
+        cdef int result
+
+        if method not in ['count', 'marginals']:
+            raise ValueError('method must be "count" or "marginals".')
+
+        try:
+            operator.index(nsample)
+        except TypeError:
+            raise ValueError('nsample must be an integer')
+
+        if nsample < 0:
+            raise ValueError("nsample must be nonnegative.")
+        if nsample > INT64_MAX:
+            raise ValueError("nsample must not exceed %d" % INT64_MAX)
+        nsamp = nsample
+
+        # Validation of colors, a 1-d sequence of nonnegative integers.
+        invalid_colors = False
+        try:
+            colors = np.asarray(colors)
+            if colors.ndim != 1:
+                invalid_colors = True
+            elif colors.size > 0 and not np.issubdtype(colors.dtype,
+                                                       np.integer):
+                invalid_colors = True
+            elif np.any((colors < 0) | (colors > INT64_MAX)):
+                invalid_colors = True
+        except ValueError:
+            invalid_colors = True
+        if invalid_colors:
+            raise ValueError('colors must be a one-dimensional sequence '
+                             'of nonnegative integers not exceeding %d.' %
+                             INT64_MAX)
+
+        colors = np.ascontiguousarray(colors, dtype=np.int64)
+        num_colors = colors.size
+
+        colors_ptr = <int64_t *> np.PyArray_DATA(colors)
+
+        total = _safe_sum_nonneg_int64(num_colors, colors_ptr)
+        if total == -1:
+            raise ValueError("sum(colors) must not exceed the maximum value "
+                             "of a 64 bit signed integer (%d)" % INT64_MAX)
+
+        if method == 'marginals' and total >= 1000000000:
+            raise ValueError('When method is "marginals", sum(colors) must '
+                             'be less than 1000000000.')
+
+        # The C code that implements the 'count' method will malloc an
+        # array of size total*sizeof(size_t). Here we ensure that that
+        # product does not overflow.
+        if SIZE_MAX > <uint64_t>INT64_MAX:
+            max_index = INT64_MAX // sizeof(size_t)
+        else:
+            max_index = SIZE_MAX // sizeof(size_t)
+        if method == 'count' and total > max_index:
+            raise ValueError("When method is 'count', sum(colors) must not "
+                             "exceed %d" % max_index)
+        if nsamp > total:
+            raise ValueError("nsample > sum(colors)")
+
+        # Figure out the shape of the return array.
+        if size is None:
+            shape = (num_colors,)
+        elif np.isscalar(size):
+            shape = (size, num_colors)
+        else:
+            shape = tuple(size) + (num_colors,)
+        variates = np.zeros(shape, dtype=np.int64)
+
+        if num_colors == 0:
+            return variates
+
+        # One variate is a vector of length num_colors.
+        num_variates = variates.size // num_colors
+        variates_ptr = <int64_t *> np.PyArray_DATA(variates)
+
+        if method == 'count':
+            with self.lock, nogil:
+                result = random_mvhg_count(&self._bitgen, total,
+                                           num_colors, colors_ptr, nsamp,
+                                           num_variates, variates_ptr)
+            if result == -1:
+                raise MemoryError("Insufficent memory for multivariate_"
+                                  "hypergeometric with method='count' and "
+                                  "sum(colors)=%d" % total)
+        else:
+            with self.lock, nogil:
+                random_mvhg_marginals(&self._bitgen, total,
+                                      num_colors, colors_ptr, nsamp,
+                                      num_variates, variates_ptr)
+        return variates
+
     def dirichlet(self, object alpha, size=None):
         """
         dirichlet(alpha, size=None)
@@ -3772,7 +4114,7 @@ cdef class Generator:
             while i < totsize:
                 acc = 0.0
                 for j in range(k):
-                    val_data[i+j] = random_standard_gamma_zig(&self._bitgen,
+                    val_data[i+j] = random_standard_gamma(&self._bitgen,
                                                               alpha_data[j])
                     acc = acc + val_data[i + j]
                 invacc = 1/acc
@@ -3783,20 +4125,21 @@ cdef class Generator:
         return diric
 
     # Shuffling and permutations:
-    def shuffle(self, object x):
+    def shuffle(self, object x, axis=0):
         """
-        shuffle(x)
+        shuffle(x, axis=0)
 
         Modify a sequence in-place by shuffling its contents.
 
-        This function only shuffles the array along the first axis of a
-        multi-dimensional array. The order of sub-arrays is changed but
-        their contents remains the same.
+        The order of sub-arrays is changed but their contents remains the same.
 
         Parameters
         ----------
         x : array_like
             The array or list to be shuffled.
+        axis : int, optional
+            The axis which `x` is shuffled along. Default is 0.
+            It is only supported on `ndarray` objects.
 
         Returns
         -------
@@ -3810,8 +4153,6 @@ cdef class Generator:
         >>> arr
         [1 7 5 2 9 4 3 6 0 8] # random
 
-        Multi-dimensional arrays are only shuffled along the first axis:
-
         >>> arr = np.arange(9).reshape((3, 3))
         >>> rng.shuffle(arr)
         >>> arr
@@ -3819,17 +4160,25 @@ cdef class Generator:
                [6, 7, 8],
                [0, 1, 2]])
 
+        >>> arr = np.arange(9).reshape((3, 3))
+        >>> rng.shuffle(arr, axis=1)
+        >>> arr
+        array([[2, 0, 1], # random
+               [5, 3, 4],
+               [8, 6, 7]])
         """
         cdef:
             np.npy_intp i, j, n = len(x), stride, itemsize
             char* x_ptr
             char* buf_ptr
 
+        axis = normalize_axis_index(axis, np.ndim(x))
+
         if type(x) is np.ndarray and x.ndim == 1 and x.size:
             # Fast, statically typed path: shuffle the underlying buffer.
             # Only for non-empty, 1d objects of class ndarray (subclasses such
             # as MaskedArrays may not support this approach).
-            x_ptr = <char*><size_t>x.ctypes.data
+            x_ptr = <char*><size_t>np.PyArray_DATA(x)
             stride = x.strides[0]
             itemsize = x.dtype.itemsize
             # As the array x could contain python objects we use a buffer
@@ -3837,7 +4186,7 @@ cdef class Generator:
             # within the buffer and erroneously decrementing it's refcount
             # when the function exits.
             buf = np.empty(itemsize, dtype=np.int8)  # GC'd at function exit
-            buf_ptr = <char*><size_t>buf.ctypes.data
+            buf_ptr = <char*><size_t>np.PyArray_DATA(buf)
             with self.lock:
                 # We trick gcc into providing a specialized implementation for
                 # the most common case, yielding a ~33% performance improvement.
@@ -3847,9 +4196,10 @@ cdef class Generator:
                 else:
                     self._shuffle_raw(n, 1, itemsize, stride, x_ptr, buf_ptr)
         elif isinstance(x, np.ndarray) and x.ndim and x.size:
+            x = np.swapaxes(x, 0, axis)
             buf = np.empty_like(x[0, ...])
             with self.lock:
-                for i in reversed(range(1, n)):
+                for i in reversed(range(1, len(x))):
                     j = random_interval(&self._bitgen, i)
                     if i == j:
                         # i == j is not needed and memcpy is undefined.
@@ -3859,6 +4209,9 @@ cdef class Generator:
                     x[i] = buf
         else:
             # Untyped path.
+            if axis != 0:
+                raise NotImplementedError("Axis argument is only supported "
+                                          "on ndarray objects")
             with self.lock:
                 for i in reversed(range(1, n)):
                     j = random_interval(&self._bitgen, i)
@@ -3914,13 +4267,11 @@ cdef class Generator:
             data[j] = data[i]
             data[i] = temp
 
-    def permutation(self, object x):
+    def permutation(self, object x, axis=0):
         """
-        permutation(x)
+        permutation(x, axis=0)
 
         Randomly permute a sequence, or return a permuted range.
-        If `x` is a multi-dimensional array, it is only shuffled along its
-        first index. 
 
         Parameters
         ----------
@@ -3928,6 +4279,8 @@ cdef class Generator:
             If `x` is an integer, randomly permute ``np.arange(x)``.
             If `x` is an array, make a copy and shuffle the elements
             randomly.
+        axis : int, optional
+            The axis which `x` is shuffled along. Default is 0.
 
         Returns
         -------
@@ -3953,16 +4306,22 @@ cdef class Generator:
         Traceback (most recent call last):
             ...
         numpy.AxisError: x must be an integer or at least 1-dimensional
-        """
 
+        >>> arr = np.arange(9).reshape((3, 3))
+        >>> rng.permutation(arr, axis=1)
+        array([[0, 2, 1], # random
+               [3, 5, 4],
+               [6, 8, 7]])
+
+        """
         if isinstance(x, (int, np.integer)):
             arr = np.arange(x)
             self.shuffle(arr)
             return arr
 
         arr = np.asarray(x)
-        if arr.ndim < 1:
-            raise np.AxisError("x must be an integer or at least 1-dimensional")
+
+        axis = normalize_axis_index(axis, arr.ndim)
 
         # shuffle has fast-path for 1-d
         if arr.ndim == 1:
@@ -3973,9 +4332,11 @@ cdef class Generator:
             return arr
 
         # Shuffle index array, dtype to ensure fast path
-        idx = np.arange(arr.shape[0], dtype=np.intp)
+        idx = np.arange(arr.shape[axis], dtype=np.intp)
         self.shuffle(idx)
-        return arr[idx]
+        slices = [slice(None)]*arr.ndim
+        slices[axis] = idx
+        return arr[tuple(slices)]
 
 
 def default_rng(seed=None):
@@ -3983,19 +4344,18 @@ def default_rng(seed=None):
 
     Parameters
     ----------
-    seed : {None, int, array_like[ints], ISeedSequence, BitGenerator, Generator}, optional
+    seed : {None, int, array_like[ints], SeedSequence, BitGenerator, Generator}, optional
         A seed to initialize the `BitGenerator`. If None, then fresh,
         unpredictable entropy will be pulled from the OS. If an ``int`` or
         ``array_like[ints]`` is passed, then it will be passed to
         `SeedSequence` to derive the initial `BitGenerator` state. One may also
-        pass in an implementor of the `ISeedSequence` interface like
-        `SeedSequence`.
+        pass in a`SeedSequence` instance
         Additionally, when passed a `BitGenerator`, it will be wrapped by
         `Generator`. If passed a `Generator`, it will be returned unaltered.
 
     Notes
     -----
-    When `seed` is omitted or ``None``, a new `BitGenerator` and `Generator` will
+    When ``seed`` is omitted or ``None``, a new `BitGenerator` and `Generator` will
     be instantiated each time. This function does not manage a default global
     instance.
     """
diff --git a/numpy/random/mt19937.pyx b/numpy/random/_mt19937.pyx
index 49c3622f5..e99652b73 100644
--- a/numpy/random/mt19937.pyx
+++ b/numpy/random/_mt19937.pyx
@@ -3,9 +3,8 @@ import operator
 import numpy as np
 cimport numpy as np
 
-from .common cimport *
-from .bit_generator cimport BitGenerator, SeedSequence
-from .entropy import random_entropy
+from libc.stdint cimport uint32_t, uint64_t
+from ._bit_generator cimport BitGenerator, SeedSequence
 
 __all__ = ['MT19937']
 
@@ -49,13 +48,12 @@ cdef class MT19937(BitGenerator):
 
     Parameters
     ----------
-    seed : {None, int, array_like[ints], ISeedSequence}, optional
+    seed : {None, int, array_like[ints], SeedSequence}, optional
         A seed to initialize the `BitGenerator`. If None, then fresh,
         unpredictable entropy will be pulled from the OS. If an ``int`` or
         ``array_like[ints]`` is passed, then it will be passed to
         `SeedSequence` to derive the initial `BitGenerator` state. One may also
-        pass in an implementor of the `ISeedSequence` interface like
-        `SeedSequence`.
+        pass in a `SeedSequence` instance.
 
     Attributes
     ----------
@@ -156,7 +154,8 @@ cdef class MT19937(BitGenerator):
             Random seed initializing the pseudo-random number generator.
             Can be an integer in [0, 2**32-1], array of integers in
             [0, 2**32-1], a `SeedSequence, or ``None``. If `seed`
-            is ``None``, then sample entropy for a seed.
+            is ``None``, then fresh, unpredictable entropy will be pulled from
+            the OS.
 
         Raises
         ------
@@ -167,7 +166,8 @@ cdef class MT19937(BitGenerator):
         with self.lock:
             try:
                 if seed is None:
-                    val = random_entropy(RK_STATE_LEN)
+                    seed = SeedSequence()
+                    val = seed.generate_state(RK_STATE_LEN)
                     # MSB is 1; assuring non-zero initial array
                     self.rng_state.key[0] = 0x80000000UL
                     for i in range(1, RK_STATE_LEN):
diff --git a/numpy/random/pcg64.pyx b/numpy/random/_pcg64.pyx
index 585520139..1a5d852a2 100644
--- a/numpy/random/pcg64.pyx
+++ b/numpy/random/_pcg64.pyx
@@ -1,8 +1,9 @@
 import numpy as np
 cimport numpy as np
 
-from .common cimport *
-from .bit_generator cimport BitGenerator
+from libc.stdint cimport uint32_t, uint64_t
+from ._common cimport uint64_to_double, wrap_int
+from ._bit_generator cimport BitGenerator
 
 __all__ = ['PCG64']
 
@@ -43,13 +44,12 @@ cdef class PCG64(BitGenerator):
 
     Parameters
     ----------
-    seed : {None, int, array_like[ints], ISeedSequence}, optional
+    seed : {None, int, array_like[ints], SeedSequence}, optional
         A seed to initialize the `BitGenerator`. If None, then fresh,
         unpredictable entropy will be pulled from the OS. If an ``int`` or
         ``array_like[ints]`` is passed, then it will be passed to
         `SeedSequence` to derive the initial `BitGenerator` state. One may also
-        pass in an implementor of the `ISeedSequence` interface like
-        `SeedSequence`.
+        pass in a `SeedSequence` instance.
 
     Notes
     -----
diff --git a/numpy/random/philox.pyx b/numpy/random/_philox.pyx
index 8b7683017..9f136c32f 100644
--- a/numpy/random/philox.pyx
+++ b/numpy/random/_philox.pyx
@@ -6,9 +6,11 @@ except ImportError:
     from dummy_threading import Lock
 
 import numpy as np
+cimport numpy as np
 
-from .common cimport *
-from .bit_generator cimport BitGenerator
+from libc.stdint cimport uint32_t, uint64_t
+from ._common cimport uint64_to_double, int_to_array, wrap_int
+from ._bit_generator cimport BitGenerator
 
 __all__ = ['Philox']
 
@@ -62,21 +64,20 @@ cdef class Philox(BitGenerator):
 
     Parameters
     ----------
-    seed : {None, int, array_like[ints], ISeedSequence}, optional
+    seed : {None, int, array_like[ints], SeedSequence}, optional
         A seed to initialize the `BitGenerator`. If None, then fresh,
         unpredictable entropy will be pulled from the OS. If an ``int`` or
         ``array_like[ints]`` is passed, then it will be passed to
         `SeedSequence` to derive the initial `BitGenerator` state. One may also
-        pass in an implementor of the `ISeedSequence` interface like
-        `SeedSequence`.
+        pass in a `SeedSequence` instance.
     counter : {None, int, array_like}, optional
         Counter to use in the Philox state. Can be either
         a Python int (long in 2.x) in [0, 2**256) or a 4-element uint64 array.
         If not provided, the RNG is initialized at 0.
     key : {None, int, array_like}, optional
-        Key to use in the Philox state.  Unlike seed, the value in key is
+        Key to use in the Philox state.  Unlike ``seed``, the value in key is
         directly set. Can be either a Python int in [0, 2**128) or a 2-element
-        uint64 array. `key` and `seed` cannot both be used.
+        uint64 array. `key` and ``seed`` cannot both be used.
 
     Attributes
     ----------
@@ -108,10 +109,10 @@ cdef class Philox(BitGenerator):
     randoms produced. The second is a key which determined the sequence
     produced. Using different keys produces independent sequences.
 
-    The input seed is processed by `SeedSequence` to generate the key. The
+    The input ``seed`` is processed by `SeedSequence` to generate the key. The
     counter is set to 0.
 
-    Alternately, one can omit the seed parameter and set the ``key`` and
+    Alternately, one can omit the ``seed`` parameter and set the ``key`` and
     ``counter`` directly.
 
     **Parallel Features**
@@ -146,7 +147,7 @@ cdef class Philox(BitGenerator):
 
     **Compatibility Guarantee**
 
-    ``Philox`` makes a guarantee that a fixed seed will always produce
+    ``Philox`` makes a guarantee that a fixed ``seed`` will always produce
     the same random integer stream.
 
     Examples
diff --git a/numpy/random/_pickle.py b/numpy/random/_pickle.py
index 3b58f21e8..29ff69644 100644
--- a/numpy/random/_pickle.py
+++ b/numpy/random/_pickle.py
@@ -1,10 +1,10 @@
 from .mtrand import RandomState
-from .philox import Philox
-from .pcg64 import PCG64
-from .sfc64 import SFC64
+from ._philox import Philox
+from ._pcg64 import PCG64
+from ._sfc64 import SFC64
 
-from .generator import Generator
-from .mt19937 import MT19937
+from ._generator import Generator
+from ._mt19937 import MT19937
 
 BitGenerators = {'MT19937': MT19937,
                  'PCG64': PCG64,
diff --git a/numpy/random/sfc64.pyx b/numpy/random/_sfc64.pyx
index a881096e9..1633669d5 100644
--- a/numpy/random/sfc64.pyx
+++ b/numpy/random/_sfc64.pyx
@@ -1,8 +1,9 @@
 import numpy as np
 cimport numpy as np
 
-from .common cimport *
-from .bit_generator cimport BitGenerator
+from libc.stdint cimport uint32_t, uint64_t
+from ._common cimport uint64_to_double
+from ._bit_generator cimport BitGenerator
 
 __all__ = ['SFC64']
 
@@ -38,13 +39,12 @@ cdef class SFC64(BitGenerator):
 
     Parameters
     ----------
-    seed : {None, int, array_like[ints], ISeedSequence}, optional
+    seed : {None, int, array_like[ints], SeedSequence}, optional
         A seed to initialize the `BitGenerator`. If None, then fresh,
         unpredictable entropy will be pulled from the OS. If an ``int`` or
         ``array_like[ints]`` is passed, then it will be passed to
         `SeedSequence` to derive the initial `BitGenerator` state. One may also
-        pass in an implementor of the `ISeedSequence` interface like
-        `SeedSequence`.
+        pass in a `SeedSequence` instance.
 
     Notes
     -----
diff --git a/numpy/random/distributions.pxd b/numpy/random/distributions.pxd
deleted file mode 100644
index 75edaee9d..000000000
--- a/numpy/random/distributions.pxd
+++ /dev/null
@@ -1,140 +0,0 @@
-#cython: language_level=3
-
-from .common cimport (uint8_t, uint16_t, uint32_t, uint64_t,
-                          int32_t, int64_t, bitgen_t)
-import numpy as np
-cimport numpy as np
-
-cdef extern from "src/distributions/distributions.h":
-
-    struct s_binomial_t:
-        int has_binomial
-        double psave
-        int64_t nsave
-        double r
-        double q
-        double fm
-        int64_t m
-        double p1
-        double xm
-        double xl
-        double xr
-        double c
-        double laml
-        double lamr
-        double p2
-        double p3
-        double p4
-
-    ctypedef s_binomial_t binomial_t
-
-    double random_double(bitgen_t *bitgen_state) nogil
-    void random_double_fill(bitgen_t* bitgen_state, np.npy_intp cnt, double *out) nogil
-    double random_standard_exponential(bitgen_t *bitgen_state) nogil
-    void random_standard_exponential_fill(bitgen_t *bitgen_state, np.npy_intp cnt, double *out) nogil
-    double random_standard_exponential_zig(bitgen_t *bitgen_state) nogil
-    void random_standard_exponential_zig_fill(bitgen_t *bitgen_state, np.npy_intp cnt, double *out) nogil
-    double random_gauss_zig(bitgen_t* bitgen_state) nogil
-    void random_gauss_zig_fill(bitgen_t *bitgen_state, np.npy_intp count, double *out) nogil
-    double random_standard_gamma_zig(bitgen_t *bitgen_state, double shape) nogil
-
-    float random_float(bitgen_t *bitgen_state) nogil
-    float random_standard_exponential_f(bitgen_t *bitgen_state) nogil
-    float random_standard_exponential_zig_f(bitgen_t *bitgen_state) nogil
-    float random_gauss_zig_f(bitgen_t* bitgen_state) nogil
-    float random_standard_gamma_f(bitgen_t *bitgen_state, float shape) nogil
-    float random_standard_gamma_zig_f(bitgen_t *bitgen_state, float shape) nogil
-
-    int64_t random_positive_int64(bitgen_t *bitgen_state) nogil
-    int32_t random_positive_int32(bitgen_t *bitgen_state) nogil
-    int64_t random_positive_int(bitgen_t *bitgen_state) nogil
-    uint64_t random_uint(bitgen_t *bitgen_state) nogil
-
-    double random_normal_zig(bitgen_t *bitgen_state, double loc, double scale) nogil
-
-    double random_gamma(bitgen_t *bitgen_state, double shape, double scale) nogil
-    float random_gamma_float(bitgen_t *bitgen_state, float shape, float scale) nogil
-
-    double random_exponential(bitgen_t *bitgen_state, double scale) nogil
-    double random_uniform(bitgen_t *bitgen_state, double lower, double range) nogil
-    double random_beta(bitgen_t *bitgen_state, double a, double b) nogil
-    double random_chisquare(bitgen_t *bitgen_state, double df) nogil
-    double random_f(bitgen_t *bitgen_state, double dfnum, double dfden) nogil
-    double random_standard_cauchy(bitgen_t *bitgen_state) nogil
-    double random_pareto(bitgen_t *bitgen_state, double a) nogil
-    double random_weibull(bitgen_t *bitgen_state, double a) nogil
-    double random_power(bitgen_t *bitgen_state, double a) nogil
-    double random_laplace(bitgen_t *bitgen_state, double loc, double scale) nogil
-    double random_gumbel(bitgen_t *bitgen_state, double loc, double scale) nogil
-    double random_logistic(bitgen_t *bitgen_state, double loc, double scale) nogil
-    double random_lognormal(bitgen_t *bitgen_state, double mean, double sigma) nogil
-    double random_rayleigh(bitgen_t *bitgen_state, double mode) nogil
-    double random_standard_t(bitgen_t *bitgen_state, double df) nogil
-    double random_noncentral_chisquare(bitgen_t *bitgen_state, double df,
-                                       double nonc) nogil
-    double random_noncentral_f(bitgen_t *bitgen_state, double dfnum,
-                               double dfden, double nonc) nogil
-    double random_wald(bitgen_t *bitgen_state, double mean, double scale) nogil
-    double random_vonmises(bitgen_t *bitgen_state, double mu, double kappa) nogil
-    double random_triangular(bitgen_t *bitgen_state, double left, double mode,
-                             double right) nogil
-
-    int64_t random_poisson(bitgen_t *bitgen_state, double lam) nogil
-    int64_t random_negative_binomial(bitgen_t *bitgen_state, double n, double p) nogil
-    int64_t random_binomial(bitgen_t *bitgen_state, double p, int64_t n, binomial_t *binomial) nogil
-    int64_t random_logseries(bitgen_t *bitgen_state, double p) nogil
-    int64_t random_geometric_search(bitgen_t *bitgen_state, double p) nogil
-    int64_t random_geometric_inversion(bitgen_t *bitgen_state, double p) nogil
-    int64_t random_geometric(bitgen_t *bitgen_state, double p) nogil
-    int64_t random_zipf(bitgen_t *bitgen_state, double a) nogil
-    int64_t random_hypergeometric(bitgen_t *bitgen_state, int64_t good, int64_t bad,
-                                    int64_t sample) nogil
-
-    uint64_t random_interval(bitgen_t *bitgen_state, uint64_t max) nogil
-
-    # Generate random uint64 numbers in closed interval [off, off + rng].
-    uint64_t random_bounded_uint64(bitgen_t *bitgen_state,
-                                   uint64_t off, uint64_t rng,
-                                   uint64_t mask, bint use_masked) nogil
-
-    # Generate random uint32 numbers in closed interval [off, off + rng].
-    uint32_t random_buffered_bounded_uint32(bitgen_t *bitgen_state,
-                                            uint32_t off, uint32_t rng,
-                                            uint32_t mask, bint use_masked,
-                                            int *bcnt, uint32_t *buf) nogil
-    uint16_t random_buffered_bounded_uint16(bitgen_t *bitgen_state,
-                                            uint16_t off, uint16_t rng,
-                                            uint16_t mask, bint use_masked,
-                                            int *bcnt, uint32_t *buf) nogil
-    uint8_t random_buffered_bounded_uint8(bitgen_t *bitgen_state,
-                                          uint8_t off, uint8_t rng,
-                                          uint8_t mask, bint use_masked,
-                                          int *bcnt, uint32_t *buf) nogil
-    np.npy_bool random_buffered_bounded_bool(bitgen_t *bitgen_state,
-                                             np.npy_bool off, np.npy_bool rng,
-                                             np.npy_bool mask, bint use_masked,
-                                             int *bcnt, uint32_t *buf) nogil
-
-    void random_bounded_uint64_fill(bitgen_t *bitgen_state,
-                                    uint64_t off, uint64_t rng, np.npy_intp cnt,
-                                    bint use_masked,
-                                    uint64_t *out) nogil
-    void random_bounded_uint32_fill(bitgen_t *bitgen_state,
-                                    uint32_t off, uint32_t rng, np.npy_intp cnt,
-                                    bint use_masked,
-                                    uint32_t *out) nogil
-    void random_bounded_uint16_fill(bitgen_t *bitgen_state,
-                                    uint16_t off, uint16_t rng, np.npy_intp cnt,
-                                    bint use_masked,
-                                    uint16_t *out) nogil
-    void random_bounded_uint8_fill(bitgen_t *bitgen_state,
-                                   uint8_t off, uint8_t rng, np.npy_intp cnt,
-                                   bint use_masked,
-                                   uint8_t *out) nogil
-    void random_bounded_bool_fill(bitgen_t *bitgen_state,
-                                  np.npy_bool off, np.npy_bool rng, np.npy_intp cnt,
-                                  bint use_masked,
-                                  np.npy_bool *out) nogil
-
-    void random_multinomial(bitgen_t *bitgen_state, int64_t n, int64_t *mnix,
-                            double *pix, np.npy_intp d, binomial_t *binomial) nogil
diff --git a/numpy/random/entropy.pyx b/numpy/random/entropy.pyx
deleted file mode 100644
index 95bf7c177..000000000
--- a/numpy/random/entropy.pyx
+++ /dev/null
@@ -1,155 +0,0 @@
-cimport numpy as np
-import numpy as np
-
-from libc.stdint cimport uint32_t, uint64_t
-
-__all__ = ['random_entropy', 'seed_by_array']
-
-np.import_array()
-
-cdef extern from "src/splitmix64/splitmix64.h":
-    cdef uint64_t splitmix64_next(uint64_t *state)  nogil
-
-cdef extern from "src/entropy/entropy.h":
-    cdef bint entropy_getbytes(void* dest, size_t size)
-    cdef bint entropy_fallback_getbytes(void *dest, size_t size)
-
-cdef Py_ssize_t compute_numel(size):
-    cdef Py_ssize_t i, n = 1
-    if isinstance(size, tuple):
-        for i in range(len(size)):
-            n *= size[i]
-    else:
-        n = size
-    return n
-
-
-def seed_by_array(object seed, Py_ssize_t n):
-    """
-    Transforms a seed array into an initial state
-
-    Parameters
-    ----------
-    seed: ndarray, 1d, uint64
-        Array to use.  If seed is a scalar, promote to array.
-    n : int
-        Number of 64-bit unsigned integers required
-
-    Notes
-    -----
-    Uses splitmix64 to perform the transformation
-    """
-    cdef uint64_t seed_copy = 0
-    cdef uint64_t[::1] seed_array
-    cdef uint64_t[::1] initial_state
-    cdef Py_ssize_t seed_size, iter_bound
-    cdef int i, loc = 0
-
-    if hasattr(seed, 'squeeze'):
-        seed = seed.squeeze()
-    arr = np.asarray(seed)
-    if arr.shape == ():
-        err_msg = 'Scalar seeds must be integers between 0 and 2**64 - 1'
-        if not np.isreal(arr):
-            raise TypeError(err_msg)
-        int_seed = int(seed)
-        if int_seed != seed:
-            raise TypeError(err_msg)
-        if int_seed < 0 or int_seed > 2**64 - 1:
-            raise ValueError(err_msg)
-        seed_array = np.array([int_seed], dtype=np.uint64)
-    elif issubclass(arr.dtype.type, np.inexact):
-        raise TypeError('seed array must be integers')
-    else:
-        err_msg = "Seed values must be integers between 0 and 2**64 - 1"
-        obj = np.asarray(seed).astype(np.object)
-        if obj.ndim != 1:
-            raise ValueError('Array-valued seeds must be 1-dimensional')
-        if not np.isreal(obj).all():
-            raise TypeError(err_msg)
-        if ((obj > int(2**64 - 1)) | (obj < 0)).any():
-            raise ValueError(err_msg)
-        try:
-            obj_int = obj.astype(np.uint64, casting='unsafe')
-        except ValueError:
-            raise ValueError(err_msg)
-        if not (obj == obj_int).all():
-            raise TypeError(err_msg)
-        seed_array = obj_int
-
-    seed_size = seed_array.shape[0]
-    iter_bound = n if n > seed_size else seed_size
-
-    initial_state = <np.ndarray>np.empty(n, dtype=np.uint64)
-    for i in range(iter_bound):
-        if i < seed_size:
-            seed_copy ^= seed_array[i]
-        initial_state[loc] = splitmix64_next(&seed_copy)
-        loc += 1
-        if loc == n:
-            loc = 0
-
-    return np.array(initial_state)
-
-
-def random_entropy(size=None, source='system'):
-    """
-    random_entropy(size=None, source='system')
-
-    Read entropy from the system cryptographic provider
-
-    Parameters
-    ----------
-    size : int or tuple of ints, optional
-        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
-        ``m * n * k`` samples are drawn.  Default is None, in which case a
-        single value is returned.
-    source : str {'system', 'fallback'}
-        Source of entropy.  'system' uses system cryptographic pool.
-        'fallback' uses a hash of the time and process id.
-
-    Returns
-    -------
-    entropy : scalar or array
-        Entropy bits in 32-bit unsigned integers. A scalar is returned if size
-        is `None`.
-
-    Notes
-    -----
-    On Unix-like machines, reads from ``/dev/urandom``. On Windows machines
-    reads from the RSA algorithm provided by the cryptographic service
-    provider.
-
-    This function reads from the system entropy pool and so samples are
-    not reproducible.  In particular, it does *NOT* make use of a
-    BitGenerator, and so ``seed`` and setting ``state`` have no
-    effect.
-
-    Raises RuntimeError if the command fails.
-    """
-    cdef bint success = True
-    cdef Py_ssize_t n = 0
-    cdef uint32_t random = 0
-    cdef uint32_t [:] randoms
-
-    if source not in ('system', 'fallback'):
-        raise ValueError('Unknown value in source.')
-
-    if size is None:
-        if source == 'system':
-            success = entropy_getbytes(<void *>&random, 4)
-        else:
-            success = entropy_fallback_getbytes(<void *>&random, 4)
-    else:
-        n = compute_numel(size)
-        randoms = np.zeros(n, dtype=np.uint32)
-        if source == 'system':
-            success = entropy_getbytes(<void *>(&randoms[0]), 4 * n)
-        else:
-            success = entropy_fallback_getbytes(<void *>(&randoms[0]), 4 * n)
-    if not success:
-        raise RuntimeError('Unable to read from system cryptographic provider')
-
-    if n == 0:
-        return random
-    return np.asarray(randoms).reshape(size)
diff --git a/numpy/random/src/aligned_malloc/aligned_malloc.h b/numpy/random/include/aligned_malloc.h
index ea24f6d23..ea24f6d23 100644
--- a/numpy/random/src/aligned_malloc/aligned_malloc.h
+++ b/numpy/random/include/aligned_malloc.h
diff --git a/numpy/random/include/bitgen.h b/numpy/random/include/bitgen.h
new file mode 100644
index 000000000..83c2858dd
--- /dev/null
+++ b/numpy/random/include/bitgen.h
@@ -0,0 +1,20 @@
+#ifndef _RANDOM_BITGEN_H
+#define _RANDOM_BITGEN_H
+
+#pragma once
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+/* Must match the declaration in numpy/random/<any>.pxd */
+
+typedef struct bitgen {
+  void *state;
+  uint64_t (*next_uint64)(void *st);
+  uint32_t (*next_uint32)(void *st);
+  double (*next_double)(void *st);
+  uint64_t (*next_raw)(void *st);
+} bitgen_t;
+
+
+#endif
diff --git a/numpy/random/src/distributions/distributions.h b/numpy/random/include/distributions.h
index c8cdfd20f..c02ea605e 100644
--- a/numpy/random/src/distributions/distributions.h
+++ b/numpy/random/include/distributions.h
@@ -1,15 +1,14 @@
 #ifndef _RANDOMDGEN__DISTRIBUTIONS_H_
 #define _RANDOMDGEN__DISTRIBUTIONS_H_
 
-#pragma once
+#include "Python.h"
+#include "numpy/npy_common.h"
 #include <stddef.h>
 #include <stdbool.h>
 #include <stdint.h>
 
-#include "Python.h"
-#include "numpy/npy_common.h"
 #include "numpy/npy_math.h"
-#include "numpy/random/bitgen.h"
+#include "include/bitgen.h"
 
 /*
  * RAND_INT_TYPE is used to share integer generators with RandomState which
@@ -43,11 +42,11 @@
 typedef struct s_binomial_t {
   int has_binomial; /* !=0: following parameters initialized for binomial */
   double psave;
-  int64_t nsave;
+  RAND_INT_TYPE nsave;
   double r;
   double q;
   double fm;
-  int64_t m;
+  RAND_INT_TYPE m;
   double p1;
   double xm;
   double xl;
@@ -60,28 +59,10 @@ typedef struct s_binomial_t {
   double p4;
 } binomial_t;
 
-/* Inline generators for internal use */
-static NPY_INLINE uint32_t next_uint32(bitgen_t *bitgen_state) {
-  return bitgen_state->next_uint32(bitgen_state->state);
-}
-
-static NPY_INLINE uint64_t next_uint64(bitgen_t *bitgen_state) {
-  return bitgen_state->next_uint64(bitgen_state->state);
-}
-
-static NPY_INLINE float next_float(bitgen_t *bitgen_state) {
-  return (next_uint32(bitgen_state) >> 9) * (1.0f / 8388608.0f);
-}
-
-static NPY_INLINE double next_double(bitgen_t *bitgen_state) {
-  return bitgen_state->next_double(bitgen_state->state);
-}
-
-DECLDIR double loggam(double x);
-
-DECLDIR float random_float(bitgen_t *bitgen_state);
-DECLDIR double random_double(bitgen_t *bitgen_state);
-DECLDIR void random_double_fill(bitgen_t *bitgen_state, npy_intp cnt, double *out);
+DECLDIR float random_standard_uniform_f(bitgen_t *bitgen_state);
+DECLDIR double random_standard_uniform(bitgen_t *bitgen_state);
+DECLDIR void random_standard_uniform_fill(bitgen_t *, npy_intp, double *);
+DECLDIR void random_standard_uniform_fill_f(bitgen_t *, npy_intp, float *);
 
 DECLDIR int64_t random_positive_int64(bitgen_t *bitgen_state);
 DECLDIR int32_t random_positive_int32(bitgen_t *bitgen_state);
@@ -89,37 +70,25 @@ DECLDIR int64_t random_positive_int(bitgen_t *bitgen_state);
 DECLDIR uint64_t random_uint(bitgen_t *bitgen_state);
 
 DECLDIR double random_standard_exponential(bitgen_t *bitgen_state);
-DECLDIR void random_standard_exponential_fill(bitgen_t *bitgen_state, npy_intp cnt,
-                                              double *out);
 DECLDIR float random_standard_exponential_f(bitgen_t *bitgen_state);
 DECLDIR double random_standard_exponential_zig(bitgen_t *bitgen_state);
-DECLDIR void random_standard_exponential_zig_fill(bitgen_t *bitgen_state,
-                                                  npy_intp cnt, double *out);
 DECLDIR float random_standard_exponential_zig_f(bitgen_t *bitgen_state);
-
-/*
-DECLDIR double random_gauss(bitgen_t *bitgen_state);
-DECLDIR float random_gauss_f(bitgen_t *bitgen_state);
-*/
-DECLDIR double random_gauss_zig(bitgen_t *bitgen_state);
-DECLDIR float random_gauss_zig_f(bitgen_t *bitgen_state);
-DECLDIR void random_gauss_zig_fill(bitgen_t *bitgen_state, npy_intp cnt,
-                                   double *out);
-
-/*
+DECLDIR void random_standard_exponential_fill(bitgen_t *, npy_intp, double *);
+DECLDIR void random_standard_exponential_fill_f(bitgen_t *, npy_intp, float *);
+DECLDIR void random_standard_exponential_zig_fill(bitgen_t *, npy_intp, double *);
+DECLDIR void random_standard_exponential_zig_fill_f(bitgen_t *, npy_intp, float *);
+
+DECLDIR double random_standard_normal(bitgen_t *bitgen_state);
+DECLDIR float random_standard_normal_f(bitgen_t *bitgen_state);
+DECLDIR void random_standard_normal_fill(bitgen_t *, npy_intp, double *);
+DECLDIR void random_standard_normal_fill_f(bitgen_t *, npy_intp, float *);
 DECLDIR double random_standard_gamma(bitgen_t *bitgen_state, double shape);
 DECLDIR float random_standard_gamma_f(bitgen_t *bitgen_state, float shape);
-*/
-DECLDIR double random_standard_gamma_zig(bitgen_t *bitgen_state, double shape);
-DECLDIR float random_standard_gamma_zig_f(bitgen_t *bitgen_state, float shape);
 
-/*
 DECLDIR double random_normal(bitgen_t *bitgen_state, double loc, double scale);
-*/
-DECLDIR double random_normal_zig(bitgen_t *bitgen_state, double loc, double scale);
 
 DECLDIR double random_gamma(bitgen_t *bitgen_state, double shape, double scale);
-DECLDIR float random_gamma_float(bitgen_t *bitgen_state, float shape, float scale);
+DECLDIR float random_gamma_f(bitgen_t *bitgen_state, float shape, float scale);
 
 DECLDIR double random_exponential(bitgen_t *bitgen_state, double scale);
 DECLDIR double random_uniform(bitgen_t *bitgen_state, double lower, double range);
@@ -147,17 +116,16 @@ DECLDIR double random_triangular(bitgen_t *bitgen_state, double left, double mod
 
 DECLDIR RAND_INT_TYPE random_poisson(bitgen_t *bitgen_state, double lam);
 DECLDIR RAND_INT_TYPE random_negative_binomial(bitgen_t *bitgen_state, double n,
-                                         double p);
-DECLDIR RAND_INT_TYPE random_binomial(bitgen_t *bitgen_state, double p, RAND_INT_TYPE n,
-                                binomial_t *binomial);
+                                 double p);
+
+DECLDIR int64_t random_binomial(bitgen_t *bitgen_state, double p,
+                                int64_t n, binomial_t *binomial);
+
 DECLDIR RAND_INT_TYPE random_logseries(bitgen_t *bitgen_state, double p);
-DECLDIR RAND_INT_TYPE random_geometric_search(bitgen_t *bitgen_state, double p);
-DECLDIR RAND_INT_TYPE random_geometric_inversion(bitgen_t *bitgen_state, double p);
 DECLDIR RAND_INT_TYPE random_geometric(bitgen_t *bitgen_state, double p);
 DECLDIR RAND_INT_TYPE random_zipf(bitgen_t *bitgen_state, double a);
 DECLDIR int64_t random_hypergeometric(bitgen_t *bitgen_state,
                                       int64_t good, int64_t bad, int64_t sample);
-
 DECLDIR uint64_t random_interval(bitgen_t *bitgen_state, uint64_t max);
 
 /* Generate random uint64 numbers in closed interval [off, off + rng]. */
@@ -202,4 +170,33 @@ DECLDIR void random_bounded_bool_fill(bitgen_t *bitgen_state, npy_bool off,
 DECLDIR void random_multinomial(bitgen_t *bitgen_state, RAND_INT_TYPE n, RAND_INT_TYPE *mnix,
                                 double *pix, npy_intp d, binomial_t *binomial);
 
+/* multivariate hypergeometric, "count" method */
+DECLDIR int random_mvhg_count(bitgen_t *bitgen_state,
+                              int64_t total,
+                              size_t num_colors, int64_t *colors,
+                              int64_t nsample,
+                              size_t num_variates, int64_t *variates);
+
+/* multivariate hypergeometric, "marginals" method */
+DECLDIR void random_mvhg_marginals(bitgen_t *bitgen_state,
+                                   int64_t total,
+                                   size_t num_colors, int64_t *colors,
+                                   int64_t nsample,
+                                   size_t num_variates, int64_t *variates);
+
+/* Common to legacy-distributions.c and distributions.c but not exported */
+
+RAND_INT_TYPE random_binomial_btpe(bitgen_t *bitgen_state,
+                                   RAND_INT_TYPE n,
+                                   double p,
+                                   binomial_t *binomial);
+RAND_INT_TYPE random_binomial_inversion(bitgen_t *bitgen_state,
+                                        RAND_INT_TYPE n,
+                                        double p,
+                                        binomial_t *binomial);
+double random_loggam(double x);
+static NPY_INLINE double next_double(bitgen_t *bitgen_state) {
+    return bitgen_state->next_double(bitgen_state->state);
+}
+
 #endif
diff --git a/numpy/random/src/legacy/legacy-distributions.h b/numpy/random/include/legacy-distributions.h
index 005c4e5d2..6a0fc7dc4 100644
--- a/numpy/random/src/legacy/legacy-distributions.h
+++ b/numpy/random/include/legacy-distributions.h
@@ -2,7 +2,7 @@
 #define _RANDOMDGEN__DISTRIBUTIONS_LEGACY_H_
 
 
-#include "../distributions/distributions.h"
+#include "distributions.h"
 
 typedef struct aug_bitgen {
   bitgen_t *bit_generator;
@@ -16,26 +16,23 @@ extern double legacy_pareto(aug_bitgen_t *aug_state, double a);
 extern double legacy_weibull(aug_bitgen_t *aug_state, double a);
 extern double legacy_power(aug_bitgen_t *aug_state, double a);
 extern double legacy_gamma(aug_bitgen_t *aug_state, double shape, double scale);
-extern double legacy_pareto(aug_bitgen_t *aug_state, double a);
-extern double legacy_weibull(aug_bitgen_t *aug_state, double a);
 extern double legacy_chisquare(aug_bitgen_t *aug_state, double df);
 extern double legacy_noncentral_chisquare(aug_bitgen_t *aug_state, double df,
                                           double nonc);
-
 extern double legacy_noncentral_f(aug_bitgen_t *aug_state, double dfnum,
                                   double dfden, double nonc);
 extern double legacy_wald(aug_bitgen_t *aug_state, double mean, double scale);
 extern double legacy_lognormal(aug_bitgen_t *aug_state, double mean,
                                double sigma);
 extern double legacy_standard_t(aug_bitgen_t *aug_state, double df);
-extern int64_t legacy_negative_binomial(aug_bitgen_t *aug_state, double n,
-                                        double p);
 extern double legacy_standard_cauchy(aug_bitgen_t *state);
 extern double legacy_beta(aug_bitgen_t *aug_state, double a, double b);
 extern double legacy_f(aug_bitgen_t *aug_state, double dfnum, double dfden);
 extern double legacy_normal(aug_bitgen_t *aug_state, double loc, double scale);
 extern double legacy_standard_gamma(aug_bitgen_t *aug_state, double shape);
 extern double legacy_exponential(aug_bitgen_t *aug_state, double scale);
+extern int64_t legacy_random_binomial(bitgen_t *bitgen_state, double p,
+                                      int64_t n, binomial_t *binomial);
 extern int64_t legacy_negative_binomial(aug_bitgen_t *aug_state, double n,
                                         double p);
 extern int64_t legacy_random_hypergeometric(bitgen_t *bitgen_state,
diff --git a/numpy/random/info.py b/numpy/random/info.py
deleted file mode 100644
index b9fd7f26a..000000000
--- a/numpy/random/info.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from __future__ import division, absolute_import, print_function
-
-from .. import __doc__
-
-depends = ['core']
diff --git a/numpy/random/legacy_distributions.pxd b/numpy/random/legacy_distributions.pxd
deleted file mode 100644
index 7ba058054..000000000
--- a/numpy/random/legacy_distributions.pxd
+++ /dev/null
@@ -1,48 +0,0 @@
-#cython: language_level=3
-
-from libc.stdint cimport int64_t
-
-import numpy as np
-cimport numpy as np
-
-from .distributions cimport bitgen_t, binomial_t
-
-cdef extern from "legacy-distributions.h":
-
-    struct aug_bitgen:
-        bitgen_t *bit_generator
-        int has_gauss
-        double gauss
-
-    ctypedef aug_bitgen aug_bitgen_t
-
-    double legacy_gauss(aug_bitgen_t *aug_state) nogil
-    double legacy_pareto(aug_bitgen_t *aug_state, double a) nogil
-    double legacy_weibull(aug_bitgen_t *aug_state, double a) nogil
-    double legacy_standard_gamma(aug_bitgen_t *aug_state, double shape) nogil
-    double legacy_normal(aug_bitgen_t *aug_state, double loc, double scale) nogil
-    double legacy_standard_t(aug_bitgen_t *aug_state, double df) nogil
-
-    double legacy_standard_exponential(aug_bitgen_t *aug_state) nogil
-    double legacy_power(aug_bitgen_t *aug_state, double a) nogil
-    double legacy_gamma(aug_bitgen_t *aug_state, double shape, double scale) nogil
-    double legacy_power(aug_bitgen_t *aug_state, double a) nogil
-    double legacy_chisquare(aug_bitgen_t *aug_state, double df) nogil
-    double legacy_noncentral_chisquare(aug_bitgen_t *aug_state, double df,
-                                    double nonc) nogil
-    double legacy_noncentral_f(aug_bitgen_t *aug_state, double dfnum, double dfden,
-                            double nonc) nogil
-    double legacy_wald(aug_bitgen_t *aug_state, double mean, double scale) nogil
-    double legacy_lognormal(aug_bitgen_t *aug_state, double mean, double sigma) nogil
-    int64_t legacy_negative_binomial(aug_bitgen_t *aug_state, double n, double p) nogil
-    int64_t legacy_random_hypergeometric(bitgen_t *bitgen_state, int64_t good, int64_t bad, int64_t sample) nogil
-    int64_t legacy_random_logseries(bitgen_t *bitgen_state, double p) nogil
-    int64_t legacy_random_poisson(bitgen_t *bitgen_state, double lam) nogil
-    int64_t legacy_random_zipf(bitgen_t *bitgen_state, double a) nogil
-    int64_t legacy_random_geometric(bitgen_t *bitgen_state, double p) nogil
-    void legacy_random_multinomial(bitgen_t *bitgen_state, long n, long *mnix, double *pix, np.npy_intp d, binomial_t *binomial) nogil
-    double legacy_standard_cauchy(aug_bitgen_t *state) nogil
-    double legacy_beta(aug_bitgen_t *aug_state, double a, double b) nogil
-    double legacy_f(aug_bitgen_t *aug_state, double dfnum, double dfden) nogil
-    double legacy_exponential(aug_bitgen_t *aug_state, double scale) nogil
-    double legacy_power(aug_bitgen_t *state, double a) nogil
diff --git a/numpy/random/mtrand.pyx b/numpy/random/mtrand.pyx
index eb263cd2d..683a771cc 100644
--- a/numpy/random/mtrand.pyx
+++ b/numpy/random/mtrand.pyx
@@ -5,19 +5,100 @@ import warnings
 
 import numpy as np
 
-from .bounded_integers import _integers_types
-from .mt19937 import MT19937 as _MT19937
 from cpython.pycapsule cimport PyCapsule_IsValid, PyCapsule_GetPointer
 from cpython cimport (Py_INCREF, PyFloat_AsDouble)
-from libc cimport string
-
 cimport cython
 cimport numpy as np
 
-from .bounded_integers cimport *
-from .common cimport *
-from .distributions cimport *
-from .legacy_distributions cimport *
+from libc cimport string
+from libc.stdint cimport int64_t, uint64_t
+from ._bounded_integers cimport (_rand_bool, _rand_int32, _rand_int64,
+         _rand_int16, _rand_int8, _rand_uint64, _rand_uint32, _rand_uint16,
+         _rand_uint8,)
+from ._bounded_integers import _integers_types
+from ._mt19937 import MT19937 as _MT19937
+from ._bit_generator cimport bitgen_t
+from ._common cimport (POISSON_LAM_MAX, CONS_POSITIVE, CONS_NONE,
+            CONS_NON_NEGATIVE, CONS_BOUNDED_0_1, CONS_BOUNDED_GT_0_1, CONS_GTE_1,
+            CONS_GT_1, LEGACY_CONS_POISSON,
+            double_fill, cont, kahan_sum, cont_broadcast_3,
+            check_array_constraint, check_constraint, disc, discrete_broadcast_iii,
+        )
+
+cdef extern from "include/distributions.h":
+    struct s_binomial_t:
+        int has_binomial
+        double psave
+        int64_t nsave
+        double r
+        double q
+        double fm
+        int64_t m
+        double p1
+        double xm
+        double xl
+        double xr
+        double c
+        double laml
+        double lamr
+        double p2
+        double p3
+        double p4
+
+    ctypedef s_binomial_t binomial_t
+
+    void random_standard_uniform_fill(bitgen_t* bitgen_state, np.npy_intp cnt, double *out) nogil
+    int64_t random_positive_int(bitgen_t *bitgen_state) nogil
+    double random_uniform(bitgen_t *bitgen_state, double lower, double range) nogil
+    double random_vonmises(bitgen_t *bitgen_state, double mu, double kappa) nogil
+    double random_laplace(bitgen_t *bitgen_state, double loc, double scale) nogil
+    double random_gumbel(bitgen_t *bitgen_state, double loc, double scale) nogil
+    double random_logistic(bitgen_t *bitgen_state, double loc, double scale) nogil
+    double random_rayleigh(bitgen_t *bitgen_state, double mode) nogil
+    double random_triangular(bitgen_t *bitgen_state, double left, double mode,
+                                 double right) nogil
+    uint64_t random_interval(bitgen_t *bitgen_state, uint64_t max) nogil
+
+cdef extern from "include/legacy-distributions.h":
+    struct aug_bitgen:
+        bitgen_t *bit_generator
+        int has_gauss
+        double gauss
+
+    ctypedef aug_bitgen aug_bitgen_t
+
+    double legacy_gauss(aug_bitgen_t *aug_state) nogil
+    double legacy_pareto(aug_bitgen_t *aug_state, double a) nogil
+    double legacy_weibull(aug_bitgen_t *aug_state, double a) nogil
+    double legacy_standard_gamma(aug_bitgen_t *aug_state, double shape) nogil
+    double legacy_normal(aug_bitgen_t *aug_state, double loc, double scale) nogil
+    double legacy_standard_t(aug_bitgen_t *aug_state, double df) nogil
+
+    double legacy_standard_exponential(aug_bitgen_t *aug_state) nogil
+    double legacy_power(aug_bitgen_t *aug_state, double a) nogil
+    double legacy_gamma(aug_bitgen_t *aug_state, double shape, double scale) nogil
+    double legacy_power(aug_bitgen_t *aug_state, double a) nogil
+    double legacy_chisquare(aug_bitgen_t *aug_state, double df) nogil
+    double legacy_noncentral_chisquare(aug_bitgen_t *aug_state, double df,
+                                    double nonc) nogil
+    double legacy_noncentral_f(aug_bitgen_t *aug_state, double dfnum, double dfden,
+                            double nonc) nogil
+    double legacy_wald(aug_bitgen_t *aug_state, double mean, double scale) nogil
+    double legacy_lognormal(aug_bitgen_t *aug_state, double mean, double sigma) nogil
+    int64_t legacy_random_binomial(bitgen_t *bitgen_state, double p,
+                                   int64_t n, binomial_t *binomial) nogil
+    int64_t legacy_negative_binomial(aug_bitgen_t *aug_state, double n, double p) nogil
+    int64_t legacy_random_hypergeometric(bitgen_t *bitgen_state, int64_t good, int64_t bad, int64_t sample) nogil
+    int64_t legacy_random_logseries(bitgen_t *bitgen_state, double p) nogil
+    int64_t legacy_random_poisson(bitgen_t *bitgen_state, double lam) nogil
+    int64_t legacy_random_zipf(bitgen_t *bitgen_state, double a) nogil
+    int64_t legacy_random_geometric(bitgen_t *bitgen_state, double p) nogil
+    void legacy_random_multinomial(bitgen_t *bitgen_state, long n, long *mnix, double *pix, np.npy_intp d, binomial_t *binomial) nogil
+    double legacy_standard_cauchy(aug_bitgen_t *state) nogil
+    double legacy_beta(aug_bitgen_t *aug_state, double a, double b) nogil
+    double legacy_f(aug_bitgen_t *aug_state, double dfnum, double dfden) nogil
+    double legacy_exponential(aug_bitgen_t *aug_state, double scale) nogil
+    double legacy_power(aug_bitgen_t *state, double a) nogil
 
 np.import_array()
 
@@ -83,8 +164,8 @@ cdef class RandomState:
     See Also
     --------
     Generator
-    mt19937.MT19937
-    Bit_Generators
+    MT19937
+    numpy.random.BitGenerator
 
     """
     cdef public object _bit_generator
@@ -329,7 +410,7 @@ cdef class RandomState:
 
         """
         cdef double temp
-        return double_fill(&random_double_fill, &self._bitgen, size, self.lock, None)
+        return double_fill(&random_standard_uniform_fill, &self._bitgen, size, self.lock, None)
 
     def random(self, size=None):
         """
@@ -567,7 +648,7 @@ cdef class RandomState:
 
         See Also
         --------
-        random.random_integers : similar to `randint`, only for the closed
+        random_integers : similar to `randint`, only for the closed
             interval [`low`, `high`], and 1 is the lowest value if `high` is
             omitted.
 
@@ -985,7 +1066,7 @@ cdef class RandomState:
 
         .. note::
             This is a convenience function for users porting code from Matlab,
-            and wraps `numpy.random.random_sample`. That function takes a
+            and wraps `random_sample`. That function takes a
             tuple to specify the size of the output, which is consistent with
             other NumPy functions like `numpy.zeros` and `numpy.ones`.
 
@@ -1029,7 +1110,7 @@ cdef class RandomState:
 
         .. note::
             This is a convenience function for users porting code from Matlab,
-            and wraps `numpy.random.standard_normal`. That function takes a
+            and wraps `standard_normal`. That function takes a
             tuple to specify the size of the output, which is consistent with
             other NumPy functions like `numpy.zeros` and `numpy.ones`.
 
@@ -1289,8 +1370,8 @@ cdef class RandomState:
         The function has its peak at the mean, and its "spread" increases with
         the standard deviation (the function reaches 0.607 times its maximum at
         :math:`x + \\sigma` and :math:`x - \\sigma` [2]_).  This implies that
-        `numpy.random.normal` is more likely to return samples lying close to
-        the mean, rather than those far away.
+        normal is more likely to return samples lying close to the mean, rather
+        than those far away.
 
         References
         ----------
@@ -3086,7 +3167,9 @@ cdef class RandomState:
                 for i in range(cnt):
                     _dp = (<double*>np.PyArray_MultiIter_DATA(it, 1))[0]
                     _in = (<long*>np.PyArray_MultiIter_DATA(it, 2))[0]
-                    (<long*>np.PyArray_MultiIter_DATA(it, 0))[0] = random_binomial(&self._bitgen, _dp, _in, &self._binomial)
+                    (<long*>np.PyArray_MultiIter_DATA(it, 0))[0] = \
+                        legacy_random_binomial(&self._bitgen, _dp, _in,
+                                               &self._binomial)
 
                     np.PyArray_MultiIter_NEXT(it)
 
@@ -3099,7 +3182,8 @@ cdef class RandomState:
 
         if size is None:
             with self.lock:
-                return random_binomial(&self._bitgen, _dp, _in, &self._binomial)
+                return <long>legacy_random_binomial(&self._bitgen, _dp, _in,
+                                                    &self._binomial)
 
         randoms = <np.ndarray>np.empty(size, int)
         cnt = np.PyArray_SIZE(randoms)
@@ -3107,8 +3191,8 @@ cdef class RandomState:
 
         with self.lock, nogil:
             for i in range(cnt):
-                randoms_data[i] = random_binomial(&self._bitgen, _dp, _in,
-                                                  &self._binomial)
+                randoms_data[i] = legacy_random_binomial(&self._bitgen, _dp, _in,
+                                                         &self._binomial)
 
         return randoms
 
@@ -3517,7 +3601,7 @@ cdef class RandomState:
         # Convert to int64, if necessary, to use int64 infrastructure
         ongood = ongood.astype(np.int64)
         onbad = onbad.astype(np.int64)
-        onbad = onbad.astype(np.int64)
+        onsample = onsample.astype(np.int64)
         out = discrete_broadcast_iii(&legacy_random_hypergeometric,&self._bitgen, size, self.lock,
                                      ongood, 'ngood', CONS_NON_NEGATIVE,
                                      onbad, 'nbad', CONS_NON_NEGATIVE,
@@ -4070,7 +4154,7 @@ cdef class RandomState:
             # Fast, statically typed path: shuffle the underlying buffer.
             # Only for non-empty, 1d objects of class ndarray (subclasses such
             # as MaskedArrays may not support this approach).
-            x_ptr = <char*><size_t>x.ctypes.data
+            x_ptr = <char*><size_t>np.PyArray_DATA(x)
             stride = x.strides[0]
             itemsize = x.dtype.itemsize
             # As the array x could contain python objects we use a buffer
@@ -4078,7 +4162,7 @@ cdef class RandomState:
             # within the buffer and erroneously decrementing it's refcount
             # when the function exits.
             buf = np.empty(itemsize, dtype=np.int8)  # GC'd at function exit
-            buf_ptr = <char*><size_t>buf.ctypes.data
+            buf_ptr = <char*><size_t>np.PyArray_DATA(buf)
             with self.lock:
                 # We trick gcc into providing a specialized implementation for
                 # the most common case, yielding a ~33% performance improvement.
diff --git a/numpy/random/setup.py b/numpy/random/setup.py
index a820d326e..ca01250f4 100644
--- a/numpy/random/setup.py
+++ b/numpy/random/setup.py
@@ -34,8 +34,6 @@ def configuration(parent_package='', top_path=None):
 
     defs.append(('NPY_NO_DEPRECATED_API', 0))
     config.add_data_dir('tests')
-    config.add_data_files('common.pxd')
-    config.add_data_files('bit_generator.pxd')
 
     EXTRA_LINK_ARGS = []
     # Math lib
@@ -61,46 +59,34 @@ def configuration(parent_package='', top_path=None):
     # One can force emulated 128-bit arithmetic if one wants.
     #PCG64_DEFS += [('PCG_FORCE_EMULATED_128BIT_MATH', '1')]
 
-    config.add_extension('entropy',
-                         sources=['entropy.c', 'src/entropy/entropy.c'] +
-                                 [generate_libraries],
-                         libraries=EXTRA_LIBRARIES,
-                         extra_compile_args=EXTRA_COMPILE_ARGS,
-                         extra_link_args=EXTRA_LINK_ARGS,
-                         depends=[join('src', 'splitmix64', 'splitmix.h'),
-                                  join('src', 'entropy', 'entropy.h'),
-                                  'entropy.pyx',
-                                  ],
-                         define_macros=defs,
-                         )
     for gen in ['mt19937']:
         # gen.pyx, src/gen/gen.c, src/gen/gen-jump.c
-        config.add_extension(gen,
-                             sources=['{0}.c'.format(gen),
+        config.add_extension('_{0}'.format(gen),
+                             sources=['_{0}.c'.format(gen),
                                       'src/{0}/{0}.c'.format(gen),
                                       'src/{0}/{0}-jump.c'.format(gen)],
                              include_dirs=['.', 'src', join('src', gen)],
                              libraries=EXTRA_LIBRARIES,
                              extra_compile_args=EXTRA_COMPILE_ARGS,
                              extra_link_args=EXTRA_LINK_ARGS,
-                             depends=['%s.pyx' % gen],
+                             depends=['_%s.pyx' % gen],
                              define_macros=defs,
                              )
     for gen in ['philox', 'pcg64', 'sfc64']:
         # gen.pyx, src/gen/gen.c
         _defs = defs + PCG64_DEFS if gen == 'pcg64' else defs
-        config.add_extension(gen,
-                             sources=['{0}.c'.format(gen),
+        config.add_extension('_{0}'.format(gen),
+                             sources=['_{0}.c'.format(gen),
                                       'src/{0}/{0}.c'.format(gen)],
                              include_dirs=['.', 'src', join('src', gen)],
                              libraries=EXTRA_LIBRARIES,
                              extra_compile_args=EXTRA_COMPILE_ARGS,
                              extra_link_args=EXTRA_LINK_ARGS,
-                             depends=['%s.pyx' % gen, 'bit_generator.pyx',
+                             depends=['_%s.pyx' % gen, 'bit_generator.pyx',
                                       'bit_generator.pxd'],
                              define_macros=_defs,
                              )
-    for gen in ['common', 'bit_generator']:
+    for gen in ['_common', '_bit_generator']:
         # gen.pyx
         config.add_extension(gen,
                              sources=['{0}.c'.format(gen)],
@@ -114,9 +100,11 @@ def configuration(parent_package='', top_path=None):
     other_srcs = [
         'src/distributions/logfactorial.c',
         'src/distributions/distributions.c',
+        'src/distributions/random_mvhg_count.c',
+        'src/distributions/random_mvhg_marginals.c',
         'src/distributions/random_hypergeometric.c',
     ]
-    for gen in ['generator', 'bounded_integers']:
+    for gen in ['_generator', '_bounded_integers']:
         # gen.pyx, src/distributions/distributions.c
         config.add_extension(gen,
                              sources=['{0}.c'.format(gen)] + other_srcs,
@@ -128,7 +116,6 @@ def configuration(parent_package='', top_path=None):
                              define_macros=defs,
                              )
     config.add_extension('mtrand',
-                         # mtrand does not depend on random_hypergeometric.c.
                          sources=['mtrand.c',
                                   'src/legacy/legacy-distributions.c',
                                   'src/distributions/logfactorial.c',
diff --git a/numpy/random/src/aligned_malloc/aligned_malloc.c b/numpy/random/src/aligned_malloc/aligned_malloc.c
deleted file mode 100644
index 6e8192cfb..000000000
--- a/numpy/random/src/aligned_malloc/aligned_malloc.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#include "aligned_malloc.h"
-
-static NPY_INLINE void *PyArray_realloc_aligned(void *p, size_t n);
-
-static NPY_INLINE void *PyArray_malloc_aligned(size_t n);
-
-static NPY_INLINE void *PyArray_calloc_aligned(size_t n, size_t s);
-
-static NPY_INLINE void PyArray_free_aligned(void *p);
-\ No newline at end of file
diff --git a/numpy/random/src/distributions/distributions.c b/numpy/random/src/distributions/distributions.c
index 65257ecbf..b382ead0b 100644
--- a/numpy/random/src/distributions/distributions.c
+++ b/numpy/random/src/distributions/distributions.c
@@ -1,4 +1,4 @@
-#include "distributions.h"
+#include "include/distributions.h"
 #include "ziggurat_constants.h"
 #include "logfactorial.h"
 
@@ -6,90 +6,52 @@
 #include <intrin.h>
 #endif
 
-/* Random generators for external use */
-float random_float(bitgen_t *bitgen_state) { return next_float(bitgen_state); }
-
-double random_double(bitgen_t *bitgen_state) {
-  return next_double(bitgen_state);
+/* Inline generators for internal use */
+static NPY_INLINE uint32_t next_uint32(bitgen_t *bitgen_state) {
+  return bitgen_state->next_uint32(bitgen_state->state);
 }
-
-static NPY_INLINE double next_standard_exponential(bitgen_t *bitgen_state) {
-  return -log(1.0 - next_double(bitgen_state));
+static NPY_INLINE uint64_t next_uint64(bitgen_t *bitgen_state) {
+  return bitgen_state->next_uint64(bitgen_state->state);
 }
 
-double random_standard_exponential(bitgen_t *bitgen_state) {
-  return next_standard_exponential(bitgen_state);
+static NPY_INLINE float next_float(bitgen_t *bitgen_state) {
+  return (next_uint32(bitgen_state) >> 9) * (1.0f / 8388608.0f);
 }
 
-void random_standard_exponential_fill(bitgen_t *bitgen_state, npy_intp cnt,
-                                      double *out) {
-  npy_intp i;
-  for (i = 0; i < cnt; i++) {
-    out[i] = next_standard_exponential(bitgen_state);
-  }
+/* Random generators for external use */
+float random_standard_uniform_f(bitgen_t *bitgen_state) {
+    return next_float(bitgen_state); 
 }
 
-float random_standard_exponential_f(bitgen_t *bitgen_state) {
-  return -logf(1.0f - next_float(bitgen_state));
+double random_standard_uniform(bitgen_t *bitgen_state) {
+    return next_double(bitgen_state);
 }
 
-void random_double_fill(bitgen_t *bitgen_state, npy_intp cnt, double *out) {
+void random_standard_uniform_fill(bitgen_t *bitgen_state, npy_intp cnt, double *out) {
   npy_intp i;
   for (i = 0; i < cnt; i++) {
     out[i] = next_double(bitgen_state);
   }
 }
-#if 0
-double random_gauss(bitgen_t *bitgen_state) {
-  if (bitgen_state->has_gauss) {
-    const double temp = bitgen_state->gauss;
-    bitgen_state->has_gauss = false;
-    bitgen_state->gauss = 0.0;
-    return temp;
-  } else {
-    double f, x1, x2, r2;
-
-    do {
-      x1 = 2.0 * next_double(bitgen_state) - 1.0;
-      x2 = 2.0 * next_double(bitgen_state) - 1.0;
-      r2 = x1 * x1 + x2 * x2;
-    } while (r2 >= 1.0 || r2 == 0.0);
 
-    /* Polar method, a more efficient version of the Box-Muller approach. */
-    f = sqrt(-2.0 * log(r2) / r2);
-    /* Keep for next call */
-    bitgen_state->gauss = f * x1;
-    bitgen_state->has_gauss = true;
-    return f * x2;
+void random_standard_uniform_fill_f(bitgen_t *bitgen_state, npy_intp cnt, float *out) {
+  npy_intp i;
+  for (i = 0; i < cnt; i++) {
+    out[i] = next_float(bitgen_state);
   }
 }
 
-float random_gauss_f(bitgen_t *bitgen_state) {
-  if (bitgen_state->has_gauss_f) {
-    const float temp = bitgen_state->gauss_f;
-    bitgen_state->has_gauss_f = false;
-    bitgen_state->gauss_f = 0.0f;
-    return temp;
-  } else {
-    float f, x1, x2, r2;
-
-    do {
-      x1 = 2.0f * next_float(bitgen_state) - 1.0f;
-      x2 = 2.0f * next_float(bitgen_state) - 1.0f;
-      r2 = x1 * x1 + x2 * x2;
-    } while (r2 >= 1.0 || r2 == 0.0);
+double random_standard_exponential(bitgen_t *bitgen_state) {
+    return -log(1.0 - next_double(bitgen_state));
+}
 
-    /* Polar method, a more efficient version of the Box-Muller approach. */
-    f = sqrtf(-2.0f * logf(r2) / r2);
-    /* Keep for next call */
-    bitgen_state->gauss_f = f * x1;
-    bitgen_state->has_gauss_f = true;
-    return f * x2;
+void random_standard_exponential_fill(bitgen_t * bitgen_state, npy_intp cnt, double * out)
+{
+  npy_intp i;
+  for (i = 0; i < cnt; i++) {
+    out[i] = random_standard_exponential(bitgen_state);
   }
 }
-#endif
-
-static NPY_INLINE double standard_exponential_zig(bitgen_t *bitgen_state);
 
 static double standard_exponential_zig_unlikely(bitgen_t *bitgen_state,
                                                 uint8_t idx, double x) {
@@ -101,11 +63,11 @@ static double standard_exponential_zig_unlikely(bitgen_t *bitgen_state,
              exp(-x)) {
     return x;
   } else {
-    return standard_exponential_zig(bitgen_state);
+    return random_standard_exponential_zig(bitgen_state);
   }
 }
 
-static NPY_INLINE double standard_exponential_zig(bitgen_t *bitgen_state) {
+double random_standard_exponential_zig(bitgen_t *bitgen_state) {
   uint64_t ri;
   uint8_t idx;
   double x;
@@ -120,20 +82,26 @@ static NPY_INLINE double standard_exponential_zig(bitgen_t *bitgen_state) {
   return standard_exponential_zig_unlikely(bitgen_state, idx, x);
 }
 
-double random_standard_exponential_zig(bitgen_t *bitgen_state) {
-  return standard_exponential_zig(bitgen_state);
+void random_standard_exponential_zig_fill(bitgen_t * bitgen_state, npy_intp cnt, double * out)
+{
+  npy_intp i;
+  for (i = 0; i < cnt; i++) {
+    out[i] = random_standard_exponential_zig(bitgen_state);
+  }
+}
+
+float random_standard_exponential_f(bitgen_t *bitgen_state) {
+  return -logf(1.0f - next_float(bitgen_state));
 }
 
-void random_standard_exponential_zig_fill(bitgen_t *bitgen_state, npy_intp cnt,
-                                          double *out) {
+void random_standard_exponential_fill_f(bitgen_t * bitgen_state, npy_intp cnt, float * out)
+{
   npy_intp i;
   for (i = 0; i < cnt; i++) {
-    out[i] = standard_exponential_zig(bitgen_state);
+    out[i] = random_standard_exponential_f(bitgen_state);
   }
 }
 
-static NPY_INLINE float standard_exponential_zig_f(bitgen_t *bitgen_state);
-
 static float standard_exponential_zig_unlikely_f(bitgen_t *bitgen_state,
                                                  uint8_t idx, float x) {
   if (idx == 0) {
@@ -144,11 +112,11 @@ static float standard_exponential_zig_unlikely_f(bitgen_t *bitgen_state,
              expf(-x)) {
     return x;
   } else {
-    return standard_exponential_zig_f(bitgen_state);
+    return random_standard_exponential_zig_f(bitgen_state);
   }
 }
 
-static NPY_INLINE float standard_exponential_zig_f(bitgen_t *bitgen_state) {
+float random_standard_exponential_zig_f(bitgen_t *bitgen_state) {
   uint32_t ri;
   uint8_t idx;
   float x;
@@ -163,11 +131,15 @@ static NPY_INLINE float standard_exponential_zig_f(bitgen_t *bitgen_state) {
   return standard_exponential_zig_unlikely_f(bitgen_state, idx, x);
 }
 
-float random_standard_exponential_zig_f(bitgen_t *bitgen_state) {
-  return standard_exponential_zig_f(bitgen_state);
+void random_standard_exponential_zig_fill_f(bitgen_t * bitgen_state, npy_intp cnt, float * out)
+{
+  npy_intp i;
+  for (i = 0; i < cnt; i++) {
+    out[i] = random_standard_exponential_zig_f(bitgen_state);
+  }
 }
 
-static NPY_INLINE double next_gauss_zig(bitgen_t *bitgen_state) {
+double random_standard_normal(bitgen_t *bitgen_state) {
   uint64_t r;
   int sign;
   uint64_t rabs;
@@ -202,18 +174,14 @@ static NPY_INLINE double next_gauss_zig(bitgen_t *bitgen_state) {
   }
 }
 
-double random_gauss_zig(bitgen_t *bitgen_state) {
-  return next_gauss_zig(bitgen_state);
-}
-
-void random_gauss_zig_fill(bitgen_t *bitgen_state, npy_intp cnt, double *out) {
+void random_standard_normal_fill(bitgen_t *bitgen_state, npy_intp cnt, double *out) {
   npy_intp i;
   for (i = 0; i < cnt; i++) {
-    out[i] = next_gauss_zig(bitgen_state);
+    out[i] = random_standard_normal(bitgen_state);
   }
 }
 
-float random_gauss_zig_f(bitgen_t *bitgen_state) {
+float random_standard_normal_f(bitgen_t *bitgen_state) {
   uint32_t r;
   int sign;
   uint32_t rabs;
@@ -247,101 +215,14 @@ float random_gauss_zig_f(bitgen_t *bitgen_state) {
   }
 }
 
-/*
-static NPY_INLINE double standard_gamma(bitgen_t *bitgen_state, double shape) {
-  double b, c;
-  double U, V, X, Y;
-
-  if (shape == 1.0) {
-    return random_standard_exponential(bitgen_state);
-  } else if (shape < 1.0) {
-    for (;;) {
-      U = next_double(bitgen_state);
-      V = random_standard_exponential(bitgen_state);
-      if (U <= 1.0 - shape) {
-        X = pow(U, 1. / shape);
-        if (X <= V) {
-          return X;
-        }
-      } else {
-        Y = -log((1 - U) / shape);
-        X = pow(1.0 - shape + shape * Y, 1. / shape);
-        if (X <= (V + Y)) {
-          return X;
-        }
-      }
-    }
-  } else {
-    b = shape - 1. / 3.;
-    c = 1. / sqrt(9 * b);
-    for (;;) {
-      do {
-        X = random_gauss(bitgen_state);
-        V = 1.0 + c * X;
-      } while (V <= 0.0);
-
-      V = V * V * V;
-      U = next_double(bitgen_state);
-      if (U < 1.0 - 0.0331 * (X * X) * (X * X))
-        return (b * V);
-      if (log(U) < 0.5 * X * X + b * (1. - V + log(V)))
-        return (b * V);
-    }
-  }
-}
-
-static NPY_INLINE float standard_gamma_float(bitgen_t *bitgen_state, float
-shape) { float b, c; float U, V, X, Y;
-
-  if (shape == 1.0f) {
-    return random_standard_exponential_f(bitgen_state);
-  } else if (shape < 1.0f) {
-    for (;;) {
-      U = next_float(bitgen_state);
-      V = random_standard_exponential_f(bitgen_state);
-      if (U <= 1.0f - shape) {
-        X = powf(U, 1.0f / shape);
-        if (X <= V) {
-          return X;
-        }
-      } else {
-        Y = -logf((1.0f - U) / shape);
-        X = powf(1.0f - shape + shape * Y, 1.0f / shape);
-        if (X <= (V + Y)) {
-          return X;
-        }
-      }
-    }
-  } else {
-    b = shape - 1.0f / 3.0f;
-    c = 1.0f / sqrtf(9.0f * b);
-    for (;;) {
-      do {
-        X = random_gauss_f(bitgen_state);
-        V = 1.0f + c * X;
-      } while (V <= 0.0f);
-
-      V = V * V * V;
-      U = next_float(bitgen_state);
-      if (U < 1.0f - 0.0331f * (X * X) * (X * X))
-        return (b * V);
-      if (logf(U) < 0.5f * X * X + b * (1.0f - V + logf(V)))
-        return (b * V);
-    }
+void random_standard_normal_fill_f(bitgen_t *bitgen_state, npy_intp cnt, float *out) {
+  npy_intp i;
+  for (i = 0; i < cnt; i++) {
+    out[i] = random_standard_normal_f(bitgen_state);
   }
 }
 
-
-double random_standard_gamma(bitgen_t *bitgen_state, double shape) {
-  return standard_gamma(bitgen_state, shape);
-}
-
-float random_standard_gamma_f(bitgen_t *bitgen_state, float shape) {
-  return standard_gamma_float(bitgen_state, shape);
-}
-*/
-
-static NPY_INLINE double standard_gamma_zig(bitgen_t *bitgen_state,
+double random_standard_gamma(bitgen_t *bitgen_state,
                                             double shape) {
   double b, c;
   double U, V, X, Y;
@@ -372,7 +253,7 @@ static NPY_INLINE double standard_gamma_zig(bitgen_t *bitgen_state,
     c = 1. / sqrt(9 * b);
     for (;;) {
       do {
-        X = random_gauss_zig(bitgen_state);
+        X = random_standard_normal(bitgen_state);
         V = 1.0 + c * X;
       } while (V <= 0.0);
 
@@ -387,7 +268,7 @@ static NPY_INLINE double standard_gamma_zig(bitgen_t *bitgen_state,
   }
 }
 
-static NPY_INLINE float standard_gamma_zig_f(bitgen_t *bitgen_state,
+float random_standard_gamma_f(bitgen_t *bitgen_state,
                                              float shape) {
   float b, c;
   float U, V, X, Y;
@@ -418,7 +299,7 @@ static NPY_INLINE float standard_gamma_zig_f(bitgen_t *bitgen_state,
     c = 1.0f / sqrtf(9.0f * b);
     for (;;) {
       do {
-        X = random_gauss_zig_f(bitgen_state);
+        X = random_standard_normal_f(bitgen_state);
         V = 1.0f + c * X;
       } while (V <= 0.0f);
 
@@ -433,14 +314,6 @@ static NPY_INLINE float standard_gamma_zig_f(bitgen_t *bitgen_state,
   }
 }
 
-double random_standard_gamma_zig(bitgen_t *bitgen_state, double shape) {
-  return standard_gamma_zig(bitgen_state, shape);
-}
-
-float random_standard_gamma_zig_f(bitgen_t *bitgen_state, float shape) {
-  return standard_gamma_zig_f(bitgen_state, shape);
-}
-
 int64_t random_positive_int64(bitgen_t *bitgen_state) {
   return next_uint64(bitgen_state) >> 1;
 }
@@ -470,10 +343,10 @@ uint64_t random_uint(bitgen_t *bitgen_state) {
  * algorithm comes from SPECFUN by Shanjie Zhang and Jianming Jin and their
  * book "Computation of Special Functions", 1996, John Wiley & Sons, Inc.
  *
- * If loggam(k+1) is being used to compute log(k!) for an integer k, consider
+ * If random_loggam(k+1) is being used to compute log(k!) for an integer k, consider
  * using logfactorial(k) instead.
  */
-double loggam(double x) {
+double random_loggam(double x) {
   double x0, x2, xp, gl, gl0;
   RAND_INT_TYPE k, n;
 
@@ -513,12 +386,12 @@ double random_normal(bitgen_t *bitgen_state, double loc, double scale) {
 }
 */
 
-double random_normal_zig(bitgen_t *bitgen_state, double loc, double scale) {
-  return loc + scale * random_gauss_zig(bitgen_state);
+double random_normal(bitgen_t *bitgen_state, double loc, double scale) {
+  return loc + scale * random_standard_normal(bitgen_state);
 }
 
 double random_exponential(bitgen_t *bitgen_state, double scale) {
-  return scale * standard_exponential_zig(bitgen_state);
+  return scale * random_standard_exponential_zig(bitgen_state);
 }
 
 double random_uniform(bitgen_t *bitgen_state, double lower, double range) {
@@ -526,11 +399,11 @@ double random_uniform(bitgen_t *bitgen_state, double lower, double range) {
 }
 
 double random_gamma(bitgen_t *bitgen_state, double shape, double scale) {
-  return scale * random_standard_gamma_zig(bitgen_state, shape);
+  return scale * random_standard_gamma(bitgen_state, shape);
 }
 
-float random_gamma_float(bitgen_t *bitgen_state, float shape, float scale) {
-  return scale * random_standard_gamma_zig_f(bitgen_state, shape);
+float random_gamma_f(bitgen_t *bitgen_state, float shape, float scale) {
+  return scale * random_standard_gamma_f(bitgen_state, shape);
 }
 
 double random_beta(bitgen_t *bitgen_state, double a, double b) {
@@ -562,14 +435,14 @@ double random_beta(bitgen_t *bitgen_state, double a, double b) {
       }
     }
   } else {
-    Ga = random_standard_gamma_zig(bitgen_state, a);
-    Gb = random_standard_gamma_zig(bitgen_state, b);
+    Ga = random_standard_gamma(bitgen_state, a);
+    Gb = random_standard_gamma(bitgen_state, b);
     return Ga / (Ga + Gb);
   }
 }
 
 double random_chisquare(bitgen_t *bitgen_state, double df) {
-  return 2.0 * random_standard_gamma_zig(bitgen_state, df / 2.0);
+  return 2.0 * random_standard_gamma(bitgen_state, df / 2.0);
 }
 
 double random_f(bitgen_t *bitgen_state, double dfnum, double dfden) {
@@ -578,22 +451,22 @@ double random_f(bitgen_t *bitgen_state, double dfnum, double dfden) {
 }
 
 double random_standard_cauchy(bitgen_t *bitgen_state) {
-  return random_gauss_zig(bitgen_state) / random_gauss_zig(bitgen_state);
+  return random_standard_normal(bitgen_state) / random_standard_normal(bitgen_state);
 }
 
 double random_pareto(bitgen_t *bitgen_state, double a) {
-  return exp(standard_exponential_zig(bitgen_state) / a) - 1;
+  return exp(random_standard_exponential_zig(bitgen_state) / a) - 1;
 }
 
 double random_weibull(bitgen_t *bitgen_state, double a) {
   if (a == 0.0) {
     return 0.0;
   }
-  return pow(standard_exponential_zig(bitgen_state), 1. / a);
+  return pow(random_standard_exponential_zig(bitgen_state), 1. / a);
 }
 
 double random_power(bitgen_t *bitgen_state, double a) {
-  return pow(1 - exp(-standard_exponential_zig(bitgen_state)), 1. / a);
+  return pow(1 - exp(-random_standard_exponential_zig(bitgen_state)), 1. / a);
 }
 
 double random_laplace(bitgen_t *bitgen_state, double loc, double scale) {
@@ -634,7 +507,7 @@ double random_logistic(bitgen_t *bitgen_state, double loc, double scale) {
 }
 
 double random_lognormal(bitgen_t *bitgen_state, double mean, double sigma) {
-  return exp(random_normal_zig(bitgen_state, mean, sigma));
+  return exp(random_normal(bitgen_state, mean, sigma));
 }
 
 double random_rayleigh(bitgen_t *bitgen_state, double mode) {
@@ -644,8 +517,8 @@ double random_rayleigh(bitgen_t *bitgen_state, double mode) {
 double random_standard_t(bitgen_t *bitgen_state, double df) {
   double num, denom;
 
-  num = random_gauss_zig(bitgen_state);
-  denom = random_standard_gamma_zig(bitgen_state, df / 2);
+  num = random_standard_normal(bitgen_state);
+  denom = random_standard_gamma(bitgen_state, df / 2);
   return sqrt(df / 2) * num / sqrt(denom);
 }
 
@@ -699,7 +572,7 @@ static RAND_INT_TYPE random_poisson_ptrs(bitgen_t *bitgen_state, double lam) {
     /* log(V) == log(0.0) ok here */
     /* if U==0.0 so that us==0.0, log is ok since always returns */
     if ((log(V) + log(invalpha) - log(a / (us * us) + b)) <=
-        (-lam + k * loglam - loggam(k + 1))) {
+        (-lam + k * loglam - random_loggam(k + 1))) {
       return k;
     }
   }
@@ -901,8 +774,8 @@ RAND_INT_TYPE random_binomial_inversion(bitgen_t *bitgen_state, RAND_INT_TYPE n,
   return X;
 }
 
-RAND_INT_TYPE random_binomial(bitgen_t *bitgen_state, double p, RAND_INT_TYPE n,
-                              binomial_t *binomial) {
+int64_t random_binomial(bitgen_t *bitgen_state, double p, int64_t n,
+                        binomial_t *binomial) {
   double q;
 
   if ((n == 0LL) || (p == 0.0f))
@@ -934,7 +807,7 @@ double random_noncentral_chisquare(bitgen_t *bitgen_state, double df,
   }
   if (1 < df) {
     const double Chi2 = random_chisquare(bitgen_state, df - 1);
-    const double n = random_gauss_zig(bitgen_state) + sqrt(nonc);
+    const double n = random_standard_normal(bitgen_state) + sqrt(nonc);
     return Chi2 + n * n;
   } else {
     const RAND_INT_TYPE i = random_poisson(bitgen_state, nonc / 2.0);
@@ -953,7 +826,7 @@ double random_wald(bitgen_t *bitgen_state, double mean, double scale) {
   double mu_2l;
 
   mu_2l = mean / (2 * scale);
-  Y = random_gauss_zig(bitgen_state);
+  Y = random_standard_normal(bitgen_state);
   Y = mean * Y * Y;
   X = mean + mu_2l * (Y - sqrt(4 * scale * Y + Y * Y));
   U = next_double(bitgen_state);
@@ -1092,8 +965,8 @@ RAND_INT_TYPE random_zipf(bitgen_t *bitgen_state, double a) {
   while (1) {
     double T, U, V, X;
 
-    U = 1.0 - random_double(bitgen_state);
-    V = random_double(bitgen_state);
+    U = 1.0 - next_double(bitgen_state);
+    V = next_double(bitgen_state);
     X = floor(pow(U, -1.0 / am1));
     /*
      * The real result may be above what can be represented in a signed
@@ -1478,7 +1351,7 @@ uint64_t random_bounded_uint64(bitgen_t *bitgen_state, uint64_t off,
                                uint64_t rng, uint64_t mask, bool use_masked) {
   if (rng == 0) {
     return off;
-  } else if (rng < 0xFFFFFFFFUL) {
+  } else if (rng <= 0xFFFFFFFFUL) {
     /* Call 32-bit generator if range in 32-bit. */
     if (use_masked) {
       return off + buffered_bounded_masked_uint32(bitgen_state, rng, mask, NULL,
@@ -1592,7 +1465,7 @@ void random_bounded_uint64_fill(bitgen_t *bitgen_state, uint64_t off,
     for (i = 0; i < cnt; i++) {
       out[i] = off;
     }
-  } else if (rng < 0xFFFFFFFFUL) {
+  } else if (rng <= 0xFFFFFFFFUL) {
     uint32_t buf = 0;
     int bcnt = 0;
 
diff --git a/numpy/random/src/distributions/random_hypergeometric.c b/numpy/random/src/distributions/random_hypergeometric.c
index 59a3a4b9b..da5ea9c68 100644
--- a/numpy/random/src/distributions/random_hypergeometric.c
+++ b/numpy/random/src/distributions/random_hypergeometric.c
@@ -1,6 +1,6 @@
-#include <stdint.h>
-#include "distributions.h"
+#include "include/distributions.h"
 #include "logfactorial.h"
+#include <stdint.h>
 
 /*
  *  Generate a sample from the hypergeometric distribution.
@@ -188,8 +188,8 @@ static int64_t hypergeometric_hrua(bitgen_t *bitgen_state,
     while (1) {
         double U, V, X, T;
         double gp;
-        U = random_double(bitgen_state);
-        V = random_double(bitgen_state);  // "U star" in Stadlober (1989)
+        U = next_double(bitgen_state);
+        V = next_double(bitgen_state);  // "U star" in Stadlober (1989)
         X = a + h*(V - 0.5) / U;
 
         // fast rejection:
diff --git a/numpy/random/src/distributions/random_mvhg_count.c b/numpy/random/src/distributions/random_mvhg_count.c
new file mode 100644
index 000000000..9c0cc045d
--- /dev/null
+++ b/numpy/random/src/distributions/random_mvhg_count.c
@@ -0,0 +1,131 @@
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+
+#include "include/distributions.h"
+
+/*
+ *  random_mvhg_count
+ *
+ *  Draw variates from the multivariate hypergeometric distribution--
+ *  the "count" algorithm.
+ *
+ *  Parameters
+ *  ----------
+ *  bitgen_t *bitgen_state
+ *      Pointer to a `bitgen_t` instance.
+ *  int64_t total
+ *      The sum of the values in the array `colors`.  (This is redundant
+ *      information, but we know the caller has already computed it, so
+ *      we might as well use it.)
+ *  size_t num_colors
+ *      The length of the `colors` array.
+ *  int64_t *colors
+ *      The array of colors (i.e. the number of each type in the collection
+ *      from which the random variate is drawn).
+ *  int64_t nsample
+ *      The number of objects drawn without replacement for each variate.
+ *      `nsample` must not exceed sum(colors).  This condition is not checked;
+ *      it is assumed that the caller has already validated the value.
+ *  size_t num_variates
+ *      The number of variates to be produced and put in the array
+ *      pointed to by `variates`.  One variate is a vector of length
+ *      `num_colors`, so the array pointed to by `variates` must have length
+ *      `num_variates * num_colors`.
+ *  int64_t *variates
+ *      The array that will hold the result.  It must have length
+ *      `num_variates * num_colors`.
+ *      The array is not initialized in the function; it is expected that the
+ *      array has been initialized with zeros when the function is called.
+ *
+ *  Notes
+ *  -----
+ *  The "count" algorithm for drawing one variate is roughly equivalent to the
+ *  following numpy code:
+ *
+ *      choices = np.repeat(np.arange(len(colors)), colors)
+ *      selection = np.random.choice(choices, nsample, replace=False)
+ *      variate = np.bincount(selection, minlength=len(colors))
+ *
+ *  This function uses a temporary array with length sum(colors).
+ *
+ *  Assumptions on the arguments (not checked in the function):
+ *    *  colors[k] >= 0  for k in range(num_colors)
+ *    *  total = sum(colors)
+ *    *  0 <= nsample <= total
+ *    *  the product total * sizeof(size_t) does not exceed SIZE_MAX
+ *    *  the product num_variates * num_colors does not overflow
+ */
+
+int random_mvhg_count(bitgen_t *bitgen_state,
+                      int64_t total,
+                      size_t num_colors, int64_t *colors,
+                      int64_t nsample,
+                      size_t num_variates, int64_t *variates)
+{
+    size_t *choices;
+    bool more_than_half;
+
+    if ((total == 0) || (nsample == 0) || (num_variates == 0)) {
+        // Nothing to do.
+        return 0;
+    }
+
+    choices = malloc(total * (sizeof *choices));
+    if (choices == NULL) {
+        return -1;
+    }
+
+    /*
+     *  If colors contains, for example, [3 2 5], then choices
+     *  will contain [0 0 0 1 1 2 2 2 2 2].
+     */
+    for (size_t i = 0, k = 0; i < num_colors; ++i) {
+        for (int64_t j = 0; j < colors[i]; ++j) {
+            choices[k] = i;
+            ++k;
+        }
+    }
+
+    more_than_half = nsample > (total / 2);
+    if (more_than_half) {
+        nsample = total - nsample;
+    }
+
+    for (size_t i = 0; i < num_variates * num_colors; i += num_colors) {
+        /*
+         *  Fisher-Yates shuffle, but only loop through the first
+         *  `nsample` entries of `choices`.  After the loop,
+         *  choices[:nsample] contains a random sample from the
+         *  the full array.
+         */
+        for (size_t j = 0; j < (size_t) nsample; ++j) {
+            size_t tmp, k;
+            // Note: nsample is not greater than total, so there is no danger
+            // of integer underflow in `(size_t) total - j - 1`.
+            k = j + (size_t) random_interval(bitgen_state,
+                                             (size_t) total - j - 1);
+            tmp = choices[k];
+            choices[k] = choices[j];
+            choices[j] = tmp;
+        }
+        /*
+         *  Count the number of occurrences of each value in choices[:nsample].
+         *  The result, stored in sample[i:i+num_colors], is the sample from
+         *  the multivariate hypergeometric distribution.
+         */
+        for (size_t j = 0; j < (size_t) nsample; ++j) {
+            variates[i + choices[j]] += 1;
+        }
+
+        if (more_than_half) {
+            for (size_t k = 0; k < num_colors; ++k) {
+                variates[i + k] = colors[k] - variates[i + k];
+            }
+        }
+    }
+
+    free(choices);
+
+    return 0;
+}
diff --git a/numpy/random/src/distributions/random_mvhg_marginals.c b/numpy/random/src/distributions/random_mvhg_marginals.c
new file mode 100644
index 000000000..301a4acad
--- /dev/null
+++ b/numpy/random/src/distributions/random_mvhg_marginals.c
@@ -0,0 +1,138 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <math.h>
+
+#include "include/distributions.h"
+#include "logfactorial.h"
+
+
+/*
+ *  random_mvhg_marginals
+ *
+ *  Draw samples from the multivariate hypergeometric distribution--
+ *  the "marginals" algorithm.
+ *
+ *  This version generates the sample by iteratively calling
+ *  hypergeometric() (the univariate hypergeometric distribution).
+ *
+ *  Parameters
+ *  ----------
+ *  bitgen_t *bitgen_state
+ *      Pointer to a `bitgen_t` instance.
+ *  int64_t total
+ *      The sum of the values in the array `colors`.  (This is redundant
+ *      information, but we know the caller has already computed it, so
+ *      we might as well use it.)
+ *  size_t num_colors
+ *      The length of the `colors` array.  The functions assumes
+ *      num_colors > 0.
+ *  int64_t *colors
+ *      The array of colors (i.e. the number of each type in the collection
+ *      from which the random variate is drawn).
+ *  int64_t nsample
+ *      The number of objects drawn without replacement for each variate.
+ *      `nsample` must not exceed sum(colors).  This condition is not checked;
+ *      it is assumed that the caller has already validated the value.
+ *  size_t num_variates
+ *      The number of variates to be produced and put in the array
+ *      pointed to by `variates`.  One variate is a vector of length
+ *      `num_colors`, so the array pointed to by `variates` must have length
+ *      `num_variates * num_colors`.
+ *  int64_t *variates
+ *      The array that will hold the result.  It must have length
+ *      `num_variates * num_colors`.
+ *      The array is not initialized in the function; it is expected that the
+ *      array has been initialized with zeros when the function is called.
+ *
+ *  Notes
+ *  -----
+ *  Here's an example that demonstrates the idea of this algorithm.
+ *
+ *  Suppose the urn contains red, green, blue and yellow marbles.
+ *  Let nred be the number of red marbles, and define the quantities for
+ *  the other colors similarly.  The total number of marbles is
+ *
+ *      total = nred + ngreen + nblue + nyellow.
+ *
+ *  To generate a sample using rk_hypergeometric:
+ *
+ *     red_sample = hypergeometric(ngood=nred, nbad=total - nred,
+ *                                 nsample=nsample)
+ *
+ *  This gives us the number of red marbles in the sample.  The number of
+ *  marbles in the sample that are *not* red is nsample - red_sample.
+ *  To figure out the distribution of those marbles, we again use
+ *  rk_hypergeometric:
+ *
+ *      green_sample = hypergeometric(ngood=ngreen,
+ *                                    nbad=total - nred - ngreen,
+ *                                    nsample=nsample - red_sample)
+ *
+ *  Similarly,
+ *
+ *      blue_sample = hypergeometric(
+ *                        ngood=nblue,
+ *                        nbad=total - nred - ngreen - nblue,
+ *                        nsample=nsample - red_sample - green_sample)
+ *
+ *  Finally,
+ *
+ *      yellow_sample = total - (red_sample + green_sample + blue_sample).
+ *
+ *  The above sequence of steps is implemented as a loop for an arbitrary
+ *  number of colors in the innermost loop in the code below.  `remaining`
+ *  is the value passed to `nbad`; it is `total - colors[0]` in the first
+ *  call to random_hypergeometric(), and then decreases by `colors[j]` in
+ *  each iteration.  `num_to_sample` is the `nsample` argument.  It
+ *  starts at this function's `nsample` input, and is decreased by the
+ *  result of the call to random_hypergeometric() in each iteration.
+ *
+ *  Assumptions on the arguments (not checked in the function):
+ *    *  colors[k] >= 0  for k in range(num_colors)
+ *    *  total = sum(colors)
+ *    *  0 <= nsample <= total
+ *    *  the product num_variates * num_colors does not overflow
+ */
+
+void random_mvhg_marginals(bitgen_t *bitgen_state,
+                           int64_t total,
+                           size_t num_colors, int64_t *colors,
+                           int64_t nsample,
+                           size_t num_variates, int64_t *variates)
+{
+    bool more_than_half;
+
+    if ((total == 0) || (nsample == 0) || (num_variates == 0)) {
+        // Nothing to do.
+        return;
+    }
+
+    more_than_half = nsample > (total / 2);
+    if (more_than_half) {
+        nsample = total - nsample;
+    }
+
+    for (size_t i = 0; i < num_variates * num_colors; i += num_colors) {
+        int64_t num_to_sample = nsample;
+        int64_t remaining = total;
+        for (size_t j = 0; (num_to_sample > 0) && (j + 1 < num_colors); ++j) {
+            int64_t r;
+            remaining -= colors[j];
+            r = random_hypergeometric(bitgen_state,
+                                      colors[j], remaining, num_to_sample);
+            variates[i + j] = r;
+            num_to_sample -= r;
+        }
+
+        if (num_to_sample > 0) {
+            variates[i + num_colors - 1] = num_to_sample;
+        }
+
+        if (more_than_half) {
+            for (size_t k = 0; k < num_colors; ++k) {
+                variates[i + k] = colors[k] - variates[i + k];
+            }
+        }
+    }
+}
diff --git a/numpy/random/src/entropy/entropy.c b/numpy/random/src/entropy/entropy.c
deleted file mode 100644
index eaca37a9c..000000000
--- a/numpy/random/src/entropy/entropy.c
+++ /dev/null
@@ -1,114 +0,0 @@
-#include <stddef.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "entropy.h"
-#ifdef _WIN32
-/* Windows */
-#include <sys/timeb.h>
-#include <time.h>
-#include <windows.h>
-
-#include <wincrypt.h>
-#else
-/* Unix */
-#include <sys/time.h>
-#include <time.h>
-#include <unistd.h>
-#include <fcntl.h>
-#endif
-
-bool entropy_getbytes(void *dest, size_t size) {
-#ifndef _WIN32
-
-  int fd = open("/dev/urandom", O_RDONLY);
-  if (fd < 0)
-    return false;
-  ssize_t sz = read(fd, dest, size);
-  if ((sz < 0) || ((size_t)sz < size))
-    return false;
-  return close(fd) == 0;
-
-#else
-
-  HCRYPTPROV hCryptProv;
-  BOOL done;
-
-  if (!CryptAcquireContext(&hCryptProv, NULL, NULL, PROV_RSA_FULL,
-                           CRYPT_VERIFYCONTEXT) ||
-      !hCryptProv) {
-    return true;
-  }
-  done = CryptGenRandom(hCryptProv, (DWORD)size, (unsigned char *)dest);
-  CryptReleaseContext(hCryptProv, 0);
-  if (!done) {
-    return false;
-  }
-
-  return true;
-#endif
-}
-
-/* Thomas Wang 32/64 bits integer hash function */
-uint32_t entropy_hash_32(uint32_t key) {
-  key += ~(key << 15);
-  key ^= (key >> 10);
-  key += (key << 3);
-  key ^= (key >> 6);
-  key += ~(key << 11);
-  key ^= (key >> 16);
-  return key;
-}
-
-uint64_t entropy_hash_64(uint64_t key) {
-  key = (~key) + (key << 21); // key = (key << 21) - key - 1;
-  key = key ^ (key >> 24);
-  key = (key + (key << 3)) + (key << 8); // key * 265
-  key = key ^ (key >> 14);
-  key = (key + (key << 2)) + (key << 4); // key * 21
-  key = key ^ (key >> 28);
-  key = key + (key << 31);
-  return key;
-}
-
-uint32_t entropy_randombytes(void) {
-
-#ifndef _WIN32
-  struct timeval tv;
-  gettimeofday(&tv, NULL);
-  return entropy_hash_32(getpid()) ^ entropy_hash_32(tv.tv_sec) ^
-         entropy_hash_32(tv.tv_usec) ^ entropy_hash_32(clock());
-#else
-  uint32_t out = 0;
-  int64_t counter;
-  struct _timeb tv;
-  _ftime_s(&tv);
-  out = entropy_hash_32(GetCurrentProcessId()) ^
-        entropy_hash_32((uint32_t)tv.time) ^ entropy_hash_32(tv.millitm) ^
-        entropy_hash_32(clock());
-  if (QueryPerformanceCounter((LARGE_INTEGER *)&counter) != 0)
-    out ^= entropy_hash_32((uint32_t)(counter & 0xffffffff));
-  return out;
-#endif
-}
-
-bool entropy_fallback_getbytes(void *dest, size_t size) {
-  int hashes = (int)size;
-  uint32_t *hash = malloc(hashes * sizeof(uint32_t));
-  int i;
-  for (i = 0; i < hashes; i++) {
-    hash[i] = entropy_randombytes();
-  }
-  memcpy(dest, (void *)hash, size);
-  free(hash);
-  return true;
-}
-
-void entropy_fill(void *dest, size_t size) {
-  bool success;
-  success = entropy_getbytes(dest, size);
-  if (!success) {
-    entropy_fallback_getbytes(dest, size);
-  }
-}
diff --git a/numpy/random/src/entropy/entropy.h b/numpy/random/src/entropy/entropy.h
deleted file mode 100644
index f00caf61d..000000000
--- a/numpy/random/src/entropy/entropy.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef _RANDOMDGEN__ENTROPY_H_
-#define _RANDOMDGEN__ENTROPY_H_
-
-#include <stddef.h>
-#include <stdbool.h>
-#include <stdint.h>
-
-extern void entropy_fill(void *dest, size_t size);
-
-extern bool entropy_getbytes(void *dest, size_t size);
-
-extern bool entropy_fallback_getbytes(void *dest, size_t size);
-
-#endif
diff --git a/numpy/random/src/legacy/legacy-distributions.c b/numpy/random/src/legacy/legacy-distributions.c
index 4741a0352..fd067fe8d 100644
--- a/numpy/random/src/legacy/legacy-distributions.c
+++ b/numpy/random/src/legacy/legacy-distributions.c
@@ -1,4 +1,4 @@
-#include "legacy-distributions.h"
+#include "include/legacy-distributions.h"
 
 
 static NPY_INLINE double legacy_double(aug_bitgen_t *aug_state) {
@@ -215,6 +215,37 @@ double legacy_exponential(aug_bitgen_t *aug_state, double scale) {
 }
 
 
+static RAND_INT_TYPE legacy_random_binomial_original(bitgen_t *bitgen_state,
+                                                     double p,
+                                                     RAND_INT_TYPE n,
+                                                     binomial_t *binomial) {
+  double q;
+
+  if (p <= 0.5) {
+    if (p * n <= 30.0) {
+      return random_binomial_inversion(bitgen_state, n, p, binomial);
+    } else {
+      return random_binomial_btpe(bitgen_state, n, p, binomial);
+    }
+  } else {
+    q = 1.0 - p;
+    if (q * n <= 30.0) {
+      return n - random_binomial_inversion(bitgen_state, n, q, binomial);
+    } else {
+      return n - random_binomial_btpe(bitgen_state, n, q, binomial);
+    }
+  }
+}
+
+
+int64_t legacy_random_binomial(bitgen_t *bitgen_state, double p,
+                               int64_t n, binomial_t *binomial) {
+  return (int64_t) legacy_random_binomial_original(bitgen_state, p,
+                                                   (RAND_INT_TYPE) n,
+                                                   binomial);
+}
+
+
 static RAND_INT_TYPE random_hypergeometric_hyp(bitgen_t *bitgen_state,
                                                RAND_INT_TYPE good,
                                                RAND_INT_TYPE bad,
@@ -263,8 +294,8 @@ static RAND_INT_TYPE random_hypergeometric_hrua(bitgen_t *bitgen_state,
   d7 = sqrt((double)(popsize - m) * sample * d4 * d5 / (popsize - 1) + 0.5);
   d8 = D1 * d7 + D2;
   d9 = (RAND_INT_TYPE)floor((double)(m + 1) * (mingoodbad + 1) / (popsize + 2));
-  d10 = (loggam(d9 + 1) + loggam(mingoodbad - d9 + 1) + loggam(m - d9 + 1) +
-         loggam(maxgoodbad - m + d9 + 1));
+  d10 = (random_loggam(d9 + 1) + random_loggam(mingoodbad - d9 + 1) +
+         random_loggam(m - d9 + 1) + random_loggam(maxgoodbad - m + d9 + 1));
   d11 = MIN(MIN(m, mingoodbad) + 1.0, floor(d6 + 16 * d7));
   /* 16 for 16-decimal-digit precision in D1 and D2 */
 
@@ -278,8 +309,8 @@ static RAND_INT_TYPE random_hypergeometric_hrua(bitgen_t *bitgen_state,
       continue;
 
     Z = (RAND_INT_TYPE)floor(W);
-    T = d10 - (loggam(Z + 1) + loggam(mingoodbad - Z + 1) + loggam(m - Z + 1) +
-               loggam(maxgoodbad - m + Z + 1));
+    T = d10 - (random_loggam(Z + 1) + random_loggam(mingoodbad - Z + 1) +
+               random_loggam(m - Z + 1) + random_loggam(maxgoodbad - m + Z + 1));
 
     /* fast acceptance: */
     if ((X * (4.0 - X) - 3.0) <= T)
diff --git a/numpy/random/src/philox/philox.h b/numpy/random/src/philox/philox.h
index 309d89eae..c72424a97 100644
--- a/numpy/random/src/philox/philox.h
+++ b/numpy/random/src/philox/philox.h
@@ -1,8 +1,8 @@
 #ifndef _RANDOMDGEN__PHILOX_H_
 #define _RANDOMDGEN__PHILOX_H_
 
-#include <inttypes.h>
 #include "numpy/npy_common.h"
+#include <inttypes.h>
 
 #define PHILOX_BUFFER_SIZE 4L
 
diff --git a/numpy/random/src/sfc64/sfc64.h b/numpy/random/src/sfc64/sfc64.h
index 6674ae69c..75c4118d3 100644
--- a/numpy/random/src/sfc64/sfc64.h
+++ b/numpy/random/src/sfc64/sfc64.h
@@ -1,11 +1,11 @@
 #ifndef _RANDOMDGEN__SFC64_H_
 #define _RANDOMDGEN__SFC64_H_
 
+#include "numpy/npy_common.h"
 #include <inttypes.h>
 #ifdef _WIN32
 #include <stdlib.h>
 #endif
-#include "numpy/npy_common.h"
 
 typedef struct s_sfc64_state {
   uint64_t s[4];
diff --git a/numpy/random/tests/test_direct.py b/numpy/random/tests/test_direct.py
index 0f57c4bd4..34d7bd278 100644
--- a/numpy/random/tests/test_direct.py
+++ b/numpy/random/tests/test_direct.py
@@ -10,7 +10,7 @@ from numpy.random import (
     Generator, MT19937, PCG64, Philox, RandomState, SeedSequence, SFC64,
     default_rng
 )
-from numpy.random.common import interface
+from numpy.random._common import interface
 
 try:
     import cffi  # noqa: F401
@@ -120,7 +120,7 @@ def gauss_from_uint(x, n, bits):
     return gauss[:n]
 
 def test_seedsequence():
-    from numpy.random.bit_generator import (ISeedSequence,
+    from numpy.random._bit_generator import (ISeedSequence,
                                             ISpawnableSeedSequence,
                                             SeedlessSeedSequence)
 
diff --git a/numpy/random/tests/test_generator_mt19937.py b/numpy/random/tests/test_generator_mt19937.py
index 853d86fba..526275dda 100644
--- a/numpy/random/tests/test_generator_mt19937.py
+++ b/numpy/random/tests/test_generator_mt19937.py
@@ -4,7 +4,7 @@ import pytest
 
 import numpy as np
 from numpy.testing import (
-    assert_, assert_raises, assert_equal,
+    assert_, assert_raises, assert_equal, assert_allclose,
     assert_warns, assert_no_warnings, assert_array_equal,
     assert_array_almost_equal, suppress_warnings)
 
@@ -115,6 +115,140 @@ class TestMultinomial(object):
         assert_array_equal(non_contig, contig)
 
 
+class TestMultivariateHypergeometric(object):
+
+    def setup(self):
+        self.seed = 8675309
+
+    def test_argument_validation(self):
+        # Error cases...
+
+        # `colors` must be a 1-d sequence
+        assert_raises(ValueError, random.multivariate_hypergeometric,
+                      10, 4)
+
+        # Negative nsample
+        assert_raises(ValueError, random.multivariate_hypergeometric,
+                      [2, 3, 4], -1)
+
+        # Negative color
+        assert_raises(ValueError, random.multivariate_hypergeometric,
+                      [-1, 2, 3], 2)
+
+        # nsample exceeds sum(colors)
+        assert_raises(ValueError, random.multivariate_hypergeometric,
+                      [2, 3, 4], 10)
+
+        # nsample exceeds sum(colors) (edge case of empty colors)
+        assert_raises(ValueError, random.multivariate_hypergeometric,
+                      [], 1)
+
+        # Validation errors associated with very large values in colors.
+        assert_raises(ValueError, random.multivariate_hypergeometric,
+                      [999999999, 101], 5, 1, 'marginals')
+
+        int64_info = np.iinfo(np.int64)
+        max_int64 = int64_info.max
+        max_int64_index = max_int64 // int64_info.dtype.itemsize
+        assert_raises(ValueError, random.multivariate_hypergeometric,
+                      [max_int64_index - 100, 101], 5, 1, 'count')
+
+    @pytest.mark.parametrize('method', ['count', 'marginals'])
+    def test_edge_cases(self, method):
+        # Set the seed, but in fact, all the results in this test are
+        # deterministic, so we don't really need this.
+        random = Generator(MT19937(self.seed))
+
+        x = random.multivariate_hypergeometric([0, 0, 0], 0, method=method)
+        assert_array_equal(x, [0, 0, 0])
+
+        x = random.multivariate_hypergeometric([], 0, method=method)
+        assert_array_equal(x, [])
+
+        x = random.multivariate_hypergeometric([], 0, size=1, method=method)
+        assert_array_equal(x, np.empty((1, 0), dtype=np.int64))
+
+        x = random.multivariate_hypergeometric([1, 2, 3], 0, method=method)
+        assert_array_equal(x, [0, 0, 0])
+
+        x = random.multivariate_hypergeometric([9, 0, 0], 3, method=method)
+        assert_array_equal(x, [3, 0, 0])
+
+        colors = [1, 1, 0, 1, 1]
+        x = random.multivariate_hypergeometric(colors, sum(colors),
+                                               method=method)
+        assert_array_equal(x, colors)
+
+        x = random.multivariate_hypergeometric([3, 4, 5], 12, size=3,
+                                               method=method)
+        assert_array_equal(x, [[3, 4, 5]]*3)
+
+    # Cases for nsample:
+    #     nsample < 10
+    #     10 <= nsample < colors.sum()/2
+    #     colors.sum()/2 < nsample < colors.sum() - 10
+    #     colors.sum() - 10 < nsample < colors.sum()
+    @pytest.mark.parametrize('nsample', [8, 25, 45, 55])
+    @pytest.mark.parametrize('method', ['count', 'marginals'])
+    @pytest.mark.parametrize('size', [5, (2, 3), 150000])
+    def test_typical_cases(self, nsample, method, size):
+        random = Generator(MT19937(self.seed))
+
+        colors = np.array([10, 5, 20, 25])
+        sample = random.multivariate_hypergeometric(colors, nsample, size,
+                                                    method=method)
+        if isinstance(size, int):
+            expected_shape = (size,) + colors.shape
+        else:
+            expected_shape = size + colors.shape
+        assert_equal(sample.shape, expected_shape)
+        assert_((sample >= 0).all())
+        assert_((sample <= colors).all())
+        assert_array_equal(sample.sum(axis=-1),
+                           np.full(size, fill_value=nsample, dtype=int))
+        if isinstance(size, int) and size >= 100000:
+            # This sample is large enough to compare its mean to
+            # the expected values.
+            assert_allclose(sample.mean(axis=0),
+                            nsample * colors / colors.sum(),
+                            rtol=1e-3, atol=0.005)
+
+    def test_repeatability1(self):
+        random = Generator(MT19937(self.seed))
+        sample = random.multivariate_hypergeometric([3, 4, 5], 5, size=5,
+                                                    method='count')
+        expected = np.array([[2, 1, 2],
+                             [2, 1, 2],
+                             [1, 1, 3],
+                             [2, 0, 3],
+                             [2, 1, 2]])
+        assert_array_equal(sample, expected)
+
+    def test_repeatability2(self):
+        random = Generator(MT19937(self.seed))
+        sample = random.multivariate_hypergeometric([20, 30, 50], 50,
+                                                    size=5,
+                                                    method='marginals')
+        expected = np.array([[ 9, 17, 24],
+                             [ 7, 13, 30],
+                             [ 9, 15, 26],
+                             [ 9, 17, 24],
+                             [12, 14, 24]])
+        assert_array_equal(sample, expected)
+
+    def test_repeatability3(self):
+        random = Generator(MT19937(self.seed))
+        sample = random.multivariate_hypergeometric([20, 30, 50], 12,
+                                                    size=5,
+                                                    method='marginals')
+        expected = np.array([[2, 3, 7],
+                             [5, 3, 4],
+                             [2, 5, 5],
+                             [5, 3, 4],
+                             [1, 5, 6]])
+        assert_array_equal(sample, expected)
+
+
 class TestSetState(object):
     def setup(self):
         self.seed = 1234567890
@@ -732,6 +866,29 @@ class TestRandomDist(object):
             desired = conv([4, 1, 9, 8, 0, 5, 3, 6, 2, 7])
             assert_array_equal(actual, desired)
 
+    def test_shuffle_custom_axis(self):
+        random = Generator(MT19937(self.seed))
+        actual = np.arange(16).reshape((4, 4))
+        random.shuffle(actual, axis=1)
+        desired = np.array([[ 0,  3,  1,  2],
+                            [ 4,  7,  5,  6],
+                            [ 8, 11,  9, 10],
+                            [12, 15, 13, 14]])
+        assert_array_equal(actual, desired)
+        random = Generator(MT19937(self.seed))
+        actual = np.arange(16).reshape((4, 4))
+        random.shuffle(actual, axis=-1)
+        assert_array_equal(actual, desired)
+
+    def test_shuffle_axis_nonsquare(self):
+        y1 = np.arange(20).reshape(2, 10)
+        y2 = y1.copy()
+        random = Generator(MT19937(self.seed))
+        random.shuffle(y1, axis=1)
+        random = Generator(MT19937(self.seed))
+        random.shuffle(y2.T)
+        assert_array_equal(y1, y2)
+
     def test_shuffle_masked(self):
         # gh-3263
         a = np.ma.masked_values(np.reshape(range(20), (5, 4)) % 3 - 1, -1)
@@ -746,6 +903,16 @@ class TestRandomDist(object):
             assert_equal(
                 sorted(b.data[~b.mask]), sorted(b_orig.data[~b_orig.mask]))
 
+    def test_shuffle_exceptions(self):
+        random = Generator(MT19937(self.seed))
+        arr = np.arange(10)
+        assert_raises(np.AxisError, random.shuffle, arr, 1)
+        arr = np.arange(9).reshape((3, 3))
+        assert_raises(np.AxisError, random.shuffle, arr, 3)
+        assert_raises(TypeError, random.shuffle, arr, slice(1, 2, None))
+        arr = [[1, 2, 3], [4, 5, 6]]
+        assert_raises(NotImplementedError, random.shuffle, arr, 1)
+
     def test_permutation(self):
         random = Generator(MT19937(self.seed))
         alist = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]
@@ -771,6 +938,27 @@ class TestRandomDist(object):
         actual = random.permutation(integer_val)
         assert_array_equal(actual, desired)
 
+    def test_permutation_custom_axis(self):
+        a = np.arange(16).reshape((4, 4))
+        desired = np.array([[ 0,  3,  1,  2],
+                            [ 4,  7,  5,  6],
+                            [ 8, 11,  9, 10],
+                            [12, 15, 13, 14]])
+        random = Generator(MT19937(self.seed))
+        actual = random.permutation(a, axis=1)
+        assert_array_equal(actual, desired)
+        random = Generator(MT19937(self.seed))
+        actual = random.permutation(a, axis=-1)
+        assert_array_equal(actual, desired)
+
+    def test_permutation_exceptions(self):
+        random = Generator(MT19937(self.seed))
+        arr = np.arange(10)
+        assert_raises(np.AxisError, random.permutation, arr, 1)
+        arr = np.arange(9).reshape((3, 3))
+        assert_raises(np.AxisError, random.permutation, arr, 3)
+        assert_raises(TypeError, random.permutation, arr, slice(1, 2, None))
+
     def test_beta(self):
         random = Generator(MT19937(self.seed))
         actual = random.beta(.1, .9, size=(3, 2))
diff --git a/numpy/random/tests/test_randomstate.py b/numpy/random/tests/test_randomstate.py
index a0edc5c23..5131f1839 100644
--- a/numpy/random/tests/test_randomstate.py
+++ b/numpy/random/tests/test_randomstate.py
@@ -11,7 +11,8 @@ from numpy.testing import (
         suppress_warnings
         )
 
-from numpy.random import MT19937, PCG64, mtrand as random
+from numpy.random import MT19937, PCG64
+from numpy import random
 
 INT_FUNCS = {'binomial': (100.0, 0.6),
              'geometric': (.5,),
diff --git a/numpy/random/tests/test_randomstate_regression.py b/numpy/random/tests/test_randomstate_regression.py
index 29870534a..bdc2214b6 100644
--- a/numpy/random/tests/test_randomstate_regression.py
+++ b/numpy/random/tests/test_randomstate_regression.py
@@ -8,7 +8,7 @@ from numpy.testing import (
 from numpy.compat import long
 import numpy as np
 
-from numpy.random import mtrand as random
+from numpy import random
 
 
 class TestRegression(object):
@@ -181,3 +181,30 @@ class TestRegression(object):
         assert c.dtype == np.dtype(int)
         c = np.random.choice(10, replace=False, size=2)
         assert c.dtype == np.dtype(int)
+
+    @pytest.mark.skipif(np.iinfo('l').max < 2**32,
+                        reason='Cannot test with 32-bit C long')
+    def test_randint_117(self):
+        # GH 14189
+        random.seed(0)
+        expected = np.array([2357136044, 2546248239, 3071714933, 3626093760,
+                             2588848963, 3684848379, 2340255427, 3638918503,
+                             1819583497, 2678185683], dtype='int64')
+        actual = random.randint(2**32, size=10)
+        assert_array_equal(actual, expected)
+
+    def test_p_zero_stream(self):
+        # Regression test for gh-14522.  Ensure that future versions
+        # generate the same variates as version 1.16.
+        np.random.seed(12345)
+        assert_array_equal(random.binomial(1, [0, 0.25, 0.5, 0.75, 1]),
+                           [0, 0, 0, 1, 1])
+
+    def test_n_zero_stream(self):
+        # Regression test for gh-14522.  Ensure that future versions
+        # generate the same variates as version 1.16.
+        np.random.seed(8675309)
+        expected = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                             [3, 4, 2, 3, 3, 1, 5, 3, 1, 3]])
+        assert_array_equal(random.binomial([[0], [10]], 0.25, size=(2, 10)),
+                           expected)
diff --git a/numpy/random/tests/test_seed_sequence.py b/numpy/random/tests/test_seed_sequence.py
index 8d6d604a2..fe23680ed 100644
--- a/numpy/random/tests/test_seed_sequence.py
+++ b/numpy/random/tests/test_seed_sequence.py
@@ -1,7 +1,7 @@
 import numpy as np
 from numpy.testing import assert_array_equal
 
-from numpy.random.bit_generator import SeedSequence
+from numpy.random import SeedSequence
 
 
 def test_reference_data():
diff --git a/numpy/random/tests/test_smoke.py b/numpy/random/tests/test_smoke.py
index 84d261e5e..6e641b5f4 100644
--- a/numpy/random/tests/test_smoke.py
+++ b/numpy/random/tests/test_smoke.py
@@ -5,7 +5,7 @@ from functools import partial
 import numpy as np
 import pytest
 from numpy.testing import assert_equal, assert_, assert_array_equal
-from numpy.random import (Generator, MT19937, PCG64, Philox, SFC64, entropy)
+from numpy.random import (Generator, MT19937, PCG64, Philox, SFC64)
 
 @pytest.fixture(scope='module',
                 params=(np.bool, np.int8, np.int16, np.int32, np.int64,
@@ -806,23 +806,3 @@ class TestDefaultRNG(RNG):
             np.random.default_rng(-1)
         with pytest.raises(ValueError):
             np.random.default_rng([12345, -1])
-
-
-class TestEntropy(object):
-    def test_entropy(self):
-        e1 = entropy.random_entropy()
-        e2 = entropy.random_entropy()
-        assert_((e1 != e2))
-        e1 = entropy.random_entropy(10)
-        e2 = entropy.random_entropy(10)
-        assert_((e1 != e2).all())
-        e1 = entropy.random_entropy(10, source='system')
-        e2 = entropy.random_entropy(10, source='system')
-        assert_((e1 != e2).all())
-
-    def test_fallback(self):
-        e1 = entropy.random_entropy(source='fallback')
-        time.sleep(0.1)
-        e2 = entropy.random_entropy(source='fallback')
-        assert_((e1 != e2))
-