diff options
Diffstat (limited to 'numpy/random/_bit_generator.pyx')
-rw-r--r-- | numpy/random/_bit_generator.pyx | 626 |
1 files changed, 626 insertions, 0 deletions
diff --git a/numpy/random/_bit_generator.pyx b/numpy/random/_bit_generator.pyx new file mode 100644 index 000000000..21d21e6bb --- /dev/null +++ b/numpy/random/_bit_generator.pyx @@ -0,0 +1,626 @@ +""" +BitGenerator base class and SeedSequence used to seed the BitGenerators. + +SeedSequence is derived from Melissa E. O'Neill's C++11 `std::seed_seq` +implementation, as it has a lot of nice properties that we want. + +https://gist.github.com/imneme/540829265469e673d045 +http://www.pcg-random.org/posts/developing-a-seed_seq-alternative.html + +The MIT License (MIT) + +Copyright (c) 2015 Melissa E. O'Neill +Copyright (c) 2019 NumPy Developers + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" + +import abc +import sys +from itertools import cycle +import re + +try: + from secrets import randbits +except ImportError: + # secrets unavailable on python 3.5 and before + from random import SystemRandom + randbits = SystemRandom().getrandbits + +try: + from threading import Lock +except ImportError: + from dummy_threading import Lock + +from cpython.pycapsule cimport PyCapsule_New + +import numpy as np +cimport numpy as np + +from ._common cimport (random_raw, benchmark, prepare_ctypes, prepare_cffi) + +__all__ = ['SeedSequence', 'BitGenerator'] + +np.import_array() + +DECIMAL_RE = re.compile(r'[0-9]+') + +cdef uint32_t DEFAULT_POOL_SIZE = 4 # Appears also in docstring for pool_size +cdef uint32_t INIT_A = 0x43b0d7e5 +cdef uint32_t MULT_A = 0x931e8875 +cdef uint32_t INIT_B = 0x8b51f9dd +cdef uint32_t MULT_B = 0x58f38ded +cdef uint32_t MIX_MULT_L = 0xca01f9dd +cdef uint32_t MIX_MULT_R = 0x4973f715 +cdef uint32_t XSHIFT = np.dtype(np.uint32).itemsize * 8 // 2 +cdef uint32_t MASK32 = 0xFFFFFFFF + +def _int_to_uint32_array(n): + arr = [] + if n < 0: + raise ValueError("expected non-negative integer") + if n == 0: + arr.append(np.uint32(n)) + if isinstance(n, np.unsignedinteger): + # Cannot do n & MASK32, convert to python int + n = int(n) + while n > 0: + arr.append(np.uint32(n & MASK32)) + n //= (2**32) + return np.array(arr, dtype=np.uint32) + +def _coerce_to_uint32_array(x): + """ Coerce an input to a uint32 array. + + If a `uint32` array, pass it through directly. + If a non-negative integer, then break it up into `uint32` words, lowest + bits first. + If a string starting with "0x", then interpret as a hex integer, as above. + If a string of decimal digits, interpret as a decimal integer, as above. + If a sequence of ints or strings, interpret each element as above and + concatenate. + + Note that the handling of `int64` or `uint64` arrays are not just + straightforward views as `uint32` arrays. If an element is small enough to + fit into a `uint32`, then it will only take up one `uint32` element in the + output. This is to make sure that the interpretation of a sequence of + integers is the same regardless of numpy's default integer type, which + differs on different platforms. + + Parameters + ---------- + x : int, str, sequence of int or str + + Returns + ------- + seed_array : uint32 array + + Examples + -------- + >>> import numpy as np + >>> from numpy.random._bit_generator import _coerce_to_uint32_array + >>> _coerce_to_uint32_array(12345) + array([12345], dtype=uint32) + >>> _coerce_to_uint32_array('12345') + array([12345], dtype=uint32) + >>> _coerce_to_uint32_array('0x12345') + array([74565], dtype=uint32) + >>> _coerce_to_uint32_array([12345, '67890']) + array([12345, 67890], dtype=uint32) + >>> _coerce_to_uint32_array(np.array([12345, 67890], dtype=np.uint32)) + array([12345, 67890], dtype=uint32) + >>> _coerce_to_uint32_array(np.array([12345, 67890], dtype=np.int64)) + array([12345, 67890], dtype=uint32) + >>> _coerce_to_uint32_array([12345, 0x10deadbeef, 67890, 0xdeadbeef]) + array([ 12345, 3735928559, 16, 67890, 3735928559], + dtype=uint32) + >>> _coerce_to_uint32_array(1234567890123456789012345678901234567890) + array([3460238034, 2898026390, 3235640248, 2697535605, 3], + dtype=uint32) + """ + if isinstance(x, np.ndarray) and x.dtype == np.dtype(np.uint32): + return x.copy() + elif isinstance(x, str): + if x.startswith('0x'): + x = int(x, base=16) + elif DECIMAL_RE.match(x): + x = int(x) + else: + raise ValueError("unrecognized seed string") + if isinstance(x, (int, np.integer)): + return _int_to_uint32_array(x) + elif isinstance(x, (float, np.inexact)): + raise TypeError('seed must be integer') + else: + if len(x) == 0: + return np.array([], dtype=np.uint32) + # Should be a sequence of interpretable-as-ints. Convert each one to + # a uint32 array and concatenate. + subseqs = [_coerce_to_uint32_array(v) for v in x] + return np.concatenate(subseqs) + + +cdef uint32_t hashmix(uint32_t value, uint32_t * hash_const): + # We are modifying the multiplier as we go along, so it is input-output + value ^= hash_const[0] + hash_const[0] *= MULT_A + value *= hash_const[0] + value ^= value >> XSHIFT + return value + +cdef uint32_t mix(uint32_t x, uint32_t y): + cdef uint32_t result = (MIX_MULT_L * x - MIX_MULT_R * y) + result ^= result >> XSHIFT + return result + + +class ISeedSequence(abc.ABC): + """ + Abstract base class for seed sequences. + + ``BitGenerator`` implementations should treat any object that adheres to + this interface as a seed sequence. + + See Also + -------- + SeedSequence, SeedlessSeedSequence + """ + + @abc.abstractmethod + def generate_state(self, n_words, dtype=np.uint32): + """ + generate_state(n_words, dtype=np.uint32) + + Return the requested number of words for PRNG seeding. + + A BitGenerator should call this method in its constructor with + an appropriate `n_words` parameter to properly seed itself. + + Parameters + ---------- + n_words : int + dtype : np.uint32 or np.uint64, optional + The size of each word. This should only be either `uint32` or + `uint64`. Strings (`'uint32'`, `'uint64'`) are fine. Note that + requesting `uint64` will draw twice as many bits as `uint32` for + the same `n_words`. This is a convenience for `BitGenerator`s that + express their states as `uint64` arrays. + + Returns + ------- + state : uint32 or uint64 array, shape=(n_words,) + """ + + +class ISpawnableSeedSequence(ISeedSequence): + """ + Abstract base class for seed sequences that can spawn. + """ + + @abc.abstractmethod + def spawn(self, n_children): + """ + spawn(n_children) + + Spawn a number of child `SeedSequence` s by extending the + `spawn_key`. + + Parameters + ---------- + n_children : int + + Returns + ------- + seqs : list of `SeedSequence` s + """ + + +cdef class SeedlessSeedSequence(): + """ + A seed sequence for BitGenerators with no need for seed state. + + See Also + -------- + SeedSequence, ISeedSequence + """ + + def generate_state(self, n_words, dtype=np.uint32): + raise NotImplementedError('seedless SeedSequences cannot generate state') + + def spawn(self, n_children): + return [self] * n_children + + +# We cannot directly subclass a `cdef class` type from an `ABC` in Cython, so +# we must register it after the fact. +ISpawnableSeedSequence.register(SeedlessSeedSequence) + + +cdef class SeedSequence(): + """ + SeedSequence(entropy=None, *, spawn_key=(), pool_size=4) + + SeedSequence mixes sources of entropy in a reproducible way to set the + initial state for independent and very probably non-overlapping + BitGenerators. + + Once the SeedSequence is instantiated, you can call the `generate_state` + method to get an appropriately sized seed. Calling `spawn(n) <spawn>` will + create ``n`` SeedSequences that can be used to seed independent + BitGenerators, i.e. for different threads. + + Parameters + ---------- + entropy : {None, int, sequence[int]}, optional + The entropy for creating a `SeedSequence`. + spawn_key : {(), sequence[int]}, optional + A third source of entropy, used internally when calling + `SeedSequence.spawn` + pool_size : {int}, optional + Size of the pooled entropy to store. Default is 4 to give a 128-bit + entropy pool. 8 (for 256 bits) is another reasonable choice if working + with larger PRNGs, but there is very little to be gained by selecting + another value. + n_children_spawned : {int}, optional + The number of children already spawned. Only pass this if + reconstructing a `SeedSequence` from a serialized form. + + Notes + ----- + + Best practice for achieving reproducible bit streams is to use + the default ``None`` for the initial entropy, and then use + `SeedSequence.entropy` to log/pickle the `entropy` for reproducibility: + + >>> sq1 = np.random.SeedSequence() + >>> sq1.entropy + 243799254704924441050048792905230269161 # random + >>> sq2 = np.random.SeedSequence(sq1.entropy) + >>> np.all(sq1.generate_state(10) == sq2.generate_state(10)) + True + """ + + def __init__(self, entropy=None, *, spawn_key=(), + pool_size=DEFAULT_POOL_SIZE, n_children_spawned=0): + if pool_size < DEFAULT_POOL_SIZE: + raise ValueError("The size of the entropy pool should be at least " + f"{DEFAULT_POOL_SIZE}") + if entropy is None: + entropy = randbits(pool_size * 32) + elif not isinstance(entropy, (int, np.integer, list, tuple, range, + np.ndarray)): + raise TypeError('SeedSequence expects int or sequence of ints for ' + 'entropy not {}'.format(entropy)) + self.entropy = entropy + self.spawn_key = tuple(spawn_key) + self.pool_size = pool_size + self.n_children_spawned = n_children_spawned + + self.pool = np.zeros(pool_size, dtype=np.uint32) + self.mix_entropy(self.pool, self.get_assembled_entropy()) + + def __repr__(self): + lines = [ + f'{type(self).__name__}(', + f' entropy={self.entropy!r},', + ] + # Omit some entries if they are left as the defaults in order to + # simplify things. + if self.spawn_key: + lines.append(f' spawn_key={self.spawn_key!r},') + if self.pool_size != DEFAULT_POOL_SIZE: + lines.append(f' pool_size={self.pool_size!r},') + if self.n_children_spawned != 0: + lines.append(f' n_children_spawned={self.n_children_spawned!r},') + lines.append(')') + text = '\n'.join(lines) + return text + + @property + def state(self): + return {k:getattr(self, k) for k in + ['entropy', 'spawn_key', 'pool_size', + 'n_children_spawned'] + if getattr(self, k) is not None} + + cdef mix_entropy(self, np.ndarray[np.npy_uint32, ndim=1] mixer, + np.ndarray[np.npy_uint32, ndim=1] entropy_array): + """ Mix in the given entropy to mixer. + + Parameters + ---------- + mixer : 1D uint32 array, modified in-place + entropy_array : 1D uint32 array + """ + cdef uint32_t hash_const[1] + hash_const[0] = INIT_A + + # Add in the entropy up to the pool size. + for i in range(len(mixer)): + if i < len(entropy_array): + mixer[i] = hashmix(entropy_array[i], hash_const) + else: + # Our pool size is bigger than our entropy, so just keep + # running the hash out. + mixer[i] = hashmix(0, hash_const) + + # Mix all bits together so late bits can affect earlier bits. + for i_src in range(len(mixer)): + for i_dst in range(len(mixer)): + if i_src != i_dst: + mixer[i_dst] = mix(mixer[i_dst], + hashmix(mixer[i_src], hash_const)) + + # Add any remaining entropy, mixing each new entropy word with each + # pool word. + for i_src in range(len(mixer), len(entropy_array)): + for i_dst in range(len(mixer)): + mixer[i_dst] = mix(mixer[i_dst], + hashmix(entropy_array[i_src], hash_const)) + + cdef get_assembled_entropy(self): + """ Convert and assemble all entropy sources into a uniform uint32 + array. + + Returns + ------- + entropy_array : 1D uint32 array + """ + # Convert run-entropy, program-entropy, and the spawn key into uint32 + # arrays and concatenate them. + + # We MUST have at least some run-entropy. The others are optional. + assert self.entropy is not None + run_entropy = _coerce_to_uint32_array(self.entropy) + spawn_entropy = _coerce_to_uint32_array(self.spawn_key) + entropy_array = np.concatenate([run_entropy, spawn_entropy]) + return entropy_array + + @np.errstate(over='ignore') + def generate_state(self, n_words, dtype=np.uint32): + """ + generate_state(n_words, dtype=np.uint32) + + Return the requested number of words for PRNG seeding. + + A BitGenerator should call this method in its constructor with + an appropriate `n_words` parameter to properly seed itself. + + Parameters + ---------- + n_words : int + dtype : np.uint32 or np.uint64, optional + The size of each word. This should only be either `uint32` or + `uint64`. Strings (`'uint32'`, `'uint64'`) are fine. Note that + requesting `uint64` will draw twice as many bits as `uint32` for + the same `n_words`. This is a convenience for `BitGenerator`s that + express their states as `uint64` arrays. + + Returns + ------- + state : uint32 or uint64 array, shape=(n_words,) + """ + cdef uint32_t hash_const = INIT_B + cdef uint32_t data_val + + out_dtype = np.dtype(dtype) + if out_dtype == np.dtype(np.uint32): + pass + elif out_dtype == np.dtype(np.uint64): + n_words *= 2 + else: + raise ValueError("only support uint32 or uint64") + state = np.zeros(n_words, dtype=np.uint32) + src_cycle = cycle(self.pool) + for i_dst in range(n_words): + data_val = next(src_cycle) + data_val ^= hash_const + hash_const *= MULT_B + data_val *= hash_const + data_val ^= data_val >> XSHIFT + state[i_dst] = data_val + if out_dtype == np.dtype(np.uint64): + # For consistency across different endiannesses, view first as + # little-endian then convert the values to the native endianness. + state = state.astype('<u4').view('<u8').astype(np.uint64) + return state + + def spawn(self, n_children): + """ + spawn(n_children) + + Spawn a number of child `SeedSequence` s by extending the + `spawn_key`. + + Parameters + ---------- + n_children : int + + Returns + ------- + seqs : list of `SeedSequence` s + """ + cdef uint32_t i + + seqs = [] + for i in range(self.n_children_spawned, + self.n_children_spawned + n_children): + seqs.append(type(self)( + self.entropy, + spawn_key=self.spawn_key + (i,), + pool_size=self.pool_size, + )) + self.n_children_spawned += n_children + return seqs + + +ISpawnableSeedSequence.register(SeedSequence) + + +cdef class BitGenerator(): + """ + BitGenerator(seed=None) + + Base Class for generic BitGenerators, which provide a stream + of random bits based on different algorithms. Must be overridden. + + Parameters + ---------- + seed : {None, int, array_like[ints], SeedSequence}, optional + A seed to initialize the `BitGenerator`. If None, then fresh, + unpredictable entropy will be pulled from the OS. If an ``int`` or + ``array_like[ints]`` is passed, then it will be passed to + ~`numpy.random.SeedSequence` to derive the initial `BitGenerator` state. + One may also pass in a `SeedSequence` instance. + + Attributes + ---------- + lock : threading.Lock + Lock instance that is shared so that the same BitGenerator can + be used in multiple Generators without corrupting the state. Code that + generates values from a bit generator should hold the bit generator's + lock. + + See Also + ------- + SeedSequence + """ + + def __init__(self, seed=None): + self.lock = Lock() + self._bitgen.state = <void *>0 + if type(self) is BitGenerator: + raise NotImplementedError('BitGenerator is a base class and cannot be instantized') + + self._ctypes = None + self._cffi = None + + cdef const char *name = "BitGenerator" + self.capsule = PyCapsule_New(<void *>&self._bitgen, name, NULL) + if not isinstance(seed, ISeedSequence): + seed = SeedSequence(seed) + self._seed_seq = seed + + # Pickling support: + def __getstate__(self): + return self.state + + def __setstate__(self, state): + self.state = state + + def __reduce__(self): + from ._pickle import __bit_generator_ctor + return __bit_generator_ctor, (self.state['bit_generator'],), self.state + + @property + def state(self): + """ + Get or set the PRNG state + + The base BitGenerator.state must be overridden by a subclass + + Returns + ------- + state : dict + Dictionary containing the information required to describe the + state of the PRNG + """ + raise NotImplementedError('Not implemented in base BitGenerator') + + @state.setter + def state(self, value): + raise NotImplementedError('Not implemented in base BitGenerator') + + def random_raw(self, size=None, output=True): + """ + random_raw(self, size=None) + + Return randoms as generated by the underlying BitGenerator + + Parameters + ---------- + size : int or tuple of ints, optional + Output shape. If the given shape is, e.g., ``(m, n, k)``, then + ``m * n * k`` samples are drawn. Default is None, in which case a + single value is returned. + output : bool, optional + Output values. Used for performance testing since the generated + values are not returned. + + Returns + ------- + out : uint or ndarray + Drawn samples. + + Notes + ----- + This method directly exposes the the raw underlying pseudo-random + number generator. All values are returned as unsigned 64-bit + values irrespective of the number of bits produced by the PRNG. + + See the class docstring for the number of bits returned. + """ + return random_raw(&self._bitgen, self.lock, size, output) + + def _benchmark(self, Py_ssize_t cnt, method=u'uint64'): + '''Used in tests''' + return benchmark(&self._bitgen, self.lock, cnt, method) + + @property + def ctypes(self): + """ + ctypes interface + + Returns + ------- + interface : namedtuple + Named tuple containing ctypes wrapper + + * state_address - Memory address of the state struct + * state - pointer to the state struct + * next_uint64 - function pointer to produce 64 bit integers + * next_uint32 - function pointer to produce 32 bit integers + * next_double - function pointer to produce doubles + * bitgen - pointer to the bit generator struct + """ + if self._ctypes is None: + self._ctypes = prepare_ctypes(&self._bitgen) + + return self._ctypes + + @property + def cffi(self): + """ + CFFI interface + + Returns + ------- + interface : namedtuple + Named tuple containing CFFI wrapper + + * state_address - Memory address of the state struct + * state - pointer to the state struct + * next_uint64 - function pointer to produce 64 bit integers + * next_uint32 - function pointer to produce 32 bit integers + * next_double - function pointer to produce doubles + * bitgen - pointer to the bit generator struct + """ + if self._cffi is None: + self._cffi = prepare_cffi(&self._bitgen) + return self._cffi |