diff options
| -rw-r--r-- | numpy/random/src/pcg64/pcg64.h | 43 |
1 files changed, 29 insertions, 14 deletions
diff --git a/numpy/random/src/pcg64/pcg64.h b/numpy/random/src/pcg64/pcg64.h index 6691a18fc..90a83fd5e 100644 --- a/numpy/random/src/pcg64/pcg64.h +++ b/numpy/random/src/pcg64/pcg64.h @@ -229,17 +229,6 @@ static inline void pcg_cm_step_r(pcg_state_setseq_128 *rng) { #endif } -static inline uint64_t pcg_output_cm_128_64(pcg128_t state) { - uint64_t hi = state.high; - uint64_t lo = state.low; - - lo |= 1; - hi ^= hi >> 32; - hi *= 0xda942042e4dd58b5ULL; - hi ^= hi >> 48; - hi *= lo; - return hi; -} static inline void pcg_cm_srandom_r(pcg_state_setseq_128 *rng, pcg128_t initstate, pcg128_t initseq) { rng->state = PCG_128BIT_CONSTANT(0ULL, 0ULL); @@ -253,9 +242,35 @@ static inline void pcg_cm_srandom_r(pcg_state_setseq_128 *rng, pcg128_t initstat static inline uint64_t pcg_cm_random_r(pcg_state_setseq_128* rng) { - uint64_t ret = pcg_output_cm_128_64(rng->state); - pcg_cm_step_r(rng); - return ret; + /* Lots of manual inlining to help out certain compilers to generate + * performant code. */ + uint64_t hi = rng->state.high; + uint64_t lo = rng->state.low; + + /* Run the DXSM output function on the pre-iterated state. */ + lo |= 1; + hi ^= hi >> 32; + hi *= 0xda942042e4dd58b5ULL; + hi ^= hi >> 48; + hi *= lo; + + /* Run the CM step. */ +#if defined _WIN32 && _MSC_VER >= 1900 && _M_AMD64 + uint64_t h1; + pcg128_t product; + + /* Manually inline the multiplication and addition using intrinsics */ + h1 = rng->state.high * PCG_CHEAP_MULTIPLIER_128; + product.low = + _umul128(rng->state.low, PCG_CHEAP_MULTIPLIER_128, &(product.high)); + product.high += h1; + _addcarry_u64(_addcarry_u64(0, product.low, rng->inc.low, &(rng->state.low)), + product.high, rng->inc.high, &(rng->state.high)); +#else + rng->state = pcg128_add(pcg128_mult_64(rng->state, PCG_CHEAP_MULTIPLIER_128), + rng->inc); +#endif + return hi; } #else /* PCG_EMULATED_128BIT_MATH */ |
