diff options
author | Kevin Sheppard <kevin.k.sheppard@gmail.com> | 2019-05-29 12:54:55 +0100 |
---|---|---|
committer | Kevin Sheppard <kevin.k.sheppard@gmail.com> | 2019-05-31 01:09:34 +0800 |
commit | 3e01f891d8e207e0dfe351c1e3d082682435a89b (patch) | |
tree | 1e9e64fee36994e14945b2ecc4b0086eb8beb6b5 /numpy | |
parent | 9c44a2dc63d5e63b7d36c57a8bea84c3d117130e (diff) | |
download | numpy-3e01f891d8e207e0dfe351c1e3d082682435a89b.tar.gz |
PERF: Use intrinsics in Win64-PCG64
Use intrinsics to speed up PCG64 on Windows 64
Diffstat (limited to 'numpy')
-rw-r--r-- | numpy/random/src/pcg64/pcg64.h | 32 |
1 files changed, 25 insertions, 7 deletions
diff --git a/numpy/random/src/pcg64/pcg64.h b/numpy/random/src/pcg64/pcg64.h index 0c263bb7b..20d64f8ab 100644 --- a/numpy/random/src/pcg64/pcg64.h +++ b/numpy/random/src/pcg64/pcg64.h @@ -72,6 +72,9 @@ typedef struct { uint64_t low; } pcg128_t; +#define PCG_DEFAULT_MULTIPLIER_HIGH 2549297995355413924ULL +#define PCG_DEFAULT_MULTIPLIER_LOW 4865540595714422341ULL + static inline pcg128_t PCG_128BIT_CONSTANT(uint64_t high, uint64_t low) { pcg128_t result; result.high = high; @@ -90,7 +93,7 @@ typedef struct { } pcg_state_setseq_128; #define PCG_DEFAULT_MULTIPLIER_128 \ - PCG_128BIT_CONSTANT(2549297995355413924ULL, 4865540595714422341ULL) + PCG_128BIT_CONSTANT(PCG_DEFAULT_MULTIPLIER_HIGH, PCG_DEFAULT_MULTIPLIER_LOW) #define PCG_DEFAULT_INCREMENT_128 \ PCG_128BIT_CONSTANT(6364136223846793005ULL, 1442695040888963407ULL) #define PCG_STATE_SETSEQ_128_INITIALIZER \ @@ -172,6 +175,27 @@ static inline void pcg_setseq_128_srandom_r(pcg_state_setseq_128 *rng, pcg_setseq_128_step_r(rng); } +static inline uint64_t +pcg_setseq_128_xsl_rr_64_random_r(pcg_state_setseq_128 *rng) { +#if defined _WIN32 && _MSC_VER >= 1900 && _M_AMD64 + uint64_t h1; + pcg128_t product; + + /* Manually inline the multiplication and addition using intrinsics */ + h1 = rng->state.high * PCG_DEFAULT_MULTIPLIER_LOW + + rng->state.low * PCG_DEFAULT_MULTIPLIER_HIGH; + product.low = + _umul128(rng->state.low, PCG_DEFAULT_MULTIPLIER_LOW, &(product.high)); + product.high += h1; + _addcarry_u64(_addcarry_u64(0, product.low, rng->inc.low, &(rng->state.low)), + product.high, rng->inc.high, &(rng->state.high)); + return _rotr64(rng->state.high ^ rng->state.low, rng->state.high >> 58u); +#else + pcg_setseq_128_step_r(rng); + return pcg_output_xsl_rr_128_64(rng->state); +#endif +} + #else /* PCG_EMULATED_128BIT_MATH */ static inline void pcg_setseq_128_step_r(pcg_state_setseq_128 *rng) { @@ -196,12 +220,6 @@ static inline void pcg_setseq_128_srandom_r(pcg_state_setseq_128 *rng, #endif /* PCG_EMULATED_128BIT_MATH */ static inline uint64_t -pcg_setseq_128_xsl_rr_64_random_r(pcg_state_setseq_128 *rng) { - pcg_setseq_128_step_r(rng); - return pcg_output_xsl_rr_128_64(rng->state); -} - -static inline uint64_t pcg_setseq_128_xsl_rr_64_boundedrand_r(pcg_state_setseq_128 *rng, uint64_t bound) { uint64_t threshold = -bound % bound; |