summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobert Kern <robert.kern@gmail.com>2021-05-07 15:22:28 -0400
committerRobert Kern <robert.kern@gmail.com>2021-05-07 15:22:28 -0400
commit9bb6a0c202e8c72b3b388ef35312ad8fda391e80 (patch)
tree31ae41556c2e2dae8047d6c76650f37d06be1620
parent56c5baff2364975d340b8b2edd41f8b8be1c0cc0 (diff)
downloadnumpy-9bb6a0c202e8c72b3b388ef35312ad8fda391e80.tar.gz
ENH: manually inline code for performance.
-rw-r--r--numpy/random/src/pcg64/pcg64.h43
1 files changed, 29 insertions, 14 deletions
diff --git a/numpy/random/src/pcg64/pcg64.h b/numpy/random/src/pcg64/pcg64.h
index 6691a18fc..90a83fd5e 100644
--- a/numpy/random/src/pcg64/pcg64.h
+++ b/numpy/random/src/pcg64/pcg64.h
@@ -229,17 +229,6 @@ static inline void pcg_cm_step_r(pcg_state_setseq_128 *rng) {
#endif
}
-static inline uint64_t pcg_output_cm_128_64(pcg128_t state) {
- uint64_t hi = state.high;
- uint64_t lo = state.low;
-
- lo |= 1;
- hi ^= hi >> 32;
- hi *= 0xda942042e4dd58b5ULL;
- hi ^= hi >> 48;
- hi *= lo;
- return hi;
-}
static inline void pcg_cm_srandom_r(pcg_state_setseq_128 *rng, pcg128_t initstate, pcg128_t initseq) {
rng->state = PCG_128BIT_CONSTANT(0ULL, 0ULL);
@@ -253,9 +242,35 @@ static inline void pcg_cm_srandom_r(pcg_state_setseq_128 *rng, pcg128_t initstat
static inline uint64_t pcg_cm_random_r(pcg_state_setseq_128* rng)
{
- uint64_t ret = pcg_output_cm_128_64(rng->state);
- pcg_cm_step_r(rng);
- return ret;
+ /* Lots of manual inlining to help out certain compilers to generate
+ * performant code. */
+ uint64_t hi = rng->state.high;
+ uint64_t lo = rng->state.low;
+
+ /* Run the DXSM output function on the pre-iterated state. */
+ lo |= 1;
+ hi ^= hi >> 32;
+ hi *= 0xda942042e4dd58b5ULL;
+ hi ^= hi >> 48;
+ hi *= lo;
+
+ /* Run the CM step. */
+#if defined _WIN32 && _MSC_VER >= 1900 && _M_AMD64
+ uint64_t h1;
+ pcg128_t product;
+
+ /* Manually inline the multiplication and addition using intrinsics */
+ h1 = rng->state.high * PCG_CHEAP_MULTIPLIER_128;
+ product.low =
+ _umul128(rng->state.low, PCG_CHEAP_MULTIPLIER_128, &(product.high));
+ product.high += h1;
+ _addcarry_u64(_addcarry_u64(0, product.low, rng->inc.low, &(rng->state.low)),
+ product.high, rng->inc.high, &(rng->state.high));
+#else
+ rng->state = pcg128_add(pcg128_mult_64(rng->state, PCG_CHEAP_MULTIPLIER_128),
+ rng->inc);
+#endif
+ return hi;
}
#else /* PCG_EMULATED_128BIT_MATH */