1 files changed, 29 insertions, 14 deletions
diff --git a/numpy/random/src/pcg64/pcg64.h b/numpy/random/src/pcg64/pcg64.h
index 6691a18fc..90a83fd5e 100644
--- a/numpy/random/src/pcg64/pcg64.h
+++ b/numpy/random/src/pcg64/pcg64.h
@@ -229,17 +229,6 @@ static inline void pcg_cm_step_r(pcg_state_setseq_128 *rng) {
 #endif
 }
 
-static inline uint64_t pcg_output_cm_128_64(pcg128_t state) {
-  uint64_t hi = state.high;
-  uint64_t lo = state.low;
-
-  lo |= 1;
-  hi ^= hi >> 32;
-  hi *= 0xda942042e4dd58b5ULL;
-  hi ^= hi >> 48;
-  hi *= lo;
-  return hi;
-}
 
 static inline void pcg_cm_srandom_r(pcg_state_setseq_128 *rng, pcg128_t initstate, pcg128_t initseq) {
   rng->state = PCG_128BIT_CONSTANT(0ULL, 0ULL);
@@ -253,9 +242,35 @@ static inline void pcg_cm_srandom_r(pcg_state_setseq_128 *rng, pcg128_t initstat
 
 static inline uint64_t pcg_cm_random_r(pcg_state_setseq_128* rng)
 {
-    uint64_t ret = pcg_output_cm_128_64(rng->state);
-    pcg_cm_step_r(rng);
-    return ret;
+  /* Lots of manual inlining to help out certain compilers to generate
+   * performant code. */
+  uint64_t hi = rng->state.high;
+  uint64_t lo = rng->state.low;
+
+  /* Run the DXSM output function on the pre-iterated state. */
+  lo |= 1;
+  hi ^= hi >> 32;
+  hi *= 0xda942042e4dd58b5ULL;
+  hi ^= hi >> 48;
+  hi *= lo;
+
+  /* Run the CM step. */
+#if defined _WIN32 && _MSC_VER >= 1900 && _M_AMD64
+  uint64_t h1;
+  pcg128_t product;
+
+  /* Manually inline the multiplication and addition using intrinsics */
+  h1 = rng->state.high * PCG_CHEAP_MULTIPLIER_128;
+  product.low =
+      _umul128(rng->state.low, PCG_CHEAP_MULTIPLIER_128, &(product.high));
+  product.high += h1;
+  _addcarry_u64(_addcarry_u64(0, product.low, rng->inc.low, &(rng->state.low)),
+                product.high, rng->inc.high, &(rng->state.high));
+#else
+  rng->state = pcg128_add(pcg128_mult_64(rng->state, PCG_CHEAP_MULTIPLIER_128),
+                           rng->inc);
+#endif
+  return hi;
 }
 #else /* PCG_EMULATED_128BIT_MATH */