summaryrefslogtreecommitdiff
path: root/numpy
diff options
context:
space:
mode:
Diffstat (limited to 'numpy')
-rw-r--r--numpy/random/_generator.pyx42
-rw-r--r--numpy/random/mtrand.pyx42
2 files changed, 54 insertions, 30 deletions
diff --git a/numpy/random/_generator.pyx b/numpy/random/_generator.pyx
index a7d98e2ed..1903dce37 100644
--- a/numpy/random/_generator.pyx
+++ b/numpy/random/_generator.pyx
@@ -1732,33 +1732,45 @@ cdef class Generator:
... 7515, 8230, 8770])
Does their energy intake deviate systematically from the recommended
- value of 7725 kJ?
+ value of 7725 kJ? Our null hypothesis will be the absence of deviation,
+ and the alternate hypothesis will be the presence of an effect that could be
+ either positive or negative, hence making our test 2-tailed.
+
+ Because we are estimating the mean and we have N=11 values in our sample,
+ we have N-1=10 degrees of freedom. We set our signifance level to 95% and
+ compute the t statistic using the empirical mean and empirical standard
+ deviation of our intake. We use a ddof of 1 to base the computation of our
+ empirical standard deviation on an unbiased estimate of the variance (note:
+ the final estimate is not unbiased due to the concave nature of the square
+ root).
- We have 10 degrees of freedom, so is the sample mean within 95% of the
- recommended value?
-
- >>> s = np.random.default_rng().standard_t(10, size=100000)
>>> np.mean(intake)
6753.636363636364
>>> intake.std(ddof=1)
1142.1232221373727
+ >>> t = (np.mean(intake)-7725)/(intake.std(ddof=1)/np.sqrt(len(intake)))
+ >>> t
+ -2.8207540608310198
- Calculate the t statistic, setting the ddof parameter to the unbiased
- value so the divisor in the standard deviation will be degrees of
- freedom, N-1.
+ We draw 1000000 samples from Student's t distribution with the adequate
+ degrees of freedom.
- >>> t = (np.mean(intake)-7725)/(intake.std(ddof=1)/np.sqrt(len(intake)))
>>> import matplotlib.pyplot as plt
+ >>> s = np.random.default_rng().standard_t(10, size=1000000)
>>> h = plt.hist(s, bins=100, density=True)
- For a one-sided t-test, how far out in the distribution does the t
- statistic appear?
+ Does our t statistic land in one of the two critical regions found at
+ both tails of the distribution?
+
+ >>> np.sum(np.abs(t) < np.abs(s)) / float(len(s))
+ 0.018318 #random < 0.05, statistic is in critical region
- >>> np.sum(s<t) / float(len(s))
- 0.0090699999999999999 # random
+ The probability value for this 2-tailed test is about 1.83%, which is
+ lower than the 5% pre-determined significance threshold.
- So the p-value is about 0.009, which says the null hypothesis has a
- probability of about 99% of being true.
+ Therefore, the probability of observing values as extreme as our intake
+ conditionally on the null hypothesis being true is too low, and we reject
+ the null hypothesis of no deviation.
"""
return cont(&random_standard_t, &self._bitgen, size, self.lock, 1,
diff --git a/numpy/random/mtrand.pyx b/numpy/random/mtrand.pyx
index df8d7e380..683705a05 100644
--- a/numpy/random/mtrand.pyx
+++ b/numpy/random/mtrand.pyx
@@ -2147,33 +2147,45 @@ cdef class RandomState:
... 7515, 8230, 8770])
Does their energy intake deviate systematically from the recommended
- value of 7725 kJ?
+ value of 7725 kJ? Our null hypothesis will be the absence of deviation,
+ and the alternate hypothesis will be the presence of an effect that could be
+ either positive or negative, hence making our test 2-tailed.
+
+ Because we are estimating the mean and we have N=11 values in our sample,
+ we have N-1=10 degrees of freedom. We set our signifance level to 95% and
+ compute the t statistic using the empirical mean and empirical standard
+ deviation of our intake. We use a ddof of 1 to base the computation of our
+ empirical standard deviation on an unbiased estimate of the variance (note:
+ the final estimate is not unbiased due to the concave nature of the square
+ root).
- We have 10 degrees of freedom, so is the sample mean within 95% of the
- recommended value?
-
- >>> s = np.random.standard_t(10, size=100000)
>>> np.mean(intake)
6753.636363636364
>>> intake.std(ddof=1)
1142.1232221373727
+ >>> t = (np.mean(intake)-7725)/(intake.std(ddof=1)/np.sqrt(len(intake)))
+ >>> t
+ -2.8207540608310198
- Calculate the t statistic, setting the ddof parameter to the unbiased
- value so the divisor in the standard deviation will be degrees of
- freedom, N-1.
+ We draw 1000000 samples from Student's t distribution with the adequate
+ degrees of freedom.
- >>> t = (np.mean(intake)-7725)/(intake.std(ddof=1)/np.sqrt(len(intake)))
>>> import matplotlib.pyplot as plt
+ >>> s = np.random.standard_t(10, size=1000000)
>>> h = plt.hist(s, bins=100, density=True)
- For a one-sided t-test, how far out in the distribution does the t
- statistic appear?
+ Does our t statistic land in one of the two critical regions found at
+ both tails of the distribution?
+
+ >>> np.sum(np.abs(t) < np.abs(s)) / float(len(s))
+ 0.018318 #random < 0.05, statistic is in critical region
- >>> np.sum(s<t) / float(len(s))
- 0.0090699999999999999 #random
+ The probability value for this 2-tailed test is about 1.83%, which is
+ lower than the 5% pre-determined significance threshold.
- So the p-value is about 0.009, which says the null hypothesis has a
- probability of about 99% of being true.
+ Therefore, the probability of observing values as extreme as our intake
+ conditionally on the null hypothesis being true is too low, and we reject
+ the null hypothesis of no deviation.
"""
return cont(&legacy_standard_t, &self._aug_state, size, self.lock, 1,