# topic 16
# This will be remarkably similar to topic 15, so
# much so that it would be worth comparing the two scripts.
# First, set up the situation. We have a population
# with an unknown standard deviation.
#
source("../gnrnd5.R")
source("../gnrnd4.R")
gnrnd5( 179453199903, 873001418)
big_pop <- L1
# I do not know the standard deviation of big_pop
# Now, someone says that they believe that the
# mean of big_pop is 200. Thus our null hypothesis
# is that the true mean is 200. We want to see if that
# could be correct. We will get a sample of
# size 43 and we will look at the sample mean.
# We are willing to be wrong in telling the
# person that they are wrong 2.5% of the time!
########################
## The critical value approach. We know that
## where the standard deviation of big_pop is sigma then
## samples of size 43 will have a standard error
## of the mean be sigma/sqrt(43) and that those
## values will be normally distributed with the
## same mean as the population, a value we assume
## to be 200. But we do not know the value of sigma.
## So we can approximate the distribution of the
## mean of samples of size 43 with a Student's-t
## with 42 degrees of freedom.
## This is a two-tailed test because
## a sample mean that is either too low or too high
## would indicate that 200 is not the mean.
## Therefore, find the value, in a Student's-t with
## 42 degrees for freedom that has 0.025/2 as the
## P(Xt), where we use the sample
## standard deviation rather than the population
## standard deviation. Because of this we need to
## take the sample first in order to find the sample
## standard deviation.
# Get our sample
# the first time we do this let us get the
# same sample each time
gnrnd4(768734201, 200000001)
L1
# take those as the index values of our random sample
our_samp <- big_pop[ L1 ]
our_samp
# find the mean of our sample
x_bar <- mean( our_samp )
x_bar
# find the standard deviation of the sample
s_x <- sd( our_samp )
s_x
# Now we can get our critical values
# long way
low_t <- qt(0.025/2,42)
low_t
low_val <- 200 + low_t*s_x/sqrt(43) # + because t is < 0
low_val
high_t <- qt( 0.025/2,42, lower.tail=FALSE)
high_t # this was silly because we know it is -low_t
high_val <- 200 + high_t*s_x/sqrt(43)
high_val
## so now compare the mean of the sample
## to our critical values.
x_bar
## In this case the sample mean is greater than
## our critical high. Therefore reject, at the
## 0.025 level of significance, the
## null hypothesis that the true mean is 200
## in favor of the alternative hypothesis that
## the true mean is not 200.
#############
############# the attained significance approach
## how strange would it be to get a mean of
## 207.0581 for a sample of size 43 if the true
## mean is 200? Because we do not have the
## flexibility in pt() that we have in pnorm()
## we will have to normalize our value.
#
pt( (207.0581 - 200 )/(s_x/sqrt(43)), 42,
lower.tail=FALSE)
#
## but we would need to double that to account for
## values that extreme or more extreme on the low
## side
0.005047642*2
# is that probability less than our 2.5% ?
# Yes, therefore, reject the null hypothesis
# in favor of the alternative.
####################### Now use the function to
####################### do the same thing
source("../hypo_unknown.R")
hypoth_test_unknown( 200,0, 0.025,
43, x_bar, s_x )
#######################################
#######################################
# Now we want to repeat this process
# but each time we want a different sample
# of size 35
L1 <- sample( big_pop, 43 )
x_bar <- mean( L1 )
s_x <- sd( L1 )
hypoth_test_unknown( 200, 0, 0.025,
43, x_bar, s_x )
#### perform lines 114-118 again and again
### now, since we have the population let us peek
# at the true mean
mean( big_pop )
####### Try our samples again, but this time test
## the null hypothesis that the true mean
## is 207.0618, and do the test at the 0.05
## level of significance.
L1 <- sample( big_pop, 43 )
x_bar <- mean( L1 )
s_x <- sd( L1 )
hypoth_test_unknown( 207.0618, 0, 0.05,
43, x_bar, s_x )
#### perform lines 128-132 again and again,
#### and we should see a Type I error about
#### 5% of the time.
### we can actually do this 1000 times and see how
### times we reject the null hypothesis even
### though it is true.
L2 <- 1:1000
for( i in 1:1000) {
L1 <- sample( big_pop, 43 )
x_bar <- mean( L1 )
s_x <- sd( L1 )
answer <- hypoth_test_unknown( 207.0618, 0, 0.05,
43, x_bar, s_x )
L2[i] <- answer[14]
}
table( L2 )