# topic 21
# First, set up the situation. We have two populations
# with a known standard deviations.
#
source("../pop_sd.R")
source("../gnrnd5.R")
source("../gnrnd4.R")
gnrnd5( 162353499902, 873001383)
pop_one <- L1
# I can tell you that I know the standard deviation
# of pop_one is 16.7252
sigma_one = 16.7252
gnrnd5( 123353499902, 1073001283)
pop_two <- L1
# I can tell you that I know the standard deviation
# of pop_two is 20.81073
sigma_two <- 20.81073
################################################
### Confidence Interval ###
################################################
# We want to construct a 95% confidence interval
# for the difference of the means, mu_one - mu_two.
# To do this we will take random samples of both,
# the size of the sample from pop_one will be 28
# and the size of the sample from pop_two will
# be 34.
n1 <- 28
n2 <- 34
# for now, we will get the samples by generating the
# appropriate random index numbers for the two
# populations.
# get the sample and sample statistics from pop_one
gnrnd4( 547892701, 500000001)
L1
index_one <- L1
samp_one <- pop_one[ index_one ]
samp_one
x_bar_one <- mean( samp_one )
x_bar_one
# get the sample and sample statistics from pop_two
gnrnd4( 763893301, 500000001)
L1
index_two <- L1
samp_two <- pop_two[ index_two ]
samp_two
x_bar_two <- mean( samp_two )
x_bar_two
#### Therefore we can find the difference of the sample means
diff_of_means <- x_bar_one - x_bar_two
diff_of_means
#### This is an instance where we know the standard deviation
#### of the underlying populations. Therefore, the distribution
#### of the difference of the sample means will be N( mu, sigma)
#### where mu is mu_one - mu_two, and sigma is
#### sqrt( sigma_one^2/n1 + sigma_two^2/n2) for samples
#### of size n1 and n2, respectively.
## we took our samples, (see above) so we can compute
##
sd_of_diff <- sqrt( sigma_one^2/n1 +sigma_two^2/n2 )
sd_of_diff
# for a 95% confidence interval, we are missing 5% total,
# and we need to split this in half, a 2.5% for the
# top and bottom.
#
# get the z value, from a N(0.1), with 2.5% above it
z <- qnorm( 0.025, lower.tail=FALSE)
z
# then our margin of error is
MOE <- z*sd_of_diff
MOE
# so our confidence interval goes from
diff_of_means - MOE
# to
diff_of_means + MOE
# That pattern of commands would be the same for each
# instance of such a problem. We have a function
# that does this. It is called ci_2known().
source("../ci_2known.R")
ci_2known(sigma_one, 28, x_bar_one,
sigma_two, 34, x_bar_two, 0.95)
################################################
### Hypothesis test ###
################################################
# Now, someone says that they believe that the
# difference of the means from pop_one and pop_two
# mu_1 - mu_2 = 0, that the means are the same.
# Thus our null hypothesis, H0, is that the difference is 0.
# Our alternative hypothesis, H1, is that the
# difference of the the means is greater than 0, or
# that mu_one > mu_two.
# We want to see if we can reject H0 in favor of H1.
# We will get a sample from each
# population, n1=28 and n2=34.
# We are willing to be wrong in telling the
# person that they are wrong 2.5% of the time!
########################
## The critical value approach. We know that two
## samples of size 28 and 34, respectively, will have
## a standard error of the difference of the means
## be sqrt( sigma_one^2/28 + sigma_two^2/35 )
# #### we computed this above, but let us do it again ###
sd_of_diff <- sqrt( sigma_one^2/28 + sigma_two^2/34 )
sd_of_diff
## Furthermore, the difference of the means will have a
## Normal distribution.
z_val <- qnorm( 0.025, lower.tail=FALSE )
z_val
# Because the null hypothesis is that the difference is 0
# The critical high value will be 0+z_val*sd_of_diff or
z_val*sd_of_diff # our critical high value
# we compare the difference of the means to that critical value
diff_of_means
# that difference is greater than our critical high. Therefore,
# we would reject H0 in favor of H1.
######################
## The attained significance approach asks, if H0 is true,
## then how strange would it be to get a difference of the
## means at this value or at any stranger value. But
## that is just a pnorm command
pnorm( diff_of_means, mean=0, sd=sd_of_diff,
lower.tail=FALSE)
#
# That attained significance is less than our given
# level of significance, 0.025, so we would reject H0
# in favor of H1.
#########################
## Of course, for any similar problem we would be doing
## the same steps, with the additional complication of
## having to pay attention to the alternative. If H1 is
## mu_one < mu_two, which is the same as mu_one-mu_two<0,
## then we would get a critical low value and our
## pnorm would use the lower tail. If, H1 is
## mu_one != mu_two, which is the same as
## mu_one-mu_two != 0, then we would need to get both a
## critical low and a critical high, splitting the area
## of significance between the low and high. In the
## same situation, once we get our attained value on
## one side, high or low, we would need to double it to
## account for values that are just as extreme or more
## extreme on the other side.
##
## Rather than think out all of these issues each time
## we run into this problem, we might as well capture
## all of this in a function, hypoth_2test_known().
## LEt us load and run that for the values in our example.
source( "../hypo_2known.R")
hypoth_2test_known( sigma_one, n1, x_bar_one,
sigma_two, n2, x_bar_two,
1, 0.025 )
######################################################
######################################################
## Of course, we really could have found the means
## of the two populations.
mu_one <- mean( pop_one )
mu_one
mu_two <- mean( pop_two )
mu_two
real_diff <- mu_one - mu_two
real_diff
# we will run through 10000 cases where we
# get samples of size 28 and 34 and we will
# use those samples,specifically the sample
# means, to generate a 95% confidence interval
# for the difference of the population means.
# In the process we will see how many of those
# intervals contain the true difference.
L1 <- 1:10000
for( i in 1:10000 ) {
samp_one <- sample( pop_one, 28 )
samp_two <- sample( pop_two, 34 )
this_interval <-
ci_2known( sigma_one, 28, mean( samp_one),
sigma_two, 34, mean( samp_two),
0.95 )
if( this_interval[1]