# topic 22
# This will be remarkably similar to the work in
# topic 21, so much so that it is worth examining the
# two to see the changes.
# First, set up the situation. We have two populations
# with a unknown standard deviations.
#
source("../gnrnd5.R")
source("../gnrnd4.R")
gnrnd5( 156823499902, 933002363)
pop_one <- L1
gnrnd5( 177843499903, 1143001693)
pop_two <- L1
################################################
### Confidence Interval ###
################################################
# We want to construct a 95% confidence interval
# for the difference of the means, mu_one - mu_two.
# To do this we will take random samples of both,
# the size of the sample from pop_one will be 38
# and the size of the sample from pop_two will
# be 43.
n1 <- 38
n2 <- 43
# for now, we will get the samples by generating the
# appropriate random index numbers for the two
# populations.
# get the sample and sample statistics from pop_one
gnrnd4( 982043701, 500000001)
L1
index_one <- L1
samp_one <- pop_one[ index_one ]
samp_one
x_bar_one <- mean( samp_one )
x_bar_one
# get the sample and sample statistics from pop_two
gnrnd4( 338064201, 500000001)
L1
index_two <- L1
samp_two <- pop_two[ index_two ]
samp_two
x_bar_two <- mean( samp_two )
x_bar_two
#### Therefore we can find the difference of the sample means
diff_of_means <- x_bar_one - x_bar_two
diff_of_means
#### This is an instance where we do not know the standard deviation
#### of the underlying populations. Therefore, we will use
#### the Student's-t distribution as the model for the distribution
#### of the difference of the sample means. As part of
#### this we will find the standard error by using the
#### standard deviations of the samples:
#### sd_of_diff =
#### sqrt( sd(samp_one)^2/n1 + sd(samp_two)^2/n2) for samples
#### of size n1 and n2, respectively.
## we took our samples, (see above) so we can compute
##
sd_of_diff <- sqrt( sd(samp_one)^2/n1 + sd(samp_two)^2/n2)
sd_of_diff
# for a 95% confidence interval, we are missing 5% total,
# and we need to split this in half, a 2.5% for the
# top and bottom. However, since this is a Student's-t
# distribution we need to determine the degrees of
# freedom.
######################### quick and dirty
# The quick and dirty way is to use one less than the
# size of the smaller sample. In our case, that would
# be 38-1 or 37 degrees of freedom.
#
# get the t value, from a Student's-t with 37 degrees
# of freedom and with 2.5% above it
t <- qt( 0.025, 37, lower.tail=FALSE)
t
# then our margin of error is
MOE <- t*sd_of_diff
MOE
# so our confidence interval goes from
diff_of_means - MOE
# to'
diff_of_means + MOE
# That pattern of commands would be the same for each
# instance of such a problem. We have a function
# that does this. It is called ci_2known().
source("../ci_2unknown.R")
ci_2unknown(sd(samp_one), n1, x_bar_one,
sd(samp_two), n2, x_bar_two, 0.95)
#### we find the values that we computed above under the
#### Simp Low, Simp High Simp MOE DF Simp and t Simp
#### labels.
#### but we recall that there was a more complex way to
#### determine the degrees of freedom. We can do that here:
#compute the degrees of freedom
d_one <- sd( samp_one )^2/n1
d_two <- sd( samp_two)^2/n2
d_one
d_two
df <-(d_one+d_two)^2/( d_one^2/(n1-1)+d_two^2/(n2-1))
# full computed degrees of freedom
df
# Now, with our new degrees of freedom we can repeat
# our earlier steps.
#
# get the t value, from a Student's-t with 37 degrees
# of freedom and with 2.5% above it
t <- qt( 0.025, df, lower.tail=FALSE)
t
# then our margin of error is
MOE <- t*sd_of_diff
MOE
# so our confidence interval goes from
diff_of_means - MOE
# to'
diff_of_means + MOE
# If we run the function again we can compare
# these "full" or "computed" results with those
# from the function.
ci_2unknown(sd(samp_one), n1, x_bar_one,
sd(samp_two), n2, x_bar_two, 0.95)
################################################
### Hypothesis test ###
################################################
# Now, someone says that they believe that the
# difference of the means from pop_one and pop_two
# mu_1 - mu_2 = 0, that the means are the same.
# Thus our null hypothesis, H0, is that the difference is 0.
# Our alternative hypothesis, H1, is that the
# difference of the the means is greater than 0, or
# that mu_one > mu_two.
# We want to see if we can reject H0 in favor of H1.
# We will get a sample from each
# population, n1=38 and n2=43.
# We are willing to be wrong in telling the
# person that they are wrong 2.5% of the time!
########################
## The critical value approach. We know that two
## samples of size 38 and 43, respectively, will have
## a standard error of the difference of the means
## be sqrt( sigma_one^2/28 + sigma_two^2/35 ) and that
## would be a normal distribution. Unfortunately,
## we do know neither sigma_one nor sigma_two. The
## best we can do is to use sd(samp_one) and
## sd( samp_two ) to approximate those and in that
## case we need to use a Student's-t distribution
## rather than a normal distribution.
# #### we computed this above, but let us do it again ###
sd_of_diff <- sqrt( sd(samp_one)^2/n1 + sd(samp_two)^2/n2)
sd_of_diff
## Now to use the Student's-t we need to decide on
## the degrees of freedom.
######################### quick and dirty
# The quick and dirty way is to use one less than the
# size of the smaller sample. In our case, that would
# be 38-1 or 37 degrees of freedom.
#
# get the t value, from a Student's-t with 37 degrees
# of freedom and with 2.5% above it
t_val <- qt( 0.025, 37, lower.tail=FALSE)
t_val
# Because the null hypothesis is that the difference is 0
# The critical high value will be 0+t_val*sd_of_diff or
t_val*sd_of_diff # our critical high value
# we compare the difference of the means to that critical value
diff_of_means
# that difference is not greater than our critical high. Therefore,
# we would not reject H0 in favor of H1.
######################
## The attained significance approach asks, if H0 is true,
## then how strange would it be to get a difference of the
## means at this value or at any stranger value. But
## that is just a pt command. Ho0wever, because pt()
## does not allow us to specify the standard deviation we
## need to normalize our value
test_val <- diff_of_means / sd_of_diff
test_val
pt( test_val, 37, lower.tail=FALSE)
#
# That attained significance is not less than our given
# level of significance, 0.025, so we would not reject H0
# in favor of H1.
#################################
## All of that was done using the "quick and dirty"
## determination of the degrees of freedom. We have
## the computed degrees of freedom which we did
## above but we sill do it again here:
d_one <- sd( samp_one )^2/n1
d_two <- sd( samp_two)^2/n2
d_one
d_two
df <-(d_one+d_two)^2/( d_one^2/(n1-1)+d_two^2/(n2-1))
# full computed degrees of freedom
df
# Now, with our new degrees of freedom we can repeat
# our earleir steps.
### the critical value approach
#
# get the t value, from a Student's-t with df degrees
# of freedom and with 2.5% above it
t_val <- qt( 0.025, df, lower.tail=FALSE)
t_val
# Because the null hypothesis is that the difference is 0
# The critical high value will be 0+t_val*sd_of_diff or
t_val*sd_of_diff # our critical high value
# we compare the difference of the means to that critical value
diff_of_means
# that difference is not greater than our critical high. Therefore,
# we would not reject H0 in favor of H1.
######################
## The attained significance approach asks, if H0 is true,
## then how strange would it be to get a difference of the
## means at this value or at any stranger value. But
## that is just a pt command. However, because pt()
## does not allow us to specify the standard deviation we
## need to normalize our value
test_val <- diff_of_means / sd_of_diff
test_val
pt( test_val, df, lower.tail=FALSE)
#
# That attained significance is not less than our given
# level of significance, 0.025, so we would not reject H0
# in favor of H1.
#########################
## Of course, for any similar problem we would be doing
## the same steps, with the additional complication of
## having to pay attention to the alternative. If H1 is
## mu_one < mu_two, which is the same as mu_one-mu_two<0,
## then we would get a critical low value and our
## pnorm would use the lower tail. If, H1 is
## mu_one != mu_two, which is the same as
## mu_one-mu_two != 0, then we would need to get both a
## critical low and a critical high, splitting the area
## of significance between the low and high. In the
## same situation, once we get our attained value on
## one side, high or low, we would need to double it to
## account for values that are just as extreme or more
## extreme on the other side.
##
## Rather than think out all of these issues each time
## we run into this problem, we might as well capture
## all of this in a function, hypoth_2test_unknown().
## LEt us load and run that for the values in our example.
source( "../hypo_2unknown.R")
hypoth_2test_unknown( sd( samp_one), n1, x_bar_one,
sd( samp_two), n2, x_bar_two,
1, 0.025 )
######################################################
######################################################
## Of course, we really could have found the means
## of the two populations.
source("../pop_sd.R")
mu_one <- mean( pop_one )
mu_one
mu_two <- mean( pop_two )
mu_two
real_diff <- mu_one - mu_two
real_diff
# and for that matter we could find the
# two population standard deviations
sigma_one <- pop_sd( pop_one )
sigma_one
sigma_two <- pop_sd( pop_two)
sigma_two
# we will run through 10000 cases where we
# get samples of size 38 and 43 and we will
# use those samples,specifically the sample
# means, to generate a 95% confidence interval
# for the difference of the population means.
# In the process we will see how many of those
# intervals contain the true difference.
L1 <- 1:10000
for( i in 1:10000 ) {
samp_one <- sample( pop_one, 38 )
samp_two <- sample( pop_two, 43 )
this_interval <-
ci_2unknown( sd(samp_one), 38, mean( samp_one),
sd(samp_two), 43, mean( samp_two),
0.95 )
if( this_interval[1]