# topic 24
# This will be remarkably similar to the work in
# topics 21 and 22, so much so that it is worth examining the
# two to see the changes.
# First, set up the situation. We have two populations,
# each having a certain proportion if items with the
# characteristic "2"
#
source("../gnrnd5.R")
source("../gnrnd4.R")
gnrnd5( 56823499907, 475685)
pop_one <- L1
# look at the first 100 items in pop_one
head( pop_one, 100)
gnrnd5( 77843499907, 847655)
pop_two <- L1
# look at the first 100 items in pop_two
head( pop_two, 100)
################################################
### Confidence Interval ###
################################################
# We want to construct a 95% confidence interval
# for the difference of the proportion of 2's, p_one - p_two.
# To do this we will take random samples of both,
# the size of the sample from pop_one will be 94
# and the size of the sample from pop_two will
# be 87.
n1 <- 94
n2 <- 87
# for now, we will get the samples by generating the
# appropriate random index numbers for the two
# populations.
# get the sample and sample statistics from pop_one
gnrnd4( 982049301, 500000001)
L1
index_one <- L1
samp_one <- pop_one[ index_one ]
samp_one
# now find the number of 2's in that sample
freqs_one <- table( samp_one )
freqs_one
phat_one <- freqs_one[2]/n1
names( phat_one)<-"phat_one"
phat_one
# get the sample and sample statistics from pop_two
gnrnd4( 387068601, 500000001)
L1
index_two <- L1
samp_two <- pop_two[ index_two ]
samp_two
# now find the number of 2's in that sample
freqs_two <- table( samp_two )
freqs_two
phat_two <- freqs_two[2]/n2
names( phat_two)<-"phat_two"
phat_two
#### Therefore we can find the difference of the sample
# proportions
diff_of_props <- phat_one - phat_two
diff_of_props
#### We want to use the normal approximation for the distribution
#### of the difference of the sample proportions. To do this
#### we want to be sure that we are not sampling more than
#### 5% of the population and that we have at least 10 successes
#### (items with the characteristic) and 10 failures (items
#### without the characteristic) in our samples. We meet
#### both conditions.
#### The the standard deviation of the difference of the two
####sample proportions is
#
std_err <- sqrt( phat_one*(1-phat_one)/n1 +
phat_two*(1-phat_two)/n2)
std_err
#### for a 95% confidence interval we need the z-value that
#### has 2.5% of the area under the standard normal curve
#### above that value
#
z_val <- qnorm( 0.025, lower.tail=FALSE)
z_val
#
# so our 95% confidence interval is
low_end <- diff_of_props - z_val*std_err
low_end
# and
high_end <- diff_of_props + z_val*std_err
high_end
#
# That pattern of commands would be the same for each
# instance of such a problem. We have a function
# that does this. It is called ci_2popproportion().
source("../ci_2popproport.R")
ci_2popproportion( n1, freqs_one[2], n2, freqs_two[2], 0.95)
# Now, it turns out that we could actually find the
# proportion of 2's in each population and,
# therefore, find the true difference in
# the proportions
#
# find the number of 2's in the first population
pop_freqs_1 <- table( pop_one )
pop_freqs_1
proport_one <- pop_freqs_1[2] / 5000
proport_one
#
# find the number of 2's in the second population
pop_freqs_2 <- table( pop_two )
pop_freqs_2
proport_two <- pop_freqs_2[2] / 5000
proport_two
# so the true difference is
true_diff <- proport_one - proport_two
true_diff
# and that is indeed inside our 95% confidence
# interval.
#
# Of course, that was for just one set of samples.
# We will do this same thing for 10000 samples and
# see how many confidence intervals contain the
# true difference in proportions
L3 <- 1:10000
for (i in 1:10000) {
samp_one <- sample( pop_one, 94)
freqs_one <- table(samp_one)
samp_two <- sample( pop_two, 87)
freqs_two <- table( samp_two)
this_ci <- ci_2popproportion(94, freqs_one[2],
87, freqs_two[2], 0.95)
if( this_ci[1] <= true_diff &
true_diff <= this_ci[2] ) {
L3[i] = "hit"}
else {
L3[i] = "missed"}
}
# see how we did
table( L3 )
################################################
### Hypothesis test ###
################################################
# Now, someone says that they believe that the
# difference of the proportions of 2's
# from pop_one and pop_two,
# phat_1 - phat_2 = 0. That is, the proportions are the same.
# Thus our null hypothesis, H0, is that the difference
# is 0.
# Our alternative hypothesis, H1, is that the
# difference of the proportions is greater than 0, or
# that phat_one > phat_two.
# We want to see if we can reject H0 in favor of H1.
# We will get a sample from each
# population, n1=94 and n2=87.
# We are willing to be wrong in telling the
# person that they are wrong 2.5% of the time!
########################
## The critical value approach. We know that two
## samples of size 94 and 87, respectively.
## The distribution of the difference in the sample
## proportions will be normal, with the mean being
## the true difference in proportions in the
## two populations. Using our null hypothesis
## that difference should be 0. However, we need
## to find the standard deviation of the distribution
## of our difference in the sample proportions.
## To do this we will use our pooled proportion.
#
# Let us go back and take our original samples again
# get the sample and sample statistics from pop_one
gnrnd4( 982049301, 500000001)
L1
index_one <- L1
samp_one <- pop_one[ index_one ]
samp_one
# now find the number of 2's in that sample
freqs_one <- table( samp_one )
freqs_one
freqs_one[2]
# get the sample and sample statistics from pop_two
gnrnd4( 387068601, 500000001)
L1
index_two <- L1
samp_two <- pop_two[ index_two ]
samp_two
# now find the number of 2's in that sample
freqs_two <- table( samp_two )
freqs_two
freqs_two[2]
#
# then our pooled proportion is
pooled_prop <- ( freqs_one[2]+freqs_two[2])/(n1+n2)
pooled_prop
# therefore we can compute the desired standard error
std_err <- sqrt(pooled_prop*(1-pooled_prop)*(1/n1+1/n2))
std_err
#
# Our desired level of significance is 2.5%.
# Also, our alternative hypothesis was the one-sided
# test, greater than.
# Therefore, our critical high value will be our
# H0 + z_0.025*std_err
# Find our z_0.025
z_high <- qnorm( 0.025, lower.tail=FALSE )
z_high
# Since our H0 is 0 this means that our critical high
# value is
0 + z_high*std_err # the critical high value
#
######################
## The attained significance approach asks, if H0 is true,
## then how strange would it be to get a difference of the
## proportions at this value or at any stranger value. But
## that is just a pnorm command.
this_diff <- freqs_one[2]/n1 - freqs_two[2]/n2
this_diff
pnorm( this_diff, mean=0, sd=std_err, lower.tail=FALSE)
#
# That attained significance is not less than our given
# level of significance, 0.025, so we would not reject H0
# in favor of H1.
#########################
## Of course, for any similar problem we would be doing
## the same steps, with the additional complication of
## having to pay attention to the alternative.
##
## We might as well capture
## all of this in a function, hypoth_2test_prop.
## Let us load and run that for the values
## in our example.
source("../hypo_2popproport.R")
hypoth_2test_prop(freqs_one[2],n1,
freqs_two[2],n2, 1, 0.025)
######################################################
######################################################
######################################
### Now let us go back to our hypothesis test. We
### know that the true difference of these means is
## not zero. In fact, we know the real difference
true_diff
#
# That is a difference but not a very large one.
# In fact, that difference is so small that
# it will be really hard to find two samples that
# would allow us to reject the null hypothesis that
# the proportions are the same. We will try to take
# two samples 1000 times and see how often we reject
# the null hypothesis. We will make that a bit easier
# to do by changing the alternative to a less than;
# after all, we know that the proportion in the
# first population is indeed less than the proportion
# in the second population.
#
L3 <- 1:1000
for (i in 1:1000) {
samp_one <- sample( pop_one, 94)
freqs_one <- table(samp_one)
samp_two <- sample( pop_two, 87)
freqs_two <- table( samp_two)
this_test <- hypoth_2test_prop(freqs_one[2],n1,
freqs_two[2],n2, -1, 0.025)
L3[i] <- "do not reject"
if( this_test[15] == "Reject" ) {
L3[i] <- "reject"
}
}
# see how we did
table( L3 )
#
# We will change the second population so that
# it has a higher proportion of 2's
gnrnd5( 77843499907, 547955)
pop_two <- L1
# find the number of 2's in the second population
pop_freqs_2 <- table( pop_two )
pop_freqs_2
proport_two <- pop_freqs_2[2] / 5000
proport_two
# so the true difference is
true_diff <- proport_one - proport_two
true_diff
L3 <- 1:1000
for (i in 1:1000) {
samp_one <- sample( pop_one, 94)
freqs_one <- table(samp_one)
samp_two <- sample( pop_two, 87)
freqs_two <- table( samp_two)
this_test <- hypoth_2test_prop(freqs_one[2],n1,
freqs_two[2],n2,
-1, 0.025)
L3[i] <- "do not reject"
if( this_test[15] == "Reject" ) {
L3[i] <- "reject"
}
}
# see how we did
table( L3 )