# topic 11 g
# Load the functions we will need in this script
source( "../assess_normality.R")
source( "../pop_sd.R")
#
# Here we are looking to answer questions about
# probabilities for proportions. So a typical
# problem might be:
#
# We know the true proportion of some characteristic
# in a population is 65%. That is, for the whole
# population, 65% of all things in the population
# have this characteristic. Then, if we take a
# sample of size 88 from this population, what is
# the probability that the proportion of the
# sample that has that characteristic is less
# than 55%?
# As long as we have the time, let us create a
# large population that has 65% of the population
# having the characteristic and 35% not having
# it.
big_pop <- c( rep(1, 6500), rep(2,3500))
#
# I have had students who do not like having the
# the values in big_pop be in this nice order, i.e.,
# 6500 ones and then 3500 twos. For those students
# we can shuffle the items in big_pop.
source("../shuffle.R")
big_pop <- shuffle( big_pop )
head( big_pop, 30)
#
# Now, take 10,000 samples, each of size 88,
# from that population and record the
# proportion of the sample that has the
# characteristic in L1
L1 <- 1:10000
for ( i in 1:10000){
# get a sample of size 88
L2 <- sample( big_pop, 88)
# find the proportion of items with a 1
# in the sample
num_times <- length( L2[ L2 == 1])
this_proportion <- num_times/88
# add that to our list in L1
L1[i] <- this_proportion
}
# Now let us look at the distribution of the
# values in L1
#
summary( L1 ) # compare mean and median
boxplot( L1, horizontal=TRUE)
hist(L1 )
assess_normality( L1 )
pop_sd( L1 )
mean( L1 )
# compare those to the mathematically
# predicted mean and standard deviation
# the mean should be p which is 0.65
# the standard deviation should be
# sqrt( p*(1-p)/n)
sqrt( 0.65*0.35/88 )
#
# So what we see is that we can use the normal
# distribution with mean=p and
# sd = sqrt(p*(1-p)/n) to answer questions
# about the probability associated with
# a known proportion.
#######################
## small diversion...why the drop in the
## histogram for the interval 0.606 to 0.608?
## let us look at the possible outcomes
outcomes <- (1:88)/88
outcomes
# then look at the number of possible
# outcomes in the 0.64 to 0.66, in 0.66 to 0.68,
# and in 0.68 to 0.70
outcomes[ outcomes>0.64 & outcomes<= 0.66]
outcomes[ outcomes>0.66 & outcomes<= 0.68]
outcomes[ outcomes>0.68 & outcomes<= 0.70]
# this explains the strange low value here
# and at a few other places.
# we would not see this if we took samples
# of size 100
#
# let us do that....
#########################
for ( i in 1:10000){
# get a sample of size 100
L2 <- sample( big_pop, 100)
# find the proportion of items with a 1
# in the sample
num_times <- length( L2[ L2 == 1])
this_proportion <- num_times/100
# add that to our list in L1
L1[i] <- this_proportion
}
# Now let us look at the distribution of the
# values in L1
#
summary( L1 ) #compare mean and median
boxplot( L1, horizontal=TRUE)
hist(L1 )
assess_normality( L1 )
pop_sd( L1 )
mean( L1 )
# compare those to the mathematically
# predicted mean and standard deviation
# the mean should be p which is 0.65
# the standard deviation should be
# sqrt( p*(1-p)/n)
sqrt( 0.65*0.35/100 )
# Then go back to the original question.
#
# We know the true proportion of some characteristic
# in a population is 65%. That is, for the whole
# population, 65% of all things in the population
# have this characteristic. Then, if we take a
# sample of size 88 from this population, what is
# the probability that the proportion of the
# sample that has that characteristic is less
# than 55%?
# This is just the same as asking "For a
# normal distribution,
# N( 0.65, sqrt(0.65*(1-0.65)/88)), what is
# P(X < 0.55)?
# But we know how to do that:
pnorm( 0.55, mean=0.65, sd=sqrt(0.65*0.35/88))
# for a population with a characteristics that
# is known to be in 58% of the population, if
# we take a sample of size 37, what is the
# probability that the sample will show a
# proportion greater than 63?
pnorm( 0.63, mean=0.58,
sd=sqrt(0.58*(1-0.58)/37),
lower.tail=FALSE)
# If we know that the proportion of people who
# will vote for candidate A in the next election
# is 53%, then in a sample of size 734 what is
# the probability that the proportion of voters
# for candidate A the sample will be less than
# 49% or greater than 57%?
pnorm( 0.49, 0.53, sqrt(0.53*0.47/734))+
pnorm( 0.57, 0.53, sqrt(0.53*0.47/734),
lower.tail = FALSE)
############################################
############################################
# Is this normal approximation always good?
############################################
##
## Look at a new case, one where n*p<10
##
## consider the case where
## p=0.15, what if our sample size was 12?
## what would our experiment of 10,000
## samples look like?
big_pop <- c(rep(1,1500),c(rep(2,8500)))
for ( i in 1:10000){
# get a sample of size 12
L2 <- sample( big_pop, 12)
# find the proportion of items with a 1
# in the sample
num_times <- length( L2[ L2 == 1])
this_proportion <- num_times/12
# add that to our list in L1
L1[i] <- this_proportion
}
# Now let us look at the distribution of the
# values in L1
#
summary( L1 ) # compare mean and median
boxplot( L1, horizontal=TRUE)
hist(L1 )
assess_normality( L1 )
pop_sd( L1 )
mean( L1 )
# compare those to the mathematically
# predicted mean and standard deviation
# the mean should be p which is 0.65
# the standard deviation should be
# sqrt( p*(1-p)/n)
sqrt( 0.15*0.85/12 )
#
# Compare the approximation to the model
# for getting P(X<0.05)
pnorm( 0.05, mean=0.15, sd=sqrt(0.15*0.85/12))
quantile(L1,0.1659877)
# So we have a rule: if n*p>=10 and
# if n*(1-p)>=10 then we can use the
# normal approximation for the probabilities.
######################################
## We seem to do the same thing each time we
## run into this problem, that is, we use the
## population proportion as the mean and the
## expression sqrt( p*(1-p)/n) as the standard
## deviation. Could we put this into a
## function? Yes, look at pprop().
source("../pprop.R")
#
# remember that we did
pnorm( 0.55, mean=0.65, sd=sqrt(0.65*0.35/88))
## now try
pprop(0.55, 0.65, 88)
# or we did
pnorm( 0.63, mean=0.58,
sd=sqrt(0.58*(1-0.58)/37),
lower.tail=FALSE)
# now try
pprop( 0.63, 0.58, 37, lower.tail=FALSE)