# line 1 # Look at finding a confidence interval for # the difference of two means when the population # standard deviations are unknown. # # For this example get a 96% confidence interval for # the difference mu_1 - mu_2. # # We will start by generating two populations source("../gnrnd5.R") gnrnd5( 275552499901, 10025010176) L2 <- L1 head(L2,10) tail(L2,10) length(L2) # gnrnd5(277661599901, 10413010732) head(L1,14) tail(L1,14) length(L1) # # Now that we have our two populations # we need to take random samples of the # two populations # # ############################################ # ## Each time we do the following steps ## # ## we will get different samples and as ## # ## such we will get different confidence ## # ## intervals. ## # ############################################ n_1 <- 43 # get a sample of size 43 from L1 index_1 <- as.integer( runif( n_1, 1, 6001)) index_1 samp_1 <- L1[ index_1 ] samp_1 # n_2 <- 56 # get a sample of size 56 from L2 index_2 <- as.integer( runif( n_2, 1, 5001)) index_2 samp_2 <- L2[ index_2 ] samp_2 # # Then we need to find the mean of each sample xbar_1 = mean( samp_1 ) xbar_1 xbar_2 = mean( samp_2 ) xbar_2 # and we need to get the standard deviation of each sample sx1 <- sd( samp_1 ) sx1 sx2 <- sd( samp_2 ) sx2 # # So our best point estimate is pnt_est <- xbar_1 - xbar_2 pnt_est # # The distribution of the difference of the means will # be Student's-t with standard deviation equal to # sqrt( sx1^2/n_1 + sx2^2/n_2 ) sd_difference <- sqrt( sx1^2/n_1 + sx2^2/n_2) sd_difference # # The, for a 96% confidence interval we need to # have the 4% we are missing split evenly on both # tails. But, since this is a Student's-t distribution # the two t-values will just be opposites. We will # just find the upper value. # But for the Student's-t we also need the degrees of # freedom. # This simple way is to use one less than the smaller # sample size. df <- n_1 - 1 if( n_2 < n_1 ) { df <- n_2- 1 } df # t <- qt( 0.04/2, df, lower.tail=FALSE ) t # then our simple confidence interval has a low end of pnt_est - t*sd_difference # and a high end of pnt_est + t*sd_difference # # Alternatively, we have a much more complex way to # compute the degrees of freed and this more complex # approach gives us a higher degree of freedom and # therefore a lower t score so we get a smaller # confidence interval. # # compute the complex degrees of freedom d1 <- sx1^2 / n_1 d2 <- sx2^2 / n_2 d_f_complex <- ( d1 + d2)^2 / (d1^2/(n_1 - 1) + d2^2/(n_2 - 1)) d_f_complex # # t <- qt( 0.04/2, d_f_complex, lower.tail=FALSE ) t # then our complex confidence interval has a low end of pnt_est - t*sd_difference # and a high end of pnt_est + t*sd_difference # # # We could have taken the shorcut and used the # function that is provided. # source("../ci_2unknown.R") ci_2unknown( sx1, n_1, xbar_1, sx2, n_2, xbar_2, 0.96 ) # # ################################################ # ## Now, highlight and rerun, over and over, ## # ## lines 32-114, to get repeated samples and## # ## thus, repeated 93% confidence intervals. ## # ## While you do this be aware that the true ## # ## difference between the population means ## # ## is 8.09059. How often do your ## # ## intervals contain the true mean? ## # ################################################