# Topic: Goodness of Fit # We will start by creating a large population composed of # the values 1-6 with an the distribution of those values having # the ratios 5:9:3:8:7:4, i.e., # 5/36, 9/36, 3/36, 8/36, 7:36, 4/36 x <- c( rep(1,5), rep(2,9), rep(3,3), rep(4,8), rep(5,7), rep(6,4) ) big_pop <- rep( x, 1000 ) # Let us look at this large population barplot( table(big_pop), ylim=c(0,10000), yaxp=c(0,10000,10), las=1) abline( h=0 ) abline( h=seq(1000,10000,1000), lty="dotted", col="darkgray") # we can do better than that with a frequency table source("../make_freq_table.R") make_freq_table( big_pop ) # Now, we want to take repeated samples of size 144 # and in each case we want to determine the sample frequency # and from that compute the sum of the quotients of the # squared differences between the observed and expected values # divided by the expected values. # # first, let us get our expected values H0 <- c( 5,9,3,8,7,4)/36 H0 expected <- 144*H0 expected computed_sums <- 1:10000 for( i in 1:10000 ) { our_samp <- sample( big_pop, 144 ) samp_freq <- table( our_samp ) diffs <- samp_freq - expected diffs_squared <- diffs^2 quotient <- diffs_squared/ expected this_sum <- sum( quotient ) computed_sums[i] <- this_sum } # Now, look at the distribution of those values hist( computed_sums, xlim=c(0,30), xaxp=c(0,30,30), breaks=seq(0,30,0.5), ylim=c(0,800), yaxp=c(0,800,8), las=2) abline(h=0, v=0 ) abline( h=seq(100,800,100), lty="dotted", col="darkgray") # # Now compare that to the chi-square distribution with # 5 degrees of freedom x <- seq(0, 30, length=400) hx <- dchisq(x,5) par( new=FALSE) plot(x, hx, type="l", lty=1, xlim=c(0,30), xaxp=c(0,30,30), ylim=c(0,0.16), yaxp=c(0,0.2,8), las=2, cex.axis=0.75, lwd=2, xlab="x", col="darkred", ylab="Density", main="The chi-squaredl Distribution for 5 degrees of freedom") abline( v=0, h=0, lty=1, lwd=2) abline( h=seq(0.02,0.2,0.02), lty=3, col="darkgray") abline( v=seq(1,30,1), lty=3, col="darkgray")