# Topic: Goodness of Fit
# We will start by creating a large population composed of
# the values 1-6 with an the distribution of those values having
# the ratios 5:9:3:8:7:4, i.e.,
# 5/36, 9/36, 3/36, 8/36, 7:36, 4/36
x <- c( rep(1,5), rep(2,9), rep(3,3),
rep(4,8), rep(5,7), rep(6,4) )
big_pop <- rep( x, 1000 )
# Let us look at this large population
barplot( table(big_pop),
ylim=c(0,10000), yaxp=c(0,10000,10),
las=1)
abline( h=0 )
abline( h=seq(1000,10000,1000), lty="dotted",
col="darkgray")
# we can do better than that with a frequency table
source("../make_freq_table.R")
make_freq_table( big_pop )
# Now, we want to take repeated samples of size 144
# and in each case we want to determine the sample frequency
# and from that compute the sum of the quotients of the
# squared differences between the observed and expected values
# divided by the expected values.
#
# first, let us get our expected values
H0 <- c( 5,9,3,8,7,4)/36
H0
expected <- 144*H0
expected
computed_sums <- 1:10000
for( i in 1:10000 ) {
our_samp <- sample( big_pop, 144 )
samp_freq <- table( our_samp )
diffs <- samp_freq - expected
diffs_squared <- diffs^2
quotient <- diffs_squared/ expected
this_sum <- sum( quotient )
computed_sums[i] <- this_sum
}
# Now, look at the distribution of those values
hist( computed_sums,
xlim=c(0,30), xaxp=c(0,30,30),
breaks=seq(0,30,0.5),
ylim=c(0,800), yaxp=c(0,800,8),
las=2)
abline(h=0, v=0 )
abline( h=seq(100,800,100), lty="dotted",
col="darkgray")
#
# Now compare that to the chi-square distribution with
# 5 degrees of freedom
x <- seq(0, 30, length=400)
hx <- dchisq(x,5)
par( new=FALSE)
plot(x, hx, type="l", lty=1,
xlim=c(0,30), xaxp=c(0,30,30),
ylim=c(0,0.16), yaxp=c(0,0.2,8),
las=2, cex.axis=0.75, lwd=2,
xlab="x",
col="darkred",
ylab="Density",
main="The chi-squaredl Distribution for 5 degrees of freedom")
abline( v=0, h=0, lty=1, lwd=2)
abline( h=seq(0.02,0.2,0.02),
lty=3, col="darkgray")
abline( v=seq(1,30,1), lty=3, col="darkgray")