# topic 11 f
# Load the functions we will need in this script
source( "../gnrnd5.R")
source( "../assess_normality.R")
source( "../pop_sd.R")
#
# Here we are looking to answer questions about
# whether or not a population is "normal", that is,
# does the population distribution look like the
# normal probability curve?
# The first thing to recognize is that no finite population
# will ever be exactly normal. The best we can do is to find
# populations that are approximately normal. For example,
# here is a population of 9500 values that is approximately
# normal with mean about 45 and standard deviation about 11.
gnrnd5(269834949904, 1100004500)
# look at the first 50 values
head( L1, 50)
# look at the last 50 values
tail( L1, 50 )
# find the mean and standard deviation
mean( L1 )
pop_sd( L1 )
# look at a historgram of the population
hist( L1 )
# look at a boxplot of the population
boxplot( L1, horizontal = TRUE)
# just to make our example a little easier, we can transform
# our population into a new one that has mean=45 and sd=11
L2 <- (L1 - mean(L1))/pop_sd(L1) # L2 will have mean=0
# sd=1
mean( L2 )
pop_sd( L2 )
L3 <- L2*11 + 45
# L3 should have mean=45 and sd=11
mean( L3 )
pop_sd( L3 )
# What are the things that we know about a true normal
# distribution?
# First, it should look like the normal distribution.
# Here is a plot of a N(45,11) distribution
######################################
## making this plot is beyond to ##
## requirements of this course. ##
######################################
x <- seq(1, 89, length=400)
hx <- dnorm(x, mean=45, sd=11)
plot(x, hx, type="l", lty=1,
xlim=c(0,90), xaxp=c(0,90,18),
ylim=c(0,0.05), yaxp=c(0,0.05,10),
las=2, cex.axis=0.75,
xlab="z value",
ylab="Density", main="N(45,11) Distribution")
abline( v=0, lty=1, lwd=2)
abline( h=0, lty=1, lwd=2)
abline( h=seq(0.005,0.05,0.005), lty=3, col="darkgray")
abline( v=seq(5,90,5), lty=3, col="darkgray")
#
# then add a red vertical line at the place where we
# have both the mean and the median
lines( c(45, 45),c(0, dnorm(45, mean=45, sd=11)), col="red", lw=2)
# going down and up one standard deviation from the mean
# would give us the x values of 45-11=34 and 45+11=56.
# Put green vertical liens there.
lines( c(34, 34),c(0, dnorm(34, mean=45, sd=11)),
col="green", lw=2)
lines( c(56, 56),c(0, dnorm(56, mean=45, sd=11)),
col="green", lw=2)
# going down and up two standard deviations from the mean
# would give us the x values of 45-2*11=23 and 45+2*11=67.
# Put blue vertical lines there.
lines( c(23, 23),c(0, dnorm(23, mean=45, sd=11)),
col="blue", lw=2)
lines( c(67, 67),c(0, dnorm(67, mean=45, sd=11)),
col="blue", lw=2)
# from what we know of the normal distribution the area under the
# curve to the left of the left blue line should be
pnorm( 23, mean=45, sd=11)
# this will also be the area to the right of the right blue line
pnorm( 67, mean=45, sd=11, lower.tail=FALSE)
# We want to compare our population, L3, to this kind of a
# distribution. To do this, we will first get a sorted
# listing of our population values
pop_sorted <- sort( L3 )
# then we should have about 2.27% of those values that are
# less than 23. There are 9500 values in our population
# so we can find 2.27% of that number
0.0277*9500
# and how many do we have that are less than or equal to 23?
less_than_23 <- pop_sorted[ pop_sorted<=23 ]
length( less_than_23 )
# we have 210 values but we expected 263. Not the same
# but not terribly off.
# let us see how many are greater than or equal to 67
greater_than_67 <- pop_sorted[ pop_sorted>=67 ]
length( greater_than_67 )
# we have 205 values but we expected 263. Not the same
# but not terribly off.
# We can do the same for the area to the left and right of
# the green lines
pnorm( 34, mean=45, sd=11)
#
pnorm( 56, mean=45, sd=11, lower.tail=FALSE)
#
# so we can compute how many we would expect out our 9500
0.1586553*9500
# and we can see how many of our values are in those ranges
less_than_34 <- pop_sorted[ pop_sorted<=34 ]
length( less_than_34 )
# that was ridiculously close to what we expect.
greater_than_56 <- pop_sorted[ pop_sorted>=56 ]
length( greater_than_56 )
# again, really close
# then, just for the sake of looking at things,
# what is the median of our population? We expect it
# to be the same as the mean.
median( L3 )
# that is really good.
# to get a little bit more detail, let us look at
# quantiles both in the ideal Normal case and in
# our population
our_points <- seq( 0.1, 0.9, 0.1)
our_points
# these two commands should give about the same results
qnorm( our_points, mean=45, sd=11)
quantile( pop_sorted, our_points )
# This looks really good.
# do this again, but at more percentage points
our_points <- seq( 0.05, 0.95, 0.05)
our_points
# these two commands should give about the same results
qnorm( our_points, mean=45, sd=11)
quantile( pop_sorted, our_points )
# This looks really good.
# we could do the same thing again, this time at percentage
# points 1%, 2%, 3%, ..., 99% but that is just a lot of points,
# many more than we want to compare. But we could do that
# as a graph.
our_points <- seq( 0.01, 0.99, 0.01)
our_points
# these two commands should give about the same results
x <- qnorm( our_points, mean=45, sd=11)
y <- quantile( pop_sorted, our_points )
plot( x,y )
# If our population is a Normal distribution then the
# plot should look like a straight line of points.
# This is, essentially, what our assess_normality
# function does.
assess_normality( L3 )
# The difference is that our previous plot had 99 points whereas
# the assess_normality plot has one point for each value in our
# population, in this case 9500 points.
# Of course, we could have looked at a box and whisker
# plot to see if the data looked normal
boxplot( L3, horizontal=TRUE)
# For a Normal distribution the two boxes should be the
# same size, and the length of whiskers should be about
# 3 times the width of each of the two boxes, and, if there
# are many items in the population we do expect to see
# some outliers.
# Alternatively, we could look at a histogram of the
# population.
hist( L3, main="L3 with the default breakpoint" )
# But remember that histograms can change their appearance
# if we change the break points.
hist( L3, breaks=seq(0,90,4.5),
main="L3 with interval width=4.5")
hist( L3, breaks=seq(0,90,3),
main="L3 with interval width=3")
hist( L3, breaks=seq(0,90,1.5),
main="L3 with interval width=1.5")
hist( L3, breaks=seq(0,90,1),
main="L3 with interval width=1")
#######################################
# Now look at a different population, this one is not
# normal.
gnrnd5( 294706944905, 800002500, 2300009000)
hist( L1 )
boxplot( L1, horizontal=TRUE)
assess_normality( L1 )