Yet another demo of Linear Regression


On your USB drive, create a new directory, copy model.R to that directory, rename the file in the new directory, double click on the file to open Rstudio. Then copy all of the text below the line and paste it into your Rstudio editor pane.
# We can try to use this information to introduce
#   linear regression and correlation
# First we want to generaate some values
source("../gnrnd4.R")
gnrnd4( 538190706, 7330120302, 1400003)
L1
L2
plot( L1, L2 )
# now make it look nicer
plot( L1, L2, 
      main="Intro to Linear Regression/Correlation",
      xlab="L1 -- the x values",
      ylab="L2 -- the y values",
      xlim=c(0,20), ylim=c(0,30),
      xaxp=c(0,20, 10), yaxp=c(0,30,6),
      las=1, pch=16, col="darkgreen")
abline(h=0, v=0)
abline( h=seq(5,30,5), v=seq(2,20,2),
        lty="dotted", col="gray")
#
# then, just from looking at the points one might
# guess that the line of best fit might be
#  y = (25/14)x + 5
# let us graph that line
abline( a=5, b=25/14, col="orange", lwd=2)
#
#  then let us find the y values associated, by that line,
#  with the x values that we have
new_y <- (25/14)*L1 + 5
new_y
# and then add those to the plot

points( L1, new_y, col="orange", pch=2)

# find the observed - expected values
diff_ome <- L2 - new_y
diff_ome
# then get the square of those values
diff_ome_2 <- diff_ome^2
diff_ome_2
# now, get the sum of those
sum( diff_ome_2)

#  then, we could try a second line, say 
#  y = (25/18)x+10
# let us graph that line
abline( a=10, b=25/18, col="blue", lwd=2)
#
#  then let us find the y values associated, by that line,
#  with the x values that we have
next_y <- (25/18)*L1 + 10
next_y
# and then add those to the plot

points( L1, next_y, col="blue", pch=5)

# find the observed - expected values
diff_next <- L2 - next_y
diff_next
# then get the square of those values
diff_next_2 <- diff_next^2
diff_next_2
# now, get the sum of those
sum( diff_next_2)

# now, let us find the best line of fit
best_line <- lm( L2~L1 )
best_line
# this turns out to be y = 1.538x + 7.214
# we can plot that
abline( best_line, col="red", lwd=2)
#  then let us find the y values associated, by that line,
#  with the x values that we have
best_y <- 1.538*L1 + 7.214
best_y
# and then add those to the plot

points( L1, best_y, col="red", pch=15)

# find the observed - expected values
diff_best <- L2 - best_y
diff_best
# then get the square of those values
diff_best_2 <- diff_best^2
diff_best_2
# now, get the sum of those
sum( diff_best_2)

# and while we are at it, let us get the correlation
#     coefficient
r_val <- cor( L1, L2 )
r_val
# and then the square of that value
r_val^2

#  Note that the values that we used for the supposed best
#  line were those displayed by the lm() function.
#  however, those were rounded to 3 places.  Let us pull out 
# just those values.
best_coeffs <- coefficients( best_line)
best_coeffs[1]
best_coeffs[2]
# those are better values, but even here they are
# rounded.  Note the following
options( digits=16 )
best_coeffs[1]
best_coeffs[2]
# then return the display to only 7 digits
options( digits=7)

#  let us do our computation again, this time with 
#  the full value of the coefficients
abline( a=best_coeffs[1], 
        b=best_coeffs[2], col="black", lwd=1)
best_y <- best_coeffs[2]*L1 + best_coeffs[1]
best_y
# and then add those to the plot

points( L1, best_y, col="black", pch=0)

# find the observed - expected values
diff_best <- L2 - best_y
diff_best
# then get the square of those values
diff_best_2 <- diff_best^2
diff_best_2
# now, get the sum of those
sum( diff_best_2)
# we see that having the extra digits really does not
# change our result by enough to see that difference in 
# default 7 digit display.

# at this point it makes sense to redraw our plot
# but with only the original points, the best fit 
# line and the expected values

plot( L1, L2, 
      main="Intro to Linear Regression/Correlation",
      xlab="L1 -- the x values",
      ylab="L2 -- the y values",
      xlim=c(0,20), ylim=c(0,30),
      xaxp=c(0,20, 10), yaxp=c(0,30,6),
      las=1, pch=16, col="darkgreen")
abline(h=0, v=0)
abline( h=seq(5,30,5), v=seq(2,20,2),
        lty="dotted", col="gray")
abline( a=best_coeffs[1], 
        b=best_coeffs[2], col="black", lwd=1)
points( L1, best_y, col="black", pch=0)

#   we could use our coefficients to generate 
#   new expected values for some given x-values
#
given_x <- c( 1, 5, 13, 23)
pred_y <- best_coeffs[1] + best_coeffs[2]*given_x
pred_y
points( given_x, pred_y, col="red", pch=8)

# notice that the point (23,42.59) is off the plot and
#  therefore not shown
#  
# the given_x values 1 and 23 are outside the
# range of the x-values in L1.  Therefore, the 
# predicted values for those two given_x values 
# are extrapolations.

# the given_x values 5 and 13 are inside the
# range of the x-values in L1.  Therefore, the 
# predicted values for those two given_x values 
# are intrapolations.

#  We could compute the residual values, 
#  the observed-expected values, directly from the 
#  observed values, L2, and the expected values, best_y
#
comp_resid <- L2 - best_y
L2
best_y
comp_resid
#
# but, we could have just pulled them out of
# our linear model, best_line
residuals( best_line )

#
# and, we can plot the L1 values vs. the residual values
plot( L1, comp_resid)

#################################
# do another, but bigger, problem
#
# generate the data
gnrnd4(6620562806, 3260530401, 23100023)
# look at it
L1
L2
# get a rough plot of it
plot(L1,L2)
# get a better plot of it
plot( L1, L2, 
      main="Scatter plot",
      xlab="L1 -- the x values",
      ylab="L2 -- the y values",
      xlim=c(-2,20), ylim=c(-20,70),
      xaxp=c(-2,20, 11), yaxp=c(-20,70,9),
      las=1, pch=16, col="darkgreen")

abline( h=seq(-20,70,10), v=seq(-2,20,2),
        lty="dotted", col="gray")
abline(h=0, v=0, col="black")
#
# get the correlation coefficient
r_val <- cor( L1, L2 )
r_val
# and the square of it
r_val^2
#
# then get our linear model
lm_hold <- lm( L2~L1 )
#display the coefficients
lm_hold
# plot the line
abline( lm_hold, col="blue")
# let us pull out the coefficients
c_hold <- coefficients( lm_hold )
c_hold[1]
c_hold[2]
# let us find the expected value for these values of x
given_x <- c( 0, 2, 11, 19)
pred_y <- c_hold[1] + c_hold[2]*given_x
pred_y
# just for fun, plot those points
points( given_x, pred_y, pch=2, col="red")
#
# we will retrieve the residual values, display
# them, and then plot them against the L1 values
res_hold <- residuals( lm_hold )
res_hold
plot( L1, res_hold )