# We can try to use this information to introduce # linear regression and correlation # First we want to generaate some values source("../gnrnd4.R") gnrnd4( 538190706, 7330120302, 1400003) L1 L2 plot( L1, L2 ) # now make it look nicer plot( L1, L2, main="Intro to Linear Regression/Correlation", xlab="L1 -- the x values", ylab="L2 -- the y values", xlim=c(0,20), ylim=c(0,30), xaxp=c(0,20, 10), yaxp=c(0,30,6), las=1, pch=16, col="darkgreen") abline(h=0, v=0) abline( h=seq(5,30,5), v=seq(2,20,2), lty="dotted", col="gray") # # then, just from looking at the points one might # guess that the line of best fit might be # y = (25/14)x + 5 # let us graph that line abline( a=5, b=25/14, col="orange", lwd=2) # # then let us find the y values associated, by that line, # with the x values that we have new_y <- (25/14)*L1 + 5 new_y # and then add those to the plot points( L1, new_y, col="orange", pch=2) # find the observed - expected values diff_ome <- L2 - new_y diff_ome # then get the square of those values diff_ome_2 <- diff_ome^2 diff_ome_2 # now, get the sum of those sum( diff_ome_2) # then, we could try a second line, say # y = (25/18)x+10 # let us graph that line abline( a=10, b=25/18, col="blue", lwd=2) # # then let us find the y values associated, by that line, # with the x values that we have next_y <- (25/18)*L1 + 10 next_y # and then add those to the plot points( L1, next_y, col="blue", pch=5) # find the observed - expected values diff_next <- L2 - next_y diff_next # then get the square of those values diff_next_2 <- diff_next^2 diff_next_2 # now, get the sum of those sum( diff_next_2) # now, let us find the best line of fit best_line <- lm( L2~L1 ) best_line # this turns out to be y = 1.538x + 7.214 # we can plot that abline( best_line, col="red", lwd=2) # then let us find the y values associated, by that line, # with the x values that we have best_y <- 1.538*L1 + 7.214 best_y # and then add those to the plot points( L1, best_y, col="red", pch=15) # find the observed - expected values diff_best <- L2 - best_y diff_best # then get the square of those values diff_best_2 <- diff_best^2 diff_best_2 # now, get the sum of those sum( diff_best_2) # and while we are at it, let us get the correlation # coefficient r_val <- cor( L1, L2 ) r_val # and then the square of that value r_val^2 # Note that the values that we used for the supposed best # line were those displayed by the lm() function. # however, those were rounded to 3 places. Let us pull out # just those values. best_coeffs <- coefficients( best_line) best_coeffs[1] best_coeffs[2] # those are better values, but even here they are # rounded. Note the following options( digits=16 ) best_coeffs[1] best_coeffs[2] # then return the display to only 7 digits options( digits=7) # let us do our computation again, this time with # the full value of the coefficients abline( a=best_coeffs[1], b=best_coeffs[2], col="black", lwd=1) best_y <- best_coeffs[2]*L1 + best_coeffs[1] best_y # and then add those to the plot points( L1, best_y, col="black", pch=0) # find the observed - expected values diff_best <- L2 - best_y diff_best # then get the square of those values diff_best_2 <- diff_best^2 diff_best_2 # now, get the sum of those sum( diff_best_2) # we see that having the extra digits really does not # change our result by enough to see that difference in # default 7 digit display. # at this point it makes sense to redraw our plot # but with only the original points, the best fit # line and the expected values plot( L1, L2, main="Intro to Linear Regression/Correlation", xlab="L1 -- the x values", ylab="L2 -- the y values", xlim=c(0,20), ylim=c(0,30), xaxp=c(0,20, 10), yaxp=c(0,30,6), las=1, pch=16, col="darkgreen") abline(h=0, v=0) abline( h=seq(5,30,5), v=seq(2,20,2), lty="dotted", col="gray") abline( a=best_coeffs[1], b=best_coeffs[2], col="black", lwd=1) points( L1, best_y, col="black", pch=0) # we could use our coefficients to generate # new expected values for some given x-values # given_x <- c( 1, 5, 13, 23) pred_y <- best_coeffs[1] + best_coeffs[2]*given_x pred_y points( given_x, pred_y, col="red", pch=8) # notice that the point (23,42.59) is off the plot and # therefore not shown # # the given_x values 1 and 23 are outside the # range of the x-values in L1. Therefore, the # predicted values for those two given_x values # are extrapolations. # the given_x values 5 and 13 are inside the # range of the x-values in L1. Therefore, the # predicted values for those two given_x values # are intrapolations. # We could compute the residual values, # the observed-expected values, directly from the # observed values, L2, and the expected values, best_y # comp_resid <- L2 - best_y L2 best_y comp_resid # # but, we could have just pulled them out of # our linear model, best_line residuals( best_line ) # # and, we can plot the L1 values vs. the residual values plot( L1, comp_resid) ################################# # do another, but bigger, problem # # generate the data gnrnd4(6620562806, 3260530401, 23100023) # look at it L1 L2 # get a rough plot of it plot(L1,L2) # get a better plot of it plot( L1, L2, main="Scatter plot", xlab="L1 -- the x values", ylab="L2 -- the y values", xlim=c(-2,20), ylim=c(-20,70), xaxp=c(-2,20, 11), yaxp=c(-20,70,9), las=1, pch=16, col="darkgreen") abline( h=seq(-20,70,10), v=seq(-2,20,2), lty="dotted", col="gray") abline(h=0, v=0, col="black") # # get the correlation coefficient r_val <- cor( L1, L2 ) r_val # and the square of it r_val^2 # # then get our linear model lm_hold <- lm( L2~L1 ) #display the coefficients lm_hold # plot the line abline( lm_hold, col="blue") # let us pull out the coefficients c_hold <- coefficients( lm_hold ) c_hold[1] c_hold[2] # let us find the expected value for these values of x given_x <- c( 0, 2, 11, 19) pred_y <- c_hold[1] + c_hold[2]*given_x pred_y # just for fun, plot those points points( given_x, pred_y, pch=2, col="red") # # we will retrieve the residual values, display # them, and then plot them against the L1 values res_hold <- residuals( lm_hold ) res_hold plot( L1, res_hold )