# This is a script of the commands that we will use to # introduce the ideas of linear regression. # # We will need to start with measure of two variables, in # this example this will be the measures of two different # amino acids in eight tissue samples # amino_A <- c( 2, 7, 12, 9, 20, 6, 15, 10 ) amino_B <- c( 7, 7, 20,16, 29,11, 15, 10 ) # # We could look at the two samples individually mean( amino_A ) mean( amino_B ) summary( amino_A ) summary( amino_B ) sd( amino_A ) sd( amino_B ) # # This was interesting, but not a real help in # seeing how the levels of these two amino acids # might relate # # Just look at a scatter plot of the two variables. # First, just a rought plot plot( amino_A, amino_B ) # There is a "feeling" of an upward drift # in the values that we plotted. That is, # it seems that higher amino_A values are # associated with higher amino_B values. # # Then we will do it again, but this time make # it a bit better. (Note that students are not # expected to create such fancy graphs.) plot( amino_A, amino_B, main="Compare Levels of\nAmino Acids A and B", xlim=c(-2,22), ylim=c(-2,30), xaxp=c(-2,22,12),yaxp=c(-2,30,16), las=1, cex.axis=0.65, cex.lab=0.85, col="maroon", pch=19) abline(h=seq(-2,30,2), v=seq(-2,22,2), col="darkgray", lty="dotted") abline(h=0, v=0, col="black") # # The "drift" is still there, as we would expect, # but we have a better feel for it. # # In trying to describle the relation between # amino_A and amino_B values, we want to capture # that "drift". We would like a line that gives # us a feeling for that drift. As a guess, we will # look at the line connecting points (0,3) and (20,23). abline( 3, 1, col="violet" ) # # That line has the form y = 3 + 1*x, where 3 is the # intercept on the y-axis, and 1 is the slope. And that # line does look like it characterizes the "drift", # the relationship. # # But what about a different line? How about the line # y = 0.5 +1.25x abline( 0.5, 1.25, col="green") # # That also looks like it characterizes the drift. # Which is the better characterization? # # We have a method for comparing these two. We will # use the amino_A values to find specific points on the # two proposed lines. These will be the values that # we would "EXPECT" to find if the lines were strictly # true. Thus, for y= 3 + 1*X, our points would be # expect_first <- 3 + 1*amino_A expect_first # # For y = 0.5 + 1.25*x expect_two <- 0.5 + 1.25*amino_A expect_two # # The values that we actually obsevered are in amino_B. # # Just so we can see these we can add them to the plot # We will add the expect_first values to the plot as # violet triangles. points(amino_A, expect_first, pch=17, col="violet") # # We will add the expect_two values to the plot as # green squares. points(amino_A, expect_two, pch=15, col="green") # # We then compute the OBSERVED - EXPECTED obs_minus_first <- amino_B - expect_first obs_minus_first # Then compute the squares of those values obs_minus_first_squared <- obs_minus_first^2 obs_minus_first_squared # And, finally we find the sum of those squared differences sum_sqr_first <- sum( obs_minus_first_squared ) sum_sqr_first # the first model, y = 3 + 1x yields 112 # # then we do the same thing for the second equation # y = 0.5 + 1.25*x # We then compute the OBSERVED - EXPECTED obs_minus_two <- amino_B - expect_two obs_minus_two # Then compute the squares of those values obs_minus_two_squared <- obs_minus_two^2 obs_minus_two_squared # And, finally we find the sum of those squared differences sum_sqr_two <- sum( obs_minus_two_squared ) sum_sqr_two # the second model y = 0.5 +1.25x yields 107.6875 # this is lower that the value from the first model # so the second model is the better model. # # But we want the best model. We want a the bes # linear model for our data. # The strange lm() function produces this # lm_hold <- lm(amino_B ~ amino_A ) # here is a quick look at it lm_hold # So the equation of the line for this model is # y = 2.642 + 1.159*x or, in our case, # amino_B = 2.642 + 1.159*amino_A # here is a bigger view of the model summary( lm_hold ) # it gives us more info, but it does include the # two values for our coefficients. # We can extact the coefficients, the intercept and # the slope, from the model. Remember that our general # form of the line is given as y = a + bx where a is the # intercept and b is the slope. coeffs <- coefficients( lm_hold) coeffs # and we can add this lienar model to the graph abline( lm_hold ) # While we are at it, we can find the expected values # based on the new linear model for all of the # values stored in amino_A expect_lm <- coeffs[1] + coeffs[2]*amino_A # # And then we can plot those values # points(amino_A, expect_lm, pch=15) # # We will follow the same process to evaluate this # new model # We then compute the OBSERVED - EXPECTED obs_minus_lm <- amino_B - expect_lm obs_minus_lm # Then compute the squares of those values obs_minus_lm_squared <- obs_minus_lm^2 obs_minus_lm_squared # And, finally we find the sum of those squared differences sum_sqr_lm <- sum( obs_minus_lm_squared ) sum_sqr_lm # # This result, 93.98287, is even lower than our earlier # model, In fact # you will not find a line that has a lower sum of the # squared differences between the observed and the # expected values. This is the "line of best fit." # With this regression equation we can predict new values # based on suggested values for amino_A # What is the expected value, using the regession equation, # for an amino_A value of 13? coeffs[1]+coeffs[2]*13 # What is the expected value for amino_A value 17.23 coeffs[1]+coeffs[2]*17.23 # What is the expected value for every amino_A value # from 3 to 21 is steps of 3 coeffs[1]+coeffs[2]*seq(3,21,3) # # all of the values for amino_A used in the last few lines # we within the range of the amino_A values in our data, # that is, they were between 2 and 20. Finding the expected # amino_B values based upon amino_A values that are within # the range of our data is called INTERPOLATION. # # Mathematically, there is nothing to stop us from using # our regression equation to predict an amino_B value based # on an amino_A value that is outside the range of our data. # Doing so is called EXTRAPOLATION. # # For example, what is the predicted amino_B level, based on # the regression equation, for the amino_A level 33? coeffs[1]+coeffs[2]*33 # We get the answer 40.88178, but extrapolation is dangerous! # Our data gives us confidence for amino_A values between # 2 and 20. We have no reason to expect the linear trend to # just keep going! # # Another aspect of this process is to look at the "residuals". # The "residuals" are the OBSERVED - EXPECTED values for # all of our observed values. We know that the observed # values are stored in amino_B. The expected values, # from our regression equation were stored in expect_lm # # Therefore, we could look at amino_A amino_B expect_lm amino_B - expect_lm # Thus the residuial value associated with 15 # is -5.023986. # # There is a faster way to get the residuals. # Recall that we saved our linear model in lm_hold residuals( lm_hold ) # # This example used a small data set, just 8 pairs of values. # Still, we want to be somewhat confident that the residuals # do not form a pattern. So we look at a new plot of # the amino_A values against the residual values plot( amino_A, residuals(lm_hold) ) # # We will see this again in a more complex example later. # # Finally, we know that our regression equation is the best fit # for any possible line, but how good is that fit. # We answer this by computing the Correlation Coefficient. cor( amino_A, amino_B) # correlation coefficients are always between -1 and 1, inclusive. # values close to either -1 or to 1 indicate a really good fit. # values closer to 0 indicate a poor fit. A value of 0.8704585 # indicates a good, though not a great fit. The square of the # correlation coefficient tells us the percent of the variation in # the original data that is explained by the linear model. cor( amino_A, amino_B)^2 # Thus our linear regression equation explains just over 75% of # the variations seen in the actual data. # # As you might guess, a correlation coefficient of 1 or of -1 # would indicate a perfect fit. # ################################################## ########## S E C O N D E X A M P L E ######### ################################################## # # Here is a new problem with a larger data set. # # load the gnrnd4.R script source("../gnrnd4.R") # generate the data gnrnd4( 1527362406, 3271480704, 19500013) L1 L2 # let us do a quick and dirty plot plot(L1,L2) # # Then, as an extra step, a slightly nicer plot plot(L1,L2,main="Second Example", xlim=c(-2,22), ylim=c(-4,40), xaxp=c(-2,22,12), yaxp=c(-4,40,11), las=1) abline(v=seq(-2,22,2), h=seq(-4,40,4), lty="dotted",col="darkgray") abline(v=0,h=0) # # find the linear model lm_two <- lm( L2 ~ L1 ) # we could fix the graph to show the regression line abline( lm_two, col="darkred") # get the coefficients and residuals lm2_coeff <- coefficients( lm_two ) lm2_resid <- residuals( lm_two ) lm2_coeff # so our equation is y = 37.59901 + (-1.792756)x # where x is the L1 varaible and y is the L2 variable # to find the predicted value when x=4.5 lm2_coeff[1]+lm2_coeff[2]*4.5 # to find the residual value for the case where # x=11.7, we could recompute it # the observed value associated with 11.7 is the second pair # (11.7, 15.2) so the residual value is # 15.2 - expected value for 11.7 # or 15.2 - (lm2_coeff[1] + lm2_coeff[2]*11.7) # or, again recogizing that this is the second pair of # values we could just pull out the second residual lm2_resid[ 2 ] # # while we are looking at residuals we should do a plot # the L1 values against their associated residual values plot(L1, lm2_resid, xlim=c(-2,22)) # From the plot we are happy to see that the residuals # are all over the place. # # And we should find the correlation coefficient and its # square lm2_cc <- cor( L1,L2 ) lm2_cc lm2_cc^2 # the correlation coefficient is -0.9769232 # which means we have a really good fit, one that # explains over 95% of the variation in the data.