# This is a script of the commands that we will use to # look at more examples of linear regressions # # This first problem relatees measures of height and shoulder # width. As it turns out, we can generate a data set for 32 # subjects where L1 holds the person's height in inches and # L2 holds their shoulder width in inches. Normally we would # have found our 32 subjects and taken the measures, and then # we would have had to get them into R somehow. Here we can # just use a special gnrnd4 command # # Get gnrnd4 into the environment source( "../gnrnd4.R") # then run it gnrnd4( 1473823106, 3240080104, 18300592 ) # # and let us look at the two sets of measures L1 L2 # # then a quick and dirty plot plot( L1, L2 ) # # perhaps a slightly improved plot plot( L1, L2, main="Height vs. Shoulder Width", xlim=c(55,80), ylim=c(16,22), xlab="Height in inches", ylab="Shoulder width in inches", las=1) abline( h=seq(16,22,1),v=seq(55,80,5), col="darkgrey", lty="dotted") # # Then we can get our linear model # tall_wide <- lm( L2 ~ L1 ) # # first look at it tall_wide # # so our equation is # y = 1.626 + 0.255x # # or, using the real names # # shoulder width = 1.626 + 0.255*(height) # # we can easily graph this abline( tall_wide ) # # we can find out how good our fit is to the data # by finding the correlation coefficient how_good <- cor( L1, L2) how_good how_good^2 # the correlation coefficient of 0.9839971 is really # high, really close to 1, so this is a great fit. # The square of the correlation coefficient, 0.9682503, # indicates that our model explains over 96% of the # variation in the data. # # We should look at the residuals. We can pull them # directly from our model res_hold <- residuals( tall_wide ) plot( L1, res_hold ) # The values are all over the place so that is good. # # If we want to find the residual value for the # original data point (73.1, 20.7) it is important to # note that this is item # 28 in the list, not item # 12 which also has 73.1 as its first coordinate. The # residual value of item #28 is res_hold[28] # we could have computed this as # 20.7 - ( 1.626 + 0.255*73.1 ) # # the difference in those two answers comes from the # fact that 1.626 and 0.255 are rounded values. # We can pull the more accurate values from the model # and save them for later use as tw_coeffs <- coefficients( tall_wide ) tw_coeffs # Even those printed results are rounded, but they give # more digits than we got before. We can use the # much more accurate internal values via the command # 20.7 - ( tw_coeffs[1] + tw_coeffs[2]*73.1) # # Which produces the value that we saw from the # residuals, re_hold[28]. # # Finally, we can use our regression equation to # predict shoulder width for a given height. # For example, for a person who is 72 inches tall # our model tw_coeffs[1] + tw_coeffs[2]*72 # # predicts their shoulder width to be 19.98526 inches. # # For a person who is 63.5 inches tall we predict tw_coeffs[1] + tw_coeffs[2]*63.5 # their shoulder width to be 17.81782 inches # # The last two precidtions were cases of # INTERPOLATION because the values for the # independent variable, height, were within # the range of values in our data. summary( L1 ) # # We could do the same computation, but for a value # that is outside the range of L1, say for 36.2 # inches tall, and that would be an example # of EXTRAPOLATION, a dangerous endeavor. tw_coeffs[1] + tw_coeffs[2]*36.2 # ####################################### ########## example two ############ ####################################### # # This example comes from WCC English courses # for the Winter term of 2018. The thought was that # the higher the course number the lower the # percent of the available seats that are filled # # Here is the list of course numbers courses <- c(10,23,24,25,50,51,90,91,100,107, 111,128,132,134,135,138,140,160, 161,165,168,170,181,200,210,214, 218,220,222,226,240,242,245,260, 261,270,271) # here is the percent of the seats filled for # each of those courses fill_rate <- c(65,54.5,86.4,50,43.9,43.9,78.3,78.3,93.8,66.7, 91.3,77.3,62.1,90.9,72.7,93.2,90,63.3, 84.1,45.5,100,80,79.2,51.7,81.8,43.3, 33.3,45.5,33.3,92.9,80,60,26.7,45, 45,89.2,89.2) # then, a quick plot plot( courses, fill_rate ) # # this does not suggest a strong linear relation # # but let us find the line of best fit anyway # num_fill <- lm( fill_rate ~ courses ) num_fill # and then we can plot that abline( num_fill ) # # but we should look at the correlation coefficient # cc_num_fill <- cor( courses, fill_rate) cc_num_fill # # Indeed, we get a negative correlation, which # corresponds to our negative slope. This would # suggest that higher courses are less filled, but # the level of the correlation, -0.1074911, is so far # from -1 that we would say that these variables, course # number and percent fill, are just not related. The # square of the correlation coefficient cc_num_fill^2 # at 0.01155434, tells us that our linear model # explains just over 1% of the vriation in the data. # Not much at all. # ####################################### ########## example three ########## ####################################### # # Just a quick look at some other data gnrnd4( 2373293606, 2477210110,261000100) L1 L2 plot(L1,L2) lm_hold <- lm( L2~L1 ) lm_hold cor( L1, L2 ) # Just a quick comparison to example two above. # The slope here and the slope there are about # the same. However, for this data we have a # correlation coefficient that is really close to -1. # It is negative, just as the slope is negative. # abline( lm_hold ) plot( L1, residuals(lm_hold)) # ####################################### ########## example four ########## ####################################### # # Here is some data that is not quite as close # to linear in relations. gnrnd4( 6760873406, 4172401104,52100234 ) L1 L2 summary(L1) summary(L2) plot(L1,L2, main="Problem Four", xlim=c(-20,30), ylim=c(-25,150), las=1) abline( h=seq(50,150,50), v=seq(-20,30,10), col="blue", lty="dotted") abline(v=0,h=0) cor( L1,L2) cor(L1,L2)^2 lm_hold <- lm( L2 ~ L1 ) abline( lm_hold, col="darkred") plot( residuals( lm_hold ) ) hold_coeff <- coefficients( lm_hold) hold_coeff hold_coeff[1] + hold_coeff[2]*(-4) hold_coeff[1] + hold_coeff[2]*(-3) hold_coeff[1] + hold_coeff[2]*(-2) plot(L1,L2, main="Problem Four", xlim=c(-20,30), ylim=c(-25,150), las=1) abline( h=seq(50,150,50), v=seq(-20,30,10), col="blue", lty="dotted") abline(v=0,h=0) abline( lm_hold, col="darkred") abline( h=hold_coeff[1], col="darkgreen", lty="dashed")