# This is a series of commands that we will use in class. # It seems that this could be more efficient than having everyone # type the commands themselves. # I will try to document the process as we go along ################################### Bar Plot ######## # First we will look at bar plots # If we know the height that we want for each bar then # we can just create a vairable to hold the heights # and then generate the plot how_high <- c(4.3, 5.2, 7.6, 2.4, 8.9, 3.1) barplot( how_high ) # # there are a few things we could do to improve that # plot. We could, for example, give each value a name. names( how_high ) <- c("Ford","Chevy","Dodge", "BMW", "Honda","Volvo") # and then plot the variable again barplot( how_high ) # # And we could change the command to include some # titles, perhaps chage the vertical scale, and add # some color barplot( how_high, main="Our First Bar Plot", xlab="My various automobiles", ylim=c(0,10), col=rainbow(6), ylab="miles driven (x10000)") # # And then we could add some horizontal grid lines abline(h=0) abline(h=seq(from=2, to=10, by=2), col="darkgreen", lty="dotted") # A different use of the bar plot is to show the frequency # of a relatively small set of values. For example, we will # load the gnrnd4 program into our environment and then # generate and look at some values. source("../gnrnd4.R") gnrnd4( 700275302, 800031) L1 # just looking at the values we can see that there are # only a few different values. We can use the # table() function to get the frequency of each table( L1 ) # that is nice, but a graph would be better barplot( table(L1) ) # and, again, we could make that fancier by adding more # commands barplot( table(L1), main="Frequency distribution of grades", col=rainbow(9), xlab="Points out of 40 on test 1", ylab="Frequency", ylim=c(0,14)) abline(h=0) abline(h=seq(2,14,2), col="darkgrey", lty="dotted") ############################### Histogram ############## # we would want to make a histogram of data that covers # wider range of values. That is because we are less # interested in seeing the frequency of each value than we # are in seeing the frequency of values within "bins" # # First we will generate some new values gnrnd4(1457267703,32200456) L1 # It is hard to even begin to see how those values # are distributed but with a simple hist() # command we can get a good picture of this hist( L1 ) # notice that R has made many arbitrary choices for # us in order to divide up the span of values. One # choice is the number of "bins", another is the # width of each "bin". # # Having seen the values like this we might want to # change some of the choices. For example, by using # summary() command we can find the minimum and # maximum values. summary(L1) # now that we know we have values from 48.10 to 77.6 # we could decide to make our "bins" cover the values # from 48 to 78 and to do that in 6 steps, each 5 # long. The new histogram command would be hist( L1, xlim=c(48,78),xaxp=c(48,78,6), breaks=seq(48,78,5), xlab="My bins", ylim=c(0,25), yaxp=c(0,25,5), main="My Version of the Histogram") # # and we can make it easier to read with some # grid lines abline(h=seq(5,25,5), col="darkgray", lty="dotted") # You might note that R does not care if you use # gray or grey. # One of the important things to recognize in # the histogram is whether the "bins" are # "closed" on the left or on the right. In our # example, by default, the bins are closed on the # right. With our data and breaks we will see a # small change if we close the bins on the left. hist( L1, xlim=c(48,78),xaxp=c(48,78,6), breaks=seq(48,78,5), right=FALSE, ylim=c(0,25), yaxp=c(0,25,5), main="My Version of the Histogram", xlab="Bins now closed on left") abline(h=seq(2,14,2), col="darkgrey", lty="dotted") # ###################### box and Whisker Plot ######## # # Box and Whisker plots show us the relative # poisition of the quartiles. Again, we need # some data to use here. gnrnd4( 376299509, 153035421374) L1 # and then, leading into looking at a box and # whisker plot, let us find the quartiles summary( L1 ) #and now look at the plot boxplot( L1 ) # # the defaut box plot is vertical # We can make it horizontal by a simple change boxplot( L1, horizontal=TRUE) # in either case, the rectangle identifies the # relative positon of the first and third # quartile points, with the heavy line in the middle # of the rectangle showing the position of the # median, i.e., the second quartile point. # In both of these plots the whiskers extend to # the min and max values, Q0 and Q4. # # One special feature of the box and whisker plots # is that the whiskers will not extend beyond # 1.5*IQR below Q1 or above Q3. # We can see tht with different data, a new summary, # and a new plot. gnrnd4( 217849309, 340805354374) L1 summary( L1 ) boxplot( L1, horizontal=TRUE) # from the summary we see that the IQR is # 540.0-494.5 or 45.5. That means that 1.5*IQR is # 1.5*45.5 = 68.25. Q1-68.25 is 426.25 while # Q3+68.25 = 608.25. However, if we look at the # sorted data values head( sort(L1) ) tail( sort(L1) ) # We see that we have 1 value, namely 392, that is # lower than our lower limit of 426.25 nd one # value that is higher than our upper limit # of 608.25, namely, 610. These two extreme # values are "outliers". The boxplot extends # whiskers as far as the lowest value not lower # than Q1-(1.5*IQR) and to the hghest value # not higher than Q3+(1.5*IQR). # We could augment our boxplot to show those # limits. abline( v=c(426.25,608.25), col="red") abline(v=seq(400,600,50), lty="dotted", col="darkblue") # ###################### pie chart ######## # # Understanding that you really do not want to # use pie charts, it is still possible to make # in R. We just need to have the relative # values in each of the pieces. For slices to # represent the values 23, 19, 37, 15, and 32, # we could do the following. chunk <- c( 23, 19, 37, 15, 32) pie( chunk ) # or, a slightly more colorful pie( chunk, col=rainbow(5)) # we will not look any further into pie charts. # ###################### dot plot ######## # # The dot plot was a great tool for us when we # did not have computers. It was a simple method # that allowed a person to really build up a # bar chart by just reading and ploting successive # data points. # As noted in the web pages, there is no built-in # command to do a dot plot in R. However, we do # have a function that would do it for us. # We will start with some new data gnrnd4(257686204,400035) L1 # note that these are values that show a lot of # repetition. # We will load our function # (be sure to note the underscore) source("../dot_plot.R") dot_plot( L1 ) # # Really, this is no better than using or barplot barplot(table(L1)) # # ###################### stem and leaf ######## # # this was another old technique that was useful # before we had computers. It allowed us to # build a histogram, and sort the datavalues, # in a two step process. However, in most cases, # the "bins" of the histogram would have to # correspond to groups of ten values. Thus, for # the data gnrnd4( 780124504, 1100065) L1 # we need to load our function source("../stem_leaf.R") stem_leaf( L1 ) #getting a result that translates into having the # data be 40, 41, 43, 46, 46, 46, 47, # 50, 51, 53, 54, 55, 57, 58, 58, 58, 59, # and so on. # ##################### scatter plot ########3 # # here we have pairs of data values that we want # to graph on and xy-plot # For example, we could generate the data gnrnd4(1569811106, 3120070405, 32000150 ) # this produces vlaues in both L1 and L2 L1 L2 # We really have pairs of data values so that # we want to be looking at the points # (44.6, 35.7), (32.4, 25.8), (42.0, 36.7), # (44.4, 36.9), and so on. # We plot these points just as we did back in # algebra class. The command to do this in R # is the plot() function. plot( L1, L2 ) # # as usual we can override the defaults to create # a possibly more appropriate graph. For example, # just the few changes shown here may help plot( L1, L2, xlim=c(-5,50), ylim=c(-5,50), xaxp=c(-5,50,11), yaxp=c(-5,50,11),xlab="x values", ylab="y values", main="My scatter plot") abline(h=seq(-5,50,5),v=seq(-5,50,5), col="darkgreen", lty="dotted") abline(h=0, v=0 ) # ##################### make a frequency table ######### # # let us create some data where we have a # few discrete values that are repeated some # different numbers of times gnrnd4( 600878701, 900023) L1 # we know how t find the frequency of each value # by using the table command table( L1 ) # However, we want to have a complete frequency # table that extend that result by including the # relative frequency, the cumulative frequency, # the relative cumulative frequency, and even # the number of degrees for each slice of a # pie chart. The web page on this walks us # through a tedious process to reach that goal. # Then it gives us a function to do all of the # work at once. We will load and use that # function. source("../make_freq_table.R") make_freq_table( L1 ) # The output from that function, when used that # way, is shown in the console pane. An # alternative is to do something like mft <- make_freq_table( L1 ) # Which displays nothing of the results. # However, we can then use the command View(), # with a capital V, to see the results in # a special form of the editor pane. View( mft ) # # That special display has certain advantages over # the console display. We can demonstrate # some of thse in class. # #################### Grouping Data ######### # # There are times when the data, especially # continuous data, makes more sense if we can # arrange it into groups ("bins", "buckets", # "intervals", etc.) as is the case for the # following data gnrnd4(1573289104, 23101650 ) L1 # looking at the summary we can find the range summary( L1 ) # # We have already seen R group the data when # we looked at a histogram of the data hist( L1 ) # in my run of that R created intervals 10 wide # running from 110 to 220. # # Can we do the same thing and then, for those # intervals, create a frequency table similar # to the one we saw above for discrete data? # The web page walks us through a long process # to do this, but it ends with a function, # collate3(), that will do this for us. # # Load collate3 source("../collate3.R") # # We will take a two phase process to make # our "bins" and produce our frequency table. # # First, a simple run of collate3() collate3( L1 ) # # The console output from that gives us the min # and max values, and it suggests a width. # In this case, in order to mimic the # histogram, we will set the starting value # for the first "bin" to be 110 and the # width of the "bin" to be 10. # collate3( L1, use_low=110, use_width=10 ) # This produces the frequency table in the # console pane. We can get abetter looking # table in the editor by assigning the result # of collate3() to a variable and then View # that variable. grp <- collate3( L1, use_low=110, use_width=10) View( grp ) # Just as we saw in the histogram discussion, # we do need to be aware of which side of the # intervals is the "closed" side. In the # default case, above, that was the right end. # A simple change in the command will change # this to be closed on the left end. This # may change the resulting table. grp_left <- collate3( L1, use_low=110, use_width=10, right=FALSE) View( grp_left ) # ############### Getting approximate measures ############### from grouped data # # It is unfortunate, but sometimes when we get # data it it already grouped in a sort of # frequency table. For example, look at the # following table # # Interval Frequency # (55, 63] 7 # (63, 71] 9 # (71, 79] 3 # (79, 87] 15 # (87, 95] 18 # (95, 103] 6 # # Our challenge is to find the approximate mean, # standard deviation, and variance for this # data. We can do this by finding the midpoints # of each interval. In this case the first # midpoint is (55+63)/2 = 59. Then the midpoints # just increase by 8 for each subsequent interval. # We can get such a list of values via mpnts <- seq( from=59, to=103, by=8) mpnts # Then we put the frequencies in a variable freqs <- c( 7, 9, 3, 15, 18, 6) freqs # and now we can create a new variable that # holds each midpoint the the same number of # times as the frequency for its interval x <- rep( mpnts, freqs ) x # and then our approximations are done on that # variable mean(x) sd(x) #assuming we are looking at a sample sd(x)^2 #variance of a sample # or, if we had a population source("../pop_sd.R") pop_sd( x ) # standard deviation of a population pop_sd( x )^2 #variance of a population # ############### Percentiles #### # # There are two kinds of questions to answer. # 1. For a given set of values, # find the 93rd percentile? # 2. For a given set of values, # what is the percentile of a specific # value? # # Here is a set of values gnrnd4( 389568304, 5100532) L1 # to look at percentiles we need to ort the # values L1_sorted <- sort(L1) L1_sorted # we can find the length of the list length(L1_sorted) # then, the 93rd percentile will be the value # that has 93% of the values less than it in # the sorted list 0.93*length(L1_sorted) # that answer is 78.12, so the 79th value # has 93% of the values less than it L1_sorted[ 79 ] # the value 564 is in the list, what is its # percentile? We can find the position of 564 # in the sorted list. which( L1_sorted == 564) # # It is position 64 # Therefore its percentile is 64/length(L1_sorted) 64/length(L1_sorted)*100 # and then round down to get 76%