Graphs and more

Graphs and More

On your USB drive, create a new directory, copy model.R to that directory, rename the file in the new directory, double click on the file to open Rstudio. Then copy all of the text below the line and paste it into your Rstudio editor pane.
# This is a series of commands that we will use in class.
# It seems that this could be more efficient than having everyone
#   type the commands themselves.
# I will try to document the process as we go along

################################### Bar Plot ########
# First we will look at bar plots
#  If we know the height that we want for each bar then
#  we can just create a vairable to hold the heights
# and then generate the plot

how_high <- c(4.3, 5.2, 7.6, 2.4, 8.9, 3.1)
barplot( how_high )
#
# there are a few things we could do to improve that 
# plot.  We could, for example, give each value a name.
names( how_high ) <- c("Ford","Chevy","Dodge",
                       "BMW", "Honda","Volvo")
# and then plot the variable again
barplot( how_high )
#
# And we could change the command to include some 
#  titles, perhaps chage the vertical scale, and add
#  some color
barplot( how_high, main="Our First Bar Plot",
         xlab="My various automobiles", 
         ylim=c(0,10), col=rainbow(6),
         ylab="miles driven (x10000)")
# 
# And then we could add some horizontal grid lines
abline(h=0)
abline(h=seq(from=2, to=10, by=2),
       col="darkgreen", lty="dotted")

# A different use of the bar plot is to show the frequency
# of a relatively small set of values. For example, we will
# load the gnrnd4 program into our environment and then 
# generate and look at some values.

source("../gnrnd4.R")
gnrnd4( 700275302, 800031)
L1

# just looking at the values we can see that there are 
# only a few different values.  We can use the   
#    table()   function to get the frequency of each

table( L1 )

# that is nice, but a graph would be better
barplot( table(L1) )

# and, again, we could make that fancier by adding more
# commands
barplot( table(L1), main="Frequency distribution of grades",
         col=rainbow(9), xlab="Points out of 40 on test 1",
         ylab="Frequency", ylim=c(0,14))
abline(h=0)
abline(h=seq(2,14,2), col="darkgrey", 
       lty="dotted")

############################### Histogram ##############

# we would want to make a histogram of data that covers 
# wider range of values.  That is because we are less 
# interested in seeing the frequency of each value than we
# are in seeing the frequency of values within "bins"

#
# First we will generate some new values

gnrnd4(1457267703,32200456)
L1

# It is hard to even begin to see how those values
# are distributed but with a simple   hist()
# command we can get a good picture of this

hist( L1 )

# notice that R has made many arbitrary choices for 
# us in order to divide up the span of values.  One
# choice is the number of "bins", another is the 
# width of each "bin".
# 
# Having seen the values like this we might want to 
# change some of the choices.  For example, by using 
#   summary()   command we can find the minimum and 
# maximum values.

summary(L1)

# now that we know we have values from 48.10 to 77.6
# we could decide to make our "bins" cover the values
# from 48 to 78 and to do that in 6 steps, each 5
# long.  The new histogram command would be

hist( L1, xlim=c(48,78),xaxp=c(48,78,6),
      breaks=seq(48,78,5), xlab="My bins",
      ylim=c(0,25), yaxp=c(0,25,5),
      main="My Version of the Histogram")
#
# and we can make it easier to read with some 
# grid lines
abline(h=seq(5,25,5), col="darkgray", lty="dotted")
# You might note that R does not care if you use
# gray or grey.

# One of the important things to recognize in
# the histogram is whether the "bins" are 
# "closed" on the left or on the right.  In our
# example, by default, the bins are closed on the
# right.  With our data and breaks we will see a 
# small change if we close the bins on the left.

hist( L1, xlim=c(48,78),xaxp=c(48,78,6),
      breaks=seq(48,78,5), right=FALSE,
      ylim=c(0,25), yaxp=c(0,25,5),
      main="My Version of the Histogram",
      xlab="Bins now closed on left")
abline(h=seq(2,14,2), col="darkgrey", 
       lty="dotted")

#
###################### box and Whisker Plot ########
#
# Box and Whisker plots show us the relative 
# poisition of the quartiles.  Again, we need
# some data to use here.

gnrnd4( 376299509, 153035421374)
L1
# and then, leading into looking at a box and
# whisker plot, let us find the quartiles
summary( L1 )
#and now look at the plot
boxplot( L1 )
#
# the defaut box plot is vertical
# We can make it horizontal by a simple change
boxplot( L1, horizontal=TRUE)

# in either case, the rectangle identifies the 
# relative positon of the first and third 
# quartile points, with the heavy line in the middle
# of the rectangle showing the position of the 
# median, i.e., the second quartile point.
# In both of these plots the whiskers extend to
# the min and max values, Q0 and Q4.

#
# One special feature of the box and whisker plots
# is that the whiskers will not extend beyond 
# 1.5*IQR below Q1 or above Q3.
# We can see tht with different data, a new summary,
# and a new plot.

gnrnd4( 217849309, 340805354374)
L1
summary( L1 )

boxplot( L1, horizontal=TRUE)

# from the summary we see that the IQR is 
# 540.0-494.5 or 45.5.  That means that 1.5*IQR is
# 1.5*45.5 = 68.25.  Q1-68.25 is 426.25 while
# Q3+68.25 = 608.25.  However, if we look at the
# sorted data values
head( sort(L1) )
tail( sort(L1) )
# We see that we have 1 value, namely 392, that is 
# lower than our lower limit of 426.25 nd one 
# value that is higher than our upper limit 
# of 608.25, namely, 610.  These two extreme 
# values are "outliers".  The boxplot extends 
# whiskers as far as the lowest value not lower
# than Q1-(1.5*IQR) and to the hghest value
# not higher than Q3+(1.5*IQR).

# We could augment our boxplot to show those 
# limits.
abline( v=c(426.25,608.25), col="red")
abline(v=seq(400,600,50), lty="dotted",
       col="darkblue")
#
###################### pie chart ########
#
# Understanding that you really do not want to 
# use pie charts, it is still possible to make 
# in R.  We just need to have the relative 
# values in each of the pieces.  For slices to
# represent the values 23, 19, 37, 15, and 32,
# we could do the following.
chunk <- c( 23, 19, 37, 15, 32)
pie( chunk )
# or, a slightly more colorful
pie( chunk, col=rainbow(5))
# we will not look any further into pie charts.

#
###################### dot plot ########
#
# The dot plot was a great tool for us when we 
# did not have computers.  It was a simple method
# that allowed a person to really build up a
# bar chart by just reading and ploting successive
# data points.  

# As noted in the web pages, there is no built-in
# command to do a dot plot in R.  However, we do
# have a function that would do it for us.  

# We will start with some new data
gnrnd4(257686204,400035)
L1
# note that these are values that show a lot of
# repetition.  
#  We will load our function 
#  (be sure to note the underscore)

source("../dot_plot.R")
dot_plot( L1 )
# 
# Really, this is no better than using or barplot
barplot(table(L1))

#
#
###################### stem and leaf ########
#
# this was another old technique that was useful
# before we had computers.  It allowed us to 
# build a histogram, and sort the datavalues,
# in a two step process.  However, in most cases,
# the "bins" of the histogram would have to 
# correspond to groups of ten values.  Thus, for 
# the data
gnrnd4( 780124504, 1100065)
L1
# we need to load our function
source("../stem_leaf.R")
stem_leaf( L1 )
#getting a result that translates into  having the
# data be 40, 41, 43, 46, 46, 46, 47, 
#         50, 51, 53, 54, 55, 57, 58, 58, 58, 59,
# and so on.
#
#####################  scatter plot ########3
#
# here we have pairs of data values that we want 
# to graph on and xy-plot
# For example, we could generate the data
gnrnd4(1569811106, 3120070405, 32000150 )
# this produces vlaues in both L1 and L2
L1
L2
# We really have pairs of data values so that
# we want to be looking at the points
# (44.6, 35.7), (32.4, 25.8), (42.0, 36.7),
# (44.4, 36.9), and so on.
# We plot these points just as we did back in 
# algebra class.  The command to do this in R
# is the plot() function.
plot( L1, L2 )
# 
# as usual we can override the defaults to create
# a possibly more appropriate graph.  For example, 
# just the few changes shown here may help
plot( L1, L2, xlim=c(-5,50),
      ylim=c(-5,50), xaxp=c(-5,50,11),
      yaxp=c(-5,50,11),xlab="x values",
      ylab="y values", main="My scatter plot")

abline(h=seq(-5,50,5),v=seq(-5,50,5),
       col="darkgreen", lty="dotted")
abline(h=0, v=0 )

#
##################### make a frequency table #########
#
# let us create some data where we have a 
# few discrete values that are repeated some 
# different numbers of times
gnrnd4( 600878701, 900023)
L1
# we know how t find the frequency of each value
# by using the table command
table( L1 )
# However, we want to have a complete frequency
# table that extend that result by including the 
# relative frequency, the cumulative frequency,
# the relative cumulative frequency, and even
# the number of degrees for each slice of a 
# pie chart.  The web page on this walks us
# through a tedious process to reach that goal.
# Then it gives us a function to do all of the
# work at once.  We will load and use that
# function.

source("../make_freq_table.R")
make_freq_table( L1 )
# The output from that function, when used that 
# way, is shown in the console pane.  An 
# alternative is to do something like
mft <- make_freq_table( L1 )
# Which displays nothing of the results.
# However, we can then use the command View(),
# with a capital V, to see the results in 
# a special form of the editor pane.
View( mft )
#
# That special display has certain advantages over
# the console display.  We can demonstrate
# some of thse in class.

#
####################  Grouping Data #########
#
# There are times when the data, especially 
# continuous data, makes more sense if we can
# arrange it into groups ("bins", "buckets",
# "intervals", etc.) as is the case for the 
# following data

gnrnd4(1573289104, 23101650 )
L1
# looking at the summary we can find the range
summary( L1 )
#
# We have already seen R group the data when 
# we looked at a histogram of the data
hist( L1 )
# in my run of that R created intervals 10 wide
# running from 110 to 220.
#
# Can we do the same thing and then, for those
# intervals, create a frequency table similar 
# to the one we saw above for discrete data?
# The web page walks us through a long process
# to do this, but it ends with a function,
# collate3(), that will do this for us.
#
# Load collate3
source("../collate3.R")
#
# We will take a two phase process to make 
# our "bins" and produce our frequency table.
#
# First, a simple run of collate3()
collate3( L1 )
#
# The console output from that gives us the min 
# and max values, and it suggests a width.
# In this case, in order to mimic the 
# histogram, we will set the starting value
# for the first "bin" to be 110 and the 
# width of the "bin" to be 10.
#
collate3( L1, use_low=110, use_width=10 )
# This produces the frequency table in the
# console pane. We can get abetter looking
# table in the editor by assigning the result
# of collate3() to a variable and then View
# that variable.

grp <- collate3( L1, use_low=110,
                 use_width=10)
View( grp )

# Just as we saw in the histogram discussion, 
# we do need to be aware of which side of the
# intervals is the "closed" side.  In the 
# default case, above, that was the right end.
# A simple change in the command will change
# this to be closed on the left end.  This
# may change the resulting table.
grp_left <- collate3( L1, use_low=110,
                 use_width=10, right=FALSE)
View( grp_left )

#
############### Getting approximate measures
############### from grouped data
#
# It is unfortunate, but sometimes when we get
# data it it already grouped in a sort of 
# frequency table.  For example, look at the
# following table
#
#      Interval   Frequency
#      (55, 63]       7
#      (63, 71]       9
#      (71, 79]       3
#      (79, 87]      15
#      (87, 95]      18
#      (95, 103]      6
#
# Our challenge is to find the approximate mean,
# standard deviation, and variance for this 
# data.  We can do this by finding the midpoints
# of each interval.  In this case the first 
# midpoint is (55+63)/2 = 59. Then the midpoints
# just increase by 8 for each subsequent interval.
# We can get such a list of values via

mpnts <- seq( from=59, to=103, by=8)
mpnts

# Then we put the frequencies in a variable

freqs <- c( 7, 9, 3, 15, 18, 6)
freqs

# and now we can create a new variable that
# holds each midpoint the the same number of 
# times as the frequency for its interval

x <- rep( mpnts, freqs )
x

# and then our approximations are done on that
# variable

mean(x)
sd(x) #assuming we are looking at a sample
sd(x)^2  #variance of a sample
# or, if we had a population
source("../pop_sd.R")
pop_sd( x )  # standard deviation of a population
pop_sd( x )^2  #variance of a population

#
############### Percentiles ####
#
# There are two kinds of questions to answer.
# 1. For a given set of values, 
#          find the 93rd percentile?
# 2. For a given set of values,
#         what is the percentile of a specific
#         value?
#
# Here is a set of values
gnrnd4( 389568304, 5100532)
L1
# to look at percentiles we need to ort the 
# values
L1_sorted <- sort(L1)
L1_sorted
# we can find the length of the list
length(L1_sorted)
# then, the 93rd percentile will be the value
# that has 93% of the values less than it in
# the sorted list
0.93*length(L1_sorted)
# that answer is 78.12, so the 79th value
# has 93% of the values less than it
L1_sorted[ 79 ]

# the value 564 is in the list, what is its
# percentile? We can find the position of 564
# in the sorted list.

which( L1_sorted == 564)
#
# It is position 64
# Therefore its percentile is 64/length(L1_sorted)
64/length(L1_sorted)*100
# and then round down to get 76%