# Prediction.R
# Calculating the regression line in R is easy. Here we'll
# work through the example in the 'prediction' tutorial
# First we'll load in the survey data:
survey <-read.csv("http://www.courses.washington.edu/psy315/datasets/Psych315W21survey.csv")
# And find the students that chose 'Green' as their favorite color:
students.green <- survey$color=="Green"
# to simply things for later, let's define 'x' to be the mother's height
# and y be the father's height:
x <- survey$mheight[students.green]
y <- survey$pheight[students.green]
# There might be 'NA's in either x or y, so we'll find where they are
# and take them out:
goodvals <- !is.na(x) & !is.na(y)
x <- x[goodvals]
y <- y[goodvals]
# The scatterplot can now be done by plotting x vs y:
plot(x,y,
xlab = "Mother's height",
ylab = "Father's height",
pch = 19,
col = "blue",
cex = 2)
# Now we'll calculate the slope and intercept for the regression line:
# We'll need the correlation:
r <- cor(x,y,use = "complete.obs")
print(r)
n <- length(x)
# The function 'sd' in R uses the sample standard deviation formula
# which has the 'n-1' in the demominator. The way to turn this into'
# the sample standard deviation is to mulitply the population deviation
# by sqrt((n-1)/n). We'll do this for both x and y:
sy <- sd(y)*sqrt((n-1)/n)
sx <- sd(x)*sqrt((n-1)/n)
# The slope, m, of the regression line is:
m <- r*sy/sx
print(m)
# where the 'sd' function is standard deviation.
# The intercept, b, is:
b <- mean(y) - m*mean(x)
print(b)
# where 'mean' is the mean (of course)
# We can draw the regression line on the scatterplot by
# using R's 'abline' function, which takes in the
# intercept and slope as inputs:
abline(b,m)
# From the slope and intercept we can find the residuals
# which are the deviations of each data point from the
# regression line. First we find the values on the
# regression line for each value of x:
yprime <- m*x+b
head(yprime)
# The residuals are the differences between yprime and y:
residuals <- yprime -y
head(residuals)
# The standard error of the estimate is the standard deviation
# of the residuals:
syx <- sqrt(sum(residuals^2)/n)
print(syx)
# The other way to calculate syx is using sy and r:
print(sy*sqrt(1-r^2))
# where 'sqrt' is the square root, and r^2 is r squared.
# Is this the same number? I hope so.
# The regresion line can be used to predict y from x.
# For example, our best guess at what the height of
# the father for a student who's mother's height is 64.5
# inches is:
xplot <-64.5;
yplot <- xplot*m+b;
print(yplot)
# To plot the data point on the graph we'll use the 'points'
# function intead of the 'plot' function. 'plot' will erase
# our current graph and start over.
points(xplot,yplot,
pch = 16, # symbol type. 16 is a filled circle
col = 'red', # color of symbol
cex = 2) # size of symbol