# Prediction.R # Calculating the regression line in R is easy. Here we'll # work through the example in the 'prediction' tutorial # First we'll load in the survey data: survey <-read.csv("http://www.courses.washington.edu/psy315/datasets/Psych315W19survey.csv") # And find the students that chose 'Red' as their favorite color: students.green <- survey\$color=="Green" # to simply things for later, let's define 'x' to be the mother's height # and y be the father's height: x <- survey\$mheight[students.green] y <- survey\$pheight[students.green] # There might be 'NA's in either x or y, so we'll find where they are # and take them out: goodvals <- !is.na(x) & !is.na(y) x <- x[goodvals] y <- y[goodvals] # The scatterplot can now be done by plotting x vs y: plot(x,y, xlab = "Mother's height", ylab = "Father's height", pch = 19, col = "blue", cex = 2) # Now we'll calculate the slope and intercept for the regression line: # We'll need the correlation: r <- cor(x,y,use = "complete.obs") print(r) # The slope, m, of the regression line is: m <- r*sd(y)/sd(x) print(m) # where the 'sd' function is standard deviation. # The intercept, b, is: b <- mean(y) - m*mean(x) print(b) # where 'mean' is the mean (of course) # We can draw the regression line on the scatterplot by # using R's 'abline' function, which takes in the # intercept and slope as inputs: abline(b,m) # From the slope and intercept we can find the residuals # which are the deviations of each data point from the # regression line. First we find the values on the # regression line for each value of x: yprime <- m*x+b head(yprime) # The residuals are the differences between yprime and y: residuals <- yprime -y head(residuals) # The standard error of the estimate is the standard deviation # of the residuals: syx <- sd(residuals) print(syx) # The other way to calculate syx is using sy and r: print(sd(y)*sqrt(1-r^2)) # where 'sqrt' is the square root, and r^2 is r squared. # Is this the same number? I hope so. # The regresion line can be used to predict y from x. # For example, our best guess at what the height of # the father for a student who's mother's height is 64.5 # inches is: xplot <-64.5; yplot <- xplot*m+b; print(yplot) # To plot the data point on the graph we'll use the 'points' # function intead of the 'plot' function. 'plot' will erase # our current graph and start over. points(xplot,yplot, pch = 16, # symbol type. 16 is a filled circle col = 'red', # color of symbol cex = 2) # size of symbol