# ScatterplotsAndCorrelations.R
#
# First, we'll load in the survey data
survey <-read.csv("http://www.courses.washington.edu/psy315/datasets/Psych315W21survey.csv")
# Scatterplots can be made with R's 'plot' function
#
# Here's how to make a scatterplot of Mother's vs. Father's heights
# for students that chose 'Green' as their favorite color.
# We first need to find the subset of the data for which
# survey$color is equal to "Green". In R, we use '==' to check if
# something is true or not. For example, to see which students chose
# "Green" as their favorite color, we can use:
students.green <- survey$color=="Green"
head(students.green)
# This list is the same length as survey$color, but is
# 'TRUE' where survey$color is "Green" and 'FALSE' otherwise.
# We can then refer to a subset of our data by using 'students.green'
# as an 'index'. For example, the Mother's heights for the students
# that prefer "Green" are:
survey$mheight[students.green]
# To make the scatterplot, we can use:
plot(survey$mheight[students.green],survey$pheight[students.green])
# The default labels and symbols are kind of ugly. We can
# customize our plot by setting parameters like 'xlab' and 'ylab'
# for x and y labels, 'col' for the color and 'pch' to 16
# which is a filled circle, and 'asp' to 1, which makes the
# aspect ratio for the x and y axes to be the same, and 'cex'
# which for some reason sets the symbol size.
plot(survey$mheight[students.green],survey$pheight[students.green],
xlab = "Mother's Height",
ylab = "Father's Height",
pch = 19,
col = "blue",
asp = 1,
cex = 2)
# Search the web for more hints
# For different symbols (or setting for 'pch') see, for example:
# http://www.endmemo.com/program/R/pchsymbols.php
# Computing correlations in R uses the 'cor' function.
#
# To calculate the correlation between x and y, just use 'cor(x,y)'
#
# The correlation in this scatterplot is:
cor(survey$mheight[students.green],survey$pheight[students.green],use = "complete.obs")
# Next we'll calculate the correlations from the survey that are
# given at the end of the correlation tutorial
# To calculate the correlation between the mean of the parent's height
# and the student's height, we first calculate the mean of the father's
# height (survey$pheight) and the mother's height (survey$mheight). We'll
# call it parent.height:
parent.height <- (survey$pheight+survey$mheight)/2;
# Then we use 'cor' to calculate the correlation between x and
# the student's heights (survey$height):
cor(parent.height,survey$height,use = "complete.obs")
# Some of our survey data has missing values which you'll see as NaN's
# or 'not a number'. The third argument 'use = "complete.obs"' deals
# with this by telling 'cor' to only include data that has both
# (complate) observations.
# Next we'll calculate the correlation between the Father's height
# and the male student's height. This will require us to include
# a subset of the data (Male students) in our analysis.
#
# This can be done by referring to the subset of the data for which
# survey$gender is equal to "Male":
male.students <- survey$gender == "Male"
head(male.students)
# The heights of the male students that like Green is:
male.heights = survey$height[male.students]
# The heights of the fathers of the male students is:
male.heights.father = survey$pheight[male.students]
# and the correlation between the male student's heights and their
# fathers is:
cor(male.heights,male.heights.father,use = "complete.obs")
# Verify that the following computes the correlation between the
# female student's heights and their mother's heights:
female.students = survey$gender == "Female"
cor(survey$height[female.students],survey$mheight[female.students],use = "complete.obs")
# Here are the other correlations from the tutorial:
cor(survey$GPA_HS,survey$GPA_UW,use = "complete.obs")
cor(survey$caffeine,survey$sleep,use = "complete.obs")
cor(survey$caffeine,survey$drink,use = "complete.obs")
cor(survey$temperature,survey$games_hours,use = "complete.obs")
# You can use 'cor' to calculate a whole matrix of pairwise correlations
# This is done by creating a 'data frame' containing the variables
# of interest. For example, let's look at the pairwise correlations
# between birth year, drinks/week, sleep, and caffeine
X = data.frame(survey$year,survey$drink,survey$sleep,survey$caffeine)
cor(X,use = "complete.obs")
# Notice that the diagonal values area all '1.000000'. That's
# because the correlation of a variable with itself is 1.0
# 'cor' will also calculate the Spearman correlation if you set
# the parameter 'method' to "spearman" instead of the default "pearson":
cor(survey$mheight,survey$pheight,
use = "complete.obs",
method = "spearman")