# Descriptive Statistics
1+1
# This script will show you how to load in data from your survey
# and explore some of it using basic descriptive stastics like
# measures of central tendency and variability, along with some histograms.
# First we'll clear the workspace and load in the survey data.
# R's function 'read.csv' loads in csv files and if there are
# headers in the first row uses the names in the headers to name
# the 'fields' that contain the data.
rm(list = ls())
survey <-read.csv("http://www.courses.washington.edu/psy315/datasets/Psych315W19survey.csv")
# We've told 'read.csv' to put all the data into a variable called
# 'survey'.
#
# Survey has a bunch of fields associated with it that correspond
# to your answers to each of the questions.
# For example, I asked you about your heights (in inches).
# This can be found in the field 'height' and can be accesed
# with the dollar sign:
survey$height
# To just look at the first few values, use 'head':
head(survey$height)
# How many students filled out the survey? This can be
# determined with the function 'length' for any of the fields:
length(survey$height)
# Mother's heights are in the field 'mheight':
head(survey$mheight)
# These lists are in the same order across students, so the first
# student in the list (whoever it is) has a height of:
survey$height[1]
# and has a mother of height
survey$mheight[1]
# Here are some basic statistics about your mother's heights:
# mean:
mean(survey$mheight)
# The answer might be 'NA' which means 'not available' That's
# because there is missing data in the list. Look back and
# you'll see that some of the entries are 'NA'.
# To calculate the mean, while ignoring missing entries, use:
mean(survey$mheight,na.rm = TRUE)
# Another way to deal with missing data is to create a new
# variable without the 'NA's.
# To find the missing data, use 'is.na'
is.na(survey$mheight)
# gives you 'TRUE' for the locations of the missing data.
# To find the data that's NOT missing, use '!' which switches
# TRUE to FALSE and vice versa:
!is.na(survey$mheight)
# Finally, we can use this list to pull out the non-missing
# data, and put it into a new variable 'mheight':
mheight <- survey$mheight[!is.na(survey$mheight)]
# or (even easier:)
mheight <- na.omit(survey$mheight)
# 'mean' should now work:
mean(mheight)
# The minimum mother's height is:
min(mheight)
# and the max is
max(mheight)
# (sample) standard deviation:
sd(mheight)
# check by calculating by hand:
# save the mean
m <- mean(mheight)
# save the sample size
n <- length(mheight)
# calculate the sums of squared deviation from the mean:
SS <- sum( (mheight-m)^2)
# calculate the sample variance by dividing SS by n-1
v <- SS/(n-1)
# calculate the standard deviation by taking the square root of the variance
s <- sqrt(v)
v
# (sample) variance:
var(mheight)
# median:
median(mheight)
# or equivalently:
quantile(mheight,.5)
# Note the '50%' above the result. That's because 'quantile'
# returns the result as a 'named number'. If you want just
# a regular number without the name, pass the named number through
# 'as.numeric':
as.numeric(quantile(mheight,.5))
# The Semi-interquartile range can be calulated using 'quantile'
# (see PercentilePointExample.R):
Q <- as.numeric(quantile(mheight,.75)-quantile(mheight,.25))/2
Q
# We can plot a histogram of mother's heights like this
# (See the 'HistogramExample.R' script):
# define class intervals based in the min and max:
class.interval <- seq(min(mheight),
max(mheight),
1)
hist(survey$mheight,
main="Histogram of Mother's Heights",
xlab="Height (in)",
col="blue",
xaxt='n',
yaxt = 'n',
breaks =class.interval
)
# and then adding your own axes with the 'axis' function
# Axis 1 is 'x' and 2 is 'y':
axis(1, at=class.interval)
axis(2, at=seq(0,100,5),las = 1)
# What is ratio of males to females in this class?
# Remember, we can ask which students are female with:
survey$gender == "Female"
# The 'sum' function will give you the number of TRUE's:
number.female <- sum(survey$gender == "Female")
number.female
# Similarly for the males:
number.male <- sum(survey$gender == "Male")
number.male
# The total number of students is:
total.number <- length(survey$gender)
total.number
# And for females:
100*number.female/total.number
# The percent of males is:
100*number.male/total.number
# Note: these may not add up to 100%. This will happen if some
# students choose not to answer that question in the survey.
# Try this on your own: Calculate the percent of left vs. right
# handers in the class.
# How much do you all play video games? That's in the field
# 'games_hours'
# Histogram
hist(survey$games_hours,
main="Histogram of Video Game Playing",
xlab="Hours/day",
col="blue",
breaks =seq(0,max(survey$games_hours),1)
)
# Does this distribution look normal? If not
# is it positively or negatively skewed?
# Does video game playing differ by gender?
mean(survey$games_hours[survey$gender == "Male"])
mean(survey$games_hours[survey$gender == "Female"])
median(survey$games_hours[survey$gender == "Male"])
median(survey$games_hours[survey$gender == "Female"])
# Later on this quarter we'll see if this difference
# is 'statistically significant' using something called
# a 't-test'.
# What is the distribution of your favorite colors?
# You'd think we could type 'hist(survey$color)' but
# that doesn't work because 'hist' needs a list of numbers,
# not a list of nominal category names.
# Fortunately, R has a convenient function 'table' that
# tabulates nominal scale data into frequencies:
color.freqs <- table(survey$color)
# Let's look at the result:
color.freqs
# This color.freqs is something called a 'table'. It's
# like a list of numbers, except that the columns have
# names associated with them. In our case, the names
# of the columns are the color names.
# tables are convenient because it
# lets you keep track of what the numbers mean.
# Here's a bar plot of the histogram.
barplot(color.freqs)
# You can then send this table into the function 'barplot'
# to make a histogram:
barplot(color.freqs)
# Got to love your love of 'purple'.
# Want to get fancy? Let's use the option 'col' to color the
# bars by their color names:
barplot(color.freqs,
col = c("blue","green","orange","pink","purple","red","yellow") )
# Let's compare the frequency distribution of favorite colors
# by gender. First we need to create a table frequency distributions
# for each gender separately, using the 'table' function like
# before. But this time we'll index into the values for the
# associated gender:
color.freqs.male <- table(survey$color[survey$gender == "Male"])
color.freqs.female <- table(survey$color[survey$gender == "Female"])
# Next we'll combine these two tables using the 'rbind' function
# which concatenates (or 'binds') rows together into a new table:
color.freqs.both <- rbind(color.freqs.male,color.freqs.female)
# Let's look at this new table:
color.freqs.both
# You can see that it contains a matrix of numbers along with
# names for the rows and columns.
# This table is ready to be plotted using 'barplot'. We'll
# use the option 'beside = TRUE' so the male and female bars
# are plotted next to each other instead of on top of each other.
barplot(color.freqs.both,
beside = TRUE,
legend = c("Male","Female"),
col = c("Blue","Red"))
# Do you think there is a difference in the distribution of color
# preference across gender? Later on we'll determine if these two
# distributions are significantly different from each other
# using something called a 'Chi-squared' test.
# As an exercise, see if you can create a histogram of the
# number of birthdays for each month. The months you all were
# born in is in the variable:
survey$month
# As before, we can make a table and plot it as a
# bar plot:
month.table <- table(survey$month)
barplot(month.table)
# By default, table organizes the categories in alphabetical
# order. That's not what we want. Look at the table:
month.table
# You can see it's in alphabetical. To reorder
# the table, we can re-index the values by noting
# January, came 5th, Feburary came 4th, and so on.
# To rearrange the order we can do this:
month.table <- month.table[c(5,4,8,1,9,7,6,2,12,11,10,3)]
# Where the list 5,4,8,... are the locations for
# January, February, March, etc.
# Now look:
month.table
# It's in the proper order.
# Now the bar plot will look right:
barplot(month.table)
# There are more students born in some months than others.
# Do you think this happened just by chance? Or is this
# distribution particularly unsual? Later on we'll run
# a 'Chi-Squared' test to determine how likely we'd get
# a data set like this by chance.
# Finally, let's find the average amount of sleep for the
# female students that like to sit near the front of the
# class:
# To find these students we need to find those for which
# BOTH their gender is Female AND their favorite color is 'Purple'.
# This requires us to use '&' (and) to compare lists of TRUE
# and FALSE
# '&' compares two TRUE/FALSE variables and returns 'TRUE' if
# both are TRUE
TRUE & TRUE
TRUE & FALSE
FALSE & TRUE
FALSE & FALSE
# Here's how to use '&' to find our female students that like
# to sit near the front of the class:
students.gender.front <- survey$gender== 'Female' & survey$sit == 'Near the front'
# And here's the average amount of sleep they get each night:
mean(survey$sleep[students.gender.front],na.rm = TRUE)
# We use '|' for 'or'. This returns TRUE if either are TRUE:
TRUE | TRUE
TRUE | FALSE
FALSE | TRUE
FALSE | FALSE
# Let's use '|' to find the favorite color for students that were either born
# in January or are left-handed:
students.january.left = survey$month == "January" | survey$hand == "Left"
survey$color[students.january.left]