# Descriptive Statistics 1+1 # This script will show you how to load in data from your survey # and explore some of it using basic descriptive stastics like # measures of central tendency and variability, along with some histograms. # First we'll clear the workspace and load in the survey data. # R's function 'read.csv' loads in csv files and if there are # headers in the first row uses the names in the headers to name # the 'fields' that contain the data. rm(list = ls()) survey <-read.csv("http://www.courses.washington.edu/psy315/datasets/Psych315W19survey.csv") # We've told 'read.csv' to put all the data into a variable called # 'survey'. # # Survey has a bunch of fields associated with it that correspond # to your answers to each of the questions. # For example, I asked you about your heights (in inches). # This can be found in the field 'height' and can be accesed # with the dollar sign: survey\$height # To just look at the first few values, use 'head': head(survey\$height) # How many students filled out the survey? This can be # determined with the function 'length' for any of the fields: length(survey\$height) # Mother's heights are in the field 'mheight': head(survey\$mheight) # These lists are in the same order across students, so the first # student in the list (whoever it is) has a height of: survey\$height[1] # and has a mother of height survey\$mheight[1] # Here are some basic statistics about your mother's heights: # mean: mean(survey\$mheight) # The answer might be 'NA' which means 'not available' That's # because there is missing data in the list. Look back and # you'll see that some of the entries are 'NA'. # To calculate the mean, while ignoring missing entries, use: mean(survey\$mheight,na.rm = TRUE) # Another way to deal with missing data is to create a new # variable without the 'NA's. # To find the missing data, use 'is.na' is.na(survey\$mheight) # gives you 'TRUE' for the locations of the missing data. # To find the data that's NOT missing, use '!' which switches # TRUE to FALSE and vice versa: !is.na(survey\$mheight) # Finally, we can use this list to pull out the non-missing # data, and put it into a new variable 'mheight': mheight <- survey\$mheight[!is.na(survey\$mheight)] # or (even easier:) mheight <- na.omit(survey\$mheight) # 'mean' should now work: mean(mheight) # The minimum mother's height is: min(mheight) # and the max is max(mheight) # (sample) standard deviation: sd(mheight) # check by calculating by hand: # save the mean m <- mean(mheight) # save the sample size n <- length(mheight) # calculate the sums of squared deviation from the mean: SS <- sum( (mheight-m)^2) # calculate the sample variance by dividing SS by n-1 v <- SS/(n-1) # calculate the standard deviation by taking the square root of the variance s <- sqrt(v) v # (sample) variance: var(mheight) # median: median(mheight) # or equivalently: quantile(mheight,.5) # Note the '50%' above the result. That's because 'quantile' # returns the result as a 'named number'. If you want just # a regular number without the name, pass the named number through # 'as.numeric': as.numeric(quantile(mheight,.5)) # The Semi-interquartile range can be calulated using 'quantile' # (see PercentilePointExample.R): Q <- as.numeric(quantile(mheight,.75)-quantile(mheight,.25))/2 Q # We can plot a histogram of mother's heights like this # (See the 'HistogramExample.R' script): # define class intervals based in the min and max: class.interval <- seq(min(mheight), max(mheight), 1) hist(survey\$mheight, main="Histogram of Mother's Heights", xlab="Height (in)", col="blue", xaxt='n', yaxt = 'n', breaks =class.interval ) # and then adding your own axes with the 'axis' function # Axis 1 is 'x' and 2 is 'y': axis(1, at=class.interval) axis(2, at=seq(0,100,5),las = 1) # What is ratio of males to females in this class? # Remember, we can ask which students are female with: survey\$gender == "Female" # The 'sum' function will give you the number of TRUE's: number.female <- sum(survey\$gender == "Female") number.female # Similarly for the males: number.male <- sum(survey\$gender == "Male") number.male # The total number of students is: total.number <- length(survey\$gender) total.number # And for females: 100*number.female/total.number # The percent of males is: 100*number.male/total.number # Note: these may not add up to 100%. This will happen if some # students choose not to answer that question in the survey. # Try this on your own: Calculate the percent of left vs. right # handers in the class. # How much do you all play video games? That's in the field # 'games_hours' # Histogram hist(survey\$games_hours, main="Histogram of Video Game Playing", xlab="Hours/day", col="blue", breaks =seq(0,max(survey\$games_hours),1) ) # Does this distribution look normal? If not # is it positively or negatively skewed? # Does video game playing differ by gender? mean(survey\$games_hours[survey\$gender == "Male"]) mean(survey\$games_hours[survey\$gender == "Female"]) median(survey\$games_hours[survey\$gender == "Male"]) median(survey\$games_hours[survey\$gender == "Female"]) # Later on this quarter we'll see if this difference # is 'statistically significant' using something called # a 't-test'. # What is the distribution of your favorite colors? # You'd think we could type 'hist(survey\$color)' but # that doesn't work because 'hist' needs a list of numbers, # not a list of nominal category names. # Fortunately, R has a convenient function 'table' that # tabulates nominal scale data into frequencies: color.freqs <- table(survey\$color) # Let's look at the result: color.freqs # This color.freqs is something called a 'table'. It's # like a list of numbers, except that the columns have # names associated with them. In our case, the names # of the columns are the color names. # tables are convenient because it # lets you keep track of what the numbers mean. # Here's a bar plot of the histogram. barplot(color.freqs) # You can then send this table into the function 'barplot' # to make a histogram: barplot(color.freqs) # Got to love your love of 'purple'. # Want to get fancy? Let's use the option 'col' to color the # bars by their color names: barplot(color.freqs, col = c("blue","green","orange","pink","purple","red","yellow") ) # Let's compare the frequency distribution of favorite colors # by gender. First we need to create a table frequency distributions # for each gender separately, using the 'table' function like # before. But this time we'll index into the values for the # associated gender: color.freqs.male <- table(survey\$color[survey\$gender == "Male"]) color.freqs.female <- table(survey\$color[survey\$gender == "Female"]) # Next we'll combine these two tables using the 'rbind' function # which concatenates (or 'binds') rows together into a new table: color.freqs.both <- rbind(color.freqs.male,color.freqs.female) # Let's look at this new table: color.freqs.both # You can see that it contains a matrix of numbers along with # names for the rows and columns. # This table is ready to be plotted using 'barplot'. We'll # use the option 'beside = TRUE' so the male and female bars # are plotted next to each other instead of on top of each other. barplot(color.freqs.both, beside = TRUE, legend = c("Male","Female"), col = c("Blue","Red")) # Do you think there is a difference in the distribution of color # preference across gender? Later on we'll determine if these two # distributions are significantly different from each other # using something called a 'Chi-squared' test. # As an exercise, see if you can create a histogram of the # number of birthdays for each month. The months you all were # born in is in the variable: survey\$month # As before, we can make a table and plot it as a # bar plot: month.table <- table(survey\$month) barplot(month.table) # By default, table organizes the categories in alphabetical # order. That's not what we want. Look at the table: month.table # You can see it's in alphabetical. To reorder # the table, we can re-index the values by noting # January, came 5th, Feburary came 4th, and so on. # To rearrange the order we can do this: month.table <- month.table[c(5,4,8,1,9,7,6,2,12,11,10,3)] # Where the list 5,4,8,... are the locations for # January, February, March, etc. # Now look: month.table # It's in the proper order. # Now the bar plot will look right: barplot(month.table) # There are more students born in some months than others. # Do you think this happened just by chance? Or is this # distribution particularly unsual? Later on we'll run # a 'Chi-Squared' test to determine how likely we'd get # a data set like this by chance. # Finally, let's find the average amount of sleep for the # female students that like to sit near the front of the # class: # To find these students we need to find those for which # BOTH their gender is Female AND their favorite color is 'Purple'. # This requires us to use '&' (and) to compare lists of TRUE # and FALSE # '&' compares two TRUE/FALSE variables and returns 'TRUE' if # both are TRUE TRUE & TRUE TRUE & FALSE FALSE & TRUE FALSE & FALSE # Here's how to use '&' to find our female students that like # to sit near the front of the class: students.gender.front <- survey\$gender== 'Female' & survey\$sit == 'Near the front' # And here's the average amount of sleep they get each night: mean(survey\$sleep[students.gender.front],na.rm = TRUE) # We use '|' for 'or'. This returns TRUE if either are TRUE: TRUE | TRUE TRUE | FALSE FALSE | TRUE FALSE | FALSE # Let's use '|' to find the favorite color for students that were either born # in January or are left-handed: students.january.left = survey\$month == "January" | survey\$hand == "Left" survey\$color[students.january.left]