{\rtf1\ansi\ansicpg1252\cocoartf1038\cocoasubrtf360
{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
{\colortbl;\red255\green255\blue255;}
\margl1440\margr1440\vieww10820\viewh14480\viewkind0
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\ql\qnatural\pardirnatural
\f0\fs24 \cf0 \
#This exercise uses two datafiles: \
# "Labov-1968-table.txt", containing a short table of percentages of post-vocalic (r)\
#production for three department stores, and "Labov-1968-raw-data.txt"\
#containing all tokens for all speakers observed in the study\
\
#STRUCTURE OF "Labov-1968-raw-data.txt"\
#The dependent variable "r-present" has two levels (1 for yes, 0 for no).\
#Therefore, this is a dataset of categorical data, all of type char. \
#Columns in the textfile include:\
#Column 1: token (1-727)\
#Column 2: store (Saks, Macys, Klein)\
#Column 3: style (casual, emphatic)\
#Column 4: word (4th, floor)\
#Column 5: r_present (1,0)\
\
#STRUCTURE OF "Labov-1968-table.txt"; two categorical independent variables, 4 continuous\
#variables with data about the dependent variable\
#Column 1: store (3 levels: Saks, Macys, Klein)\
#Column 2: age (3 levels: 15-30, 35-50,55-70)\
#Column 3: all.r-1 (continuous)\
#Column 4: percent.all.r-1 (continuous)\
#Column 5: some.r-1 (continuous)\
#Column 6: percent.some.r-1 (continuous)\
\
\
#STEP1: create a simple barplot for Labov-1968-table.txt, investigating occurence of \
#postvocalic-(r). \
#load Labov-1968-table.txt. \
#read tabular data into a variable called "dataset1", using a function allowing user to select \
#a file from browser window:\
dataset1<-read.delim(file=file.choose(),sep="\\t")\
summary(dataset1)\
\
#first, we will create a barplot using data from just one column\
barplot(dataset1$percent.all.r-1, col=dataset1$store, names.arg=dataset1$age)\
#add legend\
legend("topright", fill=c(3,2,1), legend=c("Saks", "Macy's", "Klein"))\
\
#STEP2: create a stacked barplot for Labov-1968-table.txt\
#it willh have all.r-1 data from column 3 & some.r-1 data from column 5:\
#create a matrix of the data we'll stack\
dataset1.percent <- cbind(dataset1[,3], dataset1[,5])\
#or we could do it this way:\
dataset1.percent <- cbind(dataset1$percent.all.r-1, dataset1$percent.some.r-1)\
dataset1.percent\
#for the barplot to display properly, we must use barplot(t()) to transpose the rows and columns:\
barplot((t(dataset1.percent)), names.arg=dataset1$age, ylab="% occurrence"); grid()\
\
#STEP3: Generate a Boxplot of Labov-1968-table.txt\
boxplot(dataset1$percent.all.r-1,dataset1$percent.some.r-1, ylab="%", notch=T,names=c("all r-1","some r-1"))\
\
#STEP4:Generate a Histogram of Labov-1968-raw-data.txt\
#read in the raw datafile with one word per row ("fourth floor" is a complete utterance,\
#but is divided across two rows, one for each token of (r) \
dataset2<-read.delim(file=file.choose(),sep="\\t")\
#explore the dataset, to ensure it was read properly\
dataset2\
#create histogram, using r_present as the dependent variable to count\
hist(dataset2$r_present)\
#note: R treats the 1s and 0s as if the values were numeric. This variable is nominal (the values are \
#simply category labels). So, R acts as if there can be intermediate observations between 0 and 1).\
#We change the "breaks" argument so that it takes only these two values.\
hist(dataset2$r_present, breaks=2)\
#Looking at the y-axis, we see that R is counting each observation independently, not telling us its\
#contribution to the total distribution. To change from counts to proportions, we reset the \
#"freq" argument to FALSE, so that R knows not to simply count \
hist(dataset2$r_present, breaks=2, freq=FALSE)\
\
\
\
\
}