# Script to merge demographic info from Switchboard tables # into switchboard transcript files. # This script is meant to be run on patas. # Call it with one argument which gives the path to the Switchboard # file you are working with, e.g.,: # python merge-demo-data-starter.py /corpora/LDC/LDC93T4/trans/phase2/disc01/sw3294.txt # NB: In this initial state, this script will print an error when # you run it. This is not a python error, but a tiny bit of error # checking built into the script. import sys import re # Sub-routine for creating string of demographic data, based # on line from caller table. Input is a set of fields from # one line of the caller.tab file. def create_demo_data(fields): # Store the sex of the speaker in one variable # ~*~ # Store the age of the speaker in another variable # (NB: these conversations were recording in 1990-1991, # for simplicity, assume everyone was born on Jan 1, and # that all recordings were done in 1990.) # ~*~ # Store the dialect of the speaker in a third variable # ~*~ # The return value should be the sex, age, and dialect # information, separated by commas, with one comma at the end # ~*~ ## return "foo," is just a place holder to make sure this version ## of the script runs. return "foo," # Hard code the path to the tables: # Caller table: calltab = open('/corpora/LDC/LDC93T4/tables/tables/caller.tab','r') # Take file to merge into as input: infile = open(sys.argv[1],'r') # First line of file stores the "filename", which gives # conversation and speaker numbers. For example: # FILENAME: 3294_1353_1410 # where 3294 is the conversation number, 1353 is speaker A # and 1410 is speaker B. # Read in the first line of the file, split it on white space # and keep the second element of the resulting list. # ~*~ # Extract the conversation number and the two speaker numbers # from that string, and store them in the variables conv, spkrA and spkrB. # ~*~ ## Default value of conv for now so that the script runs: conv = '0' # Create output file (in local directory) to write to: outfile = open('sw' + conv + '.merged.txt','w') # Read through calltab to get demographic data for spkrA and spkrB. # Store this data in a comma-separated string: sex,age,dialect, spkrA_data = "" spkrB_data = "" for line in calltab.readlines(): ## The following is just to make the script work for now: True # Split the line on ", ", and save the resulting list of field. # ~*~ # If the first element of the line is the ID for sprkA, # call create_demo_data with the fields from the line and # store the return value in spkrA_data. # ~*~ # If the first element of the line is the ID for sprkB, # call create_demo_data with the fields from the line and # store the return value in spkrB_data. # ~*~ # If we get through the whole file and don't find a line for # speaker A or speaker B, report an error and quit: if not spkrA_data or not spkrB_data: sys.exit("Error: Could not find entries for one or both speakers in the caller table.") # Go through transcript file (infile) line by line and # write modified version to outfile. # Varible to track whether we've consumed the header lines # yet or not: header = True for line in infile.readlines(): ## To make the for loop work until it has content: True # Check to see if we've found the line with ====, # and if so, set header to False. # ~*~ # If header is False (i.e., not True), # ~*~ # Replace trailing new lines with a single space # (so we can group utterances onto one line) # ~*~ # Then check whether the line starts with A or B. # ~*~ # If so, split it into two fields (separated by the first ":"). # Store the results in two variables, label and transcript. # ~*~ # If the label starts with A, create a new line: # spkrA_data + label + ',"' + transcript # (The " is there to group the actual transcript into one # field surrounded by quotation marks; we'll put the other # one in below, once we're sure we've got the whole thing.) # ~*~ # If the label starts with B, create a new line: # spkrB_data + label + ',"' + transcript # ~*~ # Then check whether the line is only white space. If so, we've reached # the end of that utterance. Set the value of the variable # storing the line to a close quote followed by a newline. # ~*~ # Write the line to the outfile. # ~*~