# Script to merge demographic info from Switchboard tables
# into switchboard transcript files.  

# This script is meant to be run on patas.
# Call it with one argument which gives the path to the Switchboard
# file you are working with, e.g.,:

# python merge-demo-data-starter.py /corpora/LDC/LDC93T4/trans/phase2/disc01/sw3294.txt

# NB: In this initial state, this script will print an error when
# you run it.  This is not a python error, but a tiny bit of error
# checking built into the script.

import sys
import re

# Sub-routine for creating string of demographic data, based
# on line from caller table. Input is a set of fields from
# one line of the caller.tab file.

def create_demo_data(fields):

    # Store the sex of the speaker in one variable
    # ~*~

    # Store the age of the speaker in another variable
    # (NB: these conversations were recording in 1990-1991,
    # for simplicity, assume everyone was born on Jan 1, and
    # that all recordings were done in 1990.)
    # ~*~

    # Store the dialect of the speaker in a third variable
    # ~*~

    # The return value should be the sex, age, and dialect
    # information, separated by commas, with one comma at the end
    # ~*~
    
    ## return "foo," is just a place holder to make sure this version
    ## of the script runs.

    return "foo,"

# Hard code the path to the tables:

# Caller table:

calltab = open('/corpora/LDC/LDC93T4/tables/tables/caller.tab','r')

# Take file to merge into as input:

infile = open(sys.argv[1],'r')

# First line of file stores the "filename", which gives 
# conversation and speaker numbers.  For example:

# FILENAME:    3294_1353_1410

# where 3294 is the conversation number, 1353 is speaker A
# and 1410 is speaker B.

# Read in the first line of the file, split it on white space
# and keep the second element of the resulting list.
# ~*~

# Extract the conversation number and the two speaker numbers
# from that string, and store them in the variables conv, spkrA and spkrB.
# ~*~

## Default value of conv for now so that the script runs:

conv = '0'


# Create output file (in local directory) to write to:

outfile = open('sw' + conv + '.merged.txt','w')

# Read through calltab to get demographic data for spkrA and spkrB.
# Store this data in a comma-separated string: sex,age,dialect,

spkrA_data = ""
spkrB_data = ""

for line in calltab.readlines():

    ## The following is just to make the script work for now:
    True

    # Split the line on ", ", and save the resulting list of field.
    # ~*~

    # If the first element of the line is the ID for sprkA,
    # call create_demo_data with the fields from the line and
    # store the return value in spkrA_data.
    # ~*~


    # If the first element of the line is the ID for sprkB,
    # call create_demo_data with the fields from the line and
    # store the return value in spkrB_data.
    # ~*~


# If we get through the whole file and don't find a line for
# speaker A or speaker B, report an error and quit:

if not spkrA_data or not spkrB_data:

    sys.exit("Error: Could not find entries for one or both speakers in the caller table.")

# Go through transcript file (infile) line by line and
# write modified version to outfile.

# Varible to track whether we've consumed the header lines
# yet or not:

header = True

for line in infile.readlines():

    ## To make the for loop work until it has content:
    True

    # Check to see if we've found the line with ====,
    # and if so, set header to False.
    # ~*~

    # If header is False (i.e., not True), 
    # ~*~

        # Replace trailing new lines with a single space 
        # (so we can group utterances onto one line)
        # ~*~

        # Then check whether the line starts with A or B. 
        # ~*~

            # If so, split it into two fields (separated by the first ":").
            # Store the results in two variables, label and transcript.
            # ~*~

            # If the label starts with A, create a new line:
            # spkrA_data + label + ',"' + transcript
            # (The " is there to group the actual transcript into one
            # field surrounded by quotation marks; we'll put the other
            # one in below, once we're sure we've got the whole thing.)
            # ~*~

            # If the label starts with B, create a new line:
            # spkrB_data + label + ',"' + transcript
            # ~*~

    # Then check whether the line is only white space.  If so, we've reached
    # the end of that utterance.  Set the value of the variable
    # storing the line to a close quote followed by a newline.
    # ~*~
    
    # Write the line to the outfile.
    # ~*~