#!/usr/bin/python # Tokenizer -- Linguistics 472, Autumn 2003, Assignment 1 # This python script (when completed) will take an ordinary text # file as input and create an output file with the text, segmented # into one sentence per line. Run it by typing # python tokenizer.py # from command prompt or a unix shell. # The program checks whether the output file already exists, and # if so, asks for confirmation from the user. It then reads in # the input file one line at a time, modifies those lines by # removing existing newlines and putting in just those we need, # and prints them to the output file. # The basic script provided as part of the assignment takes care # of input and output, gives one example regular expression, and # provides a series of comments (preceded by #) indicating what # you should add. # ---------------------------------------------------------------- import os import re import sys # Open the input file: infile = open(sys.argv[1],'r') # Check if the file already exists, and ask for confirmation # before overwriting it: if os.path.exists(sys.argv[1] + ".out"): res = raw_input(sys.argv[1] + ".out already exists. Continue anyway? [yn]") if re.search('[Yy]',res): print "Okay, continuing and overwriting output file...\n"; else: print "Okay, quitting...\n"; sys.exit() # Open output file: outfile = open(sys.argv[1] + ".out",'w') # Main loop: for line in infile.readlines(): # Replace all existing newlines with a space line = re.sub("\n"," ",line) # Replace all .!?: that aren't sentence-final with distinctive # strings, e.g., KEEPTHISPERIOD. # Change .!?: followed by a " to a special symbol, e.g., PERIODQUOTE. # NB " is a special character in Python, and needs to be escaped. # Change any space following a remaining .!?: to a newline. # Replace all KEEPTHISPERIOD etc. to the punctuation they represent. # Replace all PERIODQUOTE with the punctuation and a newline: # The following is for beautification of the output, and to make # it easier to compare with the goldstandard (e.g., using diff at # command line (Mac, Dante), or ediff in emacs). # Remove any line-initial spaces: # Remove any spaces before a new line: # Compress a sequence of spaces into just one: # Print the refurbished line to the output. outfile.write(line)