Linguistics 473
Computational Linguistics Fundamentals
Summer 2017

Project 2


Project Description

Count unigrams in a corpus

import os
import re

tags = re.compile("<.*?>")
invalid_chars = re.compile("[^a-z']")
invalid_apostrophe_start = re.compile(" '+|^'+")
invalid_apostrophe_end = re.compile("'+ |'+$")
dictionary = dict()

for fi in os.listdir('/corpora/LDC/LDC02T31/nyt/2000'):
  f = open('/corpora/LDC/LDC02T31/nyt/2000/' + fi, 'r').read().lower()
  f = tags.sub(' ', f)
  f = invalid_chars.sub(' ', f)
  f = invalid_apostrophe_start.sub(' ', f)
  f = invalid_apostrophe_end.sub(' ', f)
  words = f.split()
  for word in words:
    if word not in dictionary:
      dictionary[word] = 1
    else:
      dictionary[word] += 1

for word in sorted(dictionary.items(), key=lambda word: word[1], reverse=True):
  print (word[0] + "\t" + str(word[1]))