Linguistics 473
Computational Linguistics Fundamentals
Summer 2017

Project 5

Project Description

Naïve Bayesian Classifier: Language ID

My implementation takes unknown words as having a count of one less than the least commonly seen word, or a count of 1, if the least commonly seen word has a count of 1.
import os
import re
import sys
import math

unk = '''__UNK__'''
reg_punc = re.compile('''[.,?"'!$<>():;-]''')

# expected args:
# arg1 = language model directory
# arg2 = test/train file
def main():
  models = dict()
  # read in models
  for fi in os.listdir(sys.argv[1]):
    lang_name = fi[0:fi.index('.')]
    models[lang_name] = create_model('/dropbox/17-18/473/project5/language-models/' + fi)
  for line in open(sys.argv[2]):
    judge = line[line.index('\t')+1:]
    probabilities = dict()
    best_val = float('-inf')
    best = None
    for model in models.keys():
      val = read_model(models[model], judge)
      probabilities[model] = val
      if val > best_val:
        best_val = val
        best = model
    print_prediction(line, probabilities, best)

def print_prediction(line, probabilities, best):
  print (line)
  for model in probabilities.keys():
    print (model + '\t' + str(probabilities[model]))
  print ('result:\t' + best + '\n')

def create_model(filename):
  sum_count = 0
  lowest_count = sys.maxint
  model = dict()
  for line in open(filename, 'r'):
    if line == '':
    ls = line.split()
    val = int(ls[1])
    sum_count += val
    model[ls[0]] = val
    if (lowest_count > val):
      lowest_count = val
  # insert an unk value
  if lowest_count == 1:
    model[unk] = lowest_count
    sum_count += lowest_count
    model[unk] = lowest_count - 1
    sum_count += lowest_count - 1
  for key in model.keys():
    model[key] = float(model[key]) / sum_count
  return model

def read_model(model, text):
  logprob = 0
  for word in split_text(text):
    if word in model:
      #pluck the word out
      logprob += math.log(model[word],10)
      #probability of UNK
      logprob += math.log(model[unk],10)
  return logprob

def split_text(text):
  return reg_punc.sub(' ', text).split()

if __name__ == "__main__":
For the extra credit, I assumed that if the best logprob was less than -110, it was too low. This has problems, however (length of sentence, for instance, will affect the logprob):
    if best_val < (-110):
      print_prediction(line, probabilities, 'unknown')
      print_prediction(line, probabilities, best)