Linguistics 473
Computational Linguistics Fundamentals
Summer 2017

## Project 5

### Naïve Bayesian Classifier: Language ID

My implementation takes unknown words as having a count of one less than the least commonly seen word, or a count of 1, if the least commonly seen word has a count of 1.
```import os
import re
import sys
import math

unk = '''__UNK__'''
reg_punc = re.compile('''[.,?"'!\$<>():;-]''')

# expected args:
# arg1 = language model directory
# arg2 = test/train file
def main():
models = dict()
for fi in os.listdir(sys.argv[1]):
lang_name = fi[0:fi.index('.')]
models[lang_name] = create_model('/dropbox/17-18/473/project5/language-models/' + fi)
for line in open(sys.argv[2]):
judge = line[line.index('\t')+1:]
probabilities = dict()
best_val = float('-inf')
best = None
for model in models.keys():
probabilities[model] = val
if val > best_val:
best_val = val
best = model
print_prediction(line, probabilities, best)

def print_prediction(line, probabilities, best):
print (line)
for model in probabilities.keys():
print (model + '\t' + str(probabilities[model]))
print ('result:\t' + best + '\n')

def create_model(filename):
sum_count = 0
lowest_count = sys.maxint
model = dict()
for line in open(filename, 'r'):
if line == '':
continue
ls = line.split()
val = int(ls[1])
sum_count += val
model[ls[0]] = val
if (lowest_count > val):
lowest_count = val
# insert an unk value
if lowest_count == 1:
model[unk] = lowest_count
sum_count += lowest_count
else:
model[unk] = lowest_count - 1
sum_count += lowest_count - 1
for key in model.keys():
model[key] = float(model[key]) / sum_count
return model

logprob = 0
for word in split_text(text):
if word in model:
#pluck the word out
logprob += math.log(model[word],10)
else:
#probability of UNK
logprob += math.log(model[unk],10)
return logprob

def split_text(text):
return reg_punc.sub(' ', text).split()

if __name__ == "__main__":
main()
```
For the extra credit, I assumed that if the best logprob was less than -110, it was too low. This has problems, however (length of sentence, for instance, will affect the logprob):
```    if best_val < (-110):
print_prediction(line, probabilities, 'unknown')
else:
print_prediction(line, probabilities, best)
```