# Ling/CSE 472, Spring 2009 # Assignment 3 # 36tags.py # import the Natural Language Toolkit import nltk # get a tagged corpus tagged_sents = nltk.corpus.treebank.tagged_sents() # split it into a training set and a test set size = int(len(tagged_sents) * 0.9) train_sents = tagged_sents[:size] test_sents = tagged_sents[size:] # define a mapping from tags to a simpler set of tags tag_map = { 'CC':'CC', 'CD':'CD', 'DT':'DT', 'EX':'EX', 'FW':'FW', 'IN':'IN', 'JJ':'JJ', 'JJR':'JJR', 'JJS':'JJS', 'LS':'LS', 'MD':'MD', 'NN':'NN', 'NNP':'NN', 'NNPS':'NN', 'NNS':'NN', 'PDT':'PDT', 'POS':'POS', 'PRP':'PRP', 'PRP$':'DT', 'RB':'RB', 'RBR':'RBR', 'RBS':'RBS', 'RP':'RP', 'SYM':'SYM', 'TO':'TO', 'UH':'UH', 'VB':'VB', 'VBD':'VB', 'VBG':'VB', 'VBN':'VB', 'VBP':'VB', 'VBZ':'VB', 'WDT':'WDT', 'WP':'WP', 'WP$':'DT', 'WRB':'WRB', '``':'``', "''":"''", '#':'#', '$':'$', ',':',', '-LRB-':'-LRB-', '-NONE-':'-NONE-', '-RRB-':'-RRB-', '.':'.', ':':':' } # transform the test and training data test_sents2 = [[(pair[0], tag_map[pair[1]]) for pair in sent] for sent in test_sents] train_sents2 = [[(pair[0], tag_map[pair[1]]) for pair in sent] for sent in train_sents] # train a new bigram tagger default_tagger = nltk.DefaultTagger('NN') unigram_tagger = nltk.UnigramTagger(train_sents2, backoff=default_tagger) bigram_tagger = nltk.BigramTagger(train_sents2, backoff=unigram_tagger) # evaluate the accuracy and print the results print nltk.tag.accuracy(bigram_tagger, test_sents2)