# Ling/CSE 472, Spring 2009 # Assignment 3 # 23tags.py # import the Natural Language Toolkit import nltk # get a tagged corpus tagged_sents = nltk.corpus.treebank.tagged_sents() # split it into a training set and a test set size = int(len(tagged_sents) * 0.9) train_sents = tagged_sents[:size] test_sents = tagged_sents[size:] # define a mapping from tags to a simpler set of tags tag_map = { 'CC':'TAG1', 'CD':'TAG1', 'DT':'TAG2', 'EX':'TAG2', 'FW':'TAG3', 'IN':'TAG3', 'JJ':'TAG4', 'JJR':'TAG4', 'JJS':'TAG5', 'LS':'TAG5', 'MD':'TAG6', 'NN':'TAG6', 'NNP':'TAG7', 'NNPS':'TAG7', 'NNS':'TAG8', 'PDT':'TAG8', 'POS':'TAG9', 'PRP':'TAG9', 'PRP$':'TAG10', 'RB':'TAG10', 'RBR':'TAG11', 'RBS':'TAG11', 'RP':'TAG12', 'SYM':'TAG12', 'TO':'TAG13', 'UH':'TAG13', 'VB':'TAG14', 'VBD':'TAG14', 'VBG':'TAG15', 'VBN':'TAG15', 'VBP':'TAG16', 'VBZ':'TAG16', 'WDT':'TAG17', 'WP':'TAG17', 'WP$':'TAG18', 'WRB':'TAG18', '``':'TAG19', "''":'TAG19', '#':'TAG20', '$':'TAG20', ',':'TAG21', '-LRB-':'TAG21', '-NONE-':'TAG22', '-RRB-':'TAG22', '.':'TAG23', ':':'TAG23' } # transform the test and training data test_sents2 = [[(pair[0], tag_map[pair[1]]) for pair in sent] for sent in test_sents] train_sents2 = [[(pair[0], tag_map[pair[1]]) for pair in sent] for sent in train_sents] # train a new bigram tagger default_tagger = nltk.DefaultTagger('TAG6') unigram_tagger = nltk.UnigramTagger(train_sents2, backoff=default_tagger) bigram_tagger = nltk.BigramTagger(train_sents2, backoff=unigram_tagger) # evaluate the accuracy and print the results print nltk.tag.accuracy(bigram_tagger, test_sents2)