Linguistics 473
Computational Linguistics Fundamentals
Summer 2017

Project 1


Project Description

Count constituents in a Penn Treebank corpus

import os
import re
import sys

s_pattern = '\(S '
np_pattern = '\(NP '
vp_pattern = '\(VP '
regex_patterns = [s_pattern, np_pattern, vp_pattern]
regex_counts = [0, 0, 0]

class Counts:
  s_count = 0
  np_count = 0
  vp_count = 0
  dtv_count = 0
  itv_count = 0

#assumption: index is at '('
def parse_level(index, search_str):
  next_space = search_str.find(' ', index+1)
  this_node = search_str[index+1:next_space]
  if this_node == 'S':
    Counts.s_count += 1
  elif this_node == 'NP':
    Counts.np_count += 1
  elif this_node == 'VP':
    Counts.vp_count += 1
  next_lpar = search_str.find('(', index+1)
  next_rpar = search_str.find(')', index+1)
  daughters = []
  while (next_lpar != -1 and next_lpar < next_rpar):
    ret_val = parse_level(next_lpar, search_str)
    daughters = daughters + [ret_val[0]]
    next_lpar = search_str.find('(', ret_val[1])
    next_rpar = search_str.find(')', ret_val[1])
  if this_node == 'VP':
    if daughters == []:
      Counts.itv_count += 1
    elif daughters == ['NP', 'NP']:
      Counts.dtv_count += 1
  if next_rpar == -1:
    return (this_node, -1)
  else:
    return (this_node, next_rpar+1)


#find matches
for fi in os.listdir('/corpora/LDC/LDC99T42/RAW/parsed/prd/wsj/14/'):
  f = open('/corpora/LDC/LDC99T42/RAW/parsed/prd/wsj/14/' + fi, 'r').read().replace('\n', ' ').strip()
  index = f.find('(')
  while (index >= 0 and index < len(f)):
    ret_val = parse_level(index, f)
    index = ret_val[1]

#print to stdout
print 'sentences ' + str(Counts.s_count)
print 'nps ' + str(Counts.np_count)
print 'vps ' + str(Counts.vp_count)
print 'ditransitives ' + str(Counts.dtv_count)
print 'intransitives ' + str(Counts.itv_count)

Results for LDC99T42:
(S 4670
(NP 13221
(VP 7920
(VP (NP) (NP) 34
(VP x) 123