#!/usr/bin/env python3
#
# Script to take test suite file formatted per Ling 567 Lab 2
# directions and create an items file for [incr tsdb()] use.
#
# Assume for now that each source cited in the headers is on a single line.
#
# Emily M. Bender 4/6/06
# Modified Shauna Eggers 4/17/06:
# - unicode handling, normalizing phenomena to short forms, page numbers
# - some subroutines and reformatting
# Modified Emily M. Bender 1/30/09:
# - Allow user-specified i-ids.
# Modified Jeff Scott 1/27/10:
# - added linecount for display when processing fails.
# Modified Emily M. Bender 1/9/12:
# - update to new Relations version of latest [incr tsdb()]
# Rewritten in python3 J. Crowgey [ides of march 2013]:
# - new testsuite format requirement: a header line which
#   tells the script how many lines are in an example
#   (script requires uniformity on this across all the examples)
#   and also what their labels are
#   -- line labels have to be drawn from the list of pre-approved names
#      * 'orth' : orthographemic form
#      * 'orth-seg' : orthographemic form with morphemes segmented in 
#                     leipzig style
#      * 'translit' : transliteration
#      * 'translit-seg' : transliteration form with morphemes segmented in
#                         in leipzig style
#      * 'gloss' : gloss line in leipzig style
#      * 'translat' : free translation
#   -- note the script now check for correct alignment between any 'seg' lines
#      and the gloss line
# - accepts the -m/--map option, this option modifies the 
#   default mapping into the tsdb-item fields.  
#
#
#



import sys
import os
import re
import argparse

##########################
# Define some global variables
##############################

## blessed line names: 567 testsuites are expected to have a header which
##     provides labels for the lines in the igt.  these line names are used
##     to create a default mapping in to the various tsdb fields.  
##     a user of the script can override the default mapping using the -m/
##     --map option

blessed_line_names = [ 'orth','translit','orth-seg','translit-seg',
                       'gloss','translat' ]

default_line_map = { 'i-input':'orth', 'i-comment': ['gloss','translat']  }
check_lines = [ 'gloss' ]  

## keys for tsdb fields
i_keys = [ 'i-id', 'i-origin', 'i-register', 'i-format', 'i-diff', 
           'i-category','i-input', 'i-tokens', 'i-gloss', 'i-translation',
           'i-wf', 'i-length', 'i-comment', 'i-author', 'i-date' ]

# a list of encodings to try when reading input files
encodings = ('utf-8-sig', 'latin1', 'windows-1251', 'macgreek')

# regexes for testsuite headers and fields
meta_re = re.compile(r'\s*(Lines|Tiers|Language|Language code|Author|Date|' +\
                     r'Source [^:]+)\s*:', flags=re.IGNORECASE|re.UNICODE)
sources_re = re.compile(r'Source [^:]+\s*:', flags=re.IGNORECASE|re.UNICODE)
igt_meta_re = re.compile(r'\s*(Source|Judge?ment|Vetted|Phenomen(a|on))\s*:',
                         flags=re.IGNORECASE|re.UNICODE)
bracket_re = re.compile(r'(\(|\)|\[|\])')
wordsep_re = re.compile(r'[=-]')

phenomena_codes = { 'wordorder':'wo',
                    'pronouns':'pn',
                    'determiners':'det',
                    'case-markingadpositions':'adp',
                    'casemarkingadpositions':'adp',
                    'argumentoptionality':'pro-d',
                    'prod':'pro-d',
                    'agreement':'agr',
                    'case':'c',
                    'coordination':'crd',
                    'matrixyes-noquestions':'q',
                    'matrixyes/noquestions':'q',
                    'matrixyesnoquestions':'q',
                    'yes-noquestions':'q',
                    'yesnoquestions':'q',
                    'yes-no':'q',
                    'yesno':'q',
                    'modals':'m',
                    'negation':'neg',
                    'imperatives':'imp',
                    'embeddeddeclaratives':'emb-d',
                    'embeddedquestions':'emb-q',
                    'adjectives':'adj',
                    'relativeclauses':'rel',
                    'serial verb constructions':'svc',
                    'cognitive status':'cogst',
                    'non-verbal predicates':'cop',
                    'numeral classifiers':'numcl',
                    'information structure':'info' }

def norm_phenomena(ph_str):
  '''Phenomena:  map possible forms for phenomena (lc, no whitespace) 
     to short forms'''
  ph_lst = ph_str.strip("{} ").split(',')
  to_return = []
  for p in ph_lst:
    c = ''
    p = p.lower().replace(" ","")
    if p in phenomena_codes.keys():
      p = phenomena_codes[p]
    to_return.append(p)
  return ",".join(to_return)

def norm(s): return s.strip().lower()

def despace(s): return ' '.join(s.split())

def print_meta(headers):
  '''print out the header info for the testsuite file'''
  for k in headers:
    if k != 'sources':
      print(k+": "+headers[k])

  print("[sources dictionary]")
  for s in headers['sources']:
    print("  "+s+": "+headers['sources'][s])

def remove_blank_lines(lines):
  while len(lines) > 0 and ( lines[0].strip() == '' or lines[0][0] == '#' ):
    lines.pop(0)

def read_headers(tslines):
  '''builds up a dictionary of the header info about the testsuite.
     the special key 'sources' is itself a dictionary with the sources
     by the key given (usually a,b,c,etc).'''
  headers = {}
  headers['sources'] = {}
  
  remove_blank_lines(tslines)

  # get language name etc
  while meta_re.match(tslines[0]):
    k,v = tslines[0].split(':',1) 
    if sources_re.match(tslines[0]):
      headers['sources'][norm(k[-1])] = norm(v)
    else:
      headers[norm(k)] = norm(v)

    tslines.pop(0)
    remove_blank_lines(tslines)

  err = "Error: testsuite headers should give %s."
  if not headers.get('language'):
    print(err % 'language name',file=sys.stderr)
    sys.exit(1)
  
  if not headers.get('language code'):
    print(err % 'iso code',file=sys.stderr)
    sys.exit(1)

  if not headers.get('author'):
    print(err % 'author name',file=sys.stderr)
    sys.exit(1)

  if not headers.get('date'):
    print(err % 'a date',file=sys.stderr)
    sys.exit(1)

  if not (headers.get('lines') or headers.get('tiers')):
    print(err % 'a list of line names',file=sys.stderr)
    sys.exit(1)

  if headers.get('lines'):
    headers['tiers'] = headers['lines']
  for l in headers['tiers'].split():
    if not l in blessed_line_names:
      print('The list of line names should be drawn from these '\
             +'predetermined ones: '+str(blessed_line_names))
      print('Found: '+l,file=sys.stderr)
      print(str(headers['tiers']),file=sys.stderr)
      sys.exit(1)
    elif l.endswith('-seg'):
      check_lines.append(l)

  return headers

def parse_items(tslines, headers):
  # each item should have some meta (at least source, judgment 
  # are required).
  meta_schema = [ 'source','judgment','judgement','vetted','phenomena' ]
  tsdb_items = []

  i_id = 1
  i_ids = []
  ok = True
  while len(tslines) > 0:
    err = False
    meta = {}
    lines = {}

    remove_blank_lines(tslines)

    while igt_meta_re.match(tslines[0]):
      k,v = tslines[0].split(':',1) 
      meta[norm(k)] = norm(v)
      # print('created igt meta line: '+norm(k)+":"+norm(v))
      tslines.pop(0)

    if len(meta.keys()) < 4:
      print("Error: found less than full header info for igt item.", file=sys.stderr)
      print(str(meta), file=sys.stderr)
      print(tslines[0], file=sys.stderr)
      err = True
      ok = False

    # if we get here, we've found the meta, now read the lines
    # print(str(headers['lines'].split()))
    for l in headers['tiers'].split():
      lines[l] = tslines[0].replace(r'@',r'\s')
      tslines.pop(0)

    ## make sure we got a non-empty line for each of the lines we expected
    for l in lines:
      if lines[l].strip() == '':
        err = True
        print(" Error: missing line "+l)
        for k in lines:
          kstr = ' '*(8-len(k))+k
          print(kstr+": "+str(lines[k].strip()), file=sys.stderr)
        print('', file=sys.stderr)
    
    ## do alignmment checking, when necessary
    if len(check_lines) > 1:
      if check_alignment([lines[l] for l in check_lines]) == False:
        err = True
        ok = False 
        print("    (alignment error in igt item "+str(i_id)+")",file=sys.stderr)
        for k in lines:
          kstr = ' '*(8-len(k))+k
          print(kstr+": "+str(lines[k].strip()), file=sys.stderr)
        print('', file=sys.stderr)

    # if we gert here, we've got a valid example, 
    # format the tsdb and add it to the items array
    if not err:
      i_dict = dict(zip(i_keys, [ '' for i in i_keys ]))
      i_dict['i-id'] = str(i_id)
      i_dict['i-origin'] = meta['source']
  
      i_cat = norm_phenomena(meta['phenomena'])
  
      i_dict['i-category'] = i_cat
      i_dict['i-diff'] = '1' 
      i_dict['i-length'] = str(len(lines[default_line_map['i-input']].split()))
      wf = '0'
      for j in ['judgment', 'judgement']:
        if j in meta.keys():
          if meta[j] in [ 'g', 'y']:
            wf = '1'
  
      i_dict['i-wf'] = wf 
  
      for k in default_line_map:
        if isinstance(default_line_map[k], list):
          for i in default_line_map[k]:
            i_dict[k] += lines[i].strip()
        else:
          i_dict[k] = lines[default_line_map[k]].strip()
  
      i_dict['i-author'] = headers['author']
      i_dict['i-date'] = headers['date']
      tsdb_items.append(i_dict)

    i_id+=1

    if len(tslines) > 0 and tslines[0].strip() == '':
      tslines.pop(0)
      remove_blank_lines(tslines)

  if ok:
    return tsdb_items
  else:
    return None

def check_alignment(lines):
  '''make sure that the lines passed in have the same
     number of tokens and subword tokens'''

  # use the first line to get an initial count
  toks = lines[0].strip().split()
  tokcount = len(toks)

  # then check alignment on additional lines
  for i in range(1,len(lines)): 
    tks = lines[i].strip().split()
    if len(tks) != tokcount:
      print("Error: all morpheme segmented lines are required",file=sys.stderr)
      print("       to have the same number of tokens.",file=sys.stderr)
      print(str(tokcount)+" tokens: "+str(toks),file=sys.stderr)
      print(str(len(tks))+" tokens: "+str(tks),file=sys.stderr)
      return False

    for j in range(0,len(tks)):
      if tks[j].count('=') != toks[j].count('='): 
        print("Error: aligned tokens should have the same number of ",file=sys.stderr)
        print("      clitic markers '='.",file=sys.stderr)
        print("items: ["+tks[j]+"] ["+toks[j]+"]", file=sys.stderr)
        return False
      elif tks[j].count('-') != toks[j].count('-'): 
        print("Error: aligned tokens should have the same number of ",file=sys.stderr)
        print("      morpheme boundary markers '-'.",file=sys.stderr)
        print("items: ["+tks[j]+"] ["+toks[j]+"]", file=sys.stderr)
        return False
  return True


def arg_parser(tsf,itf):
  '''set up the argparser for the command line, putting this here 
  so the main method is a bit shorter and less cluttered.'''

  parser = argparse.ArgumentParser(description='Make a tsdb item file from '\
           +'a 567 testsuite file.')
  parser.add_argument('tsf', metavar='TESTSUITE',
                      type=str, help='a testsuite file')
  parser.add_argument('itf', metavar='ITEMFILE', nargs='?', type=str,
                     help="filename for output, if not given, appends .item "\
                     +"to testsuite filename", default='')
  parser.add_argument('-m','--map', nargs=2, metavar=('TS-LINE-TAG', 'ITEMFILE-FIELD-NAME'), action='append')
  parser.add_argument('-v','--verbose', dest='verb', action='store_true', default=False)
  return parser


if __name__ == '__main__':

  # check command line args
  tsf = ''
  itf = ''
  parser = arg_parser(tsf,itf)
  args = parser.parse_args()
  argsd = vars(args)
  (tsf,itf,verb,lmap) = (argsd['tsf'], argsd['itf'], argsd['verb'], argsd['map'])
  if itf=='':
    itf= tsf+'.item'
  # print("itf: "+itf+"\ntsf: "+tsf+"\nverb: "+str(verb)+"\nmap: "+str(lmap))
  if not lmap == None:
    for pair in lmap:
      if not pair[0] in blessed_line_names: 
        print("Error: the first item given to the map option should be a "\
              +"blessed line name: "+str(blessed_line_names))
        sys.exit(1)
      elif not pair[1] in i_keys:
        print("Error: the second item given to the map option should be a "\
              +"tsdb item field name: "+str(i_keys))
        sys.exit(1)
      else:
        default_line_map[pair[1]] = pair[0]
        # update default_line_map accordingly 
        


  # check that testsuite file exists 
  tslines = []
  for enc in encodings:
    try:
      tslines = open(tsf, 'rU', encoding=enc).readlines()
      break
    except UnicodeDecodeError as e:
      if enc == encodings[-1]:
        print("Unable to find a suitable encoding for this file " + tsf)

  # check for collisions on item file and warn user
  if os.path.exists(itf):
    print("Warning: %s already exists.  Overwrite? [y/n]" % itf)
    reply = input()
    if len(reply) < 1 or reply[0] != 'y':
      sys.exit(1)
  

  # files look fine, read header info 
  # can access headers['sources'] to retrieve the
  # names of sources

  headers = read_headers(tslines)
  # if -v, print what we found 
  if verb:
    print_meta(headers)

  items = parse_items(tslines, headers)
  n_grammatical = 0
  if items == None:
    print("Errors found in testsuite, no output file created.")
  else: 
    try:
      with open(itf, 'w') as out:
        for i in items:
          if i['i-wf'] == '1': n_grammatical +=1
          out.write(r'@'.join([ i[k] for k in i_keys]))  
          out.write('\n')
    except IOError as e:
      print(str(e))

  if verb:
    print("created item file with "+str(len(items))+" examples,")
    print("                       "+str(n_grammatical)+" grammatical.")
