Advertisement
Guest User

Untitled

a guest
Jun 23rd, 2016
212
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.90 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3.  
  4. # Copyright 2012 Tetsuo Kiso. All rights reserved.
  5. # Use of this source code is governed by a BSD-style
  6. # license that can be found in the LICENSE file.
  7. #https://github.com/tetsuok/cabocha-to-tikz-deptree/blob/master/LICENSE
  8.  
  9. # generate dependency and predicate-argument structure diagrams
  10. # example cmd: c:\python27\python.exe c:\diagramPAS.py c:\output.txt > c:\newoutput.tex
  11.  
  12. import exceptions
  13. import optparse
  14. import os
  15. import sys
  16. import re
  17.  
  18. class Error(exceptions.StandardError):
  19.   pass
  20.  
  21. class FormatError(Error):
  22.   pass
  23.  
  24. class Segment(object):
  25.  
  26.   def __init__(self):
  27.     self.morphs = []
  28.     self.id = None
  29.     self.head = None
  30.  
  31.   def is_root(self):
  32.     return self.head == '-1'
  33.    
  34.   def add(self, m):
  35.     self.morphs.append(m)
  36.  
  37.   def to_pred(self, case, morphr, predTargs):
  38.     caseID = re.findall(u'({0}\"[0-9]\")'.format(case), morphr)
  39.     if caseID not in predTargs:
  40.         return str(caseID).encode('utf-8')[3:-2]
  41.  
  42.   def to_str(self):
  43.     return ''.join([m.split('\t')[0].encode('utf-8') for m in self.morphs])
  44.    
  45.   def to_pas(self):
  46.     bunsetsu, predTargs, morphMerge = [], [], []
  47.     predMerge, targID = '', ''
  48.     count = 0
  49.     for m in self.morphs:
  50.             bunsetsu.append( m.split('\t', 1))
  51.     for morph in reversed(bunsetsu):
  52.         if 'ID' in morph[-1]:
  53.             targID = re.findall(u'ID\=\"([0-9])\"', morph[-1])
  54.             targID = str(targID).encode('utf-8')[3:-2]
  55.         if 'type="pred"' in morph[-1]:
  56.             if 'pred' not in predTargs:
  57.                 predTargs.append('pred')
  58.                 for i in ['ga=', 'o=', 'ni=']:
  59.                     if i in morph[-1]:
  60.                         predTargs.append(self.to_pred(i, morph[-1], predTargs))
  61.             predMerge = u' '.join(predTargs)
  62.     while count < len(bunsetsu):
  63.         for m in bunsetsu:
  64.               morphMerge.append(m[0])      
  65.               count += 1
  66.     if morphMerge and (targID or predMerge):
  67.         return u''.join(morphMerge), targID, predMerge
  68.     else:
  69.         return u''.join(morphMerge), None, None
  70.  
  71. def sentence_to_deptext(sent):
  72.   return ' \& '.join([seg.to_str() for seg in sent]) + ' \\\\'
  73.  
  74. def getID(case, cval):    
  75.   id = re.findall(u'{0}\=\"([0-9])\"'.format(case), cval)
  76.   return str(id)[3:-2]
  77.    
  78. def convID(pLst, id):
  79.       return str(pLst.index(id) + 1)
  80.    
  81. def sentence_to_pas(sent):
  82.   pLst = []
  83.   pD, aD = {}, {}
  84.   for seg in sent:
  85.      if seg.to_pas() != None:
  86.          temp, argID, targs = seg.to_pas()
  87.          if temp:
  88.              pLst.append(temp)
  89.              if argID:
  90.                  aD[argID] = temp
  91.              if targs:
  92.                  pD[temp] = targs
  93.   joined = ' \& '.join(pLst) + ' \\\\' + '\n'
  94.   return (joined), (pas_edges(pD, aD, pLst))
  95.  
  96. def pas_edges(pD, aD, pLst):
  97.   cD = {'ga': ('blue!50!cyan', 's'), 'o': ('magenta', 'o'), 'ni': ('violet', 'io')}
  98.   edges = []  
  99.   for (i, t) in pD.iteritems():
  100.         for (k, (c, l)) in cD.iteritems():
  101.            for j in t.split(' '):
  102.                if k in j:
  103.                    sid = aD.get(getID(k, j))
  104.                    if sid in pLst and i in pLst:
  105.                        cid = convID(pLst, sid)                          
  106.                        pid = convID(pLst, i)
  107.                        if cid != pid:
  108.                            edges += '\n\depedge[style={%s}, edge below, label style={draw=darkgray, top color=pink, bottom color=lightgray, text=white, circle, shade, below}]{%s}{%s}{%s}' % (c, pid, cid, l)
  109.   ej = ''.join(edges)
  110.   if ej != None:
  111.     return ej
  112.  
  113. def wrap_depedge(h, m):
  114.     return '\depedge[edge style={teal!75!cyan}]{%d}{%d}{}' % (int(h)+1, int(m)+1)
  115.  
  116. def wrap_depedges(sent):
  117.     return '\n'.join([wrap_depedge(seg.id, seg.head) for seg in sent if not seg.is_root()])
  118.    
  119. def read_deptree(f):
  120.   sentences = []
  121.   sent = []
  122.   segment = Segment()
  123.   for l in f:
  124.     if l.startswith('EOS'):
  125.       sent.append(segment)
  126.       sentences.append(sent)
  127.       segment = Segment()
  128.       sent = []
  129.     elif l.startswith('*'):
  130.       if segment.id is not None:
  131.         sent.append(segment)
  132.       segment = Segment()
  133.       lis = l.rstrip().split(' ')
  134.       if len(lis) != 5:
  135.         raise FormatError('Illegal format:' + l)
  136.       segment.id = lis[1]
  137.       if lis[2][-1].isalpha():
  138.         segment.head = lis[2][:-1]
  139.       else:
  140.         raise FormatError('Illegal format:' + l)
  141.     else:
  142.       segment.add(l.rstrip().decode('utf-8'))
  143.   return sentences
  144.  
  145. class LaTeXFormatter(object):
  146.  
  147.     def __init__(self, doc_opt, font, tikz_dep_opt, tikz_deptxt_opt):
  148.         self.doc_opt = doc_opt
  149.         self.font = font
  150.         self.tikz_dep_opt = tikz_dep_opt
  151.         self.tikz_deptxt_opt = tikz_deptxt_opt
  152.  
  153.     def latex_header(self):
  154.         return '''\documentclass[convert=true]{%s}
  155. \usepackage{tikz-dependency}
  156. \usepackage{zxjatype}
  157. \setjamainfont[Scale=.7]{%s}
  158. \\begin{document}''' % (self.doc_opt, self.font)
  159.  
  160.     def latex_footer(self):
  161.         return '''\end{document}'''
  162.  
  163.     def print_tikz_dep(self, sent):
  164.         joined, edges = sentence_to_pas(sent)
  165.         print '''\\begin{dependency}[%s]
  166. \\begin{deptext}[%s, nodes={text=cyan}]
  167. %s
  168. \end{deptext}
  169. %s%s
  170. \end{dependency}''' % (self.tikz_dep_opt, self.tikz_deptxt_opt, sentence_to_deptext(sent).replace('%', '\%'), wrap_depedges(sent), edges)
  171.  
  172. def set_default_font():
  173.   '''Set up default font according to major platforms
  174.  (Windows, Mac OS X, Linux).
  175.  '''
  176.   if os.name == 'nt':
  177.     return 'Meiryo' # ăƒĄă‚€ăƒȘă‚Ș
  178.   elif os.name == 'posix' and os.uname()[0] == 'Darwin':
  179.     return 'Hiragino Kaku Gothic Pro W3'
  180.   elif os.name == 'posix' and os.uname()[0] == 'Linux':
  181.     return 'IPAPGothic'
  182.   else:
  183.     return 'IPAPGothic'
  184.  
  185. def parse_options():
  186.   default_font = set_default_font()
  187.   parser = optparse.OptionParser(usage='%prog [options] data')
  188.  
  189.   parser.add_option('--doc-option', dest='doc_opt', default='standalone',
  190.                     help='the options of documentclass')
  191.   parser.add_option('--font', dest='font', default=default_font,
  192.                     help='Japanese font')
  193.   parser.add_option('--dep-option', dest='dep_opt', default='theme=simple',
  194.                     help='the option of the dependency environment')
  195.   parser.add_option('--deptxt-option', dest='deptxt_opt', default='column sep=.7em',
  196.                     help='the option of the deptext environment')
  197.   (options, unused_args) = parser.parse_args()
  198.   return (options, unused_args)
  199.  
  200. def main():
  201.   opts, unused_args = parse_options()
  202.   tex_formatter = LaTeXFormatter(opts.doc_opt, opts.font, opts.dep_opt, opts.deptxt_opt)
  203.  
  204.   if len(unused_args) == 0:
  205.     sents = read_deptree(sys.stdin)
  206.   else:
  207.     with open(unused_args[0]) as f:
  208.       sents = read_deptree(f)
  209.  
  210.   for sent in sents:
  211.     print tex_formatter.latex_header()
  212.     tex_formatter.print_tikz_dep(sent)
  213.     print tex_formatter.latex_footer()
  214.     print
  215.  
  216. if __name__ == '__main__':
  217.   main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement