SHARE
TWEET

"ing" isolater

a guest Nov 3rd, 2011 163 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. # -*- coding: utf-8 -*-
  2. import codecs
  3. from re import compile
  4.  
  5. def iter_sentences(LISO=None):
  6.     SISO = set(LISO) if LISO else None
  7.    
  8.     DSentences = {}
  9.     for line in codecs.open('sentences.csv', 'rb', 'utf-8'):
  10.         id, iso, sentence = line.split('\t')
  11.         id = int(id)
  12.         assert not id in DSentences
  13.        
  14.         if SISO and iso not in SISO:
  15.             continue
  16.        
  17.         DSentences[id] = (iso, sentence.strip())
  18.    
  19.     for line in codecs.open('links.csv', 'rb', 'utf-8'):
  20.         from_id, to_id = line.split('\t')
  21.         from_id = int(from_id)
  22.         to_id = int(to_id)
  23.        
  24.         if not from_id in DSentences:
  25.             continue
  26.         elif not to_id in DSentences:
  27.             continue
  28.        
  29.         yield sorted((DSentences[from_id],
  30.                       DSentences[to_id]))
  31.  
  32. ing_re = compile(r" (is|is not|isn't|are|are not|aren't|we're) ([A-z]*ing\b)")
  33. def get_ing(s):
  34.     match = ing_re.search(s)
  35.     if match:
  36.         r = match.group(2)
  37.         #print r
  38.        
  39.         if 'going to' in s and r=='going':
  40.             return None
  41.         elif 'is getting' in s and r=='getting':
  42.             return None
  43.         elif r not in ('anything',
  44.                        'nothing',
  45.                        'something',
  46.                        'interesting',
  47.                        'refreshing',
  48.                        'lacking',
  49.                        'being',
  50.                        'boring',
  51.                        'willing',
  52.                        'unwilling',
  53.                        'tiring',
  54.                        'surprising',
  55.                        'disappointing',
  56.                        'amazing',
  57.                        'amusing',
  58.                        'annoying'):
  59.             return r
  60.    
  61.     return None
  62.  
  63. LTeiru = u'''
  64. てる
  65. ている
  66. γ¦γ„γŸ
  67. ていて
  68. ていγͺい
  69. でいる
  70. γ§γ„γŸ
  71. でいて
  72. でいγͺい
  73. ています
  74. γ¦γ„γΎγ—γŸ
  75. ています
  76. でいます
  77. γ§γ„γΎγ—γŸ
  78. でいます
  79. てきて
  80. てきた
  81. γ¦γŠγ‚ŠγΎγ™
  82. ていません
  83. てます
  84. '''.strip().replace('\r', '').split('\n')
  85.  
  86. def is_teiru(s):
  87.     allow = any(i for i in LTeiru if i in s)
  88.     block = (u'できている' in s or
  89.              u'γ‚‰γ‚Œγ¦γ„γ‚‹' in s)
  90.     return allow and not block
  91.  
  92. def print_relevant():
  93.     SIng = set()
  94.     DOut = {}
  95.    
  96.     for (_, eng), (_, jpn) in iter_sentences(['jpn', 'eng']):
  97.         def append(key):
  98.             DOut.setdefault(key.lower(), []).append((eng, jpn))
  99.             s = '%s: %s %s' % (key, eng, jpn)
  100.             print s.encode('shift-jis', 'replace')
  101.        
  102.         ing = get_ing(eng)
  103.         has_teiru = is_teiru(jpn)
  104.        
  105.         if ing and not ing in SIng and not has_teiru:
  106.             append(ing)
  107.         elif (u'かけている' in jpn or
  108.               u'ζŽ›γ‘γ¦γ„γ‚‹' in jpn) and ing:
  109.             append(ing)
  110.        
  111.         if ing and 0:
  112.             SIng.add(ing)
  113.    
  114.     SUsed = set()
  115.     with codecs.open('out.txt', 'wb', 'utf-8') as f_out:
  116.         for key, L in sorted(DOut.items()):
  117.             for eng, jpn in L:
  118.                 s = '%s\t%s\t%s\n' % (key, eng, jpn)
  119.                 if s in SUsed:
  120.                     continue
  121.                 SUsed.add(s)
  122.                
  123.                 f_out.write(s)
  124.  
  125. if __name__ == '__main__':
  126.     print_relevant()
  127.  
  128.  
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top