# -*- coding: utf-8 -*-
import codecs
from re import compile
def iter_sentences(LISO=None):
SISO = set(LISO) if LISO else None
DSentences = {}
for line in codecs.open('sentences.csv', 'rb', 'utf-8'):
id, iso, sentence = line.split('\t')
id = int(id)
assert not id in DSentences
if SISO and iso not in SISO:
continue
DSentences[id] = (iso, sentence.strip())
for line in codecs.open('links.csv', 'rb', 'utf-8'):
from_id, to_id = line.split('\t')
from_id = int(from_id)
to_id = int(to_id)
if not from_id in DSentences:
continue
elif not to_id in DSentences:
continue
yield sorted((DSentences[from_id],
DSentences[to_id]))
ing_re = compile(r" (is|is not|isn't|are|are not|aren't|we're) ([A-z]*ing\b)")
def get_ing(s):
match = ing_re.search(s)
if match:
r = match.group(2)
#print r
if 'going to' in s and r=='going':
return None
elif 'is getting' in s and r=='getting':
return None
elif r not in ('anything',
'nothing',
'something',
'interesting',
'refreshing',
'lacking',
'being',
'boring',
'willing',
'unwilling',
'tiring',
'surprising',
'disappointing',
'amazing',
'amusing',
'annoying'):
return r
return None
LTeiru = u'''
てる
ている
ていた
ていて
ていない
でいる
でいた
でいて
でいない
ています
ていました
ています
でいます
でいました
でいます
てきて
てきた
ております
ていません
てます
'''.strip().replace('\r', '').split('\n')
def is_teiru(s):
allow = any(i for i in LTeiru if i in s)
block = (u'できている' in s or
u'られている' in s)
return allow and not block
def print_relevant():
SIng = set()
DOut = {}
for (_, eng), (_, jpn) in iter_sentences(['jpn', 'eng']):
def append(key):
DOut.setdefault(key.lower(), []).append((eng, jpn))
s = '%s: %s %s' % (key, eng, jpn)
print s.encode('shift-jis', 'replace')
ing = get_ing(eng)
has_teiru = is_teiru(jpn)
if ing and not ing in SIng and not has_teiru:
append(ing)
elif (u'かけている' in jpn or
u'掛けている' in jpn) and ing:
append(ing)
if ing and 0:
SIng.add(ing)
SUsed = set()
with codecs.open('out.txt', 'wb', 'utf-8') as f_out:
for key, L in sorted(DOut.items()):
for eng, jpn in L:
s = '%s\t%s\t%s\n' % (key, eng, jpn)
if s in SUsed:
continue
SUsed.add(s)
f_out.write(s)
if __name__ == '__main__':
print_relevant()