Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- import codecs
- from re import compile
- def iter_sentences(LISO=None):
- SISO = set(LISO) if LISO else None
- DSentences = {}
- for line in codecs.open('sentences.csv', 'rb', 'utf-8'):
- id, iso, sentence = line.split('\t')
- id = int(id)
- assert not id in DSentences
- if SISO and iso not in SISO:
- continue
- DSentences[id] = (iso, sentence.strip())
- for line in codecs.open('links.csv', 'rb', 'utf-8'):
- from_id, to_id = line.split('\t')
- from_id = int(from_id)
- to_id = int(to_id)
- if not from_id in DSentences:
- continue
- elif not to_id in DSentences:
- continue
- yield sorted((DSentences[from_id],
- DSentences[to_id]))
- ing_re = compile(r" (is|is not|isn't|are|are not|aren't|we're) ([A-z]*ing\b)")
- def get_ing(s):
- match = ing_re.search(s)
- if match:
- r = match.group(2)
- #print r
- if 'going to' in s and r=='going':
- return None
- elif 'is getting' in s and r=='getting':
- return None
- elif r not in ('anything',
- 'nothing',
- 'something',
- 'interesting',
- 'refreshing',
- 'lacking',
- 'being',
- 'boring',
- 'willing',
- 'unwilling',
- 'tiring',
- 'surprising',
- 'disappointing',
- 'amazing',
- 'amusing',
- 'annoying'):
- return r
- return None
- LTeiru = u'''
- てる
- ている
- ていた
- ていて
- ていない
- でいる
- でいた
- でいて
- でいない
- ています
- ていました
- ています
- でいます
- でいました
- でいます
- てきて
- てきた
- ております
- ていません
- てます
- '''.strip().replace('\r', '').split('\n')
- def is_teiru(s):
- allow = any(i for i in LTeiru if i in s)
- block = (u'できている' in s or
- u'られている' in s)
- return allow and not block
- def print_relevant():
- SIng = set()
- DOut = {}
- for (_, eng), (_, jpn) in iter_sentences(['jpn', 'eng']):
- def append(key):
- DOut.setdefault(key.lower(), []).append((eng, jpn))
- s = '%s: %s %s' % (key, eng, jpn)
- print s.encode('shift-jis', 'replace')
- ing = get_ing(eng)
- has_teiru = is_teiru(jpn)
- if ing and not ing in SIng and not has_teiru:
- append(ing)
- elif (u'かけている' in jpn or
- u'掛けている' in jpn) and ing:
- append(ing)
- if ing and 0:
- SIng.add(ing)
- SUsed = set()
- with codecs.open('out.txt', 'wb', 'utf-8') as f_out:
- for key, L in sorted(DOut.items()):
- for eng, jpn in L:
- s = '%s\t%s\t%s\n' % (key, eng, jpn)
- if s in SUsed:
- continue
- SUsed.add(s)
- f_out.write(s)
- if __name__ == '__main__':
- print_relevant()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement