Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import codecs
- from itertools import izip
- deu_en = "./bitexts/europarl-v7.de-en.en"
- deu = "./bitexts/europarl-v7.de-en.de"
- spa_en = "./bitexts/europarl-v7.es-en.en"
- spa = "./bitexts/europarl-v7.es-en.es"
- fre_en = "./bitexts/europarl-v7.fr-en.en"
- fre = "./bitexts/europarl-v7.fr-en.fr"
- ita_en = "./bitexts/europarl-v7.it-en.en"
- ita = "./bitexts/europarl-v7.it-en.it"
- nld_en = "./bitexts/europarl-v7.nl-en.en"
- nld = "./bitexts/europarl-v7.nl-en.nl"
- dan_en = "./bitexts/europarl-v7.da-en.en"
- dan = "./bitexts/europarl-v7.da-en.da"
- fin_en = "./bitexts/europarl-v7.fi-en.en"
- fin = "./bitexts/europarl-v7.fi-en.fi"
- def findinall(langs_en, output=None):
- seed = [i.strip() for i in codecs.open(langs_en[0],'r','utf8').readlines()]
- inall = set(seed)
- for l in langs_en[1:]:
- print "inall",len(inall)
- sentence = set([i.strip() for i in codecs.open(l,'r','utf8').readlines()])
- print l, len(sentence)
- inall = inall & sentence
- if output != None:
- outfile = codecs.open(output,'w','utf8')
- for s in inall:
- #print s
- print>>outfile, s
- return inall
- #findinall([deu_en,fre_en,ita_en,nld_en,spa_en],'inall5.out')
- def outAligned(langs_doc, inall):
- seed = [i.strip() for i in codecs.open(inall,'r','utf8').readlines()]
- outfile = codecs.open("aligned5.full.out",'w','utf8')
- line = ""
- for sent in seed:
- line = sent
- #print line
- if line.strip() == "":
- continue
- for i in langs_doc:
- #print i[2]
- for s, t in izip(i[0], i[1]):
- #print langiso[trg[-2:]], s, t
- if s == sent:
- line+="\t"; line+=i[2]+":"; line+=t
- #print line
- break
- print line
- print>>outfile, line
- def txt2doc(pairs):
- docs = []
- for x,y in pairs.items():
- src = [i.strip() for i in codecs.open(x,'r','utf8').readlines()]
- trg = [i.strip() for i in codecs.open(y,'r','utf8').readlines()]
- lang = langiso[y[-2:]]
- docs.append((src,trg,lang))
- return docs
- langiso = {'de':"deu","fr":"fre","it":"ita","nl":"nld","es":"spa"}
- txtpairs = {deu_en:deu,fre_en:fre,ita_en:ita,nld_en:nld,spa_en:spa}
- docpairs = txt2doc(txtpairs)
- outAligned(docpairs, "inall5.sorted")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement