Advertisement
alvations

europarl_bigbigfile

Jan 8th, 2013
196
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.19 KB | None | 0 0
  1. import codecs
  2. from itertools import izip
  3.  
  4. deu_en = "./bitexts/europarl-v7.de-en.en"
  5. deu = "./bitexts/europarl-v7.de-en.de"
  6.  
  7. spa_en = "./bitexts/europarl-v7.es-en.en"
  8. spa = "./bitexts/europarl-v7.es-en.es"
  9.  
  10.  
  11. fre_en = "./bitexts/europarl-v7.fr-en.en"
  12. fre = "./bitexts/europarl-v7.fr-en.fr"
  13.  
  14. ita_en = "./bitexts/europarl-v7.it-en.en"
  15. ita = "./bitexts/europarl-v7.it-en.it"
  16.  
  17.  
  18. nld_en = "./bitexts/europarl-v7.nl-en.en"
  19. nld = "./bitexts/europarl-v7.nl-en.nl"
  20.  
  21. dan_en = "./bitexts/europarl-v7.da-en.en"
  22. dan = "./bitexts/europarl-v7.da-en.da"
  23.  
  24. fin_en = "./bitexts/europarl-v7.fi-en.en"
  25. fin = "./bitexts/europarl-v7.fi-en.fi"
  26.  
  27.  
  28. def findinall(langs_en, output=None):
  29.   seed = [i.strip() for i in codecs.open(langs_en[0],'r','utf8').readlines()]
  30.   inall = set(seed)
  31.   for l in langs_en[1:]:
  32.     print "inall",len(inall)
  33.     sentence = set([i.strip() for i in codecs.open(l,'r','utf8').readlines()])
  34.     print l, len(sentence)
  35.     inall = inall & sentence
  36.  
  37.   if output != None:
  38.     outfile = codecs.open(output,'w','utf8')
  39.     for s in inall:
  40.           #print s
  41.       print>>outfile, s
  42.  
  43.   return inall
  44.  
  45. #findinall([deu_en,fre_en,ita_en,nld_en,spa_en],'inall5.out')  
  46.  
  47.  
  48. def outAligned(langs_doc, inall):
  49.   seed = [i.strip() for i in codecs.open(inall,'r','utf8').readlines()]
  50.   outfile = codecs.open("aligned5.full.out",'w','utf8')
  51.   line = ""
  52.   for sent in seed:
  53.     line = sent
  54.     #print line
  55.     if line.strip() == "":
  56.       continue
  57.     for i in langs_doc:
  58.       #print i[2]
  59.       for s, t in izip(i[0], i[1]):
  60.     #print langiso[trg[-2:]], s, t
  61.         if s == sent:
  62.           line+="\t"; line+=i[2]+":"; line+=t
  63.           #print line
  64.           break
  65.     print line
  66.     print>>outfile, line
  67.  
  68.  
  69. def txt2doc(pairs):
  70.   docs = []
  71.   for x,y in pairs.items():
  72.     src = [i.strip() for i in codecs.open(x,'r','utf8').readlines()]
  73.     trg = [i.strip() for i in codecs.open(y,'r','utf8').readlines()]
  74.     lang = langiso[y[-2:]]
  75.     docs.append((src,trg,lang))
  76.   return docs
  77.  
  78. langiso = {'de':"deu","fr":"fre","it":"ita","nl":"nld","es":"spa"}
  79. txtpairs = {deu_en:deu,fre_en:fre,ita_en:ita,nld_en:nld,spa_en:spa}
  80. docpairs = txt2doc(txtpairs)
  81.  
  82. outAligned(docpairs, "inall5.sorted")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement