SHARE
TWEET

Create MGIZA alignments

a guest Jan 16th, 2011 1,478 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/bin/bash
  2.  
  3. # In this script we assume that the target language is always english, and the source languages those in the "for" cycle
  4.  
  5. ./tokenizer.perl -l en < raw_corp.en > corp.tok.en
  6.  
  7. tr '[:upper:]' '[:lower:]' < corp.tok.en > corp.tok.low.en
  8.  
  9. mkcls -n10 -pcorp.tok.low.en -Vcorp.tok.low.en.vcb.classes
  10.  
  11. for l in "it" "es" "de" "fr" "nl"
  12. do
  13.         echo "Pre-processing: tokenizing and lowering..."
  14.  
  15.         ./tokenizer.perl -l ${l} < raw_corp.${l} > corp.tok.${l}
  16.  
  17.         tr '[:upper:]' '[:lower:]' < corp.tok.${l} > corp.tok.low.${l}
  18.  
  19.         echo "Finished pre-processing, starting creation of vocabulary, cooccurrence and classes..."
  20.  
  21.         mkcls -n10 -pcorp.tok.low.${l} -Vcorp.tok.low.${l}.vcb.classes
  22.  
  23.         plain2snt corp.tok.low.${l} corp.tok.low.en
  24.  
  25.         snt2cooc corp.tok.low.${l}_corp.tok.low.en.cooc corp.tok.low.${l}.vcb corp.tok.low.en.vcb corp.tok.low.${l}_corp.tok.low.en.snt
  26.  
  27.         echo "Finished creation! Now we start, really :)"
  28.  
  29.         echo "Starting alignment: ${l} -> en" > ${l}.timelog
  30.         date >> ${l}.timelog
  31.  
  32.         mgiza ${l}_en.dict.gizacfg
  33.  
  34.         echo "Finished alignment, starting merge of parts" >> ${l}.timelog
  35.  
  36.         date >> ${l}.timelog
  37.  
  38.         for i in 0 1 2 3 4 5 6 7
  39.         do
  40.                 cat ${l}_en.dict.A3.final.part${i} >> corpus_word_aligned_${l}_en
  41.         done
  42.  
  43.         rm ${l}_en.dict.A3.final.part*
  44.  
  45.         date >> ${l}.timelog
  46.         echo "End of process." >> ${l}.timelog
  47. done
RAW Paste Data
Top