Advertisement
Guest User

Create MGIZA alignments

a guest
Jan 16th, 2011
2,699
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 1.32 KB | None | 0 0
  1. #!/bin/bash
  2.  
  3. # In this script we assume that the target language is always english, and the source languages those in the "for" cycle
  4.  
  5. ./tokenizer.perl -l en < raw_corp.en > corp.tok.en
  6.  
  7. tr '[:upper:]' '[:lower:]' < corp.tok.en > corp.tok.low.en
  8.  
  9. mkcls -n10 -pcorp.tok.low.en -Vcorp.tok.low.en.vcb.classes
  10.  
  11. for l in "it" "es" "de" "fr" "nl"
  12. do
  13.     echo "Pre-processing: tokenizing and lowering..."
  14.  
  15.     ./tokenizer.perl -l ${l} < raw_corp.${l} > corp.tok.${l}
  16.  
  17.     tr '[:upper:]' '[:lower:]' < corp.tok.${l} > corp.tok.low.${l}
  18.  
  19.     echo "Finished pre-processing, starting creation of vocabulary, cooccurrence and classes..."
  20.  
  21.     mkcls -n10 -pcorp.tok.low.${l} -Vcorp.tok.low.${l}.vcb.classes
  22.  
  23.     plain2snt corp.tok.low.${l} corp.tok.low.en
  24.  
  25.     snt2cooc corp.tok.low.${l}_corp.tok.low.en.cooc corp.tok.low.${l}.vcb corp.tok.low.en.vcb corp.tok.low.${l}_corp.tok.low.en.snt
  26.  
  27.     echo "Finished creation! Now we start, really :)"
  28.  
  29.     echo "Starting alignment: ${l} -> en" > ${l}.timelog
  30.     date >> ${l}.timelog
  31.  
  32.     mgiza ${l}_en.dict.gizacfg
  33.  
  34.     echo "Finished alignment, starting merge of parts" >> ${l}.timelog
  35.  
  36.     date >> ${l}.timelog
  37.  
  38.     for i in 0 1 2 3 4 5 6 7
  39.     do
  40.         cat ${l}_en.dict.A3.final.part${i} >> corpus_word_aligned_${l}_en
  41.     done
  42.  
  43.     rm ${l}_en.dict.A3.final.part*
  44.  
  45.     date >> ${l}.timelog
  46.     echo "End of process." >> ${l}.timelog
  47. done
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement