daily pastebin goal
30%
SHARE
TWEET

Create MGIZA alignments

a guest Jan 16th, 2011 1,723 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/bin/bash
  2.  
  3. # In this script we assume that the target language is always english, and the source languages those in the "for" cycle
  4.  
  5. ./tokenizer.perl -l en < raw_corp.en > corp.tok.en
  6.  
  7. tr '[:upper:]' '[:lower:]' < corp.tok.en > corp.tok.low.en
  8.  
  9. mkcls -n10 -pcorp.tok.low.en -Vcorp.tok.low.en.vcb.classes
  10.  
  11. for l in "it" "es" "de" "fr" "nl"
  12. do
  13.         echo "Pre-processing: tokenizing and lowering..."
  14.  
  15.         ./tokenizer.perl -l ${l} < raw_corp.${l} > corp.tok.${l}
  16.  
  17.         tr '[:upper:]' '[:lower:]' < corp.tok.${l} > corp.tok.low.${l}
  18.  
  19.         echo "Finished pre-processing, starting creation of vocabulary, cooccurrence and classes..."
  20.  
  21.         mkcls -n10 -pcorp.tok.low.${l} -Vcorp.tok.low.${l}.vcb.classes
  22.  
  23.         plain2snt corp.tok.low.${l} corp.tok.low.en
  24.  
  25.         snt2cooc corp.tok.low.${l}_corp.tok.low.en.cooc corp.tok.low.${l}.vcb corp.tok.low.en.vcb corp.tok.low.${l}_corp.tok.low.en.snt
  26.  
  27.         echo "Finished creation! Now we start, really :)"
  28.  
  29.         echo "Starting alignment: ${l} -> en" > ${l}.timelog
  30.         date >> ${l}.timelog
  31.  
  32.         mgiza ${l}_en.dict.gizacfg
  33.  
  34.         echo "Finished alignment, starting merge of parts" >> ${l}.timelog
  35.  
  36.         date >> ${l}.timelog
  37.  
  38.         for i in 0 1 2 3 4 5 6 7
  39.         do
  40.                 cat ${l}_en.dict.A3.final.part${i} >> corpus_word_aligned_${l}_en
  41.         done
  42.  
  43.         rm ${l}_en.dict.A3.final.part*
  44.  
  45.         date >> ${l}.timelog
  46.         echo "End of process." >> ${l}.timelog
  47. done
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top