Advertisement
Guest User

Untitled

a guest
Sep 10th, 2012
52
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 6.80 KB | None | 0 0
  1. BILTRANS=en-es.autobil.bin
  2. LM=/home/fran/corpora/europarl/europarl.lm
  3. LOCALTEMP=temp/
  4. SCRIPTS=/home/fran/source/apertium-lex-tools/scripts/
  5. MOSESSCRIPTS=/home/fran/local/bin/scripts-20120109-1229/
  6.  
  7. CORPUS=$1
  8. DIR=$2
  9. L1=`echo $3 | cut -f1 -d'-'`
  10. L2=`echo $3 | cut -f2 -d'-'`
  11.  
  12. #sh setup-corpus.sh oab /home/fran/source/apertium-br-fr/ br-fr
  13.  
  14. mkdir -p $LOCALTEMP
  15.  
  16. perl $MOSESSCRIPTS/training/clean-corpus-n.perl $CORPUS $L1 $L2 $CORPUS.clean 1 40
  17.  
  18. cat $CORPUS.clean.$L1 | apertium-destxt | apertium -f none -d $DIR $L1-$L2-pretransfer > $CORPUS.tagged.$L1
  19. cat $CORPUS.clean.$L2 | apertium-destxt | apertium -f none -d $DIR $L2-$L1-pretransfer > $CORPUS.tagged.$L2
  20.  
  21. echo "0000        Clean and tagged corpus length:"
  22. wc -l $CORPUS.*
  23.  
  24. mv $CORPUS.clean.$L1 $LOCALTEMP
  25. mv $CORPUS.clean.$L2 $LOCALTEMP
  26.  
  27. # We need to number the lines in the corpus so that later, when we discard lines, we can find out
  28. # what the line numbers were in the original [surface form] corpus.
  29.  
  30. NUMLINES=`cat $CORPUS.tagged.$L1 | wc -l`;
  31. seq 1 $NUMLINES > $CORPUS.lines
  32.  
  33. paste $CORPUS.lines $CORPUS.tagged.$L1 $CORPUS.tagged.$L2 | grep '<' | cut -f1 > $CORPUS.lines.new
  34. paste $CORPUS.lines $CORPUS.tagged.$L1 $CORPUS.tagged.$L2 | grep '<' | cut -f2 > $CORPUS.tagged.$L1.new
  35. paste $CORPUS.lines $CORPUS.tagged.$L1 $CORPUS.tagged.$L2 | grep '<' | cut -f3 > $CORPUS.tagged.$L2.new
  36.  
  37. # This is the number of lines after removing blank lines from tagging.
  38. mv $CORPUS.lines.new $CORPUS.lines
  39. mv $CORPUS.tagged.$L1.new $CORPUS.tagged.$L1
  40. mv $CORPUS.tagged.$L2.new $CORPUS.tagged.$L2
  41.  
  42. echo "0001        Tagged and stripped corpus length:"
  43. wc -l $CORPUS.*
  44.  
  45. cat $CORPUS.tagged.$L1 | lt-proc -b $DIR/$BILTRANS > $CORPUS.biltrans.$L1-$L2
  46.  
  47. echo "0002        Biltrans length:"
  48. wc -l $CORPUS.biltrans.*
  49.  
  50. cat $CORPUS.biltrans.$L1-$L2 | python3 $SCRIPTS/calc-corpus-poly.py 2>/dev/null > $LOCALTEMP/$CORPUS.biltrans.$L1-$L2.poly
  51.  
  52. echo "0003        Corpus polysemy count:"
  53. tail -1 $LOCALTEMP/$CORPUS.biltrans.$L1-$L2.poly
  54.  
  55. cat $CORPUS.tagged.$L1 | python $SCRIPTS/process-tagger-output.py $L1 > $CORPUS.token.$L1
  56. cat $CORPUS.tagged.$L2 | python $SCRIPTS/process-tagger-output.py $L2 > $CORPUS.token.$L2
  57. cat $CORPUS.biltrans.$L1-$L2 | python $SCRIPTS/process-biltrans-output.py > $CORPUS.token.$L1-$L2
  58.  
  59. echo "0004        Tokenised corpus length:"
  60. wc -l $CORPUS.token.*
  61.  
  62. mv $CORPUS.lines $LOCALTEMP
  63. mv $CORPUS.tagged.$L1 $LOCALTEMP
  64. mv $CORPUS.tagged.$L2 $LOCALTEMP
  65. mv $CORPUS.biltrans.$L1-$L2 $LOCALTEMP
  66.  
  67. # We clean the corpus again to remove lines which may have becoming too long or manky
  68. # because of the processing with Apertium tools.
  69. perl $MOSESSCRIPTS/training/clean-corpus-n.perl $CORPUS.token $L1 $L2 $CORPUS.ctoken 1 40 $CORPUS.ctoken.retained
  70.  
  71. echo "0005        Cleaned and tokenised corpus length:"
  72. wc -l $CORPUS.ctoken.*
  73.  
  74. python3 $SCRIPTS/biltrans-only-retained.py $CORPUS.token.$L1-$L2 $CORPUS.ctoken.retained > $CORPUS.ctoken.$L1-$L2
  75.  
  76. mv $CORPUS.token.$L1 $LOCALTEMP
  77. mv $CORPUS.token.$L2 $LOCALTEMP
  78. mv $CORPUS.token.$L1-$L2 $LOCALTEMP
  79.  
  80. perl $MOSESSCRIPTS/training/train-model.perl -scripts-root-dir $MOSESSCRIPTS -root-dir . -corpus $CORPUS.ctoken -f $L1 -e $L2  -alignment grow-diag-final-and -reordering msd-bidirectional-fe  -lm 0:5:$LM:0 >log 2>&1
  81.  
  82. if [[ $? != 0 ]]; then
  83.     echo $?" is bad.";
  84.     exit ;
  85. fi
  86.  
  87. zcat giza.$L1-$L2/$L1-$L2.A3.final.gz | $SCRIPTS/giza-to-moses.awk > $CORPUS.phrasetable.$L1-$L2
  88.  
  89. wc -l $CORPUS.phrasetable.$L1-$L2
  90.  
  91. mv $CORPUS.ctoken.$L1 $LOCALTEMP
  92. mv $CORPUS.ctoken.$L2 $LOCALTEMP
  93. mv $CORPUS.ctoken.retained $LOCALTEMP
  94.  
  95. ## Extract the sentences that we are going to process to make the rules
  96.  
  97. python3 $SCRIPTS/extract-sentences.py $CORPUS.phrasetable.$L1-$L2 $CORPUS.ctoken.$L1-$L2 > $CORPUS.candidates.$L1-$L2 2>$LOCALTEMP/extract.log
  98.  
  99. ## Split the sentences into train/test/dev
  100. nohup time python3 /home/fran/source/apertium-lex-tools/scripts/split-sentences.py $CORPUS.candidates.$L1-$L2 1000
  101.  
  102. wc -l $CORPUS.train.$L1-$L2
  103. wc -l $CORPUS.tst.$L1-$L2.*
  104. wc -l $CORPUS.dev.$L1-$L2.*
  105.  
  106. ## Extract the test and dev sets from the original files.
  107.  
  108.  
  109. python3 $SCRIPTS/biltrans-only-retained.py $LOCALTEMP/$CORPUS.clean.$L1 $LOCALTEMP/$CORPUS.lines > $LOCALTEMP/$CORPUS.$L1.token
  110. python3 $SCRIPTS/biltrans-only-retained.py $LOCALTEMP/$CORPUS.$L1.token $LOCALTEMP/$CORPUS.ctoken.retained > $LOCALTEMP/$CORPUS.$L1.ctoken
  111. python3 $SCRIPTS/biltrans-only-retained.py $LOCALTEMP/$CORPUS.clean.$L2 $LOCALTEMP/$CORPUS.lines > $LOCALTEMP/$CORPUS.$L2.token
  112. python3 $SCRIPTS/biltrans-only-retained.py $LOCALTEMP/$CORPUS.$L2.token $LOCALTEMP/$CORPUS.ctoken.retained > $LOCALTEMP/$CORPUS.$L2.ctoken
  113.  
  114. ## Check to see if the token and ctoken lengths match up
  115.  
  116. wc -l $LOCALTEMP/$CORPUS.$L1.token $LOCALTEMP/$CORPUS.token.$L1
  117. wc -l $LOCALTEMP/$CORPUS.$L2.token $LOCALTEMP/$CORPUS.token.$L2
  118. wc -l $LOCALTEMP/$CORPUS.$L1.ctoken $LOCALTEMP/$CORPUS.ctoken.$L1
  119. wc -l $LOCALTEMP/$CORPUS.$L2.ctoken $LOCALTEMP/$CORPUS.ctoken.$L2
  120.  
  121. cat $CORPUS.tst.$L1-$L2.src | cut -f1 -d' ' > $LOCALTEMP/$CORPUS.tst.lines;
  122. cat $CORPUS.dev.$L1-$L2.src | cut -f1 -d' ' > $LOCALTEMP/$CORPUS.dev.lines;
  123. cat $LOCALTEMP/$CORPUS.tst.lines $LOCALTEMP/$CORPUS.dev.lines > $LOCALTEMP/$CORPUS.reserved.lines;
  124.  
  125. python3 $SCRIPTS/biltrans-only-retained.py $LOCALTEMP/$CORPUS.$L1.ctoken $LOCALTEMP/$CORPUS.tst.lines > $CORPUS.tst.$L1;
  126. python3 $SCRIPTS/biltrans-only-retained.py $LOCALTEMP/$CORPUS.$L2.ctoken $LOCALTEMP/$CORPUS.tst.lines > $CORPUS.tst.$L2;
  127. python3 $SCRIPTS/biltrans-only-retained.py $LOCALTEMP/$CORPUS.$L1.ctoken $LOCALTEMP/$CORPUS.dev.lines > $CORPUS.dev.$L1;
  128. python3 $SCRIPTS/biltrans-only-retained.py $LOCALTEMP/$CORPUS.$L2.ctoken $LOCALTEMP/$CORPUS.dev.lines > $CORPUS.dev.$L2;
  129.  
  130. wc -l $CORPUS.tst.*
  131. wc -l $CORPUS.dev.*
  132.  
  133. python3 $SCRIPTS/strip-test-lines.py $LOCALTEMP/$CORPUS.$L1.ctoken $LOCALTEMP/$CORPUS.reserved.lines > $CORPUS.train.$L1;
  134. python3 $SCRIPTS/strip-test-lines.py $LOCALTEMP/$CORPUS.$L2.ctoken $LOCALTEMP/$CORPUS.reserved.lines > $CORPUS.train.$L2;
  135.  
  136. python $SCRIPTS/extract-freq-lexicon.py $CORPUS.train.$L1-$L2 > $CORPUS.lexicon.$L1-$L2
  137.  
  138. python $SCRIPTS/ngram-count-patterns.py $CORPUS.lexicon.$L1-$L2 $CORPUS.train.$L1-$L2 2>$LOCALTEMP/ngram-count.log > $CORPUS.ngrams.$L1-$L2
  139.  
  140. # Build the basic language models of surface forms with IRSTLM.
  141. build-lm.sh -i $CORPUS.train.$L2 -o $CORPUS.$L2.5.lm.gz -n 5 -b
  142. compile-lm $CORPUS.$L2.5.lm.gz $CORPUS.$L2.5.blm
  143. mv $CORPUS.$L2.5.lm.gz $LOCALTEMP
  144.  
  145. build-lm.sh -i $CORPUS.train.$L2 -o $CORPUS.$L2.1.lm.gz -n 1 -b
  146. compile-lm $CORPUS.$L2.1.lm.gz $CORPUS.$L2.1.blm
  147. mv $CORPUS.$L2.1.lm.gz $LOCALTEMP
  148.  
  149. # Extract the alignment defaults as rules.
  150. python3 $SCRIPTS/extract-alig-lrx.py $CORPUS.lexicon.$L1-$L2 > $CORPUS.$L1-$L2.alig.lrx
  151.  
  152. # Extract the biltrans of the candidate lines (for generating TLM best selections)
  153. python3 $SCRIPTS/extract-biltrans-cand.py $CORPUS.train.$L1-$L2 > $CORPUS.train.$L1-$L2.biltrans
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement