Guest User

Untitled

a guest
Sep 10th, 2012
31
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. BILTRANS=en-es.autobil.bin
  2. LM=/home/fran/corpora/europarl/europarl.lm
  3. LOCALTEMP=temp/
  4. SCRIPTS=/home/fran/source/apertium-lex-tools/scripts/
  5. MOSESSCRIPTS=/home/fran/local/bin/scripts-20120109-1229/
  6.  
  7. CORPUS=$1
  8. DIR=$2
  9. L1=`echo $3 | cut -f1 -d'-'`
  10. L2=`echo $3 | cut -f2 -d'-'`
  11.  
  12. #sh setup-corpus.sh oab /home/fran/source/apertium-br-fr/ br-fr
  13.  
  14. mkdir -p $LOCALTEMP
  15.  
  16. perl $MOSESSCRIPTS/training/clean-corpus-n.perl $CORPUS $L1 $L2 $CORPUS.clean 1 40
  17.  
  18. cat $CORPUS.clean.$L1 | apertium-destxt | apertium -f none -d $DIR $L1-$L2-pretransfer > $CORPUS.tagged.$L1
  19. cat $CORPUS.clean.$L2 | apertium-destxt | apertium -f none -d $DIR $L2-$L1-pretransfer > $CORPUS.tagged.$L2
  20.  
  21. echo "0000        Clean and tagged corpus length:"
  22. wc -l $CORPUS.*
  23.  
  24. mv $CORPUS.clean.$L1 $LOCALTEMP
  25. mv $CORPUS.clean.$L2 $LOCALTEMP
  26.  
  27. # We need to number the lines in the corpus so that later, when we discard lines, we can find out
  28. # what the line numbers were in the original [surface form] corpus.
  29.  
  30. NUMLINES=`cat $CORPUS.tagged.$L1 | wc -l`;
  31. seq 1 $NUMLINES > $CORPUS.lines
  32.  
  33. paste $CORPUS.lines $CORPUS.tagged.$L1 $CORPUS.tagged.$L2 | grep '<' | cut -f1 > $CORPUS.lines.new
  34. paste $CORPUS.lines $CORPUS.tagged.$L1 $CORPUS.tagged.$L2 | grep '<' | cut -f2 > $CORPUS.tagged.$L1.new
  35. paste $CORPUS.lines $CORPUS.tagged.$L1 $CORPUS.tagged.$L2 | grep '<' | cut -f3 > $CORPUS.tagged.$L2.new
  36.  
  37. # This is the number of lines after removing blank lines from tagging.
  38. mv $CORPUS.lines.new $CORPUS.lines
  39. mv $CORPUS.tagged.$L1.new $CORPUS.tagged.$L1
  40. mv $CORPUS.tagged.$L2.new $CORPUS.tagged.$L2
  41.  
  42. echo "0001        Tagged and stripped corpus length:"
  43. wc -l $CORPUS.*
  44.  
  45. cat $CORPUS.tagged.$L1 | lt-proc -b $DIR/$BILTRANS > $CORPUS.biltrans.$L1-$L2
  46.  
  47. echo "0002        Biltrans length:"
  48. wc -l $CORPUS.biltrans.*
  49.  
  50. cat $CORPUS.biltrans.$L1-$L2 | python3 $SCRIPTS/calc-corpus-poly.py 2>/dev/null > $LOCALTEMP/$CORPUS.biltrans.$L1-$L2.poly
  51.  
  52. echo "0003        Corpus polysemy count:"
  53. tail -1 $LOCALTEMP/$CORPUS.biltrans.$L1-$L2.poly
  54.  
  55. cat $CORPUS.tagged.$L1 | python $SCRIPTS/process-tagger-output.py $L1 > $CORPUS.token.$L1
  56. cat $CORPUS.tagged.$L2 | python $SCRIPTS/process-tagger-output.py $L2 > $CORPUS.token.$L2
  57. cat $CORPUS.biltrans.$L1-$L2 | python $SCRIPTS/process-biltrans-output.py > $CORPUS.token.$L1-$L2
  58.  
  59. echo "0004        Tokenised corpus length:"
  60. wc -l $CORPUS.token.*
  61.  
  62. mv $CORPUS.lines $LOCALTEMP
  63. mv $CORPUS.tagged.$L1 $LOCALTEMP
  64. mv $CORPUS.tagged.$L2 $LOCALTEMP
  65. mv $CORPUS.biltrans.$L1-$L2 $LOCALTEMP
  66.  
  67. # We clean the corpus again to remove lines which may have becoming too long or manky
  68. # because of the processing with Apertium tools.
  69. perl $MOSESSCRIPTS/training/clean-corpus-n.perl $CORPUS.token $L1 $L2 $CORPUS.ctoken 1 40 $CORPUS.ctoken.retained
  70.  
  71. echo "0005        Cleaned and tokenised corpus length:"
  72. wc -l $CORPUS.ctoken.*
  73.  
  74. python3 $SCRIPTS/biltrans-only-retained.py $CORPUS.token.$L1-$L2 $CORPUS.ctoken.retained > $CORPUS.ctoken.$L1-$L2
  75.  
  76. mv $CORPUS.token.$L1 $LOCALTEMP
  77. mv $CORPUS.token.$L2 $LOCALTEMP
  78. mv $CORPUS.token.$L1-$L2 $LOCALTEMP
  79.  
  80. perl $MOSESSCRIPTS/training/train-model.perl -scripts-root-dir $MOSESSCRIPTS -root-dir . -corpus $CORPUS.ctoken -f $L1 -e $L2  -alignment grow-diag-final-and -reordering msd-bidirectional-fe  -lm 0:5:$LM:0 >log 2>&1
  81.  
  82. if [[ $? != 0 ]]; then
  83.     echo $?" is bad.";
  84.     exit ;
  85. fi
  86.  
  87. zcat giza.$L1-$L2/$L1-$L2.A3.final.gz | $SCRIPTS/giza-to-moses.awk > $CORPUS.phrasetable.$L1-$L2
  88.  
  89. wc -l $CORPUS.phrasetable.$L1-$L2
  90.  
  91. mv $CORPUS.ctoken.$L1 $LOCALTEMP
  92. mv $CORPUS.ctoken.$L2 $LOCALTEMP
  93. mv $CORPUS.ctoken.retained $LOCALTEMP
  94.  
  95. ## Extract the sentences that we are going to process to make the rules
  96.  
  97. python3 $SCRIPTS/extract-sentences.py $CORPUS.phrasetable.$L1-$L2 $CORPUS.ctoken.$L1-$L2 > $CORPUS.candidates.$L1-$L2 2>$LOCALTEMP/extract.log
  98.  
  99. ## Split the sentences into train/test/dev
  100. nohup time python3 /home/fran/source/apertium-lex-tools/scripts/split-sentences.py $CORPUS.candidates.$L1-$L2 1000
  101.  
  102. wc -l $CORPUS.train.$L1-$L2
  103. wc -l $CORPUS.tst.$L1-$L2.*
  104. wc -l $CORPUS.dev.$L1-$L2.*
  105.  
  106. ## Extract the test and dev sets from the original files.
  107.  
  108.  
  109. python3 $SCRIPTS/biltrans-only-retained.py $LOCALTEMP/$CORPUS.clean.$L1 $LOCALTEMP/$CORPUS.lines > $LOCALTEMP/$CORPUS.$L1.token
  110. python3 $SCRIPTS/biltrans-only-retained.py $LOCALTEMP/$CORPUS.$L1.token $LOCALTEMP/$CORPUS.ctoken.retained > $LOCALTEMP/$CORPUS.$L1.ctoken
  111. python3 $SCRIPTS/biltrans-only-retained.py $LOCALTEMP/$CORPUS.clean.$L2 $LOCALTEMP/$CORPUS.lines > $LOCALTEMP/$CORPUS.$L2.token
  112. python3 $SCRIPTS/biltrans-only-retained.py $LOCALTEMP/$CORPUS.$L2.token $LOCALTEMP/$CORPUS.ctoken.retained > $LOCALTEMP/$CORPUS.$L2.ctoken
  113.  
  114. ## Check to see if the token and ctoken lengths match up
  115.  
  116. wc -l $LOCALTEMP/$CORPUS.$L1.token $LOCALTEMP/$CORPUS.token.$L1
  117. wc -l $LOCALTEMP/$CORPUS.$L2.token $LOCALTEMP/$CORPUS.token.$L2
  118. wc -l $LOCALTEMP/$CORPUS.$L1.ctoken $LOCALTEMP/$CORPUS.ctoken.$L1
  119. wc -l $LOCALTEMP/$CORPUS.$L2.ctoken $LOCALTEMP/$CORPUS.ctoken.$L2
  120.  
  121. cat $CORPUS.tst.$L1-$L2.src | cut -f1 -d' ' > $LOCALTEMP/$CORPUS.tst.lines;
  122. cat $CORPUS.dev.$L1-$L2.src | cut -f1 -d' ' > $LOCALTEMP/$CORPUS.dev.lines;
  123. cat $LOCALTEMP/$CORPUS.tst.lines $LOCALTEMP/$CORPUS.dev.lines > $LOCALTEMP/$CORPUS.reserved.lines;
  124.  
  125. python3 $SCRIPTS/biltrans-only-retained.py $LOCALTEMP/$CORPUS.$L1.ctoken $LOCALTEMP/$CORPUS.tst.lines > $CORPUS.tst.$L1;
  126. python3 $SCRIPTS/biltrans-only-retained.py $LOCALTEMP/$CORPUS.$L2.ctoken $LOCALTEMP/$CORPUS.tst.lines > $CORPUS.tst.$L2;
  127. python3 $SCRIPTS/biltrans-only-retained.py $LOCALTEMP/$CORPUS.$L1.ctoken $LOCALTEMP/$CORPUS.dev.lines > $CORPUS.dev.$L1;
  128. python3 $SCRIPTS/biltrans-only-retained.py $LOCALTEMP/$CORPUS.$L2.ctoken $LOCALTEMP/$CORPUS.dev.lines > $CORPUS.dev.$L2;
  129.  
  130. wc -l $CORPUS.tst.*
  131. wc -l $CORPUS.dev.*
  132.  
  133. python3 $SCRIPTS/strip-test-lines.py $LOCALTEMP/$CORPUS.$L1.ctoken $LOCALTEMP/$CORPUS.reserved.lines > $CORPUS.train.$L1;
  134. python3 $SCRIPTS/strip-test-lines.py $LOCALTEMP/$CORPUS.$L2.ctoken $LOCALTEMP/$CORPUS.reserved.lines > $CORPUS.train.$L2;
  135.  
  136. python $SCRIPTS/extract-freq-lexicon.py $CORPUS.train.$L1-$L2 > $CORPUS.lexicon.$L1-$L2
  137.  
  138. python $SCRIPTS/ngram-count-patterns.py $CORPUS.lexicon.$L1-$L2 $CORPUS.train.$L1-$L2 2>$LOCALTEMP/ngram-count.log > $CORPUS.ngrams.$L1-$L2
  139.  
  140. # Build the basic language models of surface forms with IRSTLM.
  141. build-lm.sh -i $CORPUS.train.$L2 -o $CORPUS.$L2.5.lm.gz -n 5 -b
  142. compile-lm $CORPUS.$L2.5.lm.gz $CORPUS.$L2.5.blm
  143. mv $CORPUS.$L2.5.lm.gz $LOCALTEMP
  144.  
  145. build-lm.sh -i $CORPUS.train.$L2 -o $CORPUS.$L2.1.lm.gz -n 1 -b
  146. compile-lm $CORPUS.$L2.1.lm.gz $CORPUS.$L2.1.blm
  147. mv $CORPUS.$L2.1.lm.gz $LOCALTEMP
  148.  
  149. # Extract the alignment defaults as rules.
  150. python3 $SCRIPTS/extract-alig-lrx.py $CORPUS.lexicon.$L1-$L2 > $CORPUS.$L1-$L2.alig.lrx
  151.  
  152. # Extract the biltrans of the candidate lines (for generating TLM best selections)
  153. python3 $SCRIPTS/extract-biltrans-cand.py $CORPUS.train.$L1-$L2 > $CORPUS.train.$L1-$L2.biltrans
RAW Paste Data

Adblocker detected! Please consider disabling it...

We've detected AdBlock Plus or some other adblocking software preventing Pastebin.com from fully loading.

We don't have any obnoxious sound, or popup ads, we actively block these annoying types of ads!

Please add Pastebin.com to your ad blocker whitelist or disable your adblocking software.

×