Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- BILTRANS=en-es.autobil.bin
- LM=/home/fran/corpora/europarl/europarl.lm
- LOCALTEMP=temp/
- SCRIPTS=/home/fran/source/apertium-lex-tools/scripts/
- MOSESSCRIPTS=/home/fran/local/bin/scripts-20120109-1229/
- CORPUS=$1
- DIR=$2
- L1=`echo $3 | cut -f1 -d'-'`
- L2=`echo $3 | cut -f2 -d'-'`
- #sh setup-corpus.sh oab /home/fran/source/apertium-br-fr/ br-fr
- mkdir -p $LOCALTEMP
- perl $MOSESSCRIPTS/training/clean-corpus-n.perl $CORPUS $L1 $L2 $CORPUS.clean 1 40
- cat $CORPUS.clean.$L1 | apertium-destxt | apertium -f none -d $DIR $L1-$L2-pretransfer > $CORPUS.tagged.$L1
- cat $CORPUS.clean.$L2 | apertium-destxt | apertium -f none -d $DIR $L2-$L1-pretransfer > $CORPUS.tagged.$L2
- echo "0000 Clean and tagged corpus length:"
- wc -l $CORPUS.*
- mv $CORPUS.clean.$L1 $LOCALTEMP
- mv $CORPUS.clean.$L2 $LOCALTEMP
- # We need to number the lines in the corpus so that later, when we discard lines, we can find out
- # what the line numbers were in the original [surface form] corpus.
- NUMLINES=`cat $CORPUS.tagged.$L1 | wc -l`;
- seq 1 $NUMLINES > $CORPUS.lines
- paste $CORPUS.lines $CORPUS.tagged.$L1 $CORPUS.tagged.$L2 | grep '<' | cut -f1 > $CORPUS.lines.new
- paste $CORPUS.lines $CORPUS.tagged.$L1 $CORPUS.tagged.$L2 | grep '<' | cut -f2 > $CORPUS.tagged.$L1.new
- paste $CORPUS.lines $CORPUS.tagged.$L1 $CORPUS.tagged.$L2 | grep '<' | cut -f3 > $CORPUS.tagged.$L2.new
- # This is the number of lines after removing blank lines from tagging.
- mv $CORPUS.lines.new $CORPUS.lines
- mv $CORPUS.tagged.$L1.new $CORPUS.tagged.$L1
- mv $CORPUS.tagged.$L2.new $CORPUS.tagged.$L2
- echo "0001 Tagged and stripped corpus length:"
- wc -l $CORPUS.*
- cat $CORPUS.tagged.$L1 | lt-proc -b $DIR/$BILTRANS > $CORPUS.biltrans.$L1-$L2
- echo "0002 Biltrans length:"
- wc -l $CORPUS.biltrans.*
- cat $CORPUS.biltrans.$L1-$L2 | python3 $SCRIPTS/calc-corpus-poly.py 2>/dev/null > $LOCALTEMP/$CORPUS.biltrans.$L1-$L2.poly
- echo "0003 Corpus polysemy count:"
- tail -1 $LOCALTEMP/$CORPUS.biltrans.$L1-$L2.poly
- cat $CORPUS.tagged.$L1 | python $SCRIPTS/process-tagger-output.py $L1 > $CORPUS.token.$L1
- cat $CORPUS.tagged.$L2 | python $SCRIPTS/process-tagger-output.py $L2 > $CORPUS.token.$L2
- cat $CORPUS.biltrans.$L1-$L2 | python $SCRIPTS/process-biltrans-output.py > $CORPUS.token.$L1-$L2
- echo "0004 Tokenised corpus length:"
- wc -l $CORPUS.token.*
- mv $CORPUS.lines $LOCALTEMP
- mv $CORPUS.tagged.$L1 $LOCALTEMP
- mv $CORPUS.tagged.$L2 $LOCALTEMP
- mv $CORPUS.biltrans.$L1-$L2 $LOCALTEMP
- # We clean the corpus again to remove lines which may have becoming too long or manky
- # because of the processing with Apertium tools.
- perl $MOSESSCRIPTS/training/clean-corpus-n.perl $CORPUS.token $L1 $L2 $CORPUS.ctoken 1 40 $CORPUS.ctoken.retained
- echo "0005 Cleaned and tokenised corpus length:"
- wc -l $CORPUS.ctoken.*
- python3 $SCRIPTS/biltrans-only-retained.py $CORPUS.token.$L1-$L2 $CORPUS.ctoken.retained > $CORPUS.ctoken.$L1-$L2
- mv $CORPUS.token.$L1 $LOCALTEMP
- mv $CORPUS.token.$L2 $LOCALTEMP
- mv $CORPUS.token.$L1-$L2 $LOCALTEMP
- perl $MOSESSCRIPTS/training/train-model.perl -scripts-root-dir $MOSESSCRIPTS -root-dir . -corpus $CORPUS.ctoken -f $L1 -e $L2 -alignment grow-diag-final-and -reordering msd-bidirectional-fe -lm 0:5:$LM:0 >log 2>&1
- if [[ $? != 0 ]]; then
- echo $?" is bad.";
- exit ;
- fi
- zcat giza.$L1-$L2/$L1-$L2.A3.final.gz | $SCRIPTS/giza-to-moses.awk > $CORPUS.phrasetable.$L1-$L2
- wc -l $CORPUS.phrasetable.$L1-$L2
- mv $CORPUS.ctoken.$L1 $LOCALTEMP
- mv $CORPUS.ctoken.$L2 $LOCALTEMP
- mv $CORPUS.ctoken.retained $LOCALTEMP
- ## Extract the sentences that we are going to process to make the rules
- python3 $SCRIPTS/extract-sentences.py $CORPUS.phrasetable.$L1-$L2 $CORPUS.ctoken.$L1-$L2 > $CORPUS.candidates.$L1-$L2 2>$LOCALTEMP/extract.log
- ## Split the sentences into train/test/dev
- nohup time python3 /home/fran/source/apertium-lex-tools/scripts/split-sentences.py $CORPUS.candidates.$L1-$L2 1000
- wc -l $CORPUS.train.$L1-$L2
- wc -l $CORPUS.tst.$L1-$L2.*
- wc -l $CORPUS.dev.$L1-$L2.*
- ## Extract the test and dev sets from the original files.
- python3 $SCRIPTS/biltrans-only-retained.py $LOCALTEMP/$CORPUS.clean.$L1 $LOCALTEMP/$CORPUS.lines > $LOCALTEMP/$CORPUS.$L1.token
- python3 $SCRIPTS/biltrans-only-retained.py $LOCALTEMP/$CORPUS.$L1.token $LOCALTEMP/$CORPUS.ctoken.retained > $LOCALTEMP/$CORPUS.$L1.ctoken
- python3 $SCRIPTS/biltrans-only-retained.py $LOCALTEMP/$CORPUS.clean.$L2 $LOCALTEMP/$CORPUS.lines > $LOCALTEMP/$CORPUS.$L2.token
- python3 $SCRIPTS/biltrans-only-retained.py $LOCALTEMP/$CORPUS.$L2.token $LOCALTEMP/$CORPUS.ctoken.retained > $LOCALTEMP/$CORPUS.$L2.ctoken
- ## Check to see if the token and ctoken lengths match up
- wc -l $LOCALTEMP/$CORPUS.$L1.token $LOCALTEMP/$CORPUS.token.$L1
- wc -l $LOCALTEMP/$CORPUS.$L2.token $LOCALTEMP/$CORPUS.token.$L2
- wc -l $LOCALTEMP/$CORPUS.$L1.ctoken $LOCALTEMP/$CORPUS.ctoken.$L1
- wc -l $LOCALTEMP/$CORPUS.$L2.ctoken $LOCALTEMP/$CORPUS.ctoken.$L2
- cat $CORPUS.tst.$L1-$L2.src | cut -f1 -d' ' > $LOCALTEMP/$CORPUS.tst.lines;
- cat $CORPUS.dev.$L1-$L2.src | cut -f1 -d' ' > $LOCALTEMP/$CORPUS.dev.lines;
- cat $LOCALTEMP/$CORPUS.tst.lines $LOCALTEMP/$CORPUS.dev.lines > $LOCALTEMP/$CORPUS.reserved.lines;
- python3 $SCRIPTS/biltrans-only-retained.py $LOCALTEMP/$CORPUS.$L1.ctoken $LOCALTEMP/$CORPUS.tst.lines > $CORPUS.tst.$L1;
- python3 $SCRIPTS/biltrans-only-retained.py $LOCALTEMP/$CORPUS.$L2.ctoken $LOCALTEMP/$CORPUS.tst.lines > $CORPUS.tst.$L2;
- python3 $SCRIPTS/biltrans-only-retained.py $LOCALTEMP/$CORPUS.$L1.ctoken $LOCALTEMP/$CORPUS.dev.lines > $CORPUS.dev.$L1;
- python3 $SCRIPTS/biltrans-only-retained.py $LOCALTEMP/$CORPUS.$L2.ctoken $LOCALTEMP/$CORPUS.dev.lines > $CORPUS.dev.$L2;
- wc -l $CORPUS.tst.*
- wc -l $CORPUS.dev.*
- python3 $SCRIPTS/strip-test-lines.py $LOCALTEMP/$CORPUS.$L1.ctoken $LOCALTEMP/$CORPUS.reserved.lines > $CORPUS.train.$L1;
- python3 $SCRIPTS/strip-test-lines.py $LOCALTEMP/$CORPUS.$L2.ctoken $LOCALTEMP/$CORPUS.reserved.lines > $CORPUS.train.$L2;
- python $SCRIPTS/extract-freq-lexicon.py $CORPUS.train.$L1-$L2 > $CORPUS.lexicon.$L1-$L2
- python $SCRIPTS/ngram-count-patterns.py $CORPUS.lexicon.$L1-$L2 $CORPUS.train.$L1-$L2 2>$LOCALTEMP/ngram-count.log > $CORPUS.ngrams.$L1-$L2
- # Build the basic language models of surface forms with IRSTLM.
- build-lm.sh -i $CORPUS.train.$L2 -o $CORPUS.$L2.5.lm.gz -n 5 -b
- compile-lm $CORPUS.$L2.5.lm.gz $CORPUS.$L2.5.blm
- mv $CORPUS.$L2.5.lm.gz $LOCALTEMP
- build-lm.sh -i $CORPUS.train.$L2 -o $CORPUS.$L2.1.lm.gz -n 1 -b
- compile-lm $CORPUS.$L2.1.lm.gz $CORPUS.$L2.1.blm
- mv $CORPUS.$L2.1.lm.gz $LOCALTEMP
- # Extract the alignment defaults as rules.
- python3 $SCRIPTS/extract-alig-lrx.py $CORPUS.lexicon.$L1-$L2 > $CORPUS.$L1-$L2.alig.lrx
- # Extract the biltrans of the candidate lines (for generating TLM best selections)
- python3 $SCRIPTS/extract-biltrans-cand.py $CORPUS.train.$L1-$L2 > $CORPUS.train.$L1-$L2.biltrans
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement