Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ################################################
- ### CONFIGURATION FILE FOR AN SMT EXPERIMENT ###
- ################################################
- [GENERAL]
- thisFile = /home/nlp/Downloads/SMALL
- ### directory in which experiment is run
- #
- working-dir = $thisFile/
- opt-dir = /home/nlp/opt
- # specification of the language pair
- input-extension = pl
- output-extension = en
- pair-extension = pl-en
- ### directories that contain tools and data
- #
- # moses
- moses-src-dir = $opt-dir/moses
- #
- # moses binaries
- moses-bin-dir = $moses-src-dir/bin
- #
- # moses scripts
- moses-script-dir = $moses-src-dir/scripts
- #
- # directory where GIZA++/MGIZA programs resides
- external-bin-dir = $opt-dir/mgizapp/bin
- #
- # srilm
- srilm-dir = $opt-dir/srilm/bin/i686-m64
- #
- # irstlm
- #irstlm-dir = $opt-dir/irstlm/bin
- #
- # randlm
- #randlm-dir = $opt-dir/randlm/bin
- #
- # data
- wmt12-data = $thisFile
- ### basic tools
- #
- # moses decoder
- decoder = $moses-bin-dir/moses
- # conversion of phrase table into binary on-disk format
- #ttable-binarizer = $moses-bin-dir/processPhraseTable
- filter-settings = "-MinScore 2:0.0001,0:0.0001"
- # conversion of rule table into binary on-disk format
- ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 4 100 2"
- input-tokenizer = "/home/nlp/opt/moses/scripts/tokenizer/normalize-punctuation.perl $input-extension | $moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
- output-tokenizer = "/home/nlp/opt/moses/scripts/tokenizer/normalize-punctuation.perl $output-extension | $moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension"
- #input-truecaser = /$moses-script-dir/recaser/truecase.perl
- #output-truecaser = /$moses-script-dir/recaser/truecase.perl
- #detruecaser = $moses-script-dir/recaser/detruecase.perl
- ### generic parallelizer for cluster and multi-core machines
- # you may specify a script that allows the parallel execution
- # parallizable steps (see meta file). you also need specify
- # the number of jobs (cluster) or cores (multicore)
- #
- #generic-parallelizer = $moses-script-dir/ems/support/generic-parallelizer.perl
- generic-parallelizer = $moses-script-dir/ems/support/generic-multicore-parallelizer.perl
- ### cluster settings (if run on a cluster machine)
- # number of jobs to be submitted in parallel
- #
- #jobs = 10
- # arguments to qsub when scheduling a job
- #qsub-settings = ""
- # project for priviledges and usage accounting
- #qsub-project = iccs_smt
- # memory and time
- #qsub-memory = 4
- #qsub-hours = 48
- ### multi-core settings
- # when the generic parallelizer is used, the number of cores
- # specified here
- cores = 2
- #input-splitter = $moses-script-dir/generic/compound-splitter.perl
- #################################################################
- ### PARALLEL CORPUS PREPARATION:
- ### create a tokenized, sentence-aligned corpus, ready for training
- #################################################################
- [CORPUS]
- ### long sentences are filtered out, since they slow down GIZA++
- # and are a less reliable source of data. set here the maximum
- # length of a sentence
- #
- max-sentence-length = 80
- [CORPUS:tags]
- ### command to run to get raw corpus files
- #
- # get-corpus-script =
- ### raw corpus files (untokenized, but sentence aligned)
- #
- raw-stem = $wmt12-data/train
- ### tokenized corpus files (may contain long sentences)
- #
- #tokenized-stem =
- ### if sentence filtering should be skipped,
- # point to the clean training data
- #
- #clean-stem =
- ### if corpus preparation should be skipped,
- # point to the prepared training data
- #
- #lowercased-stem =
- #[CORPUS:inne]
- ### command to run to get raw corpus files
- #
- # get-corpus-script =
- ### raw corpus files (untokenized, but sentence aligned)
- #
- #raw-stem = $wmt12-data/inne
- #################################################################
- ### LANGUAGE MODEL TRAINING
- #################################################################
- [LM]
- ### tool to be used for language model training
- # srilm
- lm-training = $srilm-dir/ngram-count
- settings = "-interpolate -kndiscount -unk"
- # irstlm training
- # msb = modified kneser ney; p=0 no singleton pruning
- #lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/tmp"
- #settings = "-s msb -p 0"
- # order of the language model
- order = 3
- ### tool to be used for training randomized language model from scratch
- # (more commonly, a SRILM is trained)
- #
- #rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"
- ### script to use for binary table format for irstlm or kenlm
- # (default: no binarization)
- # irstlm
- #lm-binarizer = $irstlm-dir/compile-lm
- # kenlm, also set type to 8
- lm-binarizer = $moses-bin-dir/build_binary
- type = 8
- ### script to create quantized language model format (irstlm)
- # (default: no quantization)
- #
- #lm-quantizer = $irstlm-dir/quantize-lm
- ### script to use for converting into randomized table format
- # (default: no randomization)
- #
- #lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
- ### each language model to be used has its own section here
- [LM:traintags]
- ### command to run to get raw corpus files
- #
- #get-corpus-script = ""
- ### raw corpus (untokenized)
- #
- raw-corpus = $wmt12-data/lm.$output-extension
- ### tokenized corpus files (may contain long sentences)
- #
- #tokenized-corpus =
- ### if corpus preparation should be skipped,
- # point to the prepared language model
- #
- #lm =
- #[LM:trainfr]
- #raw-corpus = $wmt12-data/trainfr.$output-extension
- #[LM:traintags=pos]
- #factors = "pos"
- #order = 6
- #settings = "-interpolate -unk"
- #raw-corpus = $wmt12-data/trainbigfr.$output-extension
- #################################################################
- ### FACTOR DEFINITION
- #################################################################
- #[INPUT-FACTOR]
- # also used for output factors
- #temp-dir = $working-dir/training/factor
- #[OUTPUT-FACTOR:pos]
- ### script that generates this factor
- #
- #mxpost = $opt-dir/moses/contrib/jmx
- #factor-script = "$moses-script-dir/training/wrappers/make-factor-en-pos.mxpost.perl -mxpost -mx8096m $mxpost"
- #################################################################
- ### INTERPOLATING LANGUAGE MODELS
- #################################################################
- #[INTERPOLATED-LM]
- # if multiple language models are used, these may be combined
- # by optimizing perplexity on a tuning set
- # see, for instance [Koehn and Schwenk, IJCNLP 2008]
- ### script to interpolate language models
- # if commented out, no interpolation is performed
- #
- #script = $moses-script-dir/ems/support/interpolate-lm.perl
- ### tuning set
- # you may use the same set that is used for mert tuning (reference set)
- #
- #tuning-sgm = $wmt12-data/dev.$pair-extension.$output-extension.xml
- #raw-tuning =
- #tokenized-tuning =
- #factored-tuning =
- #lowercased-tuning =
- #split-tuning =
- ### group language models for hierarchical interpolation
- # (flat interpolation is limited to 10 language models)
- #group = "first,second fourth,fifth"
- ### script to use for binary table format for irstlm or kenlm
- # (default: no binarization)
- # irstlm
- #lm-binarizer = $irstlm-dir/compile-lm
- # kenlm, also set type to 8
- lm-binarizer = $moses-bin-dir/build_binary
- type = 8
- ### script to create quantized language model format (irstlm)
- # (default: no quantization)
- #
- #lm-quantizer = $irstlm-dir/quantize-lm
- ### script to use for converting into randomized table format
- # (default: no randomization)
- #
- #lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
- #################################################################
- ### MODIFIED MOORE LEWIS FILTERING
- #################################################################
- #[MML]
- ### specifications for language models to be trained
- #
- #lm-training = $srilm-dir/ngram-count
- #lm-settings = "-interpolate -kndiscount -unk"
- #lm-binarizer = $moses-src-dir/bin/build_binary
- #lm-query = $moses-src-dir/bin/query
- #order = 5
- #type = 8
- ### in-/out-of-domain source/target corpora to train the 4 language model
- #
- #in-domain = [CORPUS:tags]
- #outdomain-stem = [CORPUS:inne]
- # ... or to two separate monolingual corpora
- #indomain-target = [LM:toy:lowercased-corpus]
- #raw-indomain-source = $wmt12-data/train.pl
- #raw-indomain-target = $wmt12-data/train.en
- # point to out-of-domain parallel corpus
- #outdomain-stem = $wmt12-data/inne
- # settings: number of lines sampled from the corpora to train each language model on
- # (if used at all, should be small as a percentage of corpus)
- #settings = "--line-count 100000"
- #################################################################
- ### TRANSLATION MODEL TRAINING
- #################################################################
- [TRAINING]
- ### training script to be used: either a legacy script or
- # current moses training script (default)
- #
- script = $moses-script-dir/training/train-model.perl
- ### general options
- # these are options that are passed on to train-model.perl, for instance
- # * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza
- # * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting
- # * "-sort-parallel 8 -cores 8" to speed up phrase table building
- #
- training-options = "-mgiza -mgiza-cpus $cores -sort-buffer-size 2G -sort-compress gzip -sort-parallel $cores -cores $cores"
- ### factored training: specify here which factors used
- # if none specified, single factor training is assumed
- # (one translation step, surface to surface)
- #
- #input-factors = word
- #output-factors = word pos
- #alignment-factors = "word -> word"
- #translation-factors = "word -> word+pos"
- #reordering-factors = "word -> word"
- #generation-factors =
- #decoding-steps = "t0"
- ### parallelization of data preparation step
- # the two directions of the data preparation can be run in parallel
- # comment out if not needed
- #
- parallel = yes
- ### pre-computation for giza++
- # giza++ has a more efficient data structure that needs to be
- # initialized with snt2cooc. if run in parallel, this may reduces
- # memory requirements. set here the number of parts
- #
- #run-giza-in-parts = 5
- ### symmetrization method to obtain word alignments from giza output
- # (commonly used: grow-diag-final-and)
- #
- alignment-symmetrization-method = grow-diag-final-and
- ### use of berkeley aligner for word alignment
- #
- #use-berkeley = true
- #alignment-symmetrization-method = berkeley
- #berkeley-train = $moses-script-dir/ems/support/berkeley-train.sh
- #berkeley-process = $moses-script-dir/ems/support/berkeley-process.sh
- #berkeley-jar = /your/path/to/berkeleyaligner-1.1/berkeleyaligner.jar
- #berkeley-java-options = "-server -mx30000m -ea"
- #berkeley-training-options = "-Main.iters 5 5 -EMWordAligner.numThreads 8"
- #berkeley-process-options = "-EMWordAligner.numThreads 8"
- #berkeley-posterior = 0.5
- ### use of baseline alignment model (incremental training)
- #
- #baseline = 68
- #baseline-alignment-model = "$working-dir/training/prepared.$baseline/$input-extension.vcb \
- # $working-dir/training/prepared.$baseline/$output-extension.vcb \
- # $working-dir/training/giza.$baseline/${output-extension}-$input-extension.cooc \
- # $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.cooc \
- # $working-dir/training/giza.$baseline/${output-extension}-$input-extension.thmm.5 \
- # $working-dir/training/giza.$baseline/${output-extension}-$input-extension.hhmm.5 \
- # $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.thmm.5 \
- # $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.hhmm.5"
- ### if word alignment should be skipped,
- # point to word alignment files
- #
- #word-alignment = $working-dir/model/aligned.1
- ### filtering some corpora with modified Moore-Lewis
- # specify corpora to be filtered and ratio to be kept, either before or after word alignment
- #mml-filter-corpora = inne
- #mml-before-wa = "-proportion 0.8"
- #mml-after-wa = "-proportion 0.8"
- ### create a bilingual concordancer for the model
- #
- #biconcor = $moses-script-dir/ems/biconcor/biconcor
- ### lexicalized reordering: specify orientation type
- # (default: only distance-based reordering model)
- #
- lexicalized-reordering = msd-bidirectional-fe
- #operation-sequence-model = "yes"
- #operation-sequence-model-order = 5
- #operation-sequence-model-settings = ""
- ### hierarchical rule set
- #
- #hierarchical-rule-set = true
- domain-features = "sparse indicator"
- ### settings for rule extraction
- #
- #extract-settings = ""
- max-phrase-length = 5
- ### add extracted phrases from baseline model
- #
- #baseline-extract = $working-dir/model/extract.$baseline
- #
- # requires aligned parallel corpus for re-estimating lexical translation probabilities
- #baseline-corpus = $working-dir/training/corpus.$baseline
- #baseline-alignment = $working-dir/model/aligned.$baseline.$alignment-symmetrization-method
- ### unknown word labels (target syntax only)
- # enables use of unknown word labels during decoding
- # label file is generated during rule extraction
- #
- #use-unknown-word-labels = true
- ### if phrase extraction should be skipped,
- # point to stem for extract files
- #
- # extracted-phrases =
- ### settings for rule scoring
- #
- score-settings = "--GoodTuring --MinScore 2:0.0001"
- ### include word alignment in phrase table
- #
- #include-word-alignment-in-rules = yes
- ### sparse lexical features
- #
- sparse-lexical-features = "target-word-insertion all, source-word-deletion all, word-translation all, phrase-length"
- #"target-word-insertion top 50, source-word-deletion top 50, word-translation top 50 50, phrase-length"
- ### domain adaptation settings
- # options: sparse, any of: indicator, subset, ratio
- #domain-features = "subset"
- ### if phrase table training should be skipped,
- # point to phrase translation table
- #
- # phrase-translation-table =
- ### if reordering table training should be skipped,
- # point to reordering table
- #
- # reordering-table =
- ### filtering the phrase table based on significance tests
- # Johnson, Martin, Foster and Kuhn. (2007): "Improving Translation Quality by Discarding Most of the Phrasetable"
- # options: -n number of translations; -l 'a+e', 'a-e', or a positive real value -log prob threshold
- #salm-index = /path/to/project/salm/Bin/Linux/Index/IndexSA.O64
- #sigtest-filter = "-l a+e -n 50"
- ### if training should be skipped,
- # point to a configuration file that contains
- # pointers to all relevant model files
- #
- #config-with-reused-weights =
- #################################################################
- ### TUNING: finding good weights for model components
- #################################################################
- [TUNING]
- ### instead of tuning with this setting, old weights may be recycled
- # specify here an old configuration file with matching weights
- #
- #weight-config = $working-dir/tuning/moses.weight-reused.ini.1
- ### tuning script to be used
- #
- tuning-script = $moses-script-dir/training/mert-moses.pl
- tuning-settings = "-mertdir $moses-src-dir/bin --batch-mira --return-best-dev -maximum-iterations 2"
- ### specify the corpus used for tuning
- # it should contain 1000s of sentences
- #
- input-sgm = $wmt12-data/dev.$pair-extension.$input-extension.xml
- #raw-input =
- #tokenized-input =
- #factorized-input =
- #input =
- #
- reference-sgm = $wmt12-data/dev.$pair-extension.$output-extension.xml
- #raw-reference =
- #tokenized-reference =
- #factorized-reference =
- #reference =
- ### size of n-best list used (typically 100)
- #
- nbest = 100
- ### ranges for weights for random initialization
- # if not specified, the tuning script will use generic ranges
- # it is not clear, if this matters
- #
- # lambda =
- ### additional flags for the filter script
- #
- filter-settings = ""
- ### additional flags for the decoder
- #
- decoder-settings = "-threads $cores -mp -search-algorithm 1 -cube-pruning-pop-limit 1000 -s 1000 -feature-overwrite 'TranslationModel0 table-limit=100' -max-trans-opt-per-coverage 100"
- ### if tuning should be skipped, specify this here
- # and also point to a configuration file that contains
- # pointers to all relevant model files
- #
- #config =
- #################################################################
- ### RECASER: restore case, this part only trains the model
- #################################################################
- #[RECASING]
- #decoder = $moses-bin-dir/moses
- ### training data
- # raw input needs to be still tokenized,
- # also also tokenized input may be specified
- #
- #tokenized = [LM:traintags:tokenized-corpus]
- # recase-config =
- #lm-training = $srilm-dir/ngram-count
- #################################################################
- ### TRUECASER: train model to truecase corpora and input
- #################################################################
- #[TRUECASER]
- ###script to train truecaser models
- #
- #trainer = $moses-script-dir/recaser/train-truecaser.perl
- ### training data
- # data on which truecaser is trained
- # if no training data is specified, parallel corpus is used
- #
- # raw-stem =
- # tokenized-stem =
- ### trained model
- #
- # truecase-model =
- ##########################################################################
- ### EVALUATION: translating a test set using the tuned system and score it
- ##########################################################################
- [EVALUATION]
- ### number of jobs (if parallel execution on cluster)
- #
- #jobs = 10
- ### additional flags for the filter script
- #
- #filter-settings = ""
- ### additional decoder settings
- # switches for the Moses decoder
- # common choices:
- # "-threads N" for multi-threading
- # "-mbr" for MBR decoding
- # "-drop-unknown" for dropping unknown source words
- # "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
- #
- decoder-settings = "-mbr -threads $cores -mp -search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -feature-overwrite 'TranslationModel0 table-limit=100' -max-trans-opt-per-coverage 100"
- #
- ### specify size of n-best list, if produced
- #
- #nbest = 100
- ### multiple reference translations
- #
- #multiref = yes
- ### prepare system output for scoring
- # this may include detokenization and wrapping output in sgm
- # (needed for nist-bleu, ter, meteor)
- #
- detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension"
- #recaser = $moses-script-dir/recaser/recase.perl
- wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension"
- #output-sgm =
- ### BLEU
- #
- nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
- nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
- multi-bleu = $moses-script-dir/generic/multi-bleu.perl
- #ibm-bleu =
- ### TER: translation error rate (BBN metric) based on edit distance
- # not yet integrated
- #
- # ter =
- ### METEOR: gives credit to stem / worknet synonym matches
- # not yet integrated
- #
- # meteor =
- ### Analysis: carry out various forms of analysis on the output
- #
- analysis = $moses-script-dir/ems/support/analysis.perl
- #
- # also report on input coverage
- analyze-coverage = yes
- #
- # also report on phrase mappings used
- report-segmentation = yes
- #
- # report precision of translations for each input word, broken down by
- # count of input word in corpus and model
- #report-precision-by-coverage = yes
- #
- # further precision breakdown by factor
- #precision-by-coverage-factor = pos
- #
- # visualization of the search graph in tree-based models
- #analyze-search-graph = yes
- [EVALUATION:traintags]
- ### input data
- #
- input-sgm = $wmt12-data/test.$pair-extension.$input-extension.xml
- # raw-input =
- # tokenized-input =
- # factorized-input =
- # input =
- ### reference data
- #
- reference-sgm = $wmt12-data/test.$pair-extension.$output-extension.xml
- # raw-reference =
- # tokenized-reference =
- # reference =
- ### analysis settings
- # may contain any of the general evaluation analysis settings
- # specific setting: base coverage statistics on earlier run
- #
- #precision-by-coverage-base = $working-dir/evaluation/test.analysis.5
- ### wrapping frame
- # for nist-bleu and other scoring scripts, the output needs to be wrapped
- # in sgm markup (typically like the input sgm)
- #
- wrapping-frame = $input-sgm
- #################################################################
- ### REPORTING: summarize evaluation scores
- #################################################################
- [REPORTING]
- ### currently no parameters for reporting section
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement