moses_model/scripts/config-ems.jb-en.ini

[GENERAL]

home-dir = /zmifanva/moses_model

working-dir = $home-dir/tmp
moses-src-dir = /opt/moses
moses-script-dir = $moses-src-dir/scripts
moses-bin-dir = $moses-src-dir/bin
external-bin-dir = /opt/tools
corpus-dir = $home-dir/corpus
irstlm-dir = /opt/bin


ttable-binarizer = $moses-bin-dir/processPhraseTableMin
decoder = $moses-bin-dir/moses

input-tokenizer = "$home-dir/scripts/tokenize_jbo.py"
output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -l $output-extension"
output-truecaser = $moses-script-dir/recaser/truecase.perl
detruecaser = $moses-script-dir/recaser/detruecase.perl


input-extension = jb
output-extension = en
pair-extension = jb-en


#################################################################
# PARALLEL CORPUS PREPARATION: 
# create a tokenized, sentence-aligned corpus, ready for training

[CORPUS]

max-sentence-length = 80

[CORPUS:project-zmifanva]
raw-stem = $corpus-dir/train/collection

[LM]

### tool to be used for language model training
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 
# 
lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores 2 -irst-dir $irstlm-dir -temp-dir $working-dir/tmp"
settings = "-s msb -p 0"

order = 3
type = 8
lm-binarizer = $moses-bin-dir/build_binary

[LM:project-zmifanva]
raw-corpus = $corpus-dir/train/collection.$output-extension


#################################################################
# TRANSLATION MODEL TRAINING

[TRAINING]

### training script to be used: either a legacy script or 
# current moses training script (default) 
# 
script = $moses-script-dir/training/train-model.perl

### symmetrization method to obtain word alignments from giza output
# (commonly used: grow-diag-final-and)
#
#alignment-symmetrization-method = berkeley
alignment-symmetrization-method = grow-diag-final-and

### lexicalized reordering: specify orientation type
# (default: only distance-based reordering model)
#
lexicalized-reordering = msd-bidirectional-fe

### if word alignment (giza symmetrization) should be skipped,
# point to word alignment files
#
#word-alignment = 

### if phrase extraction should be skipped,
# point to stem for extract files
#
#extracted-phrases = 

### if phrase table training should be skipped,
# point to phrase translation table
#
#phrase-translation-table = 

### if reordering table training should be skipped,
# point to reordering table
#
#reordering-table = 

### if training should be skipped, 
# point to a configuration file that contains
# pointers to all relevant model files
#
#config = 

training-options = "-mgiza"

### TUNING: finding good weights for model components

[TUNING]

### instead of tuning with this setting, old weights may be recycled

### tuning script to be used
#
tuning-script = $moses-script-dir/training/mert-moses.pl
tuning-settings = "-mertdir $moses-bin-dir "

### specify the corpus used for tuning 
# it should contain 100s if not 1000s of sentences
#
raw-input = $corpus-dir/tune/collection.$input-extension
 
raw-reference = $corpus-dir/tune/collection.$output-extension

### size of n-best list used (typically 100)
#
nbest = 100

### ranges for weights for random initialization
# if not specified, the tuning script will use generic ranges
# it is not clear, if this matters
#
# lambda = 

### additional flags for the decoder
#
decoder-settings = "-threads 4"

### if tuning should be skipped, specify this here
# and also point to a configuration file that contains
# pointers to all relevant model files
#
#config = 


#######################################################
## TRUECASER: train model to truecase corpora and input

[TRUECASER]

### script to train truecaser models
#
trainer = $moses-script-dir/recaser/train-truecaser.perl

### training data
# raw input needs to be still tokenized,
# also also tokenized input may be specified
#
raw-stem = CORPUS:raw-stem

### trained model
#
#truecase-model = 


##################################
## EVALUATION: score system output

[EVALUATION]

### prepare system output for scoring 
# this may include detokenization and wrapping output in sgm 
# (needed for nist-bleu, ter, meteor)
#
detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension"

decoder-settings = "-threads 4"

### should output be scored case-sensitive (default: no)?
#
# case-sensitive = yes

### BLEU
#

multi-bleu = "$moses-script-dir/generic/multi-bleu.perl -lc"
# ibm-bleu =

### TER: translation error rate (BBN metric) based on edit distance
#
# ter = $edinburgh-script-dir/tercom_v6a.pl

### METEOR: gives credit to stem / worknet synonym matches
#
# meteor = 

[EVALUATION:zmifanva]
raw-input = $corpus-dir/eval/collection.$input-extension
raw-reference = $corpus-dir/eval/collection.$output-extension


[REPORTING]

### what to do with result (default: store in file evaluation/report)
# 
# email = pkoehn@inf.ed.ac.uk