forked from olpa/zmifanva
-
Notifications
You must be signed in to change notification settings - Fork 0
/
config-ems.jb-en.ini
203 lines (145 loc) · 4.78 KB
/
config-ems.jb-en.ini
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
[GENERAL]
home-dir = /zmifanva/moses_model
working-dir = $home-dir/tmp
moses-src-dir = /opt/moses
moses-script-dir = $moses-src-dir/scripts
moses-bin-dir = $moses-src-dir/bin
external-bin-dir = /opt/tools
corpus-dir = $home-dir/corpus
irstlm-dir = /opt/bin
ttable-binarizer = $moses-bin-dir/processPhraseTableMin
decoder = $moses-bin-dir/moses
input-tokenizer = "$home-dir/scripts/tokenize_jbo.py"
output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -l $output-extension"
output-truecaser = $moses-script-dir/recaser/truecase.perl
detruecaser = $moses-script-dir/recaser/detruecase.perl
input-extension = jb
output-extension = en
pair-extension = jb-en
#################################################################
# PARALLEL CORPUS PREPARATION:
# create a tokenized, sentence-aligned corpus, ready for training
[CORPUS]
max-sentence-length = 80
[CORPUS:project-zmifanva]
raw-stem = $corpus-dir/train/collection
[LM]
### tool to be used for language model training
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
#
lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores 2 -irst-dir $irstlm-dir -temp-dir $working-dir/tmp"
settings = "-s msb -p 0"
order = 3
type = 8
lm-binarizer = $moses-bin-dir/build_binary
[LM:project-zmifanva]
raw-corpus = $corpus-dir/train/collection.$output-extension
#################################################################
# TRANSLATION MODEL TRAINING
[TRAINING]
### training script to be used: either a legacy script or
# current moses training script (default)
#
script = $moses-script-dir/training/train-model.perl
### symmetrization method to obtain word alignments from giza output
# (commonly used: grow-diag-final-and)
#
#alignment-symmetrization-method = berkeley
alignment-symmetrization-method = grow-diag-final-and
### lexicalized reordering: specify orientation type
# (default: only distance-based reordering model)
#
lexicalized-reordering = msd-bidirectional-fe
### if word alignment (giza symmetrization) should be skipped,
# point to word alignment files
#
#word-alignment =
### if phrase extraction should be skipped,
# point to stem for extract files
#
#extracted-phrases =
### if phrase table training should be skipped,
# point to phrase translation table
#
#phrase-translation-table =
### if reordering table training should be skipped,
# point to reordering table
#
#reordering-table =
### if training should be skipped,
# point to a configuration file that contains
# pointers to all relevant model files
#
#config =
training-options = "-mgiza"
### TUNING: finding good weights for model components
[TUNING]
### instead of tuning with this setting, old weights may be recycled
### tuning script to be used
#
tuning-script = $moses-script-dir/training/mert-moses.pl
tuning-settings = "-mertdir $moses-bin-dir "
### specify the corpus used for tuning
# it should contain 100s if not 1000s of sentences
#
raw-input = $corpus-dir/tune/collection.$input-extension
raw-reference = $corpus-dir/tune/collection.$output-extension
### size of n-best list used (typically 100)
#
nbest = 100
### ranges for weights for random initialization
# if not specified, the tuning script will use generic ranges
# it is not clear, if this matters
#
# lambda =
### additional flags for the decoder
#
decoder-settings = "-threads 4"
### if tuning should be skipped, specify this here
# and also point to a configuration file that contains
# pointers to all relevant model files
#
#config =
#######################################################
## TRUECASER: train model to truecase corpora and input
[TRUECASER]
### script to train truecaser models
#
trainer = $moses-script-dir/recaser/train-truecaser.perl
### training data
# raw input needs to be still tokenized,
# also also tokenized input may be specified
#
raw-stem = CORPUS:raw-stem
### trained model
#
#truecase-model =
##################################
## EVALUATION: score system output
[EVALUATION]
### prepare system output for scoring
# this may include detokenization and wrapping output in sgm
# (needed for nist-bleu, ter, meteor)
#
detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension"
decoder-settings = "-threads 4"
### should output be scored case-sensitive (default: no)?
#
# case-sensitive = yes
### BLEU
#
multi-bleu = "$moses-script-dir/generic/multi-bleu.perl -lc"
# ibm-bleu =
### TER: translation error rate (BBN metric) based on edit distance
#
# ter = $edinburgh-script-dir/tercom_v6a.pl
### METEOR: gives credit to stem / worknet synonym matches
#
# meteor =
[EVALUATION:zmifanva]
raw-input = $corpus-dir/eval/collection.$input-extension
raw-reference = $corpus-dir/eval/collection.$output-extension
[REPORTING]
### what to do with result (default: store in file evaluation/report)
#
# email = [email protected]