forked from ixa-ehu/ixa-pipe-ml
-
Notifications
You must be signed in to change notification settings - Fork 0
/
trainParams.properties
215 lines (178 loc) · 9.67 KB
/
trainParams.properties
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Template machine learning properties file
# Choose between MAXENT and PERCEPTRON
Algorithm=PERCEPTRON
Iterations=500
Cutoff=0
Threads=4
##################################################
#### Custom parameters added by ixa-pipe-ml ####
##################################################
# Languages supported: ca, de, en, es, eu, fr, gl, it, nl, pt, ru
Language=en
# Specify if adaptive features are cleared in the training and/or evaluation data.
# Options are: 'yes', 'no', 'docstart'. The first two will reset the
# features every sentence whereas the 'docstart' option will look for -DOCSTART-
# marks in the data to clear the adaptive features.
# Crossvalidation only works if ClearTrainingFeatures is set to 'yes'.
# If commented out both values default to 'no'.
ClearTrainingFeatures=yes
ClearEvaluationFeatures=no
# TrainingCorpus:
TrainSet=/home/ragerri/experiments/nerc/conll03/eng.train
TestSet=/home/ragerri/experiments/nerc/conll03/eng.testb
# CorpusFormat of the training corpus
# CoNLL 2002 format: http://www.clips.uantwerpen.be/conll2002/ner/
# CoNLL 2003 format: http://www.clips.uantwerpen.be/conll2003/ner/
# Tabulated format: two columns, word and class tab separated without BIO spans
# Lemmatizer format: two columns, word and lemma tab separated without BIO spans
# NOTE that every training corpus format contains only two columns: the token and
# the predicted class. For including other information (POS tag, lemma, etc.)
# contained in gold standard corpora go below to MorphologicalFeatures.
# Options: conll02, conll03, lemmatizer, tabulated
CorpusFormat=conll02
# OutputModel: if commented out, ixa-pipe-ml will save the model with the
# name of this properties file
OutputModel=trainParams.bin
# Sequence types (if applicable); if not active all Sequence types in the training corpus.
# Otherwise, separate with comma, eg., location,organization,person,misc.
# NOTE: the name of the Sequence type/class needs to be exact, namely, if in the corpus
# appears B-ORG, then in the parameter needs to appear ORG, not organization,
# and so on.
#Types=location,organization,person,misc
# Beamsize 1 amounts to greedy search
BeamSize=3
# Sequence codec used to code spans of Sequences: Choose between BIO and BILOU.
# If commented out, it defaults to BILOU.
#SequenceCodec=BIO
##################
#### FEATURES ####
##################
# Window: left and right window range from the current token. TokenFeatures
# and TokenClassFeatures depend on the window range specified here. If
# commented out, it will default to 2:2.
Window=2:2
# TokenFeatures: include current token (both in original and lowercase form)
# TokenFeaturesRange indicate whether to lowercase the tokens for feature generation.
# If commented out, it defaults to 'lower'.
TokenFeatures=yes
TokenFeaturesRange=lower
# TokenClassFeatures: include token shape features (capitalization, digits,
# etc. see TokenClassFeatureGenerator class for details
# TokenClassFeaturesRange whether to lowercase the tokens and provide wordAndClass (wac)
# joint features. To cancel out an option, just write 'no,wac', 'lower,no' or 'no,no'.
# If commented out, it defaults to 'lower,wac'.
TokenClassFeatures=yes
TokenClassFeaturesRange=lower,wac
# WordShapeSuperSenseFeatures: token shape features as implemented by
# Ciaramita and Altun (2006).
WordShapeSuperSenseFeatures=yes
# OutcomePriorFeatures: maps the underlying previous outcomes
OutcomePriorFeatures=yes
# PreviousMapFeatures: takes into account previous decisions and adds them as
# features
PreviousMapFeatures=yes
# SentenceFeatures: add first and last words of sentence as features.
# Use the Begin and End options to pick and choose combinations.
SentenceFeatures=yes
SentenceFeaturesBegin=true
SentenceFeaturesEnd=false
# PrefixFeatures: takes first 3rd and 4rd characters of current token as feature.
# Modify the values to get other prefix ranges.
PrefixFeatures=yes
PrefixFeaturesBegin=3
PrefixFeaturesEnd=4
# SuffixFeatures: takes last 4 characters of current token as feature.
# Modify the options to get other suffix ranges.
SuffixFeatures=yes
SuffixFeaturesBegin=0
SuffixFeaturesEnd=4
# BigramClassFeatures: adds bigram features based on tokens and their class
# features.
BigramClassFeatures=yes
# TrigramClassFeatures: add trigram features based on tokens and their class
# features.
TrigramClassFeatures=no
# FourgramClassFeatures: add fourgram features based on tokens and their
# class features.
FourgramClassFeatures=no
# FivegramClassFeatures: add fivegram features based on tokens and their class
# features.
FivegramClassFeatures=no
# CharNgramFeatures: min and maximum length for character ngrams of current
# token. If value is yes, specify the desired range in CharNgramFeaturesRange.
# If Range is commented out, it defaults to 2:5 when this feature is "yes".
CharNgramFeatures=no
CharNgramFeaturesRange=2:5
# DictionaryFeatures: add features if token found in some gazetteers. Comment
# it out deactivate this feature. Note that every file in the directory
# provided as parameter will be taken to be a dictionary. The dictionary format
# needs to be 'sequence\tabclass'.
DictionaryFeatures=/home/ragerri/javacode/ixa-pipe-nerc/nerc-resources/en/dictionaries
# BrownClusterFeatures: add features using Brown clusters
# Comment it out to deactivate this feature. NOTE: you can add multiple
# clustering lexicons by chaining them with a comma.
BrownClusterFeatures=/home/ragerri/javacode/ixa-pipe-nerc/brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt
# ClarkClusterFeatures: add features using Clark (2003) clusters. If value is uncommented,
# specify the location of the clustering lexicon in Clark format. NOTE: you can add multiple
# clustering lexicons by chaining them with a comma.
ClarkClusterFeatures=/home/ragerri/resources/reuters-rcv1/clark/reuters-rcv1.tok.punct.lower.300
# Word2VecClusterFeatures: add features using word2vec clusters. If value is
# uncommented, specify the location of the clustering lexicon in word2vec format.
# NOTE: you can add multiple clustering lexicons by chaining them with a comma.
Word2VecClusterFeatures=/home/ragerri/clusters.large.txt
############################
## Morphological Features ##
############################
# POSTagModelFeatures: add a pos tagging model trained with ixa-pipe-ml
# POSTagModelFeaturesRange specifies the combination of features to be used: options are 'pos'
# and 'posclass' in that strict order. For example, if 'pos,posclass' is
# chosen then both types of features will be used. If 'pos,no' is chosen, then
# only pos tag features are active. If 'no,posclass' then pos tag class is chosen.
# If POSTagModelFeatures is commented out, none of these features are used.
POSTagModelFeatures=/home/ragerri/javacode/en-pos-model.bin
POSTagModelFeaturesRange=pos,posclass
# LemmaModelFeatures: add a lemmatizer model trained with ixa-pipe-ml
LemmaModelFeatures=/home/ragerri/javacode/en-lemma-model.bin
# LemmaDictionaryFeatures add lemma features from a dictionary
# It is required to provide a POS model trained with ixa-pipe-ml
# and a plain text word\tlemma\tpostag dictionary.
LemmaDictionaryFeatures=/home/ragerri/javacode/en-pos-model.bin,/home/ragerri/resources/lemmatizer-dicts/en-lemmatizer.txt
# MFSFeatures: add Most Frequent Sense as features.
# It is required to provide an ixa-pipe-pos model, a plain text word\tlemma\tpostag
# dictionary and a lexicon containing the most frequent sense information, where
# each entry is of the form word#pos\tfreq#sense. For example, house#n\t1098#noun.artifact.
# This features include Morphological and SuperSense features, therefore,
# DO NOT COMBINE THEM with POSTAG, LEMMA OR SUPERSENSE FEATURES!!!
MFSFeatures=/home/ragerri/javacode/ixa-pipe-pos/pos-models-1.3.0/en/en-maxent-100-c5-baseline-dict-penn.bin,/home/ragerri/resources/pos-resources/lemmatizer-dicts/freeling/en-lemmatizer.txt,/home/ragerri/resources/supersense/supersenses.wn20
MFSFeaturesRange=pos,posclass,lemma,mfs,monosemic
# SuperSenseFeatures: add Ciaramita and Altun (2006) super sense tagging features.
# It is required to provide an ixa-pipe-pos model, a plan text word\tlemma\tpostag
# dictionary and a lexicon containing the most frequent sense information, where
# each entry is of the form word#pos\tfreq#sense. For example, house#n\t1098#noun.artifact.
# This features include Morphological and MFS features, therefore,
# DO NOT COMBINE THEM with MORPHO OR MFS FEATURES!!!
SuperSenseFeatures=/home/ragerri/javacode/ixa-pipe-pos/pos-models-1.3.0/en/en-maxent-100-c5-baseline-dict-penn.bin,/home/ragerri/resources/pos-resources/lemmatizer-dicts/freeling/en-lemmatizer.txt,/home/ragerri/resources/supersense/supersenses.wn20
SuperSenseFeaturesRange=mfs,monosemic
#####################################
#### CROSS VALIDATION PARAMETERS ####
#####################################
# Cross Validation Folds; if commented out it defaults to 10 cross validation
# folds.
Folds=5
# Evaluation type: choose between 'detailed' and 'error'; only for cross-validation.
# It defaults to detailed evaluation.
EvaluationType=detailed