trainParams.properties

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Template machine learning properties file
# Choose between MAXENT and PERCEPTRON
Algorithm=PERCEPTRON
Iterations=500
Cutoff=0
Threads=4

##################################################
#### Custom parameters added by ixa-pipe-ml ####
##################################################

# Languages supported: ca, de, en, es, eu, fr, gl, it, nl, pt, ru
Language=en

# Specify if adaptive features are cleared in the training and/or evaluation data.
# Options are: 'yes', 'no', 'docstart'. The first two will reset the
# features every sentence whereas the 'docstart' option will look for -DOCSTART-
# marks in the data to clear the adaptive features.
# Crossvalidation only works if ClearTrainingFeatures is set to 'yes'.
# If commented out both values default to 'no'.
ClearTrainingFeatures=yes
ClearEvaluationFeatures=no

# TrainingCorpus:
TrainSet=/home/ragerri/experiments/nerc/conll03/eng.train
TestSet=/home/ragerri/experiments/nerc/conll03/eng.testb

# CorpusFormat of the training corpus
# CoNLL 2002 format: http://www.clips.uantwerpen.be/conll2002/ner/
# CoNLL 2003 format: http://www.clips.uantwerpen.be/conll2003/ner/
# Tabulated format: two columns, word and class tab separated without BIO spans
# Lemmatizer format: two columns, word and lemma tab separated without BIO spans
# NOTE that every training corpus format contains only two columns: the token and
# the predicted class. For including other information (POS tag, lemma, etc.)
# contained in gold standard corpora go below to MorphologicalFeatures.
# Options: conll02, conll03, lemmatizer, tabulated
CorpusFormat=conll02

# OutputModel: if commented out, ixa-pipe-ml will save the model with the
# name of this properties file
OutputModel=trainParams.bin

# Sequence types (if applicable); if not active all Sequence types in the training corpus.
# Otherwise, separate with comma, eg., location,organization,person,misc.
# NOTE: the name of the Sequence type/class needs to be exact, namely, if in the corpus
# appears B-ORG, then in the parameter needs to appear ORG, not organization,
# and so on.
#Types=location,organization,person,misc

# Beamsize 1 amounts to greedy search
BeamSize=3

# Sequence codec used to code spans of Sequences: Choose between BIO and BILOU.
# If commented out, it defaults to BILOU.
#SequenceCodec=BIO

##################
#### FEATURES ####
##################

# Window: left and right window range from the current token. TokenFeatures
# and TokenClassFeatures depend on the window range specified here. If
# commented out, it will default to 2:2.
Window=2:2

# TokenFeatures: include current token (both in original and lowercase form)
# TokenFeaturesRange indicate whether to lowercase the tokens for feature generation.
# If commented out, it defaults to 'lower'.
TokenFeatures=yes
TokenFeaturesRange=lower

# TokenClassFeatures: include token shape features (capitalization, digits,
# etc. see TokenClassFeatureGenerator class for details
# TokenClassFeaturesRange whether to lowercase the tokens and provide wordAndClass (wac)
# joint features. To cancel out an option, just write 'no,wac', 'lower,no' or 'no,no'.
# If commented out, it defaults to 'lower,wac'.
TokenClassFeatures=yes
TokenClassFeaturesRange=lower,wac

# WordShapeSuperSenseFeatures: token shape features as implemented by
# Ciaramita and Altun (2006).
WordShapeSuperSenseFeatures=yes

# OutcomePriorFeatures: maps the underlying previous outcomes
OutcomePriorFeatures=yes

# PreviousMapFeatures: takes into account previous decisions and adds them as
# features
PreviousMapFeatures=yes

# SentenceFeatures: add first and last words of sentence as features.
# Use the Begin and End options to pick and choose combinations.
SentenceFeatures=yes
SentenceFeaturesBegin=true
SentenceFeaturesEnd=false

# PrefixFeatures: takes first 3rd and 4rd characters of current token as feature.
# Modify the values to get other prefix ranges.
PrefixFeatures=yes
PrefixFeaturesBegin=3
PrefixFeaturesEnd=4

# SuffixFeatures: takes last 4 characters of current token as feature.
# Modify the options to get other suffix ranges.
SuffixFeatures=yes
SuffixFeaturesBegin=0
SuffixFeaturesEnd=4

# BigramClassFeatures: adds bigram features based on tokens and their class
# features.
BigramClassFeatures=yes

# TrigramClassFeatures: add trigram features based on tokens and their class
# features.
TrigramClassFeatures=no

# FourgramClassFeatures: add fourgram features based on tokens and their
# class features.
FourgramClassFeatures=no

# FivegramClassFeatures: add fivegram features based on tokens and their class
# features.
FivegramClassFeatures=no

# CharNgramFeatures: min and maximum length for character ngrams of current
# token. If value is yes, specify the desired range in CharNgramFeaturesRange.
# If Range is commented out, it defaults to 2:5 when this feature is "yes".
CharNgramFeatures=no
CharNgramFeaturesRange=2:5

# DictionaryFeatures: add features if token found in some gazetteers. Comment
# it out deactivate this feature. Note that every file in the directory
# provided as parameter will be taken to be a dictionary. The dictionary format
# needs to be 'sequence\tabclass'.
DictionaryFeatures=/home/ragerri/javacode/ixa-pipe-nerc/nerc-resources/en/dictionaries

# BrownClusterFeatures: add features using Brown clusters
# Comment it out to deactivate this feature. NOTE: you can add multiple
# clustering lexicons by chaining them with a comma.
BrownClusterFeatures=/home/ragerri/javacode/ixa-pipe-nerc/brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt

# ClarkClusterFeatures: add features using Clark (2003) clusters. If value is uncommented,
# specify the location of the clustering lexicon in Clark format. NOTE: you can add multiple
# clustering lexicons by chaining them with a comma.
ClarkClusterFeatures=/home/ragerri/resources/reuters-rcv1/clark/reuters-rcv1.tok.punct.lower.300

# Word2VecClusterFeatures: add features using word2vec clusters. If value is
# uncommented, specify the location of the clustering lexicon in word2vec format.
# NOTE: you can add multiple clustering lexicons by chaining them with a comma.
Word2VecClusterFeatures=/home/ragerri/clusters.large.txt

############################
## Morphological Features ##
############################

# POSTagModelFeatures: add a pos tagging model trained with ixa-pipe-ml
# POSTagModelFeaturesRange specifies the combination of features to be used: options are 'pos'
# and 'posclass' in that strict order. For example, if 'pos,posclass' is
# chosen then both types of features will be used. If 'pos,no' is chosen, then
# only pos tag features are active. If 'no,posclass' then pos tag class is chosen.
# If POSTagModelFeatures is commented out, none of these features are used.
POSTagModelFeatures=/home/ragerri/javacode/en-pos-model.bin
POSTagModelFeaturesRange=pos,posclass

# LemmaModelFeatures: add a lemmatizer model trained with ixa-pipe-ml
LemmaModelFeatures=/home/ragerri/javacode/en-lemma-model.bin

# LemmaDictionaryFeatures add lemma features from a dictionary
# It is required to provide a POS model trained with ixa-pipe-ml
# and a plain text word\tlemma\tpostag dictionary.
LemmaDictionaryFeatures=/home/ragerri/javacode/en-pos-model.bin,/home/ragerri/resources/lemmatizer-dicts/en-lemmatizer.txt

# MFSFeatures: add Most Frequent Sense as features.
# It is required to provide an ixa-pipe-pos model, a plain text word\tlemma\tpostag
# dictionary and a lexicon containing the most frequent sense information, where
# each entry is of the form word#pos\tfreq#sense. For example, house#n\t1098#noun.artifact.
# This features include Morphological and SuperSense features, therefore,
# DO NOT COMBINE THEM with POSTAG, LEMMA OR SUPERSENSE FEATURES!!!
MFSFeatures=/home/ragerri/javacode/ixa-pipe-pos/pos-models-1.3.0/en/en-maxent-100-c5-baseline-dict-penn.bin,/home/ragerri/resources/pos-resources/lemmatizer-dicts/freeling/en-lemmatizer.txt,/home/ragerri/resources/supersense/supersenses.wn20
MFSFeaturesRange=pos,posclass,lemma,mfs,monosemic

# SuperSenseFeatures: add Ciaramita and Altun (2006) super sense tagging features.
# It is required to provide an ixa-pipe-pos model, a plan text word\tlemma\tpostag
# dictionary and a lexicon containing the most frequent sense information, where
# each entry is of the form word#pos\tfreq#sense. For example, house#n\t1098#noun.artifact.
# This features include Morphological and MFS features, therefore,
# DO NOT COMBINE THEM with MORPHO OR MFS FEATURES!!!
SuperSenseFeatures=/home/ragerri/javacode/ixa-pipe-pos/pos-models-1.3.0/en/en-maxent-100-c5-baseline-dict-penn.bin,/home/ragerri/resources/pos-resources/lemmatizer-dicts/freeling/en-lemmatizer.txt,/home/ragerri/resources/supersense/supersenses.wn20
SuperSenseFeaturesRange=mfs,monosemic

#####################################
#### CROSS VALIDATION PARAMETERS ####
#####################################

# Cross Validation Folds; if commented out it defaults to 10 cross validation
# folds.
Folds=5
# Evaluation type: choose between 'detailed' and 'error'; only for cross-validation.
# It defaults to detailed evaluation.
EvaluationType=detailed