-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathned_ner_run.py
219 lines (197 loc) · 9.79 KB
/
ned_ner_run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import argparse
import os
import logging
import shutil
import sys
import subprocess
from eval.ner.PrepareEmbedRep import PrepareEmbedRep
from eval.ner.PrepareHmmRep import PrepareHmmRep
from eval.ner.readers.Conll2002NerCorpus import Conll2002NerCorpus, ned_test, ned_dev, ned_train
import eval.ner.sequences.extended_feature as exfc
import eval.ner.sequences.structured_perceptron as spc
def setup_logging(dirname, level):
"""
:param level: logging.INFO or logging.DEBUG, etc.
"""
logfile = "{}/log".format(dirname)
logging.basicConfig(filename=logfile, level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
return logging.getLogger(__name__)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-o", "--output_dir",
help="output directory, e.g. ../data/Conll2002/data/output/I20")
parser.add_argument("-rep", "--rep_path",
help="directory containing (hmm) word representations files")
parser.add_argument("-d", "--decoding",
choices=["viterbi", "max-emission", "max-product", "posterior", "posterior_cont",
"posterior_cont_type"],
help="method used for decoding: viterbi, posterior,...")
parser.add_argument("-brown", "--brown_cluster_file",
help="path to file with brown clusters")
parser.add_argument("--rel_spec", action='store_true', default=False,
help="if wordreps are based on specific syntactic relations")
parser.add_argument("--ignore_rel",
help="dependency relation name to ignore when decoding. Makes sense only together with rel_spec")
parser.add_argument("--embed", help="path to file with word embeddings")
parser.add_argument("--embed_v", help="path to vocabulary file of the text used for inducing word embeddings")
args = parser.parse_args()
if args.ignore_rel is not None and not args.rel_spec:
sys.exit("Ignore relation but no rel_spec option specified.")
# output dir for files and reports
if args.output_dir is not None:
outdir = args.output_dir
else:
sys.exit("Output directory path missing!")
#except IndexError:
# sys.exit("Output directory path missing!")
if not "Conll2002" in outdir:
sys.exit("Output directory probably wrong!")
if not os.path.exists(outdir):
os.makedirs(outdir)
else:
# prevent overwriting old stuff
import datetime
now = datetime.datetime.now().isoformat().replace(":", "-")
outdir = "{}_{}".format(outdir.rstrip("/"), now)
print("Output directory already exists, appending with timestamp.")
os.makedirs(outdir)
logger = setup_logging(outdir, logging.DEBUG)
brown_cluster_file = args.brown_cluster_file
hmm_rep_path = args.rep_path
lr = ("_lr_" in hmm_rep_path) if hmm_rep_path is not None else False
if hmm_rep_path is not None and "tree" in hmm_rep_path:
use_wordrep_tree = True
use_wordrep_rel = False
logger.info("Using tree representations.")
elif hmm_rep_path is not None and ("_rel_" in hmm_rep_path or lr):
use_wordrep_rel = True
use_wordrep_tree = False
logger.info("Using tree representations.")
else:
use_wordrep_tree = False
use_wordrep_rel = False
decoding = args.decoding
# use hmm-based wordrep
if hmm_rep_path is not None:
logger.info("Loading corpora. Decoding word representations.")
hmmrep = PrepareHmmRep(hmm_rep_path, lang="nl", decoding=decoding, use_wordrep_tree=use_wordrep_tree,
use_wordrep_rel=use_wordrep_rel, eval_spec_rel=args.rel_spec, logger=logger,
ignore_rel=args.ignore_rel, lr=lr)
corpus = hmmrep.ner_corpus
train_seq = hmmrep.train_seq
dev_seq = hmmrep.dev_seq
test_seq = hmmrep.test_seq
elif args.embed is not None:
logger.info("Loading embeddings.")
embrep = PrepareEmbedRep(embed=args.embed, embed_v=args.embed_v, lang="nl", logger=logger)
corpus = embrep.ner_corpus
train_seq = embrep.train_seq
dev_seq = embrep.dev_seq
test_seq = embrep.test_seq
else:
logger.info("Loading corpora.")
corpus = Conll2002NerCorpus()
train_seq = corpus.read_sequence_list_conll(ned_train)
dev_seq = corpus.read_sequence_list_conll(ned_dev)
test_seq = corpus.read_sequence_list_conll(ned_test)
logger.info("Extracting features.")
#logger.info("Training on dev !!")
#feature_mapper = exfc.ExtendedFeatures(dev_seq)
feature_mapper = exfc.ExtendedFeatures(train_seq, brown_cluster_file)
# baseline features
feature_mapper.set_baseline_features()
# other/wordrep features
if brown_cluster_file is not None:
feature_mapper.brown_id = True
feature_mapper.brown_id_plus1 = True
feature_mapper.brown_id_plus2 = True
feature_mapper.brown_id_minus1 = True
feature_mapper.brown_id_minus2 = True
feature_mapper.brown_prefix = False # prefix length features; same for all brown_id
feature_mapper.brown_prefix_lengths = []
if feature_mapper.brown_prefix:
if not feature_mapper.brown_prefix_lengths:
sys.exit("Brown prefix lengths not defined.")
else:
feature_mapper.brown_id = False
feature_mapper.brown_id_plus1 = False
feature_mapper.brown_id_plus2 = False
feature_mapper.brown_id_minus1 = False
feature_mapper.brown_id_minus2 = False
feature_mapper.brown_prefix = False # prefix length features; same for all brown_id
feature_mapper.brown_prefix_lengths = []
if hmm_rep_path is not None or args.embed is not None:
feature_mapper.rep_id = True
feature_mapper.rep_id_plus1 = True
feature_mapper.rep_id_plus2 = False
feature_mapper.rep_id_minus1 = False
feature_mapper.rep_id_minus2 = False
else:
feature_mapper.rep_id = False
feature_mapper.rep_id_plus1 = False
feature_mapper.rep_id_plus2 = False
feature_mapper.rep_id_minus1 = False
feature_mapper.rep_id_minus2 = False
feature_mapper.build_features()
logger.info("Training./Loading model.")
sp = spc.StructuredPerceptron(corpus.word_dict, corpus.tag_dict, feature_mapper)
sp.num_epochs = 20
sp.train_supervised(train_seq, dev_seq)
logger.info("Testing on dev.")
pred_dev = sp.viterbi_decode_corpus(dev_seq)
logger.info("Writing conll eval format.")
corpus.write_conll_instances(dev_seq, pred_dev, "{}/dev.txt".format(outdir))
logger.info("Testing on test.")
pred_test = sp.viterbi_decode_corpus(test_seq)
logger.info("Writing conll eval format.")
corpus.write_conll_instances(test_seq, pred_test, "{}/test.txt".format(outdir))
logger.info("Saving model, writing the settings.")
with open("{}/setting".format(outdir), "w") as setting_file:
setting_file.write("Train file: {}\n".format(ned_train))
setting_file.write("Dev file: {}\n".format(ned_dev))
setting_file.write("Test file: {}\n".format(ned_test))
setting_file.write("Output directory: {}\n".format(outdir))
setting_file.write("Loaded model parameters: {}\n".format(sp.loaded_model))
setting_file.write("Number of features: {}\n".format(feature_mapper.get_num_features()))
setting_file.write("Features used:\n")
for f in sorted(list(feature_mapper.features_used)):
setting_file.write("\t{}\n".format(f))
setting_file.write("Number of labels: {}\n".format(sp.get_num_states()))
setting_file.write("Classifier: see the experimental py file in the folder\n")
setting_file.write("Classification task decoder: see the experimental py file in the folder\n")
setting_file.write("Number of epochs: {}\n".format(sp.num_epochs))
setting_file.write("Learning rate: {}\n".format(sp.learning_rate))
setting_file.write("Averaged classifier: {}\n".format(sp.averaged))
if brown_cluster_file:
setting_file.write("Brown cluster file: {}\n".format(brown_cluster_file))
if hmm_rep_path is not None:
setting_file.write("Word rep (hmm) file: {}\n".format(hmm_rep_path))
setting_file.write("Word rep decoder: {}\n".format(decoding))
setting_file.write("Syntactic relation to ignore when decoding: {}\n".format(args.ignore_rel))
# Save the model
sp.save_model(outdir)
# Copy this file
curr_file = os.path.realpath(__file__)
shutil.copy(curr_file, outdir)
logger.info("Evaluating with official perl script.")
# Run Perl evaluation
dev_file = "{}/dev.txt".format(outdir)
test_file = "{}/test.txt".format(outdir)
eval_script = "conlleval.txt"
p1_dev = subprocess.Popen(["cat", dev_file], stdout=subprocess.PIPE)
p1_test = subprocess.Popen(["cat", test_file], stdout=subprocess.PIPE)
p2_dev = subprocess.Popen(["perl", eval_script], stdin=p1_dev.stdout, stdout=subprocess.PIPE)
p2_test = subprocess.Popen(["perl", eval_script], stdin=p1_test.stdout, stdout=subprocess.PIPE)
p1_dev.stdout.close()
p1_test.stdout.close()
dev_result = p2_dev.communicate()[0].decode()
test_result = p2_test.communicate()[0].decode()
with open("{}/dev.result".format(outdir), "w") as dev_out, \
open("{}/test.result".format(outdir), "w") as test_out:
dev_out.write("{}".format(dev_result))
test_out.write("{}".format(test_result))
# extract f-score
dev_score = dev_result.split("\n")[1].split(";")[-1].split(" ")[-1]
test_score = test_result.split("\n")[1].split(";")[-1].split(" ")[-1]
logger.info("F-score dev: {}".format(dev_score))
logger.info("F-score test: {}".format(test_score))