-
Notifications
You must be signed in to change notification settings - Fork 6
/
evaluate.py
executable file
·354 lines (316 loc) · 14.1 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
# Modified by Max Friedrich, 2018
###############################################################################
#
# Copyright 2014 Christopher Kotfila
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
# i2b2 2014 Evaluation Scripts
#
# This script is distributed as apart of the i2b2 2014 Cardiac Risk and
# Personal Health-care Information (PHI) tasks. It is intended to be used via
# command line:
#
# $> python evaluate.py [cr|phi] [FLAGS] SYSTEM GOLD
#
# Where 'cr' produces Precision, Recall and F1 (P/R/F1) measure for the
# cardiac risk task and 'phi' produces P/R/F1 for the PHI task. SYSTEM and GOLD
# may be individual files representing system output in the case of SYSTEM and
# the gold standard in the case of GOLD. SYSTEM and GOLD may also be
# directories in which case all files in SYSTEM will be compared to files the
# GOLD directory based on their file names. File names MUST be of the form:
# XXX-YY.xml where XXX is the patient id, and YY is the document id. See the
# README.md file for more details.
#
# Basic Flags:
# -v, --verbose :: Print document by document P/R/F1 for each document instead
# of summary statistics for an entire set of documents.
#
# Basic Examples:
#
# $> python evaluate.py cr system.xml gold.xml
#
# Evaluate the single system output file 'system.xml' against the gold
# standard file 'gold.xml' for the Cardiac Risk Task (the 'cr' argument).
# Please note:
# file to file comparisons are made available for testing purposes, systems
# output will be evaluated using the "batch" system/ gold/ examples as shown
# below.
#
# $> python evaluate.py cr system/ gold/
#
# Evaluate the set of system outputs in the folder system/ against the set of
# gold standard annotations in gold/ using the cardiac risk task evaluation.
#
# $> python evaluate.py phi system/ gold/
#
# Evaluate the set of system outputs in the folder system against the set of
# gold standard annotations in gold/ using the PHI task evaluation.
#
#
#
# Advanced Usage:
#
# Some additional functionality is made available for testing and error
# analysis. This functionality is provided AS IS with the hopes that it will
# be useful. It should be considered 'experimental' at best, may be bug prone
# and will not be explicitly supported, though, bug reports and pull requests
# are welcome.
#
# Advanced Flags:
#
# --filter [TAG ATTRIBUTES] :: run P/R/F1 measures in either summary or verbose
# mode (see -v) for the list of attributes defined
# by TAG ATTRIBUTES. This may be a comma separated
# list of tag names and attribute values. For more
# see Advanced Examples.
# --conjunctive :: If multiple values are passed to filter as a comma separated
# list, treat them as a series of AND based filters instead of
# a series of OR based filters
# --invert :: run P/R/F1 on the inverted set of tags defined by TAG ATTRIBUTES
# in the --filter tag (see --filter).
#
# Advanced Examples:
#
# $> python evaluate.py cr --filter MEDICATION system/ gold/
#
# Evaluate system output in system/ folder against gold/ folder considering
# only MEDICATION tags
#
# $> python evaluate.py cr --filter CAD,OBESE system/ gold/
#
# Evaluate system output in system/ folder against gold/ folder considering
# only CAD or OBESE tags. Comma separated lists to the --filter flag are con-
# joined via OR.
#
# $> python evaluate.py cr --filter "CAD,before DCT" system/ gold/
#
# Evaluate system output in system/ folder against gold/ folder considering
# only CAD *OR* tags with a time attribute of before DCT. This is probably
# not what you want when filtering, see the next example
#
# $> python evaluate.py cr --conjunctive \
# --filter "CAD,before DCT" system/ gold/
#
# Evaluate system output in system/ folder against gold/ folder considering
# CAD tags *AND* tags with a time attribute of before DCT.
#
# $> python evaluate.py cr --invert \
# --filter MEDICATION system/ gold/
#
# Evaluate system output in system/ folder against gold/ folder considering
# any tag which is NOT a MEDICATION tag.
#
# $> python evaluate.py cr --invert \
# --conjunctive \
# --filter "CAD,before DCT" system/ gold/
#
# Evaluate system output in system/ folder against gold/ folder considering
# any tag which is NOT CAD and with a time attribute of 'before DCT'
import argparse
import os
from collections import defaultdict
from .classes import StandoffAnnotation, Evaluate, CombinedEvaluation, \
PHITrackEvaluation, CardiacRiskTrackEvaluation
from .tags import DocumentTag, PHITag, MEDICAL_TAG_CLASSES
# This function is 'exterimental' as in it works for my use cases
# But is not generally well documented or a part of the expected
# workflow.
def get_predicate_function(arg, tag):
""" This function takes a tag attribute value, determines the attribute(s)
of the class(es) this value belongs to, and then returns a predicate
function that returns true if this value is set for the calculated
attribute(s) on the class(es). This allows for overlap - ie. "ACE
Inhibitor" is a valid type1 and a valid type2 attribute value. If arg
equals "ACE Inhibitor" our returned predicate function will return true if
our tag has "ACE Inhibitor" set for either type1 or type2 attributes.
Currently this is implemented to ONLY work with MEDICAL_TAG_CLASSES but
could be easily extended to work with PHI tag classes.
"""
attrs = []
# Get a list of valid attributes for this argument
# If we have a tag name (ie. MEDICATION) add 'name' to the attributes
if arg in list(tag.tag_types.keys()):
attrs.append("name")
else:
tag_attributes = ["valid_type1", "valid_type2", "valid_indicator",
"valid_status", "valid_time", "valid_type"]
for cls in MEDICAL_TAG_CLASSES:
for attr in tag_attributes:
try:
if arg in getattr(cls, attr):
# add the attribute, strip out the "valid_" prefix
# This assumes that classes follow the
# valid_ATTRIBUTE convention
# and will break if they are extended
attrs.append(attr.replace("valid_", ""))
except AttributeError:
continue
# Delete these so we don't end up carrying around
# references in our function
try:
del tag_attributes
del cls
del attr
except NameError:
pass
attrs = list(set(attrs))
if len(attrs) == 0:
print(("WARNING: could not find valid class attribute for " +
"\"{}\", + skipping.".format(arg)))
return lambda t: True
# Define the predicate function we will use. artrs are scoped into
# the closure, which is sort of the whole point of the
# get_predicate_function function.
def matchp(t):
for attr in attrs:
if attr == "name" and t.name == arg:
return True
else:
try:
if getattr(t, attr).lower() == arg.lower():
return True
except (AttributeError, KeyError):
pass
return False
return matchp
def get_document_dict_by_system_id(system_dirs):
"""Takes a list of directories and returns all of the StandoffAnnotation's
as a system id, annotation id indexed dictionary. System id (or
StandoffAnnotation.sys_id) is whatever values trail the XXX-YY file id.
For example:
301-01foo.xml
patient id: 301
document id: 01
system id: foo
In the case where there is nothing trailing the document id, the sys_id
is the empty string ('').
"""
documents = defaultdict(lambda: defaultdict(int))
for d in system_dirs:
for fn in [f for f in os.listdir(d) if f.endswith('.xml')]:
sa = StandoffAnnotation(os.path.join(d, fn))
documents[sa.sys_id][sa.id] = sa
return documents
def evaluate(system, gs, eval_class, **kwargs):
"""Evaluate the system by calling the eval_class (either EvaluatePHI or
EvaluateCardiacRisk classes) with an annotation id indexed dict of
StandoffAnnotation classes for the system(s) and the gold standard outputs.
'system' will be a list containing either one file, or one or more
directories. 'gs' will be a file or a directory. This function mostly just
handles formatting arguments for the eval_class.
"""
assert issubclass(eval_class, Evaluate) or \
issubclass(eval_class, CombinedEvaluation), \
"Must pass in EvaluatePHI or EvaluateCardiacRisk classes to evaluate()"
gold_sa = {}
evaluations = []
# Strip verbose keyword if it exists
# verbose is not a keyword to our eval classes
# __init__() functions
try:
verbose = kwargs['verbose']
del kwargs['verbose']
except KeyError:
verbose = False
assert os.path.exists(gs), "{} does not exist!".format(gs)
for s in system:
assert os.path.exists(s), "{} does not exist!".format(s)
# Handle if two files were passed on the command line
if os.path.isfile(system[0]) and os.path.isfile(gs):
gs = StandoffAnnotation(gs)
s = StandoffAnnotation(system[0])
e = eval_class({s.id: s}, {gs.id: gs}, **kwargs)
e.print_docs()
evaluations.append(e)
# Handle the case where 'gs' is a directory and 'system' is a
# list of directories. For individual evaluation (one system output
# against the gold standard) this is a little overkill, but this
# lets us run multiple systems against the gold standard and get numbers
# for each system output. useful for annotator agreement and final system
# evaluations. Error checking to ensure consistent files in each directory
# will be handled by the evaluation class.
elif all([os.path.isdir(s) for s in system]) and os.path.isdir(gs):
# Get a dict of gold standoff annotation indexed by id
for fn in [f for f in os.listdir(gs) if f.endswith('.xml')]:
sa = StandoffAnnotation(os.path.join(gs, fn))
gold_sa[sa.id] = sa
for s_id, system_sa in list(get_document_dict_by_system_id(system).items()):
e = eval_class(system_sa, gold_sa, **kwargs)
e.print_report(verbose=verbose)
evaluations.append(e)
else:
raise Exception("Must pass file.xml file.xml or [directory/]+ directory/"
"on command line!")
return evaluations[0] if len(evaluations) == 1 else evaluations
def main():
parser = argparse.ArgumentParser(description="To Write")
sp = parser.add_subparsers(dest="sp", help="To Write")
sp_phi = sp.add_parser("phi",
help="convert a document to different types")
sp_phi.add_argument('--filter',
help="Filters to apply, use with invert & conjunction")
sp_phi.add_argument('--conjunctive',
help="if multiple filters are applied, should these be \
combined with 'and' or 'or'",
action="store_true")
sp_phi.add_argument('--invert',
help="Invert the list of filters, match only tags \
that do not match filter functions",
action="store_true")
sp_phi.add_argument('-v', '--verbose',
help="list full document by document scores",
action="store_true")
sp_phi.add_argument("from_dirs",
help="directories to pull documents from",
nargs="+")
sp_phi.add_argument("to_dir",
help="directories to save documents to")
sp_cr = sp.add_parser("cr",
help="convert a document to different types")
sp_cr.add_argument('--filter',
help="Filters to apply, use with invert & conjunction")
sp_cr.add_argument('--conjunctive',
help="if multiple filters are applied, should these be \
combined with 'and' or 'or'",
action="store_true")
sp_cr.add_argument('--invert',
help="Invert the list of filters, match only tags \
that do not match filter functions",
action="store_true")
sp_cr.add_argument('-v', '--verbose',
help="list full document by document scores",
action="store_true")
sp_cr.add_argument("from_dirs",
help="directories to pull documents from",
nargs="+")
sp_cr.add_argument("to_dir",
help="directories to save documents to")
args = parser.parse_args()
if args.filter:
evaluate(args.from_dirs, args.to_dir,
PHITrackEvaluation if args.sp == "phi" else
CardiacRiskTrackEvaluation,
verbose=args.verbose,
invert=args.invert,
conjunctive=args.conjunctive,
filters=[get_predicate_function(a, PHITag if args.sp == "phi" else DocumentTag)
for a in args.filter.split(",")])
else:
evaluate(args.from_dirs, args.to_dir,
PHITrackEvaluation if args.sp == "phi" else
CardiacRiskTrackEvaluation,
verbose=args.verbose)
if __name__ == "__main__":
main()