forked from tommasoc80/EventStoryLine
-
Notifications
You must be signed in to change notification settings - Fork 9
/
baseline_OP.py
132 lines (90 loc) · 3.78 KB
/
baseline_OP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import sys
import os
import os.path
from lxml import etree
import collections
from create_gold_document import create_folder
from itertools import combinations
"""
Baseline system V1 for PLOT_LINK detection
- assume that all events are correctly identified and classified
- assume relations only in same sentences
- PLOT_LINK exists between every pair of events in the same sentence, excluding classes (NEG_)ACTION_REPORTING
and (NEG_)ACTION_CAUSTIVE (NEG_)ACTION_ASPECTUAL
"""
d = dict()
def check_path(filepath):
if os.path.isdir(filepath):
if filepath[-1] != '/':
filepath += '/'
return filepath
def extract_event_CAT(etreeRoot):
event_dict = collections.defaultdict(list)
not_good_events = ['ACTION_REPORTING', 'NEG_ACTION_REPORTING', 'ACTION_CAUSATIVE', 'NEG_ACTION_CAUSATIVE', 'ACTION_ASPECTUAL', 'NEG_ACTION_ASPECTUAL']
for elem in etreeRoot.findall('Markables/'):
for token_id in elem.findall('token_anchor'):
if elem.tag.startswith("ACTION") or elem.tag.startswith(("NEG_ACTION")):
if elem.tag not in not_good_events:
event_mention_id = elem.get('m_id', 'nothing')
token_mention_id = token_id.get('t_id', 'nothing')
event_dict[event_mention_id].append(token_mention_id)
return event_dict
def event_sentence(etreeRoot, d):
"""
Identify events in the same sentence ONLY
:param etreeRoot: CAT file ECB+
:param d: dictionary annotated event mentions ECB+; key = markable id; v = token ids
:return:
"""
event_sentence_dict = collections.defaultdict(list)
for elem in etreeRoot.findall('token'):
sentence_id = elem.attrib.get("sentence", "null")
token_match = elem.attrib.get("t_id", "null")
for k, v in d.items():
token_id = v[0]
if token_match == token_id:
event_sentence_dict[sentence_id].append(k)
return event_sentence_dict
def generate_event_pairs(d):
same_sentence_event_pairs = {}
for k, v in d.items():
if len(v) >= 2:
# same_sentence_pairs = ["\t".join(map(str, comb)) for comb in combinations(v, 2)]
same_sentence_pairs = [tuple(map(str, comb)) for comb in combinations(v, 2)]
same_sentence_event_pairs[k] = same_sentence_pairs
return same_sentence_event_pairs
def produce_output(inputf, outfile):
ecbplus = etree.parse(inputf, etree.XMLParser(remove_blank_text=True))
root_ecbplus = ecbplus.getroot()
root_ecbplus.getchildren()
event_mentions = extract_event_CAT(ecbplus)
event_per_sentence = event_sentence(ecbplus, event_mentions)
event_pairs = generate_event_pairs(event_per_sentence)
# print(event_mentions)
for k, v in event_pairs.items():
for i in v:
output = open(outfile, "a")
output.writelines("_".join(event_mentions[i[0]]) + "\t" + "_".join(event_mentions[i[1]]) + "\tPRECONDITION" + "\n")
output.close()
def baseline_v1(input, outdir):
input_dir = check_path(input)
ecb_subfolder = os.path.dirname(input_dir).split("/")[-1]
final_outdir = os.path.join(outdir, ecb_subfolder)
if final_outdir[-1] != '/':
final_outdir += '/'
create_folder(final_outdir)
output_dir = check_path(final_outdir)
file_names_ecbplus = [(input_dir, f) for f in os.listdir(input_dir)]
for f in file_names_ecbplus:
if f[1].endswith("plus.xml"):
outfile = output_dir + f[1].split(".xml")[0] + ".base.out"
produce_output(input_dir + f[1], outfile)
def main(argv = None):
if argv is None:
argv = sys.argv
if len(argv) < 3:
print("Usage python3 baseline_v1.py ECBplus outfolder")
else:
baseline_v1(argv[1], argv[2])
if __name__ == '__main__':
main()