-
Notifications
You must be signed in to change notification settings - Fork 0
/
deduce-test.py
72 lines (62 loc) · 2.31 KB
/
deduce-test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env python3
"""
deduce-test.py: test the de-identification program deduce
usage: deduce-test.py first_names initials surname given_name < file.ego
note: deduce on github: https://github.com/vmenger/deduce
20190121 erikt(at)xs4all.nl
"""
import deduce
import re
import sys
COMMAND = sys.argv.pop(0)
USAGE = "usage: "+COMMAND+"first_names initials surname given_name < file"
ARGVLEN = 4
OUTSIDENE = "O"
CONVERT = { "DATUM":"DATE","INSTELLING":"ORG","LEEFTIJD":"NUM","LOCATIE":"LOC","MAIL":"O","PATIENT":"PER","PERSOON":"PER","TELEFOONNUMMER":"NUM","URL":"O" }
def readTextFromStdin():
text = ""
for line in sys.stdin: text += line
return(text)
def annotationStart(token):
return(re.search(r"^<[A-Z]",token))
def annotationEnd(token):
return(re.search(r".>.*$",token))
def addMissingSpaces(line):
line = re.sub(r"([^ ])<",r"\1 <",line)
line = re.sub(r">([^ ])",r"> \1",line)
return(line)
def printResults(annotatedText):
paragraphs = annotatedText.split("\n")
for par in paragraphs:
par = addMissingSpaces(par)
tokens = par.split(" ")
t = 0
while t < len(tokens):
if annotationStart(tokens[t]):
label = tokens[t]
label = re.sub(r"<",r"",label)
t += 1
while t < len(tokens) and not re.search(r">",tokens[t]):
print(tokens[t],CONVERT[label],sep="\t")
t += 1
if t >= len(tokens):
sys.exit(COMMAND+": entity not closed on line: "+par)
tokens[t] = re.sub(r">.*$",r"",tokens[t])
if tokens[t] != "":
print(tokens[t],CONVERT[label],sep="\t")
elif tokens[t] != "":
print(tokens[t],OUTSIDENE,sep="\t")
t += 1
print("")
def main(argv):
if len(argv) != ARGVLEN: sys.exit(USAGE)
first_names,initials,surname,given_name = argv
text = readTextFromStdin()
annotatedText = deduce.annotate_text(text, \
first_names,initials,surname,given_name, \
names=True, locations=True, institutions=True, dates=True, \
ages=True, patient_numbers=True, phone_numbers=True, urls=True, \
flatten=True)
printResults(annotatedText)
if __name__ == "__main__":
sys.exit(main(sys.argv))