forked from Klim314/Quetzalcoatl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pattern.py
397 lines (329 loc) · 12.1 KB
/
pattern.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
#!/usr/bin/env python3
from modules import sent_tokenize as st
from nltk.stem.snowball import SnowballStemmer
from modules import papers
from modules import modfile
import os
import re
from modules import paperparse as pp
import copy
"""
pattern.py
Takes an .sp file produced by pubcrawl, checks if any of the preloaded patterns are present. All members of patterns must
be present and in order. Patterns may be spaced out
ALL PATTERNS USED
Activity A against B
containing A inhibited B
A decreased B
bacteriocins A inhibit B
A compete B
Antagonistic A on B
A antimicrobial B
A antibacterial B
A antagonistic B
A bacteriocin against B
A inhibit B
"""
"""
GLOBALS
"""
#Word Stemmer
stemmer = SnowballStemmer('english')
#Output Directory
outDir = "output/pattern/"
if not os.path.exists(outDir):
os.mkdir(outDir)
"""""""""""""""""""""
#####################
# PATTERNS #
#####################
"""""""""""""""""""""
#Single-line Patterns
patterns = ["Activity sjA against sjB",
"containing sjA inhibited sjB",
"sjA decreased sjB",
"bacteriocin sjA against sjB",
"bacteriocin sjA inhibit sjB",
"sjA compet sjB",
"Antagonistic sjA on sjB",
"sjA antimicrobial sjB",
"sjA antibacterial sjB",
"sjA antagonistic sjB",
"sjA bacteriocin sjB",
"sjA inhibit sjB",
"sjA inhibitory effect sjB",
"sjA inhibitory activity sjB",
"inhibit sjA by sjB"]
#nPatterns
nPatterns = [
["bacteriocins produced by sja","antagonistic effect sjb"],
["bacteriocins produced by sja","inhibit sjb"]
]
#antiPatterns
antiPatterns = [
"(Fluoroquinolone|Lipoglycopeptides|Cephalosporin|Macrocyclics|Penicillins|Amoxicillin|Ampicillin|Bacampicillin|Carbenicillin|Cloxacillin|Dicloxacillin|Flucloxacillin|Mezlocillin|Nafcillin|Oxacillin|Penicillin|Penicillin|Piperacillin|Pivampicillin|Pivmecillinam|Ticarcillin|Cefacetrile|Cefadroxil|Cefalexin|Cefaloglycin|Cefalonium|Cefaloridine|Cefalotin|Cefapirin|Cefatrizine|Cefazaflur|Cefazedone|Cefazolin|Cefradine|Cefroxadine|Ceftezole|Cefaclor|Cefamandole|Cefmetazole|Cefonicid|Cefotetan|Cefoxitin|Cefprozil|Cefuroxime|Cefuzonam|Cefcapene|Cefdaloxime|Cefdinir|Cefditoren|Cefetamet|Cefixime|Cefmenoxime|Cefodizime|Cefotaxime|Cefpimizole|Cefpodoxime|Cefteram|Ceftibuten|Ceftiofur|Ceftiolene|Ceftizoxime|Ceftriaxone|Cefoperazone|Ceftazidime|Cefclidine|Cefepime|Cefluprenam|Cefoselis|Cefozopran|Cefpirome|Cefquinome|Ceftobiprole|Ceftaroline|Cefaclomezine|Cefaloram|Cefaparole|Cefcanel|Cefedrolor|Cefempidone|Cefetrizole|Cefivitril|Cefmatilen|Cefmepidium|Cefovecin|Cefoxazole|Cefrotil|Cefsumide|Cefuracetime|Ceftioxide|Aztreonam|Imipenem|cilastatin|Doripenem|Meropenem|Ertapenem|Azithromycin|Erythromycin|Clarithromycin|Dirithromycin|Roxithromycin|Ketolides|Telithromycin|Lincosamides|Clindamycin|Lincomycin|Pristinamycin|Quinupristin|dalfopristin|Amikacin|Gentamicin|Kanamycin|Neomycin|Netilmicin|Paromomycin|Streptomycin|Tobramycin|Flumequine|Nalidixic|Oxolinic|Piromidic|Pipemidic|Rosoxacin|Ciprofloxacin|Enoxacin|Lomefloxacin|Nadifloxacin|Norfloxacin|Ofloxacin|Pefloxacin|Rufloxacin|Balofloxacin|Gatifloxacin|Grepafloxacin|Levofloxacin|Moxifloxacin|Pazufloxacin|Sparfloxacin|Temafloxacin|Tosufloxacin|Besifloxacin|Clinafloxacin|Gemifloxacin|Sitafloxacin|Trovafloxacin|Prulifloxacin|Sulfonamides|Sulfamethizole|Sulfamethoxazole|Sulfisoxazole|Trimethoprim-Sulfamethoxazole|Demeclocycline|Doxycycline|Minocycline|Oxytetracycline|Tetracycline|Glycylcyclines|Tigecycline|Chloramphenicol|Metronidazole|Tinidazole|Nitrofurantoin|Glycopeptides|Vancomycin|Teicoplanin|Lipoglycopeptides|Telavancin|Oxazolidinones|Linezolid|Cycloserine|Rifamycins|Rifampin|Rifabutin|Rifapentine|Polypeptides|Bacitracin|Polymyxin|Tuberactinomycins|Viomycin|Capreomycin)"
]
#lowercase them all
patterns = [i.lower() for i in patterns]
nPatterns = [[i.lower() for i in j]for j in nPatterns]
antiPatterns = [i.lower() for i in antiPatterns]
"""""""""""""""""""""
#####################
# CLASSES #
#####################
"""""""""""""""""""""
"""
Paper():
Class representation of a single paper. Contains the title, abstract, and their respective stemmed and tokenized forms
"""
class Paper():
def __init__(self, spFilePaper, spSet = {}):
self.spSet = spSet
try:
self.title = spFilePaper["TI "]
except:
print("ERRORSPFILE: ", spFilePaper)
raise
self.abstract = spFilePaper["AB "]
self.sTitle = self.tokStem(self.title)
self.sAbstract = self.tokStem(self.abstract)
def tokStem(self, paragraph):
temp = st.preprocess(paragraph, self.spSet)
temp = [[stemmer.stem(i) for i in j] for j in temp]
return [" ".join(i) for i in temp]
def export(self):
print("-------PAPER--------")
print(self.spSet)
print(self.title)
print(self.abstract)
print(self.sTitle)
print(self.sAbstract)
print("-------PAPER--------")
"""
Pair():
Class contained of paper objects for all papers for a species pair.
Takes in a filePath to a set of line-separated, initalizer-tagged papers
from that pair and packages them into Paper() objects
"""
class Pair():
def __init__(self, filePath):
#initalize the species sets
sja, sjb = pp.getNames(filePath)[0], pp.getNames(filePath)[1]
self.spSet1 = set(sja)
self.spSet2 = set(sjb)
self.spSet = self.spSet1.union(self.spSet2)
#Load the papers
self.spFile = pp.spFile(filePath, purge = True)
#unified is a tuple: (spFile.papers[i], Paper)
self.unified = [(i, Paper(i, self.spSet)) for i in self.spFile.papers]
def test(self, unifiedObject, patternList, antiPatternList):
flag = 0
for pattern in patternList:
titleCheck = pattern.pCheck(unifiedObject[1].sTitle)
if titleCheck:
unifiedObject[0]['TIHT'] += ":#:".join(["^#^" + pattern.text] + [i.group(0) for i in titleCheck])
flag = 1
abstractCheck = pattern.pCheck(unifiedObject[1].sAbstract)
if abstractCheck:
unifiedObject[0]['ABHT'] += ":#:".join(["^#^" + pattern.text] + [i.group(0) for i in abstractCheck])
flag = 1
#antiPatternChecker
for antiPattern in antiPatternList:
if antiPattern.pCheck(unifiedObject[1].sTitle):
unifiedObject[0]['TIHT'] = ""
flag = 0
if antiPattern.pCheck(unifiedObject[1].sTitle):
unifiedObject[0]['ABHT'] = ""
flag = 0
return flag
def testAll(self, patternList, antiPatternList, outPath):
for unifiedObject in self.unified:
isTrue = self.test(unifiedObject, patternList, antiPatternList)
if isTrue:
self.spFile.summary["INT "] = '1'
self.spFile.summary["NEG "] = '1'
self.spFile.writeSpFileHits(outPath)
"""
pattern:
Takes in a pattern sentence. Establishes a regex from the pattern and the initialization variables and uses it to detect informative patterns
in the data.
check(sentence):
takes in a sentence and, using the precompiled regexes, attempts to detect a pattern. returns True if detected, false otherwise
Initialize(sja, sjb):
Compiles multiple regex variations of the pattern sentence from two input species
"""
class Pattern():
def __init__(self, text):
self.text = text
self.regexes = []
def export(self):
return self.text
def initialize(self, sja, sjb):
#cleanup
self.regexes = []
sja, sjb = sja.lower(), sjb.lower()
sp1 = [sja, abb(sja, regex = 1)]
sp2 = [sjb, abb(sjb, regex = 1)]
#initialize base regexes
flags = self.text.split(' ')
rFlags = copy.deepcopy(flags)
#create forward match
for j in sp1:
for k in sp2:
for i in range(len(flags)):
if flags[i] == "sja":
flags[i] = j
elif flags[i] == "sjb":
flags[i] = k
else:
flags[i] = flags[i]
try:
self.regexes.append(re.compile("(" + "[ -\\.].*".join(flags) + ")"))
except:
print("RegexError: ",flags)
print("sja: ",sja)
print("sjb: ",sjb)
raise
flags = self.text.split(' ')
#create reverse match
for j in sp2:
for k in sp1:
for i in range(len(flags)):
if flags[i] == "sja":
flags[i] = j
elif flags[i] == "sjb":
flags[i] = k
else:
flags[i] = flags[i]
try:
self.regexes.append(re.compile("(" + "[ -\\.].*".join(flags) + ")"))
except:
print("RegexError: ",flags)
print("sja: ",sja)
print("sjb: ",sjb)
raise
flags = self.text.split(' ')
return
def check(self, sentence):
holder = []
for regex in self.regexes:
# print("REGEX: ", regex)
temp = regex.search(sentence)
if temp:
holder.append(temp)
# print("HIT")
return holder
def pCheck(self, paragraph):
# print(paragraph)
# print(self.regexes)
holder = []
for sentence in paragraph:
# print("CHECKING: ", sentence)
temp = self.check(sentence)
if temp:
holder.extend(temp)
return holder
class nPattern():
def __init__(self, textList):
self.patterns = [Pattern(i) for i in textList]
self.text = "|||".join([pattern.text for pattern in self.patterns])
def initialize(self, sja, sjb):
for pattern in self.patterns:
pattern.initialize(sja, sjb)
def pCheck(self, paragraph):
temp = [i for i in self.patterns]
enum = enumerate(self.patterns)
holder = [[] for i in self.patterns]
for index, pattern in enum:
checkData = pattern.pCheck(paragraph)
if checkData:
holder[index]=checkData[0]
temp.remove(pattern)
# print("HOLDER", holder)
# print(temp)
if len(temp) == 0:
return holder
return []
def export(self):
for i in self.patterns:
print(i.regexes)
"""
abb
Takes in a string of format A B, where A is the genus and B the Species. Returns abbreviated species name
"""
def abb(spec, regex = 0):
temp = spec.split(' ')
if regex == True:
spec = temp[0][0] + '. ' + temp[1]
return spec.replace(".", "\.")
return temp[0][0] + '. ' + temp[1]
"""
makeName
"""
def makeName(sja, sjb):
sja = '_'.join(sja.split(" "))
sjb = '_'.join(sjb.split(" "))
return sja+'#' + sjb + ".sp"
"""
makePatterns(patternStringList):
takes in a list of pattern strings and processes them (tokenizing, stemming) into Pattern objects
"""
def makePatterns(patternStringList):
patterns = [st.preprocess(i)[0] for i in patternStringList]
patterns = [[stemmer.stem(j) for j in i ] for i in patterns]
patterns = [" ".join(i) for i in patterns]
return [Pattern(i) for i in patterns]
def makeNpatterns(nPatternStringList):
patterns = [[st.preprocess(i)[0] for i in j] for j in nPatternStringList]
# print(patterns)
patterns = [[[stemmer.stem(word) for word in sentence ] for sentence in sentenceTup] for sentenceTup in patterns]
patterns = [[" ".join(i) for i in j] for j in patterns]
return [nPattern(i) for i in patterns]
def test(paperObject, patternList):
for pattern in patternList:
if pattern.pCheck(paperObject.sTitle) or pattern.pCheck(paperObject.sAbstract):
return True
return False
"""
execute(filePath):
Summary script. Loads all papers in the filepath and tests them with the preloaded patternStringList.
TODO:
allow customization of output folder
"""
def execute(filePath, postOutDir = ""):
#Extraction of subject names
names = pp.getNames(filePath)
sja, sjb = stemmer.stem(names[0][0]), stemmer.stem(names[1][0])
#Creation of
if postOutDir != "" and postOutDir[-1] != "/":
postOutDir += "/"
outPath = outDir + postOutDir + makeName(names[0][0], names[1][0])
patternList = makePatterns(patterns)
patternList += makeNpatterns(nPatterns)
antiPatternList = makePatterns(antiPatterns)
# reverseList = makePatterns(patterns)
for pattern in patternList:
pattern.initialize(sja, sjb)
for pattern in antiPatternList:
pattern.initialize(sja, sjb)
pairPapers = Pair(filePath)
pairPapers.testAll(patternList, antiPatternList, outPath)
def debug(filePath):
#Extraction of subject names
names = pp.getNames(filePath)
sja, sjb = names[0][0], names[1][0]
#patternList = makePatterns(patterns)
patternList = makeNpatterns(nPatterns)
# reverseList = makePatterns(patterns)
for pattern in patternList:
pattern.initialize(sja, sjb)
pairPapers = Pair(filePath)
print(pairPapers.spFile.papers[0])
print(pairPapers.unified[0][1].sAbstract)
print(patternList[0].pCheck(pairPapers.unified[0][1].sAbstract))
if __name__ == "__main__":
#target = "annotated/patternscan/Escherichia_coli#Lactobacillus_acidophilus.sp"
target = "annotated/patternscan/precision/200_samples_set_2/evaluate/missed_hits/Lactobacillus_fermentum#Gardnerella_vaginalis.sp"
#target = "annotated/patternscan/precision/200_samples_set_2/evaluate/false_hits/acinetobacter_sp.#acinetobacter_baumannii.sp"
#debug(target)
execute(target)