-
Notifications
You must be signed in to change notification settings - Fork 0
/
PyPubmedText.py
executable file
·416 lines (303 loc) · 11.5 KB
/
PyPubmedText.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
import MySQLdb
from Bio import Entrez
from Bio import Medline
import xml.parsers.expat
from cStringIO import StringIO
from ConfigParser import SafeConfigParser
import optparse
import ReadConfig
import sys
class NcbiArticle:
'Class for PubMed or PMC articles from the local database'
def __init__(self):
self.id_ext = '' # PMID or PMCID
self.source = '' # PubMed or PMC
self.xml = '' # the xml format of the article
self.id_map = '' # only used for the PMC articles
self.text_title = ''
self.text_abstract = ''
self.text_body = ''
self.journal = ''
self.id_issn = ''
self.volume = ''
self.issue = ''
self.pages = ''
self.date = ''
self.affiliation = ''
self.authors = '' # define an empty list of authors
self.pubType = '' # publication type
self.mesh_terms = ''
self.supplMesh = ''
def displayArticle(self):
print self.id_ext, self.text_title
def getText(self):
text = self.text_title + self.text_abstract
return text
def getArticlesById(idList, articleType, db, dbTables):
cur = db.cursor()
if cur is None:
print 'Database connection not valid. Please check'
tableName = dbTables['MEDLINE'] # for PubMed articles
meshTable = dbTables['MESH_MEDLINE']
if articleType == 'pmc': # for PMC articles
tableName = dbTables['PMC']
meshTable = dbTables['MESH_PMC']
artMap = {}
cnt = 0
unicode_cnt = 0
for id in idList:
sql = "select * from " + tableName + " where id_ext = '%s';" % id
cur.execute(sql) #execute many queries in a batch
row = cur.fetchone()
if row is None:
continue
article = NcbiArticle()
article.xml = row[1]
article.id_ext = row[2]
article.source = row[3]
article.text_title = row[5]
article.text_abstract = row[6]
article.text_body = row[7]
article.pubType = row[9]
article.authors = row[11]
article.date = row[12]
article.journal = row[13]
article.id_issn = row[14]
article.volume = row[15]
article.issue = row[16]
article.pages = row[17]
if article.text_abstract is None or len(article.text_abstract) == 0:
continue
medline_citation = parseXml(article.xml)
if medline_citation is not None:
mesh_terms = getMesh(medline_citation)
article.mesh_terms = '|'.join(mesh_terms)
article.supplMesh = '|'.join(getSupplMesh(medline_citation))
article.affiliation = getAffiliation(medline_citation)
else:
unicode_cnt = unicode_cnt + 1
continue
for k,v in article.__dict__.items():
if not k.startswith("__"):
if isinstance(v, unicode):
setattr(article, k, v.encode('utf-8'))
artMap[article.id_ext] = article
cnt = cnt + 1
if cnt % 100 == 0:
print cnt, ' records processed.'
cur.close()
print unicode_cnt, ' records contains unicode'
return artMap
def parseXml(xml):
xml = xml.replace("MedlineCitationSet", "PubmedArticle")
xml = xml.replace('<?xml version="1.0" encoding="UTF-8"?>', '<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st May 2013//EN" "http://www.ncbi.nlm.nih.gov/corehtml/query/DTD/pubmed_130501.dtd">')
try:
handle = StringIO(xml)
medline_citation = Entrez.read(handle) #There is only one citation record
return medline_citation
except:
return None
def getMesh(medline_citation):
mesh_terms = []
if 'MeshHeadingList' not in medline_citation['MedlineCitation']:
return mesh_terms
mesh_list = medline_citation['MedlineCitation']['MeshHeadingList']
for mesh in mesh_list:
descriptor = mesh['DescriptorName']
major1 = descriptor.attributes['MajorTopicYN']
descriptorName = descriptor
qualiferNames = []
major2 = {}
for qualifer in mesh['QualifierName']:
if len(qualifer) > 0:
major2[qualifer] = qualifer.attributes['MajorTopicYN']
qualiferNames.append(qualifer)
if len(qualiferNames) > 0 :
for qualiferName in qualiferNames:
mesh_term = descriptorName + '/' + qualiferName
if major2[qualiferName] == 'Y':
mesh_term = mesh_term + '*'
mesh_terms.append(mesh_term)
else:
mesh_term = descriptorName
if major1 == 'Y':
mesh_term = mesh_term + '*'
mesh_terms.append(mesh_term)
return mesh_terms
def getSupplMesh(medline_citation):
supplMesh = []
if 'SupplMeshList' not in medline_citation['MedlineCitation']:
return supplMesh
supplMeshList = medline_citation['MedlineCitation']['SupplMeshList']
for supplMeshEle in supplMeshList:
supplMeshType = supplMeshEle.attributes['Type']
supplMesh.append(supplMeshEle + '[%s]' % supplMeshType)
return supplMesh
def getAffiliation(medline_citation):
affiliation = ''
if 'Affiliation' not in medline_citation['MedlineCitation']['Article']:
return ''
else:
return medline_citation['MedlineCitation']['Article']['Affiliation']
def getArticlesFromPubmed(idList, email):
artMap = {}
batch_size = 200
cnt = 0
#[seq[start:start + 20] for start in range(0, len(seq), 20)]
for sublist in [idList[start:start + batch_size] for start in range(0, len(idList), batch_size)]:
#Now retrieving the articles from PubMed (using PMIDs in the sublist)
Entrez.email = email
handle=Entrez.efetch(db='pubmed',id=sublist, retmode='xml')
records = Entrez.read(handle)
for record in records:
'''
for absEle in record["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]:
print absEle.encode('utf-8').strip()
'''
if 'MedlineCitation' not in record:
continue
articleEle = record["MedlineCitation"]["Article"]
article = NcbiArticle()
pmid = record["MedlineCitation"]["PMID"]
article.id_ext = pmid
article.mesh_terms = '|'.join(getMesh(record))
article.supplMesh = '|'.join(getSupplMesh(record))
article.text_title = articleEle["ArticleTitle"].encode('utf-8')
article.text_abstract = ''
if 'Abstract' in articleEle:
for absEle in articleEle["Abstract"]["AbstractText"]:
article.text_abstract = article.text_abstract + absEle.encode('utf-8').strip()
article.pubType = '|'.join(articleEle["PublicationTypeList"])
if 'Affiliation' in articleEle:
article.affiliation = articleEle['Affiliation'].encode('utf-8')
#print article.pubType
if 'AuthorList' in articleEle:
authors = []
#print articleEle['AuthorList']
for author in articleEle['AuthorList']:
if 'CollectiveName' in author:
authorName = author['CollectiveName']
else:
lastName = ''
foreName = ''
initials = ''
if 'LastName' in author:
lastName = author['LastName']
if 'ForeName' in author:
foreName = author['ForeName']
if 'Initials' in author:
initials = author['Initials']
authorName = ' '.join([lastName, foreName])
authors.append(authorName)
article.authors = '|'.join(authors).encode('utf-8')
journalEle = articleEle['Journal']
if 'ISSN' in journalEle:
journalISSN = journalEle['ISSN']
journalIssueEle = journalEle['JournalIssue']
journalTitle = journalEle['Title'].encode('utf-8')
article.journal = journalTitle
article.id_issn = journalISSN
if 'Volume' in journalIssueEle:
article.volume = journalIssueEle['Volume']
if 'Issue' in journalIssueEle:
article.issue = journalIssueEle['Issue']
if 'ArticleDate' in articleEle:
articleDate = articleEle['ArticleDate']
dateStr = ''
if len(articleDate) > 0:
# articleDate is a single element list
year = articleDate[0]['Year']
month = articleDate[0]['Month']
day = articleDate[0]['Day']
dateStr = '-'.join([year, month, day])
else:
dateStr = '-'.join(journalIssueEle['PubDate'].values())
else:
#month = journalIssueEle['PubDate']['Month']
#Some articles don't have 'Month', they have 'Season'
dateStr = '-'.join(journalIssueEle['PubDate'].values())
article.date = dateStr
article.pages = articleEle['Pagination']['MedlinePgn']
artMap[pmid] = article
cnt = cnt + 1
if cnt % 100 == 0:
print cnt, 'records fetched from PubMed'
print len(artMap), 'records fetched from PubMed in total.'
return artMap
def insert2DB(artMap, db, dbTables):
insert_cnt = 0
supplMesh_cnt = 0
dbCur = db.cursor()
if dbCur is None:
print 'Database connection not valid. Please check'
for pmid in artMap.keys():
article = artMap[pmid]
insert_sql = 'REPLACE into ' + dbTables['CORPUS_TEXT_TABLE'] + """ (id_ext, text_title, text_body, xml, text_abstract, authors, date, article_type, mesh_terms, journal, affiliation, text_raw) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s,%s, NULL)"""
try:
dbCur.execute(insert_sql, (article.id_ext, article.text_title, article.text_body, article.xml, article.text_abstract, article.authors, article.date, article.pubType, article.mesh_terms, article.journal, article.affiliation))
except Exception, err:
sys.stderr.write('ERROR: %s\n' % str(err))
print 'Error-causing document ID:', pmid
exit(-1)
insert_cnt = insert_cnt + 1
if insert_cnt % 100 == 0:
print insert_cnt, 'records inserted'
supplMesh = article.supplMesh
if len(supplMesh) > 0:
#Need to insert to a different table
supplMesh_sql = 'REPLACE INTO ' + dbTables['SUPPL_MESH_TABLE'] + ' (id_ext, supplMesh) values(%s,%s)'
dbCur.execute(supplMesh_sql, (pmid, supplMesh))
supplMesh_cnt = supplMesh_cnt + 1
dbCur.close()
print supplMesh_cnt, 'supplementary concepts found'
print 'Insert2DB finished!'
def rebuildCorpus(corpus, db, dbTables, email, useLocal):
pmidList = []
if type(corpus).__name__ == 'list':
pmidList = corpus
if type(corpus).__name__ == 'str':
pmidList = [line.strip() for line in open(corpus)]
#Clear the corpus first
if db is None:
print 'You need to have a valid DB connection first!'
return
else:
print 'Now rebuilding thet thyroid cancer corpus'
dbCur = db.cursor()
clear_sql = "delete from %s" % dbTables['CORPUS_TABLE']
dbCur.execute(clear_sql)
print 'Corpus table cleared. '
clear_sql = "delete from %s" % dbTables['CORPUS_TEXT_TABLE']
dbCur.execute(clear_sql)
print 'Thyroid cancer text table cleared'
for pmid in pmidList:
insert_sql = "insert into %s (id_ext) values(\'%s\')" % (dbTables['CORPUS_TABLE'], pmid)
dbCur.execute(insert_sql)
print len(pmidList), 'records inserted to the thyroid cancer corpus'
dbCur.close()
print 'Now fetching PubMed articles'
dbArtMap = {}
if useLocal == 'Y':
print 'from local DB first'
dbArtMap = getArticlesById(pmidList, 'pubmed', db, dbTables)
print len(dbArtMap), 'articles fetched from local database'
#Insert
insert2DB(dbArtMap, db, dbTables)
print 'Now fetching directly from PubMed'
waitList = []
for pmid in set(pmidList) ^ set(dbArtMap.keys()):
waitList.append(pmid)
print len(waitList), 'articles to be fetched from PubMed'
artMap = getArticlesFromPubmed(waitList, email)
insert2DB(artMap, db, dbTables)
if __name__ == "__main__":
(corpus, dbConfig, dbTables,miscConfig) = ReadConfig.config(sys.argv)
db = MySQLdb.connect(host=dbConfig['dbHost'], # your host, usually localhost
user=dbConfig['dbUser'], # your username
passwd=dbConfig['dbPass'] , # your password
db=dbConfig['dbSchema'],
charset=dbConfig['dbCharset'],
use_unicode=dbConfig['dbUnicode']) # name of the data base
print 'Database connected.'
rebuildCorpus(corpus, db, dbTables, miscConfig['email'], miscConfig['use_local'])
db.close()