-
Notifications
You must be signed in to change notification settings - Fork 0
/
PubMedParse.py
33 lines (29 loc) · 1.46 KB
/
PubMedParse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# import HTTP requests module
import requests
# import XML DOM parser
from xml.dom import minidom
import ParseException
# PubMed records web-service parser
class PubMedParser:
# construct parser with default web URL
def __init__(self, webserviceurl="https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"):
self.ws_url = webserviceurl
# Obtain and parse data from remote
def parse(self, PMID):
# send post request to PubMed web-service
pub_med_response = requests.post(self.ws_url, data={'db': 'PubMed', 'retmode': 'XML', 'id': PMID})
# enforce Unicode encoding in response
pub_med_response.encoding = 'UTF-8'
# construct DOM tree from the response
pub_med_article = minidom.parseString(pub_med_response.content)
# get all elements for AbstractText field
abstract_text_node = pub_med_article.getElementsByTagName("AbstractText")
# throw an exception if there is no abstracts in the response
if len(abstract_text_node) <= 0:
raise ParseException.ParseException("There is no abstract for the PMID {}.".format(PMID))
# throw an exception if there is too many abstracts to parse
if len(abstract_text_node) > 1:
raise ParseException.ParseException("Too many abstracts in the response for PMID {}.".format(PMID))
# get abstract text from the DOM node
abstract = abstract_text_node[0].firstChild.nodeValue
return abstract