-
Notifications
You must be signed in to change notification settings - Fork 0
/
pubmed.py
78 lines (67 loc) · 2.7 KB
/
pubmed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from Bio import Entrez
import datetime
import re
import json
def save_data(papers, filename='data/papers_data.json'):
formatted_papers = []
for paper in papers:
article = paper['MedlineCitation']['Article']
authors = article.get('AuthorList', [])
formatted_authors = []
for author in authors:
forename = author.get('ForeName', '')
lastname = author.get('LastName', '')
full_name = ' '.join([forename, lastname]).strip()
if full_name: # Only add if the author's name is not empty
formatted_authors.append(full_name)
paper_data = {
'title': article.get('ArticleTitle'),
'abstract': article['Abstract']['AbstractText'][0] if 'Abstract' in article else 'No abstract available',
'authors': formatted_authors,
'pub_date': article['Journal']['JournalIssue']['PubDate'],
'pmid': paper['MedlineCitation']['PMID'],
'pmcid': paper.get('PubmedData', {}).get('ArticleIdList', [])
}
formatted_papers.append(paper_data)
# Save to a JSON file
with open(filename, 'w') as f:
json.dump(formatted_papers, f, indent=4)
def clean_text(text):
# Remove special characters and extra whitespaces
text = re.sub(r'\s+', ' ', text)
text = text.strip()
return text
def search(query, start_year=2001):
Entrez.email = '[email protected]'
current_year = datetime.datetime.now().year
handle = Entrez.esearch(db='pubmed',
sort='relevance',
retmax='5000',
retmode='xml',
term=query,
mindate=f"{start_year}/01/01",
maxdate=f"{current_year}/12/31")
results = Entrez.read(handle)
return results
def fetch_details(id_list):
ids = ','.join(id_list)
Entrez.email = '[email protected]'
handle = Entrez.efetch(db='pubmed',
retmode='xml',
id=ids)
results = Entrez.read(handle)
papers_with_pmcid = []
for paper in results['PubmedArticle']:
pmcids = [article_id for article_id in paper['PubmedData']['ArticleIdList'] if str(article_id).startswith('PMC')]
if pmcids:
# Only include the first PMCID (in case there are multiple)
paper['PubmedData']['ArticleIdList'] = pmcids[0]
papers_with_pmcid.append(paper)
return papers_with_pmcid
# Example usage
query = 'dictyostelium discoideum'
results = search(query)
id_list = results['IdList']
papers = fetch_details(id_list)
save_data(papers)
print(f"Found {len(papers)} papers with PMCID.")