-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPubMedSqliteIterator.py
35 lines (30 loc) · 1.28 KB
/
PubMedSqliteIterator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import os
import sqlite3
def fetch_pubmed_abstracts_for_embedding(sqlite_path: str, batch_size: int = 100000):
"""
An iterator that fetches PubMed abstracts in batches. The contents are aimed at creating embedding vectors for
retrieval.
:param sqlite_path: The path to the SQLite database file
:param batch_size: The size of the batches the iterator returns
:return: A tuple of 3: pmids, texts, and publication dates (toordinal integers), each of length batch_size.
"""
connection = sqlite3.connect(sqlite_path)
cursor = connection.cursor()
sql = """
SELECT pmid,
CASE WHEN title IS NULL THEN '' ELSE title || '\n\n' END ||
CASE WHEN abstract IS NULL THEN '' ELSE abstract || '\n\n' END ||
CASE WHEN mesh_terms IS NULL THEN '' ELSE 'MeSH terms:\n' || mesh_terms || '\n\n' END ||
CASE WHEN keywords IS NULL THEN '' ELSE 'Keywords:\n' || keywords || '\n\n' END ||
CASE WHEN chemicals IS NULL THEN '' ELSE 'Chemicals:\n' || chemicals || '\n\n' END AS text,
publication_date
FROM pubmed_articles;
"""
cursor.execute(sql)
while True:
records = cursor.fetchmany(batch_size)
if not records:
break
yield records
cursor.close()
connection.close()