Skip to content

Commit

Permalink
added auto download to umls
Browse files Browse the repository at this point in the history
  • Loading branch information
DylanWelzel committed Apr 18, 2024
1 parent 7ad851f commit 5d126b0
Show file tree
Hide file tree
Showing 2 changed files with 127 additions and 23 deletions.
42 changes: 32 additions & 10 deletions src/hub/dataload/sources/umls/umls_dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,30 +20,52 @@ def get_latest_release(self):
res = self.client.get(self.__class__.HOMEPAGE_URL)
# Raise error if status is not 200
res.raise_for_status()
html = bs4.BeautifulSoup(res.text, 'lxml')
html = bs4.BeautifulSoup(res.text, "lxml")
# Get the table of metathesaurus release files
table = html.find("table", attrs={"class": "usa-table margin-bottom-4"})
rows = table.find_all('tr')
table = html.find(
"table", attrs={"class": "usa-table border-base-lighter margin-bottom-4"}
)
rows = table.find_all("tr")
# The header of the fifth column should be 'Date'
assert rows[0].find_all('th')[4].text.strip() == 'Date', "Could not parse version from html table."
version = rows[1].find_all('td')[4].text
assert (
rows[0].find_all("th")[4].text.strip() == "Date"
), "Could not parse version from html table."
version = rows[1].find_all("td")[4].text
try:
latest = datetime.date.strftime(dtparser.parse(version), "%Y-%m-%d")
latest = datetime.date.strftime(
dtparser.parse(version), "%Y-%m-%d")
return latest
except Exception as e:
raise DumperException("Can't find or parse date from table field {}: {}" % (version, e))
raise DumperException(
"Can't find or parse date from table field {}: {}" % (
version, e)
)

def create_todump_list(self, force=True):
self.release = self.get_latest_release()
if force or not self.src_doc or (self.src_doc and self.src_doc.get("download", {}).get("release") < self.release):
self.logger.info("Manually download from: https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html")
if (
force
or not self.src_doc
or (
self.src_doc
and self.src_doc.get("download", {}).get("release") < self.release
)
):
self.logger.info(
"Manually download from: https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html"
)
# Create data folder
local = os.path.join(self.SRC_ROOT_FOLDER, self.release)
if not os.path.exists(local):
os.makedirs(local)
# Dump a dummy file, to mark dump as successful and trigger uploader
release_notes = "https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/notes.html"
self.to_dump.append({"remote":release_notes, "local":os.path.join(local, "release_notes.html")})
self.to_dump.append(
{
"remote": release_notes,
"local": os.path.join(local, "release_notes.html"),
}
)

def post_dump(self, *args, **kwargs):
self.logger.info("Unzipping files in '%s'" % self.new_data_folder)
Expand Down
108 changes: 95 additions & 13 deletions src/hub/dataload/sources/umls/umls_parser.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,33 @@
import glob
import os
import re
import time
import urllib
import zipfile
from collections import defaultdict

import bs4
import requests
from biothings_client import get_client

from .umls_secret import UMLS_API_KEY

try:
from biothings import config

logger = config.logger
except ImportError:
import logging

logger = logging.getLogger(__name__)

CHEM_CLIENT = get_client('chem')


class ParserException(Exception):
pass


# list of UMLS semantic types belonging to chemical is based on
# https://www.nlm.nih.gov/research/umls/META3_current_semantic_types.html
UMLS_CHEMICAL_SEMANTIC_TYPES = [
Expand Down Expand Up @@ -121,20 +144,79 @@ def unlist(l):
return l


def get_download_url():
res = requests.get(
"https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html"
)
# Raise error if status is not 200
res.raise_for_status()
html = bs4.BeautifulSoup(res.text, "lxml")
# Get the table of metathesaurus release files
table = html.find(
"table", attrs={"class": "usa-table border-base-lighter margin-bottom-4"}
)
rows = table.find_all("tr")
# The header of the first column should be 'Release'
assert (
rows[0].find_all("th")[0].text.strip() == "Release"
), "Could not parse url from html table."
try:
# Get the url from the link
url = rows[2].find_all("td")[0].a["href"]
logger.info(f"Found UMLS download url: {url}")
# Create the url using the api aky
url = f"https://uts-ws.nlm.nih.gov/download?url={
url}&apiKey={UMLS_API_KEY}"
return url
except Exception as e:
raise ParserException(
f"Can't find or parse url from table field {url}: {e}")


def load_data(data_folder):
mrsat_file = os.path.join(data_folder, 'MRSTY.RRF')
mrconso_file = os.path.join(data_folder, 'MRCONSO.RRF')
if not os.path.exists(mrsat_file):
raise FileNotFoundError(
"""Could not find 'MRSTY.RRF' in {}.
Please download UMLS Metathesaurus files manually and extract to folder.
""".format(data_folder))
if not os.path.exists(mrconso_file):
raise FileNotFoundError(
"""Could not find 'MRCONSO.RRF' in {}.
Please download manually and extract to folder.""".format(data_folder))
chem_umls = fetch_chemical_umls_cuis(mrsat_file)
cui_map, mesh_ids, names = parse_umls(mrconso_file, chem_umls)
try:
metathesaurus_file = glob.glob(
os.path.join(data_folder, "*metathesaurus-release.zip")
)[0]
except IndexError:
url = get_download_url()
# Use re.sub to replace all characters after "apiKey=" with asterisks
pii_url = re.sub(
r"(apiKey=).*",
r"\1" + "*" * len(re.search(r"(apiKey=)(.*)", url).group(2)),
url,
)
logger.info(
"""Could not find metathesaurus archive in {}.
Downloading UMLS Metathesaurus file automatically:
{}
""".format(
data_folder, pii_url
)
)
# Download UMLS file to data folder
urllib.request.urlretrieve(
url, os.path.join(data_folder, "metathesaurus-release.zip")
)
# Get the downloaded file path
metathesaurus_file = glob.glob(
os.path.join(data_folder, "*metathesaurus-release.zip")
)[0]
file_list = zipfile.ZipFile(metathesaurus_file, mode="r").namelist()
logger.info(
"Found the following files in the metathesaurus file: {}".format(
file_list)
)
try:
mrsty_path = [f for f in file_list if f.endswith("MRSTY.RRF")][0]
except IndexError:
raise FileNotFoundError("Could not find MRSTY.RRF in archive.")
try:
mrconso_path = [f for f in file_list if f.endswith("MRCONSO.RRF")][0]
except IndexError:
raise FileNotFoundError("Could not find MRCONSO.RRF in archive.")
chem_umls = fetch_chemical_umls_cuis(mrsty_path)
cui_map, mesh_ids, names = parse_umls(mrconso_path, chem_umls)
name_mapping = query_drug_name(names)
time.sleep(200)
mesh_id_mapping = query_mesh(mesh_ids)
Expand Down

0 comments on commit 5d126b0

Please sign in to comment.