added auto download to umls

biothings · Apr 18, 2024 · 5d126b0 · 5d126b0
1 parent 7ad851f
commit 5d126b0
Show file tree

Hide file tree

Showing 2 changed files with 127 additions and 23 deletions.
diff --git a/src/hub/dataload/sources/umls/umls_dump.py b/src/hub/dataload/sources/umls/umls_dump.py
@@ -20,30 +20,52 @@ def get_latest_release(self):
         res = self.client.get(self.__class__.HOMEPAGE_URL)
         # Raise error if status is not 200
         res.raise_for_status()
-        html = bs4.BeautifulSoup(res.text, 'lxml')
+        html = bs4.BeautifulSoup(res.text, "lxml")
         # Get the table of metathesaurus release files
-        table = html.find("table", attrs={"class": "usa-table margin-bottom-4"})
-        rows = table.find_all('tr')
+        table = html.find(
+            "table", attrs={"class": "usa-table border-base-lighter margin-bottom-4"}
+        )
+        rows = table.find_all("tr")
         # The header of the fifth column should be 'Date'
-        assert rows[0].find_all('th')[4].text.strip() == 'Date', "Could not parse version from html table."
-        version = rows[1].find_all('td')[4].text
+        assert (
+            rows[0].find_all("th")[4].text.strip() == "Date"
+        ), "Could not parse version from html table."
+        version = rows[1].find_all("td")[4].text
         try:
-            latest = datetime.date.strftime(dtparser.parse(version), "%Y-%m-%d")
+            latest = datetime.date.strftime(
+                dtparser.parse(version), "%Y-%m-%d")
             return latest
         except Exception as e:
-            raise DumperException("Can't find or parse date from table field {}: {}" % (version, e))
+            raise DumperException(
+                "Can't find or parse date from table field {}: {}" % (
+                    version, e)
+            )
 
     def create_todump_list(self, force=True):
         self.release = self.get_latest_release()
-        if force or not self.src_doc or (self.src_doc and self.src_doc.get("download", {}).get("release") < self.release):
-            self.logger.info("Manually download from: https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html")
+        if (
+            force
+            or not self.src_doc
+            or (
+                self.src_doc
+                and self.src_doc.get("download", {}).get("release") < self.release
+            )
+        ):
+            self.logger.info(
+                "Manually download from: https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html"
+            )
             # Create data folder
             local = os.path.join(self.SRC_ROOT_FOLDER, self.release)
             if not os.path.exists(local):
                 os.makedirs(local)
             # Dump a dummy file, to mark dump as successful and trigger uploader
             release_notes = "https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/notes.html"
-            self.to_dump.append({"remote":release_notes, "local":os.path.join(local, "release_notes.html")})
+            self.to_dump.append(
+                {
+                    "remote": release_notes,
+                    "local": os.path.join(local, "release_notes.html"),
+                }
+            )
 
     def post_dump(self, *args, **kwargs):
         self.logger.info("Unzipping files in '%s'" % self.new_data_folder)

diff --git a/src/hub/dataload/sources/umls/umls_parser.py b/src/hub/dataload/sources/umls/umls_parser.py
@@ -1,10 +1,33 @@
+import glob
 import os
+import re
 import time
+import urllib
+import zipfile
 from collections import defaultdict
 
+import bs4
+import requests
 from biothings_client import get_client
 
+from .umls_secret import UMLS_API_KEY
+
+try:
+    from biothings import config
+
+    logger = config.logger
+except ImportError:
+    import logging
+
+    logger = logging.getLogger(__name__)
+
 CHEM_CLIENT = get_client('chem')
+
+
+class ParserException(Exception):
+    pass
+
+
 # list of UMLS semantic types belonging to chemical is based on
 # https://www.nlm.nih.gov/research/umls/META3_current_semantic_types.html
 UMLS_CHEMICAL_SEMANTIC_TYPES = [
@@ -121,20 +144,79 @@ def unlist(l):
     return l
 
 
+def get_download_url():
+    res = requests.get(
+        "https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html"
+    )
+    # Raise error if status is not 200
+    res.raise_for_status()
+    html = bs4.BeautifulSoup(res.text, "lxml")
+    # Get the table of metathesaurus release files
+    table = html.find(
+        "table", attrs={"class": "usa-table border-base-lighter margin-bottom-4"}
+    )
+    rows = table.find_all("tr")
+    # The header of the first column should be 'Release'
+    assert (
+        rows[0].find_all("th")[0].text.strip() == "Release"
+    ), "Could not parse url from html table."
+    try:
+        # Get the url from the link
+        url = rows[2].find_all("td")[0].a["href"]
+        logger.info(f"Found UMLS download url: {url}")
+        # Create the url using the api aky
+        url = f"https://uts-ws.nlm.nih.gov/download?url={
+            url}&apiKey={UMLS_API_KEY}"
+        return url
+    except Exception as e:
+        raise ParserException(
+            f"Can't find or parse url from table field {url}: {e}")
+
+
 def load_data(data_folder):
-    mrsat_file = os.path.join(data_folder, 'MRSTY.RRF')
-    mrconso_file = os.path.join(data_folder, 'MRCONSO.RRF')
-    if not os.path.exists(mrsat_file):
-        raise FileNotFoundError(
-            """Could not find 'MRSTY.RRF' in {}.
-            Please download UMLS Metathesaurus files manually and extract to folder.
-            """.format(data_folder))
-    if not os.path.exists(mrconso_file):
-        raise FileNotFoundError(
-            """Could not find 'MRCONSO.RRF' in {}.
-            Please download manually and extract to folder.""".format(data_folder))
-    chem_umls = fetch_chemical_umls_cuis(mrsat_file)
-    cui_map, mesh_ids, names = parse_umls(mrconso_file, chem_umls)
+    try:
+        metathesaurus_file = glob.glob(
+            os.path.join(data_folder, "*metathesaurus-release.zip")
+        )[0]
+    except IndexError:
+        url = get_download_url()
+        # Use re.sub to replace all characters after "apiKey=" with asterisks
+        pii_url = re.sub(
+            r"(apiKey=).*",
+            r"\1" + "*" * len(re.search(r"(apiKey=)(.*)", url).group(2)),
+            url,
+        )
+        logger.info(
+            """Could not find metathesaurus archive in {}.
+                     Downloading UMLS Metathesaurus file automatically:
+                     {}
+                     """.format(
+                data_folder, pii_url
+            )
+        )
+        # Download UMLS file to data folder
+        urllib.request.urlretrieve(
+            url, os.path.join(data_folder, "metathesaurus-release.zip")
+        )
+        # Get the downloaded file path
+        metathesaurus_file = glob.glob(
+            os.path.join(data_folder, "*metathesaurus-release.zip")
+        )[0]
+    file_list = zipfile.ZipFile(metathesaurus_file, mode="r").namelist()
+    logger.info(
+        "Found the following files in the metathesaurus file: {}".format(
+            file_list)
+    )
+    try:
+        mrsty_path = [f for f in file_list if f.endswith("MRSTY.RRF")][0]
+    except IndexError:
+        raise FileNotFoundError("Could not find MRSTY.RRF in archive.")
+    try:
+        mrconso_path = [f for f in file_list if f.endswith("MRCONSO.RRF")][0]
+    except IndexError:
+        raise FileNotFoundError("Could not find MRCONSO.RRF in archive.")
+    chem_umls = fetch_chemical_umls_cuis(mrsty_path)
+    cui_map, mesh_ids, names = parse_umls(mrconso_path, chem_umls)
     name_mapping = query_drug_name(names)
     time.sleep(200)
     mesh_id_mapping = query_mesh(mesh_ids)