fixed date, id, and xref parsing

biothings · Jun 28, 2024 · 8c6f81c · 8c6f81c
1 parent c788513
commit 8c6f81c
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 22 deletions.
diff --git a/src/hub/dataload/sources/gsrs/gsrs_parser.py b/src/hub/dataload/sources/gsrs/gsrs_parser.py
@@ -1,22 +1,58 @@
 import gzip
 import json
+import os
 from datetime import datetime, timezone
-from typing import Tuple
+from typing import Any, List
 
 from biothings import config
-from biothings.utils.dataload import dict_convert, dict_sweep
+from biothings.utils.dataload import dict_convert, dict_sweep, dict_traverse
 
 logging = config.logger
 
 process_key = lambda key: key.replace(" ", "_").lower()
+recognized_code_systems = [
+    "cas",
+    "chebi",
+    "chembl",
+    "drugbank",
+    "fdaunii",
+    "mesh",
+    "pubchem",
+]
+date_cols = ("documentDate", "deprecatedDate")
 
 
-def timestamp_to_date(d: dict, keys: Tuple[str]):
-    for key in keys:
-        if key in d.keys():
-            date_obj = datetime.fromtimestamp(int(d[key]), tz=timezone.utc)
-            d.update({key: date_obj.strftime("%Y-%m-%d")})
-    return d
+def timestamp_to_date(k: str, v: Any):
+    """
+    check if a key-value pair needs date formatting and do so.
+    NOTE: val is expected to be a UNIX millisecond timestamp for
+    date-bearing keys
+    """
+
+    if k in date_cols:
+        date_obj = datetime.fromtimestamp(int(v) / 1000.0, tz=timezone.utc)
+        v = date_obj.strftime("%Y-%m-%d")
+    return k, v
+
+
+def parse_xrefs(codes: List[str]):
+    xrefs = {}
+    for code in codes:
+        code_system = code["codeSystem"].replace(" ", "").lower()
+        if code_system in recognized_code_systems:
+            if code_system == "fdaunii":
+                code_system = "unii"
+
+            if code_system == "pubchem":
+                if "url" not in code.keys():  # cannot determine cid or sid
+                    continue
+                if "compound" in code["url"]:
+                    code_system += "_cid"
+                elif "substance" in code["url"]:
+                    code_system += "_sid"
+
+            xrefs[code_system] = code["code"]
+    return xrefs
 
 
 def load_substances(file_name: str):
@@ -25,8 +61,14 @@ def load_substances(file_name: str):
         for raw_line in fd:
             record = json.loads(raw_line.decode("utf-8").strip())
             record = dict_convert(record, keyfn=process_key)
+            if "codes" in record.keys():
+                record["xrefs"] = parse_xrefs(record["codes"])
+
+            # parse dates in `date_cols` only
+            dict_traverse(record, timestamp_to_date, traverse_list=True)
             record = dict_sweep(record, vals=["", None], remove_invalid_list=True)
-            record = timestamp_to_date(record, ("documentDate", "deprecatedDate"))
 
-            _id = record.pop("uuid")
+            _id = f"gsrs.uuid:{record['uuid']}"
+            if "approvalid" in record.keys():
+                _id = f"unii:{record['approvalid']}"
             yield {"_id": _id, "gsrs": record}
diff --git a/src/hub/dataload/sources/gsrs/gsrs_upload.py b/src/hub/dataload/sources/gsrs/gsrs_upload.py
@@ -173,7 +173,10 @@ def get_mapping(cls):
                                         "type": "keyword",
                                     },
                                     "nameOrg": {"type": "text"},
-                                    "deprecatedDate": {"type": "date"},
+                                    "deprecatedDate": {
+                                        "type": "date",
+                                        "format": "yyyy-MM-dd",
+                                    },
                                 }
                             },
                             "nameJurisdiction": {
@@ -457,7 +460,7 @@ def get_mapping(cls):
                             },
                             "publicDomain": {"type": "boolean"},
                             "tags": {"type": "text"},
-                            "documentDate": {"type": "date"},
+                            "documentDate": {"type": "date", "format": "yyyy-MM-dd"},
                             "uploadedFile": {
                                 "normalizer": "keyword_lowercase_normalizer",
                                 "type": "keyword",
@@ -672,10 +675,7 @@ def get_mapping(cls):
                                 "normalizer": "keyword_lowercase_normalizer",
                                 "type": "keyword",
                             },
-                            "smiles": {
-                                "normalizer": "keyword_lowercase_normalizer",
-                                "type": "keyword",
-                            },
+                            "smiles": {"type": "keyword"},
                             "formula": {
                                 "normalizer": "keyword_lowercase_normalizer",
                                 "type": "keyword",
@@ -721,10 +721,7 @@ def get_mapping(cls):
                                 "normalizer": "keyword_lowercase_normalizer",
                                 "type": "keyword",
                             },
-                            "smiles": {
-                                "normalizer": "keyword_lowercase_normalizer",
-                                "type": "keyword",
-                            },
+                            "smiles": {"type": "keyword"},
                             "formula": {
                                 "normalizer": "keyword_lowercase_normalizer",
                                 "type": "keyword",
@@ -922,7 +919,6 @@ def get_mapping(cls):
                                     },
                                     "molfile": {"type": "text"},
                                     "smiles": {
-                                        "normalizer": "keyword_lowercase_normalizer",
                                         "type": "keyword",
                                     },
                                     "formula": {
@@ -962,7 +958,6 @@ def get_mapping(cls):
                                     },
                                     "molfile": {"type": "text"},
                                     "smiles": {
-                                        "normalizer": "keyword_lowercase_normalizer",
                                         "type": "keyword",
                                     },
                                     "formula": {