From 8c6f81ce7648cd2db2066e911a2a419359311d60 Mon Sep 17 00:00:00 2001 From: NeuralFlux <40491005+NeuralFlux@users.noreply.github.com> Date: Fri, 28 Jun 2024 13:01:12 -0400 Subject: [PATCH] fixed date, id, and xref parsing --- src/hub/dataload/sources/gsrs/gsrs_parser.py | 62 ++++++++++++++++---- src/hub/dataload/sources/gsrs/gsrs_upload.py | 19 +++--- 2 files changed, 59 insertions(+), 22 deletions(-) diff --git a/src/hub/dataload/sources/gsrs/gsrs_parser.py b/src/hub/dataload/sources/gsrs/gsrs_parser.py index 715d225..91632f5 100644 --- a/src/hub/dataload/sources/gsrs/gsrs_parser.py +++ b/src/hub/dataload/sources/gsrs/gsrs_parser.py @@ -1,22 +1,58 @@ import gzip import json +import os from datetime import datetime, timezone -from typing import Tuple +from typing import Any, List from biothings import config -from biothings.utils.dataload import dict_convert, dict_sweep +from biothings.utils.dataload import dict_convert, dict_sweep, dict_traverse logging = config.logger process_key = lambda key: key.replace(" ", "_").lower() +recognized_code_systems = [ + "cas", + "chebi", + "chembl", + "drugbank", + "fdaunii", + "mesh", + "pubchem", +] +date_cols = ("documentDate", "deprecatedDate") -def timestamp_to_date(d: dict, keys: Tuple[str]): - for key in keys: - if key in d.keys(): - date_obj = datetime.fromtimestamp(int(d[key]), tz=timezone.utc) - d.update({key: date_obj.strftime("%Y-%m-%d")}) - return d +def timestamp_to_date(k: str, v: Any): + """ + check if a key-value pair needs date formatting and do so. + NOTE: val is expected to be a UNIX millisecond timestamp for + date-bearing keys + """ + + if k in date_cols: + date_obj = datetime.fromtimestamp(int(v) / 1000.0, tz=timezone.utc) + v = date_obj.strftime("%Y-%m-%d") + return k, v + + +def parse_xrefs(codes: List[str]): + xrefs = {} + for code in codes: + code_system = code["codeSystem"].replace(" ", "").lower() + if code_system in recognized_code_systems: + if code_system == "fdaunii": + code_system = "unii" + + if code_system == "pubchem": + if "url" not in code.keys(): # cannot determine cid or sid + continue + if "compound" in code["url"]: + code_system += "_cid" + elif "substance" in code["url"]: + code_system += "_sid" + + xrefs[code_system] = code["code"] + return xrefs def load_substances(file_name: str): @@ -25,8 +61,14 @@ def load_substances(file_name: str): for raw_line in fd: record = json.loads(raw_line.decode("utf-8").strip()) record = dict_convert(record, keyfn=process_key) + if "codes" in record.keys(): + record["xrefs"] = parse_xrefs(record["codes"]) + + # parse dates in `date_cols` only + dict_traverse(record, timestamp_to_date, traverse_list=True) record = dict_sweep(record, vals=["", None], remove_invalid_list=True) - record = timestamp_to_date(record, ("documentDate", "deprecatedDate")) - _id = record.pop("uuid") + _id = f"gsrs.uuid:{record['uuid']}" + if "approvalid" in record.keys(): + _id = f"unii:{record['approvalid']}" yield {"_id": _id, "gsrs": record} diff --git a/src/hub/dataload/sources/gsrs/gsrs_upload.py b/src/hub/dataload/sources/gsrs/gsrs_upload.py index 8968408..e91c83a 100644 --- a/src/hub/dataload/sources/gsrs/gsrs_upload.py +++ b/src/hub/dataload/sources/gsrs/gsrs_upload.py @@ -173,7 +173,10 @@ def get_mapping(cls): "type": "keyword", }, "nameOrg": {"type": "text"}, - "deprecatedDate": {"type": "date"}, + "deprecatedDate": { + "type": "date", + "format": "yyyy-MM-dd", + }, } }, "nameJurisdiction": { @@ -457,7 +460,7 @@ def get_mapping(cls): }, "publicDomain": {"type": "boolean"}, "tags": {"type": "text"}, - "documentDate": {"type": "date"}, + "documentDate": {"type": "date", "format": "yyyy-MM-dd"}, "uploadedFile": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", @@ -672,10 +675,7 @@ def get_mapping(cls): "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, - "smiles": { - "normalizer": "keyword_lowercase_normalizer", - "type": "keyword", - }, + "smiles": {"type": "keyword"}, "formula": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", @@ -721,10 +721,7 @@ def get_mapping(cls): "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, - "smiles": { - "normalizer": "keyword_lowercase_normalizer", - "type": "keyword", - }, + "smiles": {"type": "keyword"}, "formula": { "normalizer": "keyword_lowercase_normalizer", "type": "keyword", @@ -922,7 +919,6 @@ def get_mapping(cls): }, "molfile": {"type": "text"}, "smiles": { - "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "formula": { @@ -962,7 +958,6 @@ def get_mapping(cls): }, "molfile": {"type": "text"}, "smiles": { - "normalizer": "keyword_lowercase_normalizer", "type": "keyword", }, "formula": {