Skip to content

Commit

Permalink
fixed date, id, and xref parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
NeuralFlux committed Jun 28, 2024
1 parent c788513 commit 8c6f81c
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 22 deletions.
62 changes: 52 additions & 10 deletions src/hub/dataload/sources/gsrs/gsrs_parser.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,58 @@
import gzip
import json
import os
from datetime import datetime, timezone
from typing import Tuple
from typing import Any, List

from biothings import config
from biothings.utils.dataload import dict_convert, dict_sweep
from biothings.utils.dataload import dict_convert, dict_sweep, dict_traverse

logging = config.logger

process_key = lambda key: key.replace(" ", "_").lower()
recognized_code_systems = [
"cas",
"chebi",
"chembl",
"drugbank",
"fdaunii",
"mesh",
"pubchem",
]
date_cols = ("documentDate", "deprecatedDate")


def timestamp_to_date(d: dict, keys: Tuple[str]):
for key in keys:
if key in d.keys():
date_obj = datetime.fromtimestamp(int(d[key]), tz=timezone.utc)
d.update({key: date_obj.strftime("%Y-%m-%d")})
return d
def timestamp_to_date(k: str, v: Any):
"""
check if a key-value pair needs date formatting and do so.
NOTE: val is expected to be a UNIX millisecond timestamp for
date-bearing keys
"""

if k in date_cols:
date_obj = datetime.fromtimestamp(int(v) / 1000.0, tz=timezone.utc)
v = date_obj.strftime("%Y-%m-%d")
return k, v


def parse_xrefs(codes: List[str]):
xrefs = {}
for code in codes:
code_system = code["codeSystem"].replace(" ", "").lower()
if code_system in recognized_code_systems:
if code_system == "fdaunii":
code_system = "unii"

if code_system == "pubchem":
if "url" not in code.keys(): # cannot determine cid or sid
continue
if "compound" in code["url"]:
code_system += "_cid"
elif "substance" in code["url"]:
code_system += "_sid"

xrefs[code_system] = code["code"]
return xrefs


def load_substances(file_name: str):
Expand All @@ -25,8 +61,14 @@ def load_substances(file_name: str):
for raw_line in fd:
record = json.loads(raw_line.decode("utf-8").strip())
record = dict_convert(record, keyfn=process_key)
if "codes" in record.keys():
record["xrefs"] = parse_xrefs(record["codes"])

# parse dates in `date_cols` only
dict_traverse(record, timestamp_to_date, traverse_list=True)
record = dict_sweep(record, vals=["", None], remove_invalid_list=True)
record = timestamp_to_date(record, ("documentDate", "deprecatedDate"))

_id = record.pop("uuid")
_id = f"gsrs.uuid:{record['uuid']}"
if "approvalid" in record.keys():
_id = f"unii:{record['approvalid']}"
yield {"_id": _id, "gsrs": record}
19 changes: 7 additions & 12 deletions src/hub/dataload/sources/gsrs/gsrs_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,10 @@ def get_mapping(cls):
"type": "keyword",
},
"nameOrg": {"type": "text"},
"deprecatedDate": {"type": "date"},
"deprecatedDate": {
"type": "date",
"format": "yyyy-MM-dd",
},
}
},
"nameJurisdiction": {
Expand Down Expand Up @@ -457,7 +460,7 @@ def get_mapping(cls):
},
"publicDomain": {"type": "boolean"},
"tags": {"type": "text"},
"documentDate": {"type": "date"},
"documentDate": {"type": "date", "format": "yyyy-MM-dd"},
"uploadedFile": {
"normalizer": "keyword_lowercase_normalizer",
"type": "keyword",
Expand Down Expand Up @@ -672,10 +675,7 @@ def get_mapping(cls):
"normalizer": "keyword_lowercase_normalizer",
"type": "keyword",
},
"smiles": {
"normalizer": "keyword_lowercase_normalizer",
"type": "keyword",
},
"smiles": {"type": "keyword"},
"formula": {
"normalizer": "keyword_lowercase_normalizer",
"type": "keyword",
Expand Down Expand Up @@ -721,10 +721,7 @@ def get_mapping(cls):
"normalizer": "keyword_lowercase_normalizer",
"type": "keyword",
},
"smiles": {
"normalizer": "keyword_lowercase_normalizer",
"type": "keyword",
},
"smiles": {"type": "keyword"},
"formula": {
"normalizer": "keyword_lowercase_normalizer",
"type": "keyword",
Expand Down Expand Up @@ -922,7 +919,6 @@ def get_mapping(cls):
},
"molfile": {"type": "text"},
"smiles": {
"normalizer": "keyword_lowercase_normalizer",
"type": "keyword",
},
"formula": {
Expand Down Expand Up @@ -962,7 +958,6 @@ def get_mapping(cls):
},
"molfile": {"type": "text"},
"smiles": {
"normalizer": "keyword_lowercase_normalizer",
"type": "keyword",
},
"formula": {
Expand Down

0 comments on commit 8c6f81c

Please sign in to comment.