Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/biothings/mychem.info int…
Browse files Browse the repository at this point in the history
…o update_gtopdb
  • Loading branch information
DylanWelzel committed Jul 12, 2024
2 parents 007d21d + f2c885c commit 94f7f15
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 53 deletions.
62 changes: 52 additions & 10 deletions src/hub/dataload/sources/gsrs/gsrs_parser.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,58 @@
import gzip
import json
import os
from datetime import datetime, timezone
from typing import Tuple
from typing import Any, List

from biothings import config
from biothings.utils.dataload import dict_convert, dict_sweep
from biothings.utils.dataload import dict_convert, dict_sweep, dict_traverse

logging = config.logger

process_key = lambda key: key.replace(" ", "_").lower()
recognized_code_systems = [
"cas",
"chebi",
"chembl",
"drugbank",
"fdaunii",
"mesh",
"pubchem",
]
date_cols = ("documentDate", "deprecatedDate")


def timestamp_to_date(d: dict, keys: Tuple[str]):
for key in keys:
if key in d.keys():
date_obj = datetime.fromtimestamp(int(d[key]), tz=timezone.utc)
d.update({key: date_obj.strftime("%Y-%m-%d")})
return d
def timestamp_to_date(k: str, v: Any):
"""
check if a key-value pair needs date formatting and do so.
NOTE: val is expected to be a UNIX millisecond timestamp for
date-bearing keys
"""

if k in date_cols:
date_obj = datetime.fromtimestamp(int(v) / 1000.0, tz=timezone.utc)
v = date_obj.strftime("%Y-%m-%d")
return k, v


def parse_xrefs(codes: List[str]):
xrefs = {}
for code in codes:
code_system = code["codeSystem"].replace(" ", "").lower()
if code_system in recognized_code_systems:
if code_system == "fdaunii":
code_system = "unii"

if code_system == "pubchem":
if "url" not in code.keys(): # cannot determine cid or sid
continue
if "compound" in code["url"]:
code_system += "_cid"
elif "substance" in code["url"]:
code_system += "_sid"

xrefs[code_system] = code["code"]
return xrefs


def load_substances(file_name: str):
Expand All @@ -25,8 +61,14 @@ def load_substances(file_name: str):
for raw_line in fd:
record = json.loads(raw_line.decode("utf-8").strip())
record = dict_convert(record, keyfn=process_key)
if "codes" in record.keys():
record["xrefs"] = parse_xrefs(record["codes"])

# parse dates in `date_cols` only
dict_traverse(record, timestamp_to_date, traverse_list=True)
record = dict_sweep(record, vals=["", None], remove_invalid_list=True)
record = timestamp_to_date(record, ("documentDate", "deprecatedDate"))

_id = record.pop("uuid")
_id = record['uuid']
if "approvalid" in record.keys():
_id = record['approvalid']
yield {"_id": _id, "gsrs": record}
28 changes: 14 additions & 14 deletions src/hub/dataload/sources/gsrs/gsrs_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import biothings.hub.dataload.storage as storage

from hub.dataload.uploader import BaseDrugUploader
from hub.datatransform.keylookup import MyChemKeyLookup

from .gsrs_parser import load_substances

Expand All @@ -24,13 +25,17 @@ class GSRSUploader(BaseDrugUploader):

name = "gsrs"
__metadata__ = {"src_meta": SRC_META}
storage_class = storage.RootKeyMergerStorage
keylookup = MyChemKeyLookup(
[('smiles', 'gsrs.smiles')])

def load_data(self, data_folder):
"""load_data method"""
self.logger.info("Load data from '%s'" % data_folder)
input_file = os.path.join(data_folder, "dump-public-2023-12-14.gsrs")
assert os.path.exists(input_file), "Can't find input file '%s'" % input_file
return load_substances(input_file)
assert os.path.exists(
input_file), "Can't find input file '%s'" % input_file
return self.keylookup(load_substances)(input_file)

@classmethod
def get_mapping(cls):
Expand Down Expand Up @@ -173,7 +178,10 @@ def get_mapping(cls):
"type": "keyword",
},
"nameOrg": {"type": "text"},
"deprecatedDate": {"type": "date"},
"deprecatedDate": {
"type": "date",
"format": "yyyy-MM-dd",
},
}
},
"nameJurisdiction": {
Expand Down Expand Up @@ -457,7 +465,7 @@ def get_mapping(cls):
},
"publicDomain": {"type": "boolean"},
"tags": {"type": "text"},
"documentDate": {"type": "date"},
"documentDate": {"type": "date", "format": "yyyy-MM-dd"},
"uploadedFile": {
"normalizer": "keyword_lowercase_normalizer",
"type": "keyword",
Expand Down Expand Up @@ -672,10 +680,7 @@ def get_mapping(cls):
"normalizer": "keyword_lowercase_normalizer",
"type": "keyword",
},
"smiles": {
"normalizer": "keyword_lowercase_normalizer",
"type": "keyword",
},
"smiles": {"type": "keyword"},
"formula": {
"normalizer": "keyword_lowercase_normalizer",
"type": "keyword",
Expand Down Expand Up @@ -721,10 +726,7 @@ def get_mapping(cls):
"normalizer": "keyword_lowercase_normalizer",
"type": "keyword",
},
"smiles": {
"normalizer": "keyword_lowercase_normalizer",
"type": "keyword",
},
"smiles": {"type": "keyword"},
"formula": {
"normalizer": "keyword_lowercase_normalizer",
"type": "keyword",
Expand Down Expand Up @@ -922,7 +924,6 @@ def get_mapping(cls):
},
"molfile": {"type": "text"},
"smiles": {
"normalizer": "keyword_lowercase_normalizer",
"type": "keyword",
},
"formula": {
Expand Down Expand Up @@ -962,7 +963,6 @@ def get_mapping(cls):
},
"molfile": {"type": "text"},
"smiles": {
"normalizer": "keyword_lowercase_normalizer",
"type": "keyword",
},
"formula": {
Expand Down
48 changes: 19 additions & 29 deletions src/hub/datatransform/keylookup.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
import networkx as nx
from biothings.hub.datatransform import (
CIMongoDBEdge,
DataTransformMDB,
MongoDBEdge,
)
from biothings.hub.datatransform import CIMongoDBEdge, DataTransformMDB, MongoDBEdge

graph_mychem = nx.DiGraph()

###############################################################################
# PharmGKB Nodes and Edges
###############################################################################
graph_mychem.add_node("inchi")
graph_mychem.add_node("chebi")
graph_mychem.add_node("cas")
graph_mychem.add_node("chembl")
graph_mychem.add_node("drugbank")
graph_mychem.add_node("drugcentral")
graph_mychem.add_node("drugname")
graph_mychem.add_node("inchi")
graph_mychem.add_node("inchikey")
graph_mychem.add_node("ndc")
graph_mychem.add_node("pharmgkb")
graph_mychem.add_node("pubchem")
graph_mychem.add_node("rxnorm")
graph_mychem.add_node("smiles")
graph_mychem.add_node("unii")
graph_mychem.add_node("inchikey")
graph_mychem.add_node("pharmgkb")

graph_mychem.add_edge(
"inchi",
Expand Down Expand Up @@ -48,7 +48,8 @@
graph_mychem.add_edge(
"chembl",
"inchikey",
object=MongoDBEdge("chembl", "chembl.molecule_chembl_id", "chembl.inchi_key"),
object=MongoDBEdge("chembl", "chembl.molecule_chembl_id",
"chembl.inchi_key"),
weight=1.0,
)

Expand Down Expand Up @@ -86,7 +87,6 @@
###############################################################################
# ndc -> drugbank -> inchikey
# shortcut edge, one lookup for ndc to inchikey by way of drugbank
graph_mychem.add_node("ndc")

graph_mychem.add_edge(
"ndc",
Expand All @@ -102,7 +102,6 @@
###############################################################################
# chebi -> drugbank -> inchikey
# chebi -> chembl -> inchikey
graph_mychem.add_node("chebi")
graph_mychem.add_edge(
"chebi",
"inchikey",
Expand All @@ -118,7 +117,8 @@
graph_mychem.add_edge(
"chebi",
"chembl",
object=MongoDBEdge("chembl", "chembl.chebi_par_id", "chembl.molecule_chembl_id"),
object=MongoDBEdge("chembl", "chembl.chebi_par_id",
"chembl.molecule_chembl_id"),
weight=1.0,
)

Expand Down Expand Up @@ -156,45 +156,33 @@
)

###############################################################################
# GSRS Nodes and Edges
###############################################################################
# Adding gsrs -> inchikey
graph_mychem.add_node("gsrs")
graph_mychem.add_edge(
"gsrs",
"inchikey",
object=MongoDBEdge("gsrs", "gsrs.smiles", "gsrs.inchikey"),
weight=1.3,
)

###############################################################################
# Adding edges for other SMILES sources
# Edges for SMILES sources
###############################################################################
# chebi.smiles -> inchikey
graph_mychem.add_edge(
"chebi",
"smiles",
"inchikey",
object=MongoDBEdge("chebi", "chebi.smiles", "chebi.inchikey"),
)

# chembl.smiles -> inchikey
graph_mychem.add_edge(
"chembl",
"smiles",
"inchikey",
object=MongoDBEdge("chembl", "chembl.smiles", "chembl.inchikey"),
)

# drugcentral.structures.smiles -> inchikey
graph_mychem.add_edge(
"drugcentral",
"smiles",
"inchikey",
object=MongoDBEdge("drugcentral", "drugcentral.structures.smiles",
"drugcentral.structures.inchikey"),
)

# unii.smiles -> inchikey
graph_mychem.add_edge(
"unii",
"smiles",
"inchikey",
object=MongoDBEdge("unii", "unii.smiles", "unii.inchikey"),
)
Expand All @@ -213,6 +201,7 @@ def __init__(self, input_types, *args, **kwargs):
"chebi",
"chembl",
"pubchem",
'cas',
"drugname",
],
id_priority_list=[
Expand All @@ -223,6 +212,7 @@ def __init__(self, input_types, *args, **kwargs):
"chebi",
"chembl",
"pubchem",
'cas',
"drugname",
],
# skip keylookup for InchiKeys
Expand Down

0 comments on commit 94f7f15

Please sign in to comment.