-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added basic plugin for new source gtopdb
- Loading branch information
1 parent
e6fd7c3
commit 39e8284
Showing
2 changed files
with
139 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
version: '0.2' | ||
requires: | ||
- "numpy==1.26.4" | ||
- "pandas==2.2.2" | ||
- "lxml==5.2.2" | ||
__metadata__: | ||
license_url: https://www.guidetopharmacology.org/download.jsp | ||
licence: CC BY-SA 4.0 | ||
url: https://www.guidetopharmacology.org | ||
dumper: | ||
data_url: | ||
- https://www.guidetopharmacology.org/DATA/approved_drug_detailed_interactions.csv | ||
- https://www.guidetopharmacology.org/DATA/ligands.csv | ||
uncompress: false | ||
uploader: | ||
parser: parser:load_ligands | ||
on_duplicates: ignore |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
import os | ||
|
||
import lxml.html | ||
import numpy as np | ||
import pandas as pd | ||
from biothings import config | ||
from biothings.utils.dataload import dict_convert, dict_sweep | ||
|
||
logging = config.logger | ||
|
||
VAL_MAP = {"yes": True, "no": False} | ||
process_key = lambda key: key.replace(" ", "_").lower() | ||
process_val = lambda val: VAL_MAP[val] if isinstance(val, str) and val in VAL_MAP.keys() else val | ||
remove_tags = lambda val: ( | ||
lxml.html.document_fromstring(val).text_content() if isinstance(val, str) else val | ||
) | ||
intrs_rename_dict = { | ||
"Target Ensembl Gene ID": "Ensembl Gene", | ||
"Target Entrez Gene ID": "Entrez Gene", | ||
"Target Gene Name": "Symbol", | ||
"Target Species": "Species", | ||
} | ||
|
||
|
||
def preprocess_ligands(d: dict): | ||
"""convert key names, remove empty vals and XML tags, and determine _id | ||
Args: | ||
d (dict): ligand properties | ||
Returns: | ||
dict: processed ligand properties | ||
""" | ||
if isinstance(d["Synonyms"], str): | ||
d["Synonyms"] = d["Synonyms"].split("|") | ||
d = dict_sweep(d, vals=["", np.nan], remove_invalid_list=True) | ||
d = dict_convert(d, keyfn=process_key) | ||
d = dict_convert(d, valuefn=process_val) | ||
d = dict_convert(d, valuefn=remove_tags) | ||
|
||
if "inchikey" in d.keys() and not d["inchikey_dup"]: | ||
d["_id"] = d["inchikey"] | ||
elif "pubchem_cid" in d.keys() and not d["cid_dup"]: | ||
d["_id"] = f"pubchem.compound:{d['pubchem_cid']}" | ||
elif "pubchem_sid" in d.keys() and not d["sid_dup"]: | ||
d["_id"] = f"pubchem.substance:{d['pubchem_sid']}" | ||
|
||
for key in ["inchikey_dup", "cid_dup", "sid_dup"]: | ||
d.pop(key) | ||
return d | ||
|
||
|
||
def preprocess_intrs(d: dict): | ||
"""convert key names and remove empty vals, XML tags, and repeated columns | ||
Args: | ||
d (dict): interaction properties | ||
Returns: | ||
dict: processed interaction properties | ||
""" | ||
d["Name"] = d["Target"] | ||
if isinstance(d["Species"], str): | ||
d["Species"] = d["Species"].lower() | ||
|
||
# redundant since present in ligands | ||
cols_to_drop = [ | ||
"Ligand ID", | ||
"CAS Number", | ||
"Clinical Use Comment", | ||
"Bioactivity Comment", | ||
"Ligand Synonyms", | ||
"Target", | ||
"Ligand", | ||
"Type", | ||
"SMILES", | ||
] | ||
for col in cols_to_drop: | ||
d.pop(col) | ||
|
||
d = dict_sweep(d, vals=["", np.nan], remove_invalid_list=True) | ||
d = dict_convert(d, keyfn=process_key) | ||
d = dict_convert(d, valuefn=remove_tags) | ||
return d | ||
|
||
|
||
def load_ligands(data_folder: str): | ||
# pk: Ligand ID,Target ID,Target Ligand ID,Target Species | ||
# inner join of primary_targets_csv[pk] and detailed_csv[pk] is primary_targets_csv[pk] | ||
interactions_file = os.path.join(data_folder, "approved_drug_detailed_interactions.csv") | ||
ligands_file = os.path.join(data_folder, "ligands.csv") | ||
assert os.path.exists(interactions_file) and os.path.exists(ligands_file) | ||
|
||
ligands = pd.read_csv(ligands_file, skiprows=1, dtype=object).set_index("Ligand ID") | ||
interactions = ( | ||
pd.read_csv(interactions_file, skiprows=1, dtype=object) | ||
.rename(intrs_rename_dict, axis=1) | ||
.to_dict(orient="records") | ||
) | ||
assert type(list(ligands.keys())[0]) == str | ||
|
||
# keep track of duplicates for determining _id | ||
ligands["inchikey_dup"] = ligands["InChIKey"].duplicated() | ||
ligands["cid_dup"] = ligands["PubChem CID"].duplicated() | ||
ligands["sid_dup"] = ligands["PubChem SID"].duplicated() | ||
ligands = ligands.to_dict(orient="index") | ||
|
||
for row in interactions: | ||
ligand_id = str(row["Ligand ID"]) | ||
|
||
# NOTE: we assume ligand IDs in interactions will be found in ligands | ||
if "interaction_targets" not in ligands[ligand_id].keys(): | ||
ligands[ligand_id]["interaction_targets"] = [] | ||
ligands[ligand_id]["CAS Number"] = row["CAS Number"] | ||
ligands[ligand_id]["Clinical Use Comment"] = row["Clinical Use Comment"] | ||
ligands[ligand_id]["Bioactivity Comment"] = row["Bioactivity Comment"] | ||
|
||
ligands[ligand_id]["interaction_targets"].append(preprocess_intrs(row)) | ||
|
||
for k, ligand in ligands.items(): | ||
ligand["_id"] = f"gtopdb:{k}" # default _id if others are NaN or duplicated | ||
yield preprocess_ligands(ligand) |