Skip to content

Commit

Permalink
Merge pull request #10 from TranslatorSRI/novelty
Browse files Browse the repository at this point in the history
Novelty
  • Loading branch information
maximusunc authored Aug 7, 2023
2 parents 05fa392 + 44e9599 commit eaf5c84
Show file tree
Hide file tree
Showing 9 changed files with 661 additions and 10 deletions.
Empty file added app/novelty/__init__.py
Empty file.
481 changes: 481 additions & 0 deletions app/novelty/compute_novelty.py

Large diffs are not rendered by default.

59 changes: 59 additions & 0 deletions app/novelty/extr_smile_molpro_by_id.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import requests


def mol_to_smile_molpro(molecules):
"""
Args:
List
Returns:
Dict
"""

url = "https://molepro.transltr.io/molecular_data_provider/compound/by_id"
headers = {"accept": "application/json", "Content-Type": "application/json"}

smiles = {}

data_mol = list(set(molecules))
# print(f'init data: {len(data_mol)}')
while data_mol:
# print(f'before: {len(data_mol)}')
data_mol_before = len(data_mol)
response = requests.post(url, headers=headers, json=data_mol)

if response.status_code == 200:
json_response = response.json()
collec_url = json_response["url"]
temp_collec_response = requests.get(collec_url)
if temp_collec_response.status_code == 200:
collec_response = temp_collec_response.json()

for i in range(json_response["size"]):
key_list = ["identifiers"]
if set(key_list).issubset(collec_response["elements"][i].keys()):
identifiers = collec_response["elements"][i]["identifiers"]
smile = identifiers.get("smiles", "No SMILES could be found")
smiles[data_mol[i]] = smile
else:
smiles[data_mol[i]] = "No identifiers could be found"

# Remove molecules with successfully retrieved smiles from data_mol
data_mol = [mol for mol in data_mol if mol not in smiles]
data_mol_after = len(data_mol)
# print(f'after: {len(data_mol)}')

if data_mol_after == data_mol_before:
break

else:
print(
f"Error: {temp_collec_response.status_code} - {temp_collec_response.text}"
)
break
else:
print(f"Error: {response.status_code} - {response.text}")
break

return smiles
27 changes: 27 additions & 0 deletions app/novelty/known.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
def find_known_results(message):
inferring_sources = [
"infores:aragorn",
"infores:arax",
"infores:biothings-explorer",
"infores:improving-agent",
"infores:robokop",
"infores:unsecret-agent",
]
known_result_ids = []
unknown_result_ids = []
results = message["results"]
knowledge_graph = message["knowledge_graph"]
for idres, result in enumerate(results):
for analysis in result.get("analyses") or []:
for eb in analysis["edge_bindings"].values():
for element in eb:
edge_id = element["id"]
edge = knowledge_graph["edges"][edge_id]
for source in edge["sources"]:
if source["resource_role"] == "primary_knowledge_source":
if source["resource_id"] not in inferring_sources:
known_result_ids.append(idres)
break
else:
unknown_result_ids.append(idres)
return known_result_ids, unknown_result_ids
68 changes: 68 additions & 0 deletions app/novelty/mol_similarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/usr/bin/env python

from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem


def find_nearest_neighbors(
unknown_smiles_dict, known_smiles_dict, similarity_cutoff, num_neighbors
):
"""
Returns:
Dict
Args:
unknown_smiles_dict: Dict
known_smiles_dict: Dict
similarity_cutoff: float: 0
num_neighbors: int: 1
"""
unknown_smiles = {
key: value
for key, value in unknown_smiles_dict.items()
if value != "No SMILES could be found"
}
known_smiles = {
key: value
for key, value in known_smiles_dict.items()
if value != "No SMILES could be found"
}

# Convert input SMILES to a molecule
known_mols = {}
for key, value in known_smiles.items():
known_mol = Chem.MolFromSmiles(value)
if known_mol is None:
raise ValueError("Invalid SMILES string for", key)
else:
known_mols.update({key: known_mol})
nearest_neighbor_mapping = {}
for unknownkey, value in unknown_smiles.items():
query_mol = Chem.MolFromSmiles(value)
if query_mol is None:
raise ValueError("Invalid SMILES string")

# Calculate fingerprints for the query molecule
query_fp = AllChem.GetMorganFingerprint(query_mol, 2)

# Calculate similarity scores between the query molecule and all molecules in the dataset
similarities = []
for key, mol in known_mols.items():
fp = AllChem.GetMorganFingerprint(mol, 2)
similarity = DataStructs.TanimotoSimilarity(query_fp, fp)
similarities.append((key, similarity))

# Sort the similarities in descending order
similarities.sort(key=lambda x: x[1], reverse=True)

# Retrieve the nearest neighbors
neighbors = []
for i in range(min(num_neighbors, len(similarities))):
index, similarity = similarities[i]
if similarity >= similarity_cutoff:
neighbors.append((index, similarity))
nearest_neighbor_mapping.update({unknownkey: neighbors})
return nearest_neighbor_mapping
25 changes: 16 additions & 9 deletions app/ordering_components.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
"""Compute scores for each result in the given message."""
import os
import redis
from tqdm import tqdm

from .config import settings
from .clinical_evidence.compute_clinical_evidence import compute_clinical_evidence
from .novelty.compute_novelty import compute_novelty


redis_pool = redis.ConnectionPool(
host=settings.redis_host,
Expand Down Expand Up @@ -42,15 +43,18 @@ def get_clinical_evidence(result, message, logger, db_conn):
return compute_clinical_evidence(result, message, logger, db_conn)


def get_novelty(result, message, logger):
# TODO get novelty from novelty package
return 0
def get_novelty(message, logger):
return compute_novelty(message, logger)


def get_ordering_components(message, logger):
logger.debug(f"Computing scores for {len(message['results'])} results")
db_conn = redis.Redis(connection_pool=redis_pool)
for result_index, result in enumerate(tqdm(message.get("results") or [])):
novelty_scores_dict = get_novelty(message, logger).to_dict(orient="index")
novelty_scores = {
node["drug"]: node["novelty_score"] for node in novelty_scores_dict.values()
}
for result in tqdm(message.get("results") or []):
clinical_evidence_score = get_clinical_evidence(
result,
message,
Expand All @@ -62,8 +66,11 @@ def get_ordering_components(message, logger):
"clinical_evidence": clinical_evidence_score,
"novelty": 0,
}
if result["ordering_components"]["clinical_evidence"] == 0:
if clinical_evidence_score == 0:
# Only compute novelty if there is no clinical evidence
result["ordering_components"]["novelty"] = get_novelty(
result, message, logger
)
for node_bindings in result.get("node_bindings", {}).values():
for node_binding in node_bindings:
if node_binding["id"] in novelty_scores:
result["ordering_components"]["novelty"] = novelty_scores[
node_binding["id"]
]
2 changes: 1 addition & 1 deletion app/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

openapi_args = dict(
title="Answer Appraiser",
version="0.2.3",
version="0.3.0",
terms_of_service="",
translator_component="Utility",
translator_teams=["Standards Reference Implementation Team"],
Expand Down
7 changes: 7 additions & 0 deletions requirements-lock.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,22 @@ httptools==0.2.0
httpx==0.24.1
idna==3.4
numpy==1.25.1
pandas==2.0.3
Pillow==10.0.0
pydantic==1.10.9
python-dateutil==2.8.2
python-dotenv==1.0.0
pytz==2023.3
PyYAML==6.0
rdkit==2023.3.2
reasoner-pydantic==4.0.8
redis==4.6.0
six==1.16.0
sniffio==1.3.0
starlette==0.17.1
tqdm==4.65.0
typing_extensions==4.6.3
tzdata==2023.3
uvicorn==0.13.3
uvloop==0.17.0
watchgod==0.8.2
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ fastapi==0.75.0
gunicorn==20.1.0
httpx==0.24.1
numpy==1.25.1
pandas==2.0.3
rdkit==2023.3.2
reasoner-pydantic==4.0.8
redis==4.6.0
tqdm==4.65.0
Expand Down

0 comments on commit eaf5c84

Please sign in to comment.