Skip to content

Commit

Permalink
Add updated novelty code
Browse files Browse the repository at this point in the history
  • Loading branch information
maximusunc committed Aug 4, 2023
1 parent aeda561 commit 44e9599
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 48 deletions.
42 changes: 22 additions & 20 deletions app/novelty/compute_novelty.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from datetime import date
import requests
import numpy as np
import traceback

from .known import find_known_results
from .extr_smile_molpro_by_id import mol_to_smile_molpro
Expand Down Expand Up @@ -78,7 +79,7 @@ def get_publication_info(pub_id):
response.raise_for_status()
response = response.json()
except Exception:
return {
response = {
"_meta": {
"n_results": 0,
},
Expand Down Expand Up @@ -169,7 +170,6 @@ def extracting_drug_fda_publ_date(message, unknown):
today = date.today()

res_chk = 1
# for edge in message['knowledge_graph']['edges'].keys():
query_known, query_unknown, query_chk = query_id(message)
idi = -1
for tmp in unknown:
Expand Down Expand Up @@ -406,12 +406,14 @@ def compute_novelty(message, logger):
known, unknown = find_known_results(message)
#
# # Step 2
similarity_map = molecular_sim(known, unknown, message)

# start = time.time()
df, query_chk = extracting_drug_fda_publ_date(message, unknown)
# print(f"Time to extract fda status and Publication data:{time.time()-start}")
# # print(df.head())
# # print(query_chk)
#
# df.to_excel(f'DATAFRAME.xlsx', header=False, index=False)
# df = pd.read_excel('DATAFRAME.xlsx', names=['edge', 'drug', 'fda status', 'publications', 'number_of_publ', 'age_oldest_pub'])
# query_chk = 1

Expand All @@ -427,6 +429,20 @@ def compute_novelty(message, logger):
# print(df.head())
# print(similarity_map)
if query_chk == 1:
# start = time.time()
try:
similarity_map = molecular_sim(known, unknown, message)
df["similarity"] = df.apply(
lambda row: similarity_map[row["drug"]][0][1]
if row["drug"] in similarity_map.keys()
else np.nan,
axis=1,
)
except Exception as e:
logger.error(traceback.format_exc())
df = df.assign(similarity=np.nan)

# print(f"Time to compute Molecular Similarity:{time.time() - start}")
# Step 3:
# calculating the recency
df["recency"] = df.apply(
Expand All @@ -439,15 +455,9 @@ def compute_novelty(message, logger):
)
#
# # Step 4:
# # This section will be added later. Currently just putting 'NaN':
# # Calculating the Similarity:
# nearest_neighbours = calculate_nn_distance(res_known, res_unknown, 0, 1)

df["similarity"] = df.apply(
lambda row: similarity_map[row["drug"]][0][1]
if row["drug"] in similarity_map.keys()
else np.nan,
axis=1,
)
# df = df.assign(similarity=np.nan)

# # Step 5:
Expand All @@ -458,6 +468,7 @@ def compute_novelty(message, logger):
),
axis=1,
)
# df.to_excel(f'DATAFRAME_result.xlsx', header=False, index=False)

# # # Step 6
# # # Just sort them:
Expand All @@ -466,14 +477,5 @@ def compute_novelty(message, logger):
)
else:
df = df.assign(novelty_score=0)
# df.to_excel(f'DATAFRAME_NOVELTY.xlsx', header=False, index=False)
return df


# for i in list(range(1, 5)):
# start = time.time()
# temp = compute_novelty('mergedAnnotatedOutput.json')
# if temp.empty:
# print(f"No Results in mergedAnnotatedOutput.json")
# else:
# temp_json = temp.to_json(f'mergedAnnotatedOutput_scores.json', orient='values')
# print(time.time()-start)
57 changes: 31 additions & 26 deletions app/novelty/extr_smile_molpro_by_id.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,59 @@
#!/usr/bin/env python
import requests


def mol_to_smile_molpro(molecules):
"""
Args:
List
Returns:
Dict
Args:
List
Example: Attached sample_mols.json file
"""

url = "https://translator.broadinstitute.org/molecular_data_provider/compound/by_id"
url = "https://molepro.transltr.io/molecular_data_provider/compound/by_id"
headers = {"accept": "application/json", "Content-Type": "application/json"}

data_mol = list(set(molecules))
smiles = {}

response = requests.post(url, headers=headers, json=data_mol)
data_mol = list(set(molecules))
# print(f'init data: {len(data_mol)}')
while data_mol:
# print(f'before: {len(data_mol)}')
data_mol_before = len(data_mol)
response = requests.post(url, headers=headers, json=data_mol)

if response.status_code == 200:
json_response = response.json()
# print(json_response)
collec_url = json_response["url"]
temp_collec_response = requests.get(collec_url)
if response.status_code == 200:
collec_response = temp_collec_response.json()
# print('\n')
# print(collec_response['elements'][0]['identifiers'])
if json_response["size"] > 0:
json_response = response.json()
collec_url = json_response["url"]
temp_collec_response = requests.get(collec_url)
if temp_collec_response.status_code == 200:
collec_response = temp_collec_response.json()

for i in range(json_response["size"]):
key_list = ["identifiers"]
if set(key_list).issubset(collec_response["elements"][i].keys()):
# if 'smiles' in collec_response['elements'][i]['identifiers'].keys():
identifiers = collec_response["elements"][i]["identifiers"]
smile = identifiers.get("smiles", "No SMILES could be found")
smiles[data_mol[i]] = smile
else:
smiles[data_mol[i]] = "No identifiers could be found"

# Recursion: re-attempt to retrieve the SMILES for those (from the initial data_mol)
# for which the retrieval was not successful in the first attempt
if len(list(smiles.keys())) < len(data_mol):
diff = [mol for mol in data_mol if mol not in list(smiles.keys())]
diff_smiles = mol_to_smile_molpro(diff)
if len(diff_smiles) > 0:
smiles.update(diff_smiles)
else:
print(f"Error: {response.status_code} - {response.text}")
# Remove molecules with successfully retrieved smiles from data_mol
data_mol = [mol for mol in data_mol if mol not in smiles]
data_mol_after = len(data_mol)
# print(f'after: {len(data_mol)}')

if data_mol_after == data_mol_before:
break

else:
print(
f"Error: {temp_collec_response.status_code} - {temp_collec_response.text}"
)
break
else:
print(f"Error: {response.status_code} - {response.text}")
break

return smiles
3 changes: 1 addition & 2 deletions app/ordering_components.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
"""Compute scores for each result in the given message."""
import os
import redis
from tqdm import tqdm

from .config import settings
from .clinical_evidence.compute_clinical_evidence import compute_clinical_evidence

from .novelty.compute_novelty import compute_novelty


redis_pool = redis.ConnectionPool(
host=settings.redis_host,
port=settings.redis_port,
Expand Down

0 comments on commit 44e9599

Please sign in to comment.