diff --git a/app/novelty/compute_novelty.py b/app/novelty/compute_novelty.py index 258a759..806919b 100644 --- a/app/novelty/compute_novelty.py +++ b/app/novelty/compute_novelty.py @@ -2,6 +2,7 @@ from datetime import date import requests import numpy as np +import traceback from .known import find_known_results from .extr_smile_molpro_by_id import mol_to_smile_molpro @@ -78,7 +79,7 @@ def get_publication_info(pub_id): response.raise_for_status() response = response.json() except Exception: - return { + response = { "_meta": { "n_results": 0, }, @@ -169,7 +170,6 @@ def extracting_drug_fda_publ_date(message, unknown): today = date.today() res_chk = 1 - # for edge in message['knowledge_graph']['edges'].keys(): query_known, query_unknown, query_chk = query_id(message) idi = -1 for tmp in unknown: @@ -406,12 +406,14 @@ def compute_novelty(message, logger): known, unknown = find_known_results(message) # # # Step 2 - similarity_map = molecular_sim(known, unknown, message) + # start = time.time() df, query_chk = extracting_drug_fda_publ_date(message, unknown) + # print(f"Time to extract fda status and Publication data:{time.time()-start}") # # print(df.head()) # # print(query_chk) # + # df.to_excel(f'DATAFRAME.xlsx', header=False, index=False) # df = pd.read_excel('DATAFRAME.xlsx', names=['edge', 'drug', 'fda status', 'publications', 'number_of_publ', 'age_oldest_pub']) # query_chk = 1 @@ -427,6 +429,20 @@ def compute_novelty(message, logger): # print(df.head()) # print(similarity_map) if query_chk == 1: + # start = time.time() + try: + similarity_map = molecular_sim(known, unknown, message) + df["similarity"] = df.apply( + lambda row: similarity_map[row["drug"]][0][1] + if row["drug"] in similarity_map.keys() + else np.nan, + axis=1, + ) + except Exception as e: + logger.error(traceback.format_exc()) + df = df.assign(similarity=np.nan) + + # print(f"Time to compute Molecular Similarity:{time.time() - start}") # Step 3: # calculating the recency df["recency"] = df.apply( @@ -439,15 +455,9 @@ def compute_novelty(message, logger): ) # # # Step 4: - # # This section will be added later. Currently just putting 'NaN': + # # Calculating the Similarity: # nearest_neighbours = calculate_nn_distance(res_known, res_unknown, 0, 1) - df["similarity"] = df.apply( - lambda row: similarity_map[row["drug"]][0][1] - if row["drug"] in similarity_map.keys() - else np.nan, - axis=1, - ) # df = df.assign(similarity=np.nan) # # Step 5: @@ -458,6 +468,7 @@ def compute_novelty(message, logger): ), axis=1, ) + # df.to_excel(f'DATAFRAME_result.xlsx', header=False, index=False) # # # Step 6 # # # Just sort them: @@ -466,14 +477,5 @@ def compute_novelty(message, logger): ) else: df = df.assign(novelty_score=0) + # df.to_excel(f'DATAFRAME_NOVELTY.xlsx', header=False, index=False) return df - - -# for i in list(range(1, 5)): -# start = time.time() -# temp = compute_novelty('mergedAnnotatedOutput.json') -# if temp.empty: -# print(f"No Results in mergedAnnotatedOutput.json") -# else: -# temp_json = temp.to_json(f'mergedAnnotatedOutput_scores.json', orient='values') -# print(time.time()-start) diff --git a/app/novelty/extr_smile_molpro_by_id.py b/app/novelty/extr_smile_molpro_by_id.py index 504e30e..e15a707 100644 --- a/app/novelty/extr_smile_molpro_by_id.py +++ b/app/novelty/extr_smile_molpro_by_id.py @@ -1,54 +1,59 @@ -#!/usr/bin/env python import requests def mol_to_smile_molpro(molecules): """ + Args: + List Returns: Dict - Args: - List - Example: Attached sample_mols.json file """ - url = "https://translator.broadinstitute.org/molecular_data_provider/compound/by_id" + url = "https://molepro.transltr.io/molecular_data_provider/compound/by_id" headers = {"accept": "application/json", "Content-Type": "application/json"} - data_mol = list(set(molecules)) smiles = {} - response = requests.post(url, headers=headers, json=data_mol) + data_mol = list(set(molecules)) + # print(f'init data: {len(data_mol)}') + while data_mol: + # print(f'before: {len(data_mol)}') + data_mol_before = len(data_mol) + response = requests.post(url, headers=headers, json=data_mol) - if response.status_code == 200: - json_response = response.json() - # print(json_response) - collec_url = json_response["url"] - temp_collec_response = requests.get(collec_url) if response.status_code == 200: - collec_response = temp_collec_response.json() - # print('\n') - # print(collec_response['elements'][0]['identifiers']) - if json_response["size"] > 0: + json_response = response.json() + collec_url = json_response["url"] + temp_collec_response = requests.get(collec_url) + if temp_collec_response.status_code == 200: + collec_response = temp_collec_response.json() + for i in range(json_response["size"]): key_list = ["identifiers"] if set(key_list).issubset(collec_response["elements"][i].keys()): - # if 'smiles' in collec_response['elements'][i]['identifiers'].keys(): identifiers = collec_response["elements"][i]["identifiers"] smile = identifiers.get("smiles", "No SMILES could be found") smiles[data_mol[i]] = smile else: smiles[data_mol[i]] = "No identifiers could be found" - # Recursion: re-attempt to retrieve the SMILES for those (from the initial data_mol) - # for which the retrieval was not successful in the first attempt - if len(list(smiles.keys())) < len(data_mol): - diff = [mol for mol in data_mol if mol not in list(smiles.keys())] - diff_smiles = mol_to_smile_molpro(diff) - if len(diff_smiles) > 0: - smiles.update(diff_smiles) - else: - print(f"Error: {response.status_code} - {response.text}") + # Remove molecules with successfully retrieved smiles from data_mol + data_mol = [mol for mol in data_mol if mol not in smiles] + data_mol_after = len(data_mol) + # print(f'after: {len(data_mol)}') + + if data_mol_after == data_mol_before: + break + + else: + print( + f"Error: {temp_collec_response.status_code} - {temp_collec_response.text}" + ) + break + else: + print(f"Error: {response.status_code} - {response.text}") + break return smiles diff --git a/app/ordering_components.py b/app/ordering_components.py index 32a8e4d..979f234 100644 --- a/app/ordering_components.py +++ b/app/ordering_components.py @@ -1,13 +1,12 @@ """Compute scores for each result in the given message.""" -import os import redis from tqdm import tqdm from .config import settings from .clinical_evidence.compute_clinical_evidence import compute_clinical_evidence - from .novelty.compute_novelty import compute_novelty + redis_pool = redis.ConnectionPool( host=settings.redis_host, port=settings.redis_port,