From 46342aef130a8ea2f8682e339237816e692c5b5c Mon Sep 17 00:00:00 2001 From: Max Wang Date: Thu, 3 Aug 2023 14:12:12 -0400 Subject: [PATCH 1/5] Add novelty score code --- app/novelty/__init__.py | 0 app/novelty/compute_novelty.py | 375 +++++++++++++++++++++++++ app/novelty/extr_smile_molpro_by_id.py | 59 ++++ app/novelty/known.py | 27 ++ app/novelty/mol_similarity.py | 59 ++++ requirements-lock.txt | 7 + requirements.txt | 2 + 7 files changed, 529 insertions(+) create mode 100644 app/novelty/__init__.py create mode 100644 app/novelty/compute_novelty.py create mode 100644 app/novelty/extr_smile_molpro_by_id.py create mode 100644 app/novelty/known.py create mode 100644 app/novelty/mol_similarity.py diff --git a/app/novelty/__init__.py b/app/novelty/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/novelty/compute_novelty.py b/app/novelty/compute_novelty.py new file mode 100644 index 0000000..2b53729 --- /dev/null +++ b/app/novelty/compute_novelty.py @@ -0,0 +1,375 @@ +import pandas as pd +from datetime import date +import requests +import numpy as np + +from .known import find_known_results +from .extr_smile_molpro_by_id import mol_to_smile_molpro +from .mol_similarity import find_nearest_neighbors + +""" +This script computes the novelty score for a list of results obtained for a 1-H response. +The steps for the ideal workflow are as follows: +1. Distinguish whether the result is a known result or an unknown result. +2. Compute FDA approval status to check if currently under approval or already approved. +3. Compute recency by noting the associated publications of the result. +4. Compute molecular similarity to identify if similar to existing drugs. + +The end result of this script displays a table with values from different columns and accordingly lists the novelty score as well. +""" + +def molecular_sim(known, unknown, message): + unknown_ids = [] + known_ids = [] + if len(unknown) > 0: + for drug in unknown: + s = list(message['results'][drug]['node_bindings'].keys()) + edge_attribute_sn = message['results'][drug]['node_bindings'][s[0]][0]['id'] + if 'PUBCHEM' in edge_attribute_sn or 'CHEMBL' in edge_attribute_sn or 'UNII' in edge_attribute_sn or 'RXNORM' in edge_attribute_sn or 'UMLS' in edge_attribute_sn or not 'MONDO' in edge_attribute_sn: + unknown_ids.append(edge_attribute_sn) + else: + unknown_ids.append( + message['results'][drug]['node_bindings'][s[1]][0]['id']) + + if len(known) > 0: + for drug in known: + s = list(message['results'][drug]['node_bindings'].keys()) + edge_attribute_sn = message['results'][drug]['node_bindings'][s[0]][0]['id'] + if 'PUBCHEM' in edge_attribute_sn or 'CHEMBL' in edge_attribute_sn or 'UMLS' in edge_attribute_sn or 'UNII' in edge_attribute_sn or 'RXNORM' in edge_attribute_sn or not 'MONDO' in edge_attribute_sn: + known_ids.append(edge_attribute_sn) + else: + known_ids.append(message['results'][drug]['node_bindings'][s[1]][0]['id']) + + smile_unkown = mol_to_smile_molpro(unknown_ids) + smile_known = mol_to_smile_molpro(known_ids) + + similarity_map = find_nearest_neighbors(smile_unkown, smile_known, 0, 1) + return similarity_map + + +def get_publication_info(pub_id): + """ + Args: PMID + Returns: The publication info + """ + base_url = "https://3md2qwxrrk.us-east-1.awsapprunner.com/publications?pubids=" + request_id = '1df88223-c0f8-47f5-a1f3-661b944c7849' + full_url = f"{base_url}{pub_id}&request_id={request_id}" + try: + response = requests.get(full_url) + response.raise_for_status() + response = response.json() + except Exception: + return { + "_meta": { + "n_results": 0, + }, + } + return response + + +def sigmoid(x): + return 1 / (1 + np.exp(x)) + +def query_id(message): + for i in message['query_graph']['nodes']: + if 'ids' in message['query_graph']['nodes'][i].keys(): + if message['query_graph']['nodes'][i]['ids']: + known_node = message['query_graph']['nodes'][i]['categories'][0] + else: + unknown_node = message['query_graph']['nodes'][i]['categories'][0] + else: + unknown_node = message['query_graph']['nodes'][i]['categories'][0] + if unknown_node in ['biolink:ChemicalEntity', 'biolink:SmallMolecule', 'biolink:Drug']: + chk=1 + else: + chk=0 + return known_node, unknown_node, chk + + +def recency_function_exp(number_of_publ, age_of_oldest_publ, max_number, max_age): + """ + Calculates the recency based on number of publications accociated to each drug + and age of the oldest publication + + Args: + number_of_publ (float): The current number of publications. + age_of_oldest_publ (float): The current age of the oldest publication. + max_number (float): The maximum number of publication: e.g. consider 100 for all drugs. + max_age (float): The publication with the recent 50 years have been considered. + + Returns: + float: The recency value of z. + """ + coef_number = 10 + coef_age = 4 + alter_number_of_publ = sigmoid(coef_number * (number_of_publ / max_number - 0.5)) + alter_age_of_oldest_publ = sigmoid(coef_age * (age_of_oldest_publ / max_age - 0.5)) + + if np.isnan(number_of_publ) and not np.isnan(age_of_oldest_publ): + recency = alter_age_of_oldest_publ + elif np.isnan(age_of_oldest_publ) and not np.isnan(number_of_publ): + recency = alter_number_of_publ + else: + recency = alter_number_of_publ * alter_age_of_oldest_publ + + return recency + + +def extracting_drug_fda_publ_date(message, unknown): + """ + Upon querying, the response is returned as a list containing 10 dictionaries, + with each dictionary representing the response from an ARA. The function 'extracting_drug_fda_publ_date' + is designed to extract the drug entity name of each edge. It then checks the EPC attributes of each edge + to determine the FDA approval status of the drug and the accociated pulications PMID/PMC ID ... . + And finally extract the publishing date of the publications to get the oldest (Year) date among them. + + Args: + Dictionary: response for a ARA to a query + + Returns: + "An DataFrame constructed where each row represents an edge and contains information such as the drug entity + name, FDA status of the drug, a list of associated publications, the number of associated publications, + and the oldest publication date (year) linked to each drug." + + """ + attribute_type_id_list_fda = ['biolink:FDA_approval_status', 'biolink:FDA_APPROVAL_STATUS'] + attribute_type_id_list_pub = ['biolink:publications', 'biolink:Publication', 'biolink:publication'] + drug_idx_fda_status = [] + today = date.today() + + res_chk = 1 + # for edge in message['knowledge_graph']['edges'].keys(): + query_known, query_unknown, query_chk = query_id(message) + idi=-1 + for tmp in unknown: + tmp_res = message['results'][tmp]['analyses'][0]['edge_bindings'] + for tmp_1 in tmp_res: + idi+=1 + edge = message['results'][tmp]['analyses'][0]['edge_bindings'][tmp_1][0]['id'] + # edge_list = list(message['knowledge_graph']['edges'].keys()) + # for idx, idi in enumerate(edge_list): + # if idx % 20 == 0: + # print(f'progressing {idx}') + # edge = edge_list[idx] + edge_attribute = message['knowledge_graph']['edges'][edge] + # if set(['subject', 'object']).issubset(edge_attribute.keys()): + if query_chk==1: + if 'PUBCHEM' in edge_attribute['subject'] or 'CHEMBL' in edge_attribute['subject'] or 'UNII' in edge_attribute['subject'] or 'RXNORM' in edge_attribute['subject'] or 'UMLS' in edge_attribute['subject'] or not 'MONDO' in edge_attribute['subject']: + drug_idx = edge_attribute['subject'] + else: + drug_idx = edge_attribute['object'] + if set(['attributes']).issubset(edge_attribute.keys()): + if len(edge_attribute['attributes']) > 0: + att_type_id = {} + fda = [] + pub = [] + for i in range(len(edge_attribute['attributes'])): + att_type_id[i] = edge_attribute['attributes'][i]['attribute_type_id'] + + for key in att_type_id.keys(): + if att_type_id[key] in attribute_type_id_list_fda: + fda.append(key) + elif att_type_id[key] in attribute_type_id_list_pub: + pub.append(key) + + if len(fda) > 0: + if edge_attribute['attributes'][fda[0]]['value'] == 'FDA Approval': + fda_status = 0.0 + else: + fda_status = 1.0 + else: + fda_status = None + + # Publication + if len(pub) > 0: + publications = edge_attribute['attributes'][pub[0]]['value'] + if '|' in publications: + publications = publications.split('|') + if type(publications) == 'str': + publications = [publications] + + # Removal of all publication entries that are links + publications = [x for x in publications if "http" not in x] + # Removal of all publication entries that are Clinical Trials + publications = [x for x in publications if "clinicaltrials" not in x] + number_of_publ = len(publications) + + if len(publications)>0: + # print(publications) + publications_1 = ','.join(publications) + try: + response_pub = get_publication_info(publications_1) + if response_pub['_meta']['n_results']==0: + age_oldest = np.nan + else: + publ_year = [] + for key in response_pub['results'].keys(): + if 'not_found' not in key: + publ_year.extend([int(response_pub['results'][key]['pub_year'])]) + age_oldest = today.year - min(publ_year) + except ConnectionError as e: + age_oldest = np.nan + else: + publications = None + number_of_publ = 0.0 + age_oldest = np.nan + drug_idx_fda_status.append((idi, drug_idx, fda_status, publications, number_of_publ, age_oldest)) + else: + if query_unknown in ['biolink:Gene', 'biolink:Protein']: + if 'NCBI' in edge_attribute['subject'] or 'GO' in edge_attribute['subject']: + gene_idx = edge_attribute['subject'] + else: + gene_idx = edge_attribute['object'] + drug_idx_fda_status.append((idi, gene_idx)) + elif query_unknown in ['biolink:Disease', 'biolink:Phenotype']: + if 'MONDO' in edge_attribute['subject']: + dis_idx = edge_attribute['subject'] + else: + dis_idx = edge_attribute['object'] + drug_idx_fda_status.append((idi, dis_idx)) + if query_chk==1 and res_chk==1: + DF = pd.DataFrame(drug_idx_fda_status, columns=['edge', 'drug', 'fda status', 'publications', 'number_of_publ', 'age_oldest_pub']) + elif query_chk!=1 and res_chk==1: + DF = pd.DataFrame(drug_idx_fda_status, columns=['edge', 'result']) + else: + DF = pd.DataFrame() + return DF, query_chk + +def extract_results(message, unknown, known): + results = [] + results_known = [] + kid, ukid = 0, 0 + for idi, i in enumerate(message['results']): + if idi in unknown: + results.append([]) + for idj, j in enumerate(i['analyses']): + for idk, k in enumerate(j['edge_bindings'][list(j['edge_bindings'].keys())[0]]): + results[ukid].append(k['id']) + ukid+=1 + + elif idi in known: + results_known.append([]) + for idj, j in enumerate(i['analyses']): + for idk, k in enumerate(j['edge_bindings'][list(j['edge_bindings'].keys())[0]]): + results_known[kid].append(k['id']) + kid+=1 + return results, results_known + +def result_edge_correlation(results, results_known, df): + df_res = pd.DataFrame() + res_known = set() + res_unknown = set() + for idi, i in enumerate(results): + for j in i: + df_res = pd.concat([df_res, df[df['edge']==j]]) + res_unknown.add(df.loc[df['edge']==j, 'drug'].iloc[0]) + + for idi, i in enumerate(results_known): + for j in i: + res_known.add(df.loc[df['edge']==j, 'drug'].iloc[0]) + return df_res, res_unknown, res_known + +def novelty_score(fda_status, recency, similarity): + """ + Calculate the novelty score for each drug entity based on FDA status, recency and similarity of the drug. + + FDA status 0 | 1 + --> 0 to be FDA approved + 0 < recency < 1 + --> 1 to have a high recency where the publications are so new and number of publications is too few. + 0 < similarity < 1 + --> 0 to have a very low molecular structure similarity, so it is novel. + + Args: + float: fda_status + float: recency + float: similarity + + Returns: + float: novelty_score + + """ + if not np.isnan(recency): + score = recency + if not np.isnan(similarity): + similarity = 1 - similarity + if similarity > 0.5: + score = score*(0.73+similarity) + if score>1: + score=1 + if fda_status == 0: + score=score*0.85 + else: + if np.isnan(similarity): + score=0 + else: + score=(1-similarity) + return score + +def compute_novelty(message, logger): + """ INPUT: JSON Response with merged annotated results for a 1-H query + + 1. load the json file + 2. Give the json to extracting_drug_fda_publ_date(response) function to extract the EPC + 3. Apply the recency function of df, to add a new column as recency to the dataframe + 4. Add a new column to the df as similarity which has random number between 0-1 + 5. Now the dataframe df is ready for applying the novelty score on it + + OUTPUT: Pandas DataFrame with FDA Status, Recency, Similarity and Novelty score per result + """ + # Step 1 + known, unknown = find_known_results(message) + # + # # Step 2 + similarity_map = molecular_sim(known, unknown, message) + + df, query_chk = extracting_drug_fda_publ_date(message, unknown) +# # print(df.head()) +# # print(query_chk) +# + # df = pd.read_excel('DATAFRAME.xlsx', names=['edge', 'drug', 'fda status', 'publications', 'number_of_publ', 'age_oldest_pub']) + # query_chk = 1 + + # # + # res, res_known = extract_results(mergedAnnotatedOutput, unknown, known) +# # print(len(res_known)) +# # print(len(res)) +# # # # +# df_res, res_unknown, res_known = result_edge_correlation(res, res_known, df) + # print(len(res_unknown)) + # print(len(res_known)) + # df = df_res + # print(df.head()) + # print(similarity_map) + if query_chk==1: + # Step 3: + # calculating the recency + df['recency'] = df.apply(lambda row: recency_function_exp(row['number_of_publ'], row['age_oldest_pub'], 100, 50) if not (np.isnan(row['number_of_publ']) or np.isnan(row['age_oldest_pub'])) else np.nan, axis=1) + # + # # Step 4: + # # This section will be added later. Currently just putting 'NaN': + # nearest_neighbours = calculate_nn_distance(res_known, res_unknown, 0, 1) + + df['similarity'] = df.apply(lambda row: similarity_map[row['drug']][0][1] if row['drug'] in similarity_map.keys() else np.nan, axis=1) + # df = df.assign(similarity=np.nan) + + # # Step 5: + # # Calculating the novelty score: + df['novelty_score'] = df.apply(lambda row: novelty_score(row['fda status'], row['recency'], row['similarity']), axis=1) + + # # # Step 6 + # # # Just sort them: + df = df[['drug', 'novelty_score']].sort_values(by= 'novelty_score', ascending= False) + else: + df = df.assign(novelty_score=0) + return df + +#for i in list(range(1, 5)): +# start = time.time() +# temp = compute_novelty('mergedAnnotatedOutput.json') +# if temp.empty: +# print(f"No Results in mergedAnnotatedOutput.json") +# else: +# temp_json = temp.to_json(f'mergedAnnotatedOutput_scores.json', orient='values') +# print(time.time()-start) \ No newline at end of file diff --git a/app/novelty/extr_smile_molpro_by_id.py b/app/novelty/extr_smile_molpro_by_id.py new file mode 100644 index 0000000..98df59d --- /dev/null +++ b/app/novelty/extr_smile_molpro_by_id.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python +import requests + + +def mol_to_smile_molpro(molecules): + """ + + Returns: + Dict + + Args: + List + Example: Attached sample_mols.json file + """ + + url = "https://translator.broadinstitute.org/molecular_data_provider/compound/by_id" + headers = { + "accept": "application/json", + "Content-Type": "application/json" + } + + data_mol = list(set(molecules)) + smiles = {} + + response = requests.post(url, headers=headers, json=data_mol) + + if response.status_code == 200: + json_response = response.json() + #print(json_response) + collec_url = json_response['url'] + temp_collec_response = requests.get(collec_url) + if response.status_code == 200: + collec_response = temp_collec_response.json() + #print('\n') + #print(collec_response['elements'][0]['identifiers']) + if json_response['size'] > 0: + + for i in range(json_response['size']): + key_list = ['identifiers'] + if set(key_list).issubset(collec_response['elements'][i].keys()): + #if 'smiles' in collec_response['elements'][i]['identifiers'].keys(): + identifiers = collec_response['elements'][i]['identifiers'] + smile = identifiers.get('smiles', 'No SMILES could be found') + smiles[data_mol[i]] = smile + else: + smiles[data_mol[i]] = 'No identifiers could be found' + + #Recursion: re-attempt to retrieve the SMILES for those (from the initial data_mol) + #for which the retrieval was not successful in the first attempt + if len(list(smiles.keys())) < len(data_mol): + diff = [mol for mol in data_mol if mol not in list(smiles.keys())] + diff_smiles = mol_to_smile_molpro(diff) + if len(diff_smiles) > 0: + smiles.update(diff_smiles) + else: + print(f"Error: {response.status_code} - {response.text}") + + + return smiles \ No newline at end of file diff --git a/app/novelty/known.py b/app/novelty/known.py new file mode 100644 index 0000000..1e2c3a6 --- /dev/null +++ b/app/novelty/known.py @@ -0,0 +1,27 @@ +def find_known_results(message): + inferring_sources = [ + "infores:aragorn", + "infores:arax", + "infores:biothings-explorer", + "infores:improving-agent", + "infores:robokop", + "infores:unsecret-agent", + ] + known_result_ids = [] + unknown_result_ids = [] + results = message["results"] + knowledge_graph = message["knowledge_graph"] + for idres, result in enumerate(results): + for analysis in result.get("analyses") or []: + for eb in analysis["edge_bindings"].values(): + for element in eb: + edge_id = element["id"] + edge = knowledge_graph["edges"][edge_id] + for source in edge["sources"]: + if source["resource_role"] == "primary_knowledge_source": + if source["resource_id"] not in inferring_sources: + known_result_ids.append(idres) + break + else: + unknown_result_ids.append(idres) + return known_result_ids, unknown_result_ids diff --git a/app/novelty/mol_similarity.py b/app/novelty/mol_similarity.py new file mode 100644 index 0000000..70c2f60 --- /dev/null +++ b/app/novelty/mol_similarity.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python + +from rdkit import Chem +from rdkit import DataStructs +from rdkit.Chem import AllChem + + + +def find_nearest_neighbors(unknown_smiles_dict, known_smiles_dict, similarity_cutoff, num_neighbors): + """ + + Returns: + Dict + + Args: + unknown_smiles_dict: Dict + known_smiles_dict: Dict + similarity_cutoff: float: 0 + num_neighbors: int: 1 + + """ + unknown_smiles = {key:value for key,value in unknown_smiles_dict.items() if value != "No SMILES could be found"} + known_smiles = {key:value for key,value in known_smiles_dict.items() if value != "No SMILES could be found"} + + # Convert input SMILES to a molecule + known_mols = {} + for key,value in known_smiles.items(): + known_mol = Chem.MolFromSmiles(value) + if known_mol is None: + raise ValueError("Invalid SMILES string for",key) + else: + known_mols.update({key:known_mol}) + nearest_neighbor_mapping = {} + for unknownkey,value in unknown_smiles.items(): + query_mol = Chem.MolFromSmiles(value) + if query_mol is None: + raise ValueError("Invalid SMILES string") + + # Calculate fingerprints for the query molecule + query_fp = AllChem.GetMorganFingerprint(query_mol, 2) + + # Calculate similarity scores between the query molecule and all molecules in the dataset + similarities = [] + for key, mol in known_mols.items(): + fp = AllChem.GetMorganFingerprint(mol, 2) + similarity = DataStructs.TanimotoSimilarity(query_fp, fp) + similarities.append((key, similarity)) + + # Sort the similarities in descending order + similarities.sort(key=lambda x: x[1], reverse=True) + + # Retrieve the nearest neighbors + neighbors = [] + for i in range(min(num_neighbors, len(similarities))): + index, similarity = similarities[i] + if similarity >= similarity_cutoff: + neighbors.append((index, similarity)) + nearest_neighbor_mapping.update({unknownkey:neighbors}) + return nearest_neighbor_mapping \ No newline at end of file diff --git a/requirements-lock.txt b/requirements-lock.txt index e5f9c09..cf79215 100644 --- a/requirements-lock.txt +++ b/requirements-lock.txt @@ -12,15 +12,22 @@ httptools==0.2.0 httpx==0.24.1 idna==3.4 numpy==1.25.1 +pandas==2.0.3 +Pillow==10.0.0 pydantic==1.10.9 +python-dateutil==2.8.2 python-dotenv==1.0.0 +pytz==2023.3 PyYAML==6.0 +rdkit==2023.3.2 reasoner-pydantic==4.0.8 redis==4.6.0 +six==1.16.0 sniffio==1.3.0 starlette==0.17.1 tqdm==4.65.0 typing_extensions==4.6.3 +tzdata==2023.3 uvicorn==0.13.3 uvloop==0.17.0 watchgod==0.8.2 diff --git a/requirements.txt b/requirements.txt index dbceaa9..03e7735 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,8 @@ fastapi==0.75.0 gunicorn==20.1.0 httpx==0.24.1 numpy==1.25.1 +pandas==2.0.3 +rdkit==2023.3.2 reasoner-pydantic==4.0.8 redis==4.6.0 tqdm==4.65.0 From fbffb73c7409f66b81126372a000dfe8d491ee61 Mon Sep 17 00:00:00 2001 From: Max Wang Date: Thu, 3 Aug 2023 14:12:52 -0400 Subject: [PATCH 2/5] Actually call novelty code --- app/ordering_components.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/app/ordering_components.py b/app/ordering_components.py index 5fc209a..9022df5 100644 --- a/app/ordering_components.py +++ b/app/ordering_components.py @@ -6,6 +6,8 @@ from .config import settings from .clinical_evidence.compute_clinical_evidence import compute_clinical_evidence +from .novelty.compute_novelty import compute_novelty + redis_pool = redis.ConnectionPool( host=settings.redis_host, port=settings.redis_port, @@ -42,15 +44,16 @@ def get_clinical_evidence(result, message, logger, db_conn): return compute_clinical_evidence(result, message, logger, db_conn) -def get_novelty(result, message, logger): - # TODO get novelty from novelty package - return 0 +def get_novelty(message, logger): + return compute_novelty(message, logger) def get_ordering_components(message, logger): logger.debug(f"Computing scores for {len(message['results'])} results") db_conn = redis.Redis(connection_pool=redis_pool) - for result_index, result in enumerate(tqdm(message.get("results") or [])): + novelty_scores_dict = get_novelty(message, logger).to_dict(orient="index") + novelty_scores = {node["drug"]: node["novelty_score"] for node in novelty_scores_dict.values()} + for result in tqdm(message.get("results") or []): clinical_evidence_score = get_clinical_evidence( result, message, @@ -62,8 +65,9 @@ def get_ordering_components(message, logger): "clinical_evidence": clinical_evidence_score, "novelty": 0, } - if result["ordering_components"]["clinical_evidence"] == 0: + if clinical_evidence_score == 0: # Only compute novelty if there is no clinical evidence - result["ordering_components"]["novelty"] = get_novelty( - result, message, logger - ) + for node_bindings in result.get("node_bindings", {}).values(): + for node_binding in node_bindings: + if node_binding["id"] in novelty_scores: + result["ordering_components"]["novelty"] = novelty_scores[node_binding["id"]] From 14c4414d364e4591982b8a4c293f5885cfe698bf Mon Sep 17 00:00:00 2001 From: Max Wang Date: Thu, 3 Aug 2023 14:13:05 -0400 Subject: [PATCH 3/5] Bump minor version --- app/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/server.py b/app/server.py index def83bf..b94790f 100644 --- a/app/server.py +++ b/app/server.py @@ -21,7 +21,7 @@ openapi_args = dict( title="Answer Appraiser", - version="0.2.3", + version="0.3.0", terms_of_service="", translator_component="Utility", translator_teams=["Standards Reference Implementation Team"], From aeda5611e9b183c0234cd82f843108edf5e770e4 Mon Sep 17 00:00:00 2001 From: Max Wang Date: Thu, 3 Aug 2023 14:25:30 -0400 Subject: [PATCH 4/5] Black --- app/novelty/compute_novelty.py | 320 ++++++++++++++++--------- app/novelty/extr_smile_molpro_by_id.py | 55 ++--- app/novelty/mol_similarity.py | 31 ++- app/ordering_components.py | 8 +- 4 files changed, 263 insertions(+), 151 deletions(-) diff --git a/app/novelty/compute_novelty.py b/app/novelty/compute_novelty.py index 2b53729..258a759 100644 --- a/app/novelty/compute_novelty.py +++ b/app/novelty/compute_novelty.py @@ -4,7 +4,7 @@ import numpy as np from .known import find_known_results -from .extr_smile_molpro_by_id import mol_to_smile_molpro +from .extr_smile_molpro_by_id import mol_to_smile_molpro from .mol_similarity import find_nearest_neighbors """ @@ -18,27 +18,45 @@ The end result of this script displays a table with values from different columns and accordingly lists the novelty score as well. """ + def molecular_sim(known, unknown, message): unknown_ids = [] known_ids = [] if len(unknown) > 0: for drug in unknown: - s = list(message['results'][drug]['node_bindings'].keys()) - edge_attribute_sn = message['results'][drug]['node_bindings'][s[0]][0]['id'] - if 'PUBCHEM' in edge_attribute_sn or 'CHEMBL' in edge_attribute_sn or 'UNII' in edge_attribute_sn or 'RXNORM' in edge_attribute_sn or 'UMLS' in edge_attribute_sn or not 'MONDO' in edge_attribute_sn: + s = list(message["results"][drug]["node_bindings"].keys()) + edge_attribute_sn = message["results"][drug]["node_bindings"][s[0]][0]["id"] + if ( + "PUBCHEM" in edge_attribute_sn + or "CHEMBL" in edge_attribute_sn + or "UNII" in edge_attribute_sn + or "RXNORM" in edge_attribute_sn + or "UMLS" in edge_attribute_sn + or not "MONDO" in edge_attribute_sn + ): unknown_ids.append(edge_attribute_sn) else: unknown_ids.append( - message['results'][drug]['node_bindings'][s[1]][0]['id']) + message["results"][drug]["node_bindings"][s[1]][0]["id"] + ) if len(known) > 0: for drug in known: - s = list(message['results'][drug]['node_bindings'].keys()) - edge_attribute_sn = message['results'][drug]['node_bindings'][s[0]][0]['id'] - if 'PUBCHEM' in edge_attribute_sn or 'CHEMBL' in edge_attribute_sn or 'UMLS' in edge_attribute_sn or 'UNII' in edge_attribute_sn or 'RXNORM' in edge_attribute_sn or not 'MONDO' in edge_attribute_sn: + s = list(message["results"][drug]["node_bindings"].keys()) + edge_attribute_sn = message["results"][drug]["node_bindings"][s[0]][0]["id"] + if ( + "PUBCHEM" in edge_attribute_sn + or "CHEMBL" in edge_attribute_sn + or "UMLS" in edge_attribute_sn + or "UNII" in edge_attribute_sn + or "RXNORM" in edge_attribute_sn + or not "MONDO" in edge_attribute_sn + ): known_ids.append(edge_attribute_sn) else: - known_ids.append(message['results'][drug]['node_bindings'][s[1]][0]['id']) + known_ids.append( + message["results"][drug]["node_bindings"][s[1]][0]["id"] + ) smile_unkown = mol_to_smile_molpro(unknown_ids) smile_known = mol_to_smile_molpro(known_ids) @@ -53,7 +71,7 @@ def get_publication_info(pub_id): Returns: The publication info """ base_url = "https://3md2qwxrrk.us-east-1.awsapprunner.com/publications?pubids=" - request_id = '1df88223-c0f8-47f5-a1f3-661b944c7849' + request_id = "1df88223-c0f8-47f5-a1f3-661b944c7849" full_url = f"{base_url}{pub_id}&request_id={request_id}" try: response = requests.get(full_url) @@ -71,19 +89,24 @@ def get_publication_info(pub_id): def sigmoid(x): return 1 / (1 + np.exp(x)) + def query_id(message): - for i in message['query_graph']['nodes']: - if 'ids' in message['query_graph']['nodes'][i].keys(): - if message['query_graph']['nodes'][i]['ids']: - known_node = message['query_graph']['nodes'][i]['categories'][0] + for i in message["query_graph"]["nodes"]: + if "ids" in message["query_graph"]["nodes"][i].keys(): + if message["query_graph"]["nodes"][i]["ids"]: + known_node = message["query_graph"]["nodes"][i]["categories"][0] else: - unknown_node = message['query_graph']['nodes'][i]['categories'][0] + unknown_node = message["query_graph"]["nodes"][i]["categories"][0] else: - unknown_node = message['query_graph']['nodes'][i]['categories'][0] - if unknown_node in ['biolink:ChemicalEntity', 'biolink:SmallMolecule', 'biolink:Drug']: - chk=1 + unknown_node = message["query_graph"]["nodes"][i]["categories"][0] + if unknown_node in [ + "biolink:ChemicalEntity", + "biolink:SmallMolecule", + "biolink:Drug", + ]: + chk = 1 else: - chk=0 + chk = 0 return known_node, unknown_node, chk @@ -91,13 +114,13 @@ def recency_function_exp(number_of_publ, age_of_oldest_publ, max_number, max_age """ Calculates the recency based on number of publications accociated to each drug and age of the oldest publication - + Args: number_of_publ (float): The current number of publications. age_of_oldest_publ (float): The current age of the oldest publication. max_number (float): The maximum number of publication: e.g. consider 100 for all drugs. max_age (float): The publication with the recent 50 years have been considered. - + Returns: float: The recency value of z. """ @@ -112,7 +135,7 @@ def recency_function_exp(number_of_publ, age_of_oldest_publ, max_number, max_age recency = alter_number_of_publ else: recency = alter_number_of_publ * alter_age_of_oldest_publ - + return recency @@ -133,39 +156,57 @@ def extracting_drug_fda_publ_date(message, unknown): and the oldest publication date (year) linked to each drug." """ - attribute_type_id_list_fda = ['biolink:FDA_approval_status', 'biolink:FDA_APPROVAL_STATUS'] - attribute_type_id_list_pub = ['biolink:publications', 'biolink:Publication', 'biolink:publication'] + attribute_type_id_list_fda = [ + "biolink:FDA_approval_status", + "biolink:FDA_APPROVAL_STATUS", + ] + attribute_type_id_list_pub = [ + "biolink:publications", + "biolink:Publication", + "biolink:publication", + ] drug_idx_fda_status = [] today = date.today() res_chk = 1 # for edge in message['knowledge_graph']['edges'].keys(): query_known, query_unknown, query_chk = query_id(message) - idi=-1 + idi = -1 for tmp in unknown: - tmp_res = message['results'][tmp]['analyses'][0]['edge_bindings'] + tmp_res = message["results"][tmp]["analyses"][0]["edge_bindings"] for tmp_1 in tmp_res: - idi+=1 - edge = message['results'][tmp]['analyses'][0]['edge_bindings'][tmp_1][0]['id'] - # edge_list = list(message['knowledge_graph']['edges'].keys()) - # for idx, idi in enumerate(edge_list): - # if idx % 20 == 0: - # print(f'progressing {idx}') - # edge = edge_list[idx] - edge_attribute = message['knowledge_graph']['edges'][edge] + idi += 1 + edge = message["results"][tmp]["analyses"][0]["edge_bindings"][tmp_1][0][ + "id" + ] + # edge_list = list(message['knowledge_graph']['edges'].keys()) + # for idx, idi in enumerate(edge_list): + # if idx % 20 == 0: + # print(f'progressing {idx}') + # edge = edge_list[idx] + edge_attribute = message["knowledge_graph"]["edges"][edge] # if set(['subject', 'object']).issubset(edge_attribute.keys()): - if query_chk==1: - if 'PUBCHEM' in edge_attribute['subject'] or 'CHEMBL' in edge_attribute['subject'] or 'UNII' in edge_attribute['subject'] or 'RXNORM' in edge_attribute['subject'] or 'UMLS' in edge_attribute['subject'] or not 'MONDO' in edge_attribute['subject']: - drug_idx = edge_attribute['subject'] + if query_chk == 1: + if ( + "PUBCHEM" in edge_attribute["subject"] + or "CHEMBL" in edge_attribute["subject"] + or "UNII" in edge_attribute["subject"] + or "RXNORM" in edge_attribute["subject"] + or "UMLS" in edge_attribute["subject"] + or not "MONDO" in edge_attribute["subject"] + ): + drug_idx = edge_attribute["subject"] else: - drug_idx = edge_attribute['object'] - if set(['attributes']).issubset(edge_attribute.keys()): - if len(edge_attribute['attributes']) > 0: + drug_idx = edge_attribute["object"] + if set(["attributes"]).issubset(edge_attribute.keys()): + if len(edge_attribute["attributes"]) > 0: att_type_id = {} fda = [] pub = [] - for i in range(len(edge_attribute['attributes'])): - att_type_id[i] = edge_attribute['attributes'][i]['attribute_type_id'] + for i in range(len(edge_attribute["attributes"])): + att_type_id[i] = edge_attribute["attributes"][i][ + "attribute_type_id" + ] for key in att_type_id.keys(): if att_type_id[key] in attribute_type_id_list_fda: @@ -174,7 +215,10 @@ def extracting_drug_fda_publ_date(message, unknown): pub.append(key) if len(fda) > 0: - if edge_attribute['attributes'][fda[0]]['value'] == 'FDA Approval': + if ( + edge_attribute["attributes"][fda[0]]["value"] + == "FDA Approval" + ): fda_status = 0.0 else: fda_status = 1.0 @@ -183,30 +227,40 @@ def extracting_drug_fda_publ_date(message, unknown): # Publication if len(pub) > 0: - publications = edge_attribute['attributes'][pub[0]]['value'] - if '|' in publications: - publications = publications.split('|') - if type(publications) == 'str': + publications = edge_attribute["attributes"][pub[0]]["value"] + if "|" in publications: + publications = publications.split("|") + if type(publications) == "str": publications = [publications] # Removal of all publication entries that are links publications = [x for x in publications if "http" not in x] # Removal of all publication entries that are Clinical Trials - publications = [x for x in publications if "clinicaltrials" not in x] + publications = [ + x for x in publications if "clinicaltrials" not in x + ] number_of_publ = len(publications) - if len(publications)>0: + if len(publications) > 0: # print(publications) - publications_1 = ','.join(publications) + publications_1 = ",".join(publications) try: response_pub = get_publication_info(publications_1) - if response_pub['_meta']['n_results']==0: + if response_pub["_meta"]["n_results"] == 0: age_oldest = np.nan else: publ_year = [] - for key in response_pub['results'].keys(): - if 'not_found' not in key: - publ_year.extend([int(response_pub['results'][key]['pub_year'])]) + for key in response_pub["results"].keys(): + if "not_found" not in key: + publ_year.extend( + [ + int( + response_pub["results"][ + key + ]["pub_year"] + ) + ] + ) age_oldest = today.year - min(publ_year) except ConnectionError as e: age_oldest = np.nan @@ -214,101 +268,131 @@ def extracting_drug_fda_publ_date(message, unknown): publications = None number_of_publ = 0.0 age_oldest = np.nan - drug_idx_fda_status.append((idi, drug_idx, fda_status, publications, number_of_publ, age_oldest)) + drug_idx_fda_status.append( + ( + idi, + drug_idx, + fda_status, + publications, + number_of_publ, + age_oldest, + ) + ) else: - if query_unknown in ['biolink:Gene', 'biolink:Protein']: - if 'NCBI' in edge_attribute['subject'] or 'GO' in edge_attribute['subject']: - gene_idx = edge_attribute['subject'] + if query_unknown in ["biolink:Gene", "biolink:Protein"]: + if ( + "NCBI" in edge_attribute["subject"] + or "GO" in edge_attribute["subject"] + ): + gene_idx = edge_attribute["subject"] else: - gene_idx = edge_attribute['object'] + gene_idx = edge_attribute["object"] drug_idx_fda_status.append((idi, gene_idx)) - elif query_unknown in ['biolink:Disease', 'biolink:Phenotype']: - if 'MONDO' in edge_attribute['subject']: - dis_idx = edge_attribute['subject'] + elif query_unknown in ["biolink:Disease", "biolink:Phenotype"]: + if "MONDO" in edge_attribute["subject"]: + dis_idx = edge_attribute["subject"] else: - dis_idx = edge_attribute['object'] + dis_idx = edge_attribute["object"] drug_idx_fda_status.append((idi, dis_idx)) - if query_chk==1 and res_chk==1: - DF = pd.DataFrame(drug_idx_fda_status, columns=['edge', 'drug', 'fda status', 'publications', 'number_of_publ', 'age_oldest_pub']) - elif query_chk!=1 and res_chk==1: - DF = pd.DataFrame(drug_idx_fda_status, columns=['edge', 'result']) + if query_chk == 1 and res_chk == 1: + DF = pd.DataFrame( + drug_idx_fda_status, + columns=[ + "edge", + "drug", + "fda status", + "publications", + "number_of_publ", + "age_oldest_pub", + ], + ) + elif query_chk != 1 and res_chk == 1: + DF = pd.DataFrame(drug_idx_fda_status, columns=["edge", "result"]) else: DF = pd.DataFrame() return DF, query_chk + def extract_results(message, unknown, known): results = [] results_known = [] kid, ukid = 0, 0 - for idi, i in enumerate(message['results']): + for idi, i in enumerate(message["results"]): if idi in unknown: results.append([]) - for idj, j in enumerate(i['analyses']): - for idk, k in enumerate(j['edge_bindings'][list(j['edge_bindings'].keys())[0]]): - results[ukid].append(k['id']) - ukid+=1 + for idj, j in enumerate(i["analyses"]): + for idk, k in enumerate( + j["edge_bindings"][list(j["edge_bindings"].keys())[0]] + ): + results[ukid].append(k["id"]) + ukid += 1 elif idi in known: results_known.append([]) - for idj, j in enumerate(i['analyses']): - for idk, k in enumerate(j['edge_bindings'][list(j['edge_bindings'].keys())[0]]): - results_known[kid].append(k['id']) - kid+=1 + for idj, j in enumerate(i["analyses"]): + for idk, k in enumerate( + j["edge_bindings"][list(j["edge_bindings"].keys())[0]] + ): + results_known[kid].append(k["id"]) + kid += 1 return results, results_known + def result_edge_correlation(results, results_known, df): df_res = pd.DataFrame() res_known = set() res_unknown = set() for idi, i in enumerate(results): for j in i: - df_res = pd.concat([df_res, df[df['edge']==j]]) - res_unknown.add(df.loc[df['edge']==j, 'drug'].iloc[0]) + df_res = pd.concat([df_res, df[df["edge"] == j]]) + res_unknown.add(df.loc[df["edge"] == j, "drug"].iloc[0]) for idi, i in enumerate(results_known): for j in i: - res_known.add(df.loc[df['edge']==j, 'drug'].iloc[0]) + res_known.add(df.loc[df["edge"] == j, "drug"].iloc[0]) return df_res, res_unknown, res_known + def novelty_score(fda_status, recency, similarity): """ Calculate the novelty score for each drug entity based on FDA status, recency and similarity of the drug. - - FDA status 0 | 1 + + FDA status 0 | 1 --> 0 to be FDA approved - 0 < recency < 1 - --> 1 to have a high recency where the publications are so new and number of publications is too few. + 0 < recency < 1 + --> 1 to have a high recency where the publications are so new and number of publications is too few. 0 < similarity < 1 --> 0 to have a very low molecular structure similarity, so it is novel. - + Args: float: fda_status float: recency float: similarity - + Returns: float: novelty_score - + """ if not np.isnan(recency): score = recency if not np.isnan(similarity): similarity = 1 - similarity if similarity > 0.5: - score = score*(0.73+similarity) - if score>1: - score=1 + score = score * (0.73 + similarity) + if score > 1: + score = 1 if fda_status == 0: - score=score*0.85 + score = score * 0.85 else: - if np.isnan(similarity): - score=0 + if np.isnan(similarity): + score = 0 else: - score=(1-similarity) + score = 1 - similarity return score + def compute_novelty(message, logger): - """ INPUT: JSON Response with merged annotated results for a 1-H query + """INPUT: JSON Response with merged annotated results for a 1-H query 1. load the json file 2. Give the json to extracting_drug_fda_publ_date(response) function to extract the EPC @@ -325,51 +409,71 @@ def compute_novelty(message, logger): similarity_map = molecular_sim(known, unknown, message) df, query_chk = extracting_drug_fda_publ_date(message, unknown) -# # print(df.head()) -# # print(query_chk) -# + # # print(df.head()) + # # print(query_chk) + # # df = pd.read_excel('DATAFRAME.xlsx', names=['edge', 'drug', 'fda status', 'publications', 'number_of_publ', 'age_oldest_pub']) # query_chk = 1 # # # res, res_known = extract_results(mergedAnnotatedOutput, unknown, known) -# # print(len(res_known)) -# # print(len(res)) -# # # # -# df_res, res_unknown, res_known = result_edge_correlation(res, res_known, df) + # # print(len(res_known)) + # # print(len(res)) + # # # # + # df_res, res_unknown, res_known = result_edge_correlation(res, res_known, df) # print(len(res_unknown)) # print(len(res_known)) # df = df_res # print(df.head()) # print(similarity_map) - if query_chk==1: + if query_chk == 1: # Step 3: # calculating the recency - df['recency'] = df.apply(lambda row: recency_function_exp(row['number_of_publ'], row['age_oldest_pub'], 100, 50) if not (np.isnan(row['number_of_publ']) or np.isnan(row['age_oldest_pub'])) else np.nan, axis=1) + df["recency"] = df.apply( + lambda row: recency_function_exp( + row["number_of_publ"], row["age_oldest_pub"], 100, 50 + ) + if not (np.isnan(row["number_of_publ"]) or np.isnan(row["age_oldest_pub"])) + else np.nan, + axis=1, + ) # # # Step 4: # # This section will be added later. Currently just putting 'NaN': # nearest_neighbours = calculate_nn_distance(res_known, res_unknown, 0, 1) - df['similarity'] = df.apply(lambda row: similarity_map[row['drug']][0][1] if row['drug'] in similarity_map.keys() else np.nan, axis=1) + df["similarity"] = df.apply( + lambda row: similarity_map[row["drug"]][0][1] + if row["drug"] in similarity_map.keys() + else np.nan, + axis=1, + ) # df = df.assign(similarity=np.nan) # # Step 5: # # Calculating the novelty score: - df['novelty_score'] = df.apply(lambda row: novelty_score(row['fda status'], row['recency'], row['similarity']), axis=1) + df["novelty_score"] = df.apply( + lambda row: novelty_score( + row["fda status"], row["recency"], row["similarity"] + ), + axis=1, + ) # # # Step 6 # # # Just sort them: - df = df[['drug', 'novelty_score']].sort_values(by= 'novelty_score', ascending= False) + df = df[["drug", "novelty_score"]].sort_values( + by="novelty_score", ascending=False + ) else: df = df.assign(novelty_score=0) return df -#for i in list(range(1, 5)): + +# for i in list(range(1, 5)): # start = time.time() # temp = compute_novelty('mergedAnnotatedOutput.json') # if temp.empty: # print(f"No Results in mergedAnnotatedOutput.json") # else: # temp_json = temp.to_json(f'mergedAnnotatedOutput_scores.json', orient='values') -# print(time.time()-start) \ No newline at end of file +# print(time.time()-start) diff --git a/app/novelty/extr_smile_molpro_by_id.py b/app/novelty/extr_smile_molpro_by_id.py index 98df59d..504e30e 100644 --- a/app/novelty/extr_smile_molpro_by_id.py +++ b/app/novelty/extr_smile_molpro_by_id.py @@ -4,21 +4,18 @@ def mol_to_smile_molpro(molecules): """ - + Returns: Dict - + Args: - List - Example: Attached sample_mols.json file + List + Example: Attached sample_mols.json file """ - + url = "https://translator.broadinstitute.org/molecular_data_provider/compound/by_id" - headers = { - "accept": "application/json", - "Content-Type": "application/json" - } - + headers = {"accept": "application/json", "Content-Type": "application/json"} + data_mol = list(set(molecules)) smiles = {} @@ -26,34 +23,32 @@ def mol_to_smile_molpro(molecules): if response.status_code == 200: json_response = response.json() - #print(json_response) - collec_url = json_response['url'] + # print(json_response) + collec_url = json_response["url"] temp_collec_response = requests.get(collec_url) if response.status_code == 200: collec_response = temp_collec_response.json() - #print('\n') - #print(collec_response['elements'][0]['identifiers']) - if json_response['size'] > 0: - - for i in range(json_response['size']): - key_list = ['identifiers'] - if set(key_list).issubset(collec_response['elements'][i].keys()): - #if 'smiles' in collec_response['elements'][i]['identifiers'].keys(): - identifiers = collec_response['elements'][i]['identifiers'] - smile = identifiers.get('smiles', 'No SMILES could be found') + # print('\n') + # print(collec_response['elements'][0]['identifiers']) + if json_response["size"] > 0: + for i in range(json_response["size"]): + key_list = ["identifiers"] + if set(key_list).issubset(collec_response["elements"][i].keys()): + # if 'smiles' in collec_response['elements'][i]['identifiers'].keys(): + identifiers = collec_response["elements"][i]["identifiers"] + smile = identifiers.get("smiles", "No SMILES could be found") smiles[data_mol[i]] = smile else: - smiles[data_mol[i]] = 'No identifiers could be found' - - #Recursion: re-attempt to retrieve the SMILES for those (from the initial data_mol) - #for which the retrieval was not successful in the first attempt + smiles[data_mol[i]] = "No identifiers could be found" + + # Recursion: re-attempt to retrieve the SMILES for those (from the initial data_mol) + # for which the retrieval was not successful in the first attempt if len(list(smiles.keys())) < len(data_mol): diff = [mol for mol in data_mol if mol not in list(smiles.keys())] diff_smiles = mol_to_smile_molpro(diff) if len(diff_smiles) > 0: - smiles.update(diff_smiles) + smiles.update(diff_smiles) else: print(f"Error: {response.status_code} - {response.text}") - - - return smiles \ No newline at end of file + + return smiles diff --git a/app/novelty/mol_similarity.py b/app/novelty/mol_similarity.py index 70c2f60..d452b10 100644 --- a/app/novelty/mol_similarity.py +++ b/app/novelty/mol_similarity.py @@ -5,10 +5,11 @@ from rdkit.Chem import AllChem - -def find_nearest_neighbors(unknown_smiles_dict, known_smiles_dict, similarity_cutoff, num_neighbors): +def find_nearest_neighbors( + unknown_smiles_dict, known_smiles_dict, similarity_cutoff, num_neighbors +): """ - + Returns: Dict @@ -19,19 +20,27 @@ def find_nearest_neighbors(unknown_smiles_dict, known_smiles_dict, similarity_cu num_neighbors: int: 1 """ - unknown_smiles = {key:value for key,value in unknown_smiles_dict.items() if value != "No SMILES could be found"} - known_smiles = {key:value for key,value in known_smiles_dict.items() if value != "No SMILES could be found"} + unknown_smiles = { + key: value + for key, value in unknown_smiles_dict.items() + if value != "No SMILES could be found" + } + known_smiles = { + key: value + for key, value in known_smiles_dict.items() + if value != "No SMILES could be found" + } # Convert input SMILES to a molecule known_mols = {} - for key,value in known_smiles.items(): + for key, value in known_smiles.items(): known_mol = Chem.MolFromSmiles(value) if known_mol is None: - raise ValueError("Invalid SMILES string for",key) + raise ValueError("Invalid SMILES string for", key) else: - known_mols.update({key:known_mol}) + known_mols.update({key: known_mol}) nearest_neighbor_mapping = {} - for unknownkey,value in unknown_smiles.items(): + for unknownkey, value in unknown_smiles.items(): query_mol = Chem.MolFromSmiles(value) if query_mol is None: raise ValueError("Invalid SMILES string") @@ -55,5 +64,5 @@ def find_nearest_neighbors(unknown_smiles_dict, known_smiles_dict, similarity_cu index, similarity = similarities[i] if similarity >= similarity_cutoff: neighbors.append((index, similarity)) - nearest_neighbor_mapping.update({unknownkey:neighbors}) - return nearest_neighbor_mapping \ No newline at end of file + nearest_neighbor_mapping.update({unknownkey: neighbors}) + return nearest_neighbor_mapping diff --git a/app/ordering_components.py b/app/ordering_components.py index 9022df5..32a8e4d 100644 --- a/app/ordering_components.py +++ b/app/ordering_components.py @@ -52,7 +52,9 @@ def get_ordering_components(message, logger): logger.debug(f"Computing scores for {len(message['results'])} results") db_conn = redis.Redis(connection_pool=redis_pool) novelty_scores_dict = get_novelty(message, logger).to_dict(orient="index") - novelty_scores = {node["drug"]: node["novelty_score"] for node in novelty_scores_dict.values()} + novelty_scores = { + node["drug"]: node["novelty_score"] for node in novelty_scores_dict.values() + } for result in tqdm(message.get("results") or []): clinical_evidence_score = get_clinical_evidence( result, @@ -70,4 +72,6 @@ def get_ordering_components(message, logger): for node_bindings in result.get("node_bindings", {}).values(): for node_binding in node_bindings: if node_binding["id"] in novelty_scores: - result["ordering_components"]["novelty"] = novelty_scores[node_binding["id"]] + result["ordering_components"]["novelty"] = novelty_scores[ + node_binding["id"] + ] From 44e9599cb714f028dc4e1eccbe77e39bc8601b46 Mon Sep 17 00:00:00 2001 From: Max Wang Date: Fri, 4 Aug 2023 15:33:04 -0400 Subject: [PATCH 5/5] Add updated novelty code --- app/novelty/compute_novelty.py | 42 ++++++++++--------- app/novelty/extr_smile_molpro_by_id.py | 57 ++++++++++++++------------ app/ordering_components.py | 3 +- 3 files changed, 54 insertions(+), 48 deletions(-) diff --git a/app/novelty/compute_novelty.py b/app/novelty/compute_novelty.py index 258a759..806919b 100644 --- a/app/novelty/compute_novelty.py +++ b/app/novelty/compute_novelty.py @@ -2,6 +2,7 @@ from datetime import date import requests import numpy as np +import traceback from .known import find_known_results from .extr_smile_molpro_by_id import mol_to_smile_molpro @@ -78,7 +79,7 @@ def get_publication_info(pub_id): response.raise_for_status() response = response.json() except Exception: - return { + response = { "_meta": { "n_results": 0, }, @@ -169,7 +170,6 @@ def extracting_drug_fda_publ_date(message, unknown): today = date.today() res_chk = 1 - # for edge in message['knowledge_graph']['edges'].keys(): query_known, query_unknown, query_chk = query_id(message) idi = -1 for tmp in unknown: @@ -406,12 +406,14 @@ def compute_novelty(message, logger): known, unknown = find_known_results(message) # # # Step 2 - similarity_map = molecular_sim(known, unknown, message) + # start = time.time() df, query_chk = extracting_drug_fda_publ_date(message, unknown) + # print(f"Time to extract fda status and Publication data:{time.time()-start}") # # print(df.head()) # # print(query_chk) # + # df.to_excel(f'DATAFRAME.xlsx', header=False, index=False) # df = pd.read_excel('DATAFRAME.xlsx', names=['edge', 'drug', 'fda status', 'publications', 'number_of_publ', 'age_oldest_pub']) # query_chk = 1 @@ -427,6 +429,20 @@ def compute_novelty(message, logger): # print(df.head()) # print(similarity_map) if query_chk == 1: + # start = time.time() + try: + similarity_map = molecular_sim(known, unknown, message) + df["similarity"] = df.apply( + lambda row: similarity_map[row["drug"]][0][1] + if row["drug"] in similarity_map.keys() + else np.nan, + axis=1, + ) + except Exception as e: + logger.error(traceback.format_exc()) + df = df.assign(similarity=np.nan) + + # print(f"Time to compute Molecular Similarity:{time.time() - start}") # Step 3: # calculating the recency df["recency"] = df.apply( @@ -439,15 +455,9 @@ def compute_novelty(message, logger): ) # # # Step 4: - # # This section will be added later. Currently just putting 'NaN': + # # Calculating the Similarity: # nearest_neighbours = calculate_nn_distance(res_known, res_unknown, 0, 1) - df["similarity"] = df.apply( - lambda row: similarity_map[row["drug"]][0][1] - if row["drug"] in similarity_map.keys() - else np.nan, - axis=1, - ) # df = df.assign(similarity=np.nan) # # Step 5: @@ -458,6 +468,7 @@ def compute_novelty(message, logger): ), axis=1, ) + # df.to_excel(f'DATAFRAME_result.xlsx', header=False, index=False) # # # Step 6 # # # Just sort them: @@ -466,14 +477,5 @@ def compute_novelty(message, logger): ) else: df = df.assign(novelty_score=0) + # df.to_excel(f'DATAFRAME_NOVELTY.xlsx', header=False, index=False) return df - - -# for i in list(range(1, 5)): -# start = time.time() -# temp = compute_novelty('mergedAnnotatedOutput.json') -# if temp.empty: -# print(f"No Results in mergedAnnotatedOutput.json") -# else: -# temp_json = temp.to_json(f'mergedAnnotatedOutput_scores.json', orient='values') -# print(time.time()-start) diff --git a/app/novelty/extr_smile_molpro_by_id.py b/app/novelty/extr_smile_molpro_by_id.py index 504e30e..e15a707 100644 --- a/app/novelty/extr_smile_molpro_by_id.py +++ b/app/novelty/extr_smile_molpro_by_id.py @@ -1,54 +1,59 @@ -#!/usr/bin/env python import requests def mol_to_smile_molpro(molecules): """ + Args: + List Returns: Dict - Args: - List - Example: Attached sample_mols.json file """ - url = "https://translator.broadinstitute.org/molecular_data_provider/compound/by_id" + url = "https://molepro.transltr.io/molecular_data_provider/compound/by_id" headers = {"accept": "application/json", "Content-Type": "application/json"} - data_mol = list(set(molecules)) smiles = {} - response = requests.post(url, headers=headers, json=data_mol) + data_mol = list(set(molecules)) + # print(f'init data: {len(data_mol)}') + while data_mol: + # print(f'before: {len(data_mol)}') + data_mol_before = len(data_mol) + response = requests.post(url, headers=headers, json=data_mol) - if response.status_code == 200: - json_response = response.json() - # print(json_response) - collec_url = json_response["url"] - temp_collec_response = requests.get(collec_url) if response.status_code == 200: - collec_response = temp_collec_response.json() - # print('\n') - # print(collec_response['elements'][0]['identifiers']) - if json_response["size"] > 0: + json_response = response.json() + collec_url = json_response["url"] + temp_collec_response = requests.get(collec_url) + if temp_collec_response.status_code == 200: + collec_response = temp_collec_response.json() + for i in range(json_response["size"]): key_list = ["identifiers"] if set(key_list).issubset(collec_response["elements"][i].keys()): - # if 'smiles' in collec_response['elements'][i]['identifiers'].keys(): identifiers = collec_response["elements"][i]["identifiers"] smile = identifiers.get("smiles", "No SMILES could be found") smiles[data_mol[i]] = smile else: smiles[data_mol[i]] = "No identifiers could be found" - # Recursion: re-attempt to retrieve the SMILES for those (from the initial data_mol) - # for which the retrieval was not successful in the first attempt - if len(list(smiles.keys())) < len(data_mol): - diff = [mol for mol in data_mol if mol not in list(smiles.keys())] - diff_smiles = mol_to_smile_molpro(diff) - if len(diff_smiles) > 0: - smiles.update(diff_smiles) - else: - print(f"Error: {response.status_code} - {response.text}") + # Remove molecules with successfully retrieved smiles from data_mol + data_mol = [mol for mol in data_mol if mol not in smiles] + data_mol_after = len(data_mol) + # print(f'after: {len(data_mol)}') + + if data_mol_after == data_mol_before: + break + + else: + print( + f"Error: {temp_collec_response.status_code} - {temp_collec_response.text}" + ) + break + else: + print(f"Error: {response.status_code} - {response.text}") + break return smiles diff --git a/app/ordering_components.py b/app/ordering_components.py index 32a8e4d..979f234 100644 --- a/app/ordering_components.py +++ b/app/ordering_components.py @@ -1,13 +1,12 @@ """Compute scores for each result in the given message.""" -import os import redis from tqdm import tqdm from .config import settings from .clinical_evidence.compute_clinical_evidence import compute_clinical_evidence - from .novelty.compute_novelty import compute_novelty + redis_pool = redis.ConnectionPool( host=settings.redis_host, port=settings.redis_port,