Add updated novelty code

TranslatorSRI · Aug 4, 2023 · 44e9599 · 44e9599
1 parent aeda561
commit 44e9599
Show file tree

Hide file tree

Showing 3 changed files with 54 additions and 48 deletions.
diff --git a/app/novelty/compute_novelty.py b/app/novelty/compute_novelty.py
@@ -2,6 +2,7 @@
 from datetime import date
 import requests
 import numpy as np
+import traceback
 
 from .known import find_known_results
 from .extr_smile_molpro_by_id import mol_to_smile_molpro
@@ -78,7 +79,7 @@ def get_publication_info(pub_id):
         response.raise_for_status()
         response = response.json()
     except Exception:
-        return {
+        response = {
             "_meta": {
                 "n_results": 0,
             },
@@ -169,7 +170,6 @@ def extracting_drug_fda_publ_date(message, unknown):
     today = date.today()
 
     res_chk = 1
-    # for edge in message['knowledge_graph']['edges'].keys():
     query_known, query_unknown, query_chk = query_id(message)
     idi = -1
     for tmp in unknown:
@@ -406,12 +406,14 @@ def compute_novelty(message, logger):
     known, unknown = find_known_results(message)
     #
     # # Step 2
-    similarity_map = molecular_sim(known, unknown, message)
 
+    # start = time.time()
     df, query_chk = extracting_drug_fda_publ_date(message, unknown)
+    # print(f"Time to extract fda status and Publication data:{time.time()-start}")
     #         # print(df.head())
     #         # print(query_chk)
     #
+    # df.to_excel(f'DATAFRAME.xlsx', header=False, index=False)
     # df = pd.read_excel('DATAFRAME.xlsx', names=['edge', 'drug', 'fda status', 'publications', 'number_of_publ', 'age_oldest_pub'])
     # query_chk = 1
 
@@ -427,6 +429,20 @@ def compute_novelty(message, logger):
     # print(df.head())
     # print(similarity_map)
     if query_chk == 1:
+        # start = time.time()
+        try:
+            similarity_map = molecular_sim(known, unknown, message)
+            df["similarity"] = df.apply(
+                lambda row: similarity_map[row["drug"]][0][1]
+                if row["drug"] in similarity_map.keys()
+                else np.nan,
+                axis=1,
+            )
+        except Exception as e:
+            logger.error(traceback.format_exc())
+            df = df.assign(similarity=np.nan)
+
+        # print(f"Time to compute Molecular Similarity:{time.time() - start}")
         # Step 3:
         # calculating the recency
         df["recency"] = df.apply(
@@ -439,15 +455,9 @@ def compute_novelty(message, logger):
         )
         #
         # # Step 4:
-        # # This section will be added later. Currently just putting 'NaN':
+        # # Calculating the Similarity:
         # nearest_neighbours = calculate_nn_distance(res_known, res_unknown, 0, 1)
 
-        df["similarity"] = df.apply(
-            lambda row: similarity_map[row["drug"]][0][1]
-            if row["drug"] in similarity_map.keys()
-            else np.nan,
-            axis=1,
-        )
         # df = df.assign(similarity=np.nan)
 
         # # Step 5:
@@ -458,6 +468,7 @@ def compute_novelty(message, logger):
             ),
             axis=1,
         )
+        # df.to_excel(f'DATAFRAME_result.xlsx', header=False, index=False)
 
         # # # Step 6
         # # # Just sort them:
@@ -466,14 +477,5 @@ def compute_novelty(message, logger):
         )
     else:
         df = df.assign(novelty_score=0)
+    # df.to_excel(f'DATAFRAME_NOVELTY.xlsx', header=False, index=False)
     return df
-
-
-# for i in list(range(1, 5)):
-# start = time.time()
-# temp = compute_novelty('mergedAnnotatedOutput.json')
-# if temp.empty:
-#     print(f"No Results in mergedAnnotatedOutput.json")
-# else:
-#     temp_json = temp.to_json(f'mergedAnnotatedOutput_scores.json', orient='values')
-# print(time.time()-start)
diff --git a/app/novelty/extr_smile_molpro_by_id.py b/app/novelty/extr_smile_molpro_by_id.py
@@ -1,54 +1,59 @@
-#!/usr/bin/env python
 import requests
 
 
 def mol_to_smile_molpro(molecules):
     """
+    Args:
+        List
 
     Returns:
         Dict
 
-    Args:
-        List
-        Example: Attached sample_mols.json file
     """
 
-    url = "https://translator.broadinstitute.org/molecular_data_provider/compound/by_id"
+    url = "https://molepro.transltr.io/molecular_data_provider/compound/by_id"
     headers = {"accept": "application/json", "Content-Type": "application/json"}
 
-    data_mol = list(set(molecules))
     smiles = {}
 
-    response = requests.post(url, headers=headers, json=data_mol)
+    data_mol = list(set(molecules))
+    # print(f'init data: {len(data_mol)}')
+    while data_mol:
+        # print(f'before: {len(data_mol)}')
+        data_mol_before = len(data_mol)
+        response = requests.post(url, headers=headers, json=data_mol)
 
-    if response.status_code == 200:
-        json_response = response.json()
-        # print(json_response)
-        collec_url = json_response["url"]
-        temp_collec_response = requests.get(collec_url)
         if response.status_code == 200:
-            collec_response = temp_collec_response.json()
-            # print('\n')
-            # print(collec_response['elements'][0]['identifiers'])
-            if json_response["size"] > 0:
+            json_response = response.json()
+            collec_url = json_response["url"]
+            temp_collec_response = requests.get(collec_url)
+            if temp_collec_response.status_code == 200:
+                collec_response = temp_collec_response.json()
+
                 for i in range(json_response["size"]):
                     key_list = ["identifiers"]
                     if set(key_list).issubset(collec_response["elements"][i].keys()):
-                        # if 'smiles' in collec_response['elements'][i]['identifiers'].keys():
                         identifiers = collec_response["elements"][i]["identifiers"]
                         smile = identifiers.get("smiles", "No SMILES could be found")
                         smiles[data_mol[i]] = smile
                     else:
                         smiles[data_mol[i]] = "No identifiers could be found"
 
-            # Recursion: re-attempt to retrieve the SMILES for those (from the initial data_mol)
-            # for which the retrieval was not successful in the first attempt
-            if len(list(smiles.keys())) < len(data_mol):
-                diff = [mol for mol in data_mol if mol not in list(smiles.keys())]
-                diff_smiles = mol_to_smile_molpro(diff)
-                if len(diff_smiles) > 0:
-                    smiles.update(diff_smiles)
-    else:
-        print(f"Error: {response.status_code} - {response.text}")
+                # Remove molecules with successfully retrieved smiles from data_mol
+                data_mol = [mol for mol in data_mol if mol not in smiles]
+                data_mol_after = len(data_mol)
+                # print(f'after: {len(data_mol)}')
+
+                if data_mol_after == data_mol_before:
+                    break
+
+            else:
+                print(
+                    f"Error: {temp_collec_response.status_code} - {temp_collec_response.text}"
+                )
+                break
+        else:
+            print(f"Error: {response.status_code} - {response.text}")
+            break
 
     return smiles
diff --git a/app/ordering_components.py b/app/ordering_components.py
@@ -1,13 +1,12 @@
 """Compute scores for each result in the given message."""
-import os
 import redis
 from tqdm import tqdm
 
 from .config import settings
 from .clinical_evidence.compute_clinical_evidence import compute_clinical_evidence
-
 from .novelty.compute_novelty import compute_novelty
 
+
 redis_pool = redis.ConnectionPool(
     host=settings.redis_host,
     port=settings.redis_port,