From 46342aef130a8ea2f8682e339237816e692c5b5c Mon Sep 17 00:00:00 2001
From: Max Wang <max@covar.com>
Date: Thu, 3 Aug 2023 14:12:12 -0400
Subject: [PATCH 1/5] Add novelty score code

---
 app/novelty/__init__.py                |   0
 app/novelty/compute_novelty.py         | 375 +++++++++++++++++++++++++
 app/novelty/extr_smile_molpro_by_id.py |  59 ++++
 app/novelty/known.py                   |  27 ++
 app/novelty/mol_similarity.py          |  59 ++++
 requirements-lock.txt                  |   7 +
 requirements.txt                       |   2 +
 7 files changed, 529 insertions(+)
 create mode 100644 app/novelty/__init__.py
 create mode 100644 app/novelty/compute_novelty.py
 create mode 100644 app/novelty/extr_smile_molpro_by_id.py
 create mode 100644 app/novelty/known.py
 create mode 100644 app/novelty/mol_similarity.py

diff --git a/app/novelty/__init__.py b/app/novelty/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/novelty/compute_novelty.py b/app/novelty/compute_novelty.py
new file mode 100644
index 0000000..2b53729
--- /dev/null
+++ b/app/novelty/compute_novelty.py
@@ -0,0 +1,375 @@
+import pandas as pd
+from datetime import date
+import requests
+import numpy as np
+
+from .known import find_known_results
+from .extr_smile_molpro_by_id import  mol_to_smile_molpro
+from .mol_similarity import find_nearest_neighbors
+
+"""
+This script computes the novelty score for a list of results obtained for a 1-H response.
+The steps for the ideal workflow are as follows:
+1. Distinguish whether the result is a known result or an unknown result.
+2. Compute FDA approval status to check if currently under approval or already approved.
+3. Compute recency by noting the associated publications of the result.
+4. Compute molecular similarity to identify if similar to existing drugs.
+ 
+The end result of this script displays a table with values from different columns and accordingly lists the novelty score as well.
+"""
+
+def molecular_sim(known, unknown, message):
+    unknown_ids = []
+    known_ids = []
+    if len(unknown) > 0:
+        for drug in unknown:
+            s = list(message['results'][drug]['node_bindings'].keys())
+            edge_attribute_sn = message['results'][drug]['node_bindings'][s[0]][0]['id']
+            if 'PUBCHEM' in edge_attribute_sn or 'CHEMBL' in edge_attribute_sn or 'UNII' in edge_attribute_sn or 'RXNORM' in edge_attribute_sn or 'UMLS' in edge_attribute_sn or not 'MONDO' in edge_attribute_sn:
+                unknown_ids.append(edge_attribute_sn)
+            else:
+                unknown_ids.append(
+                    message['results'][drug]['node_bindings'][s[1]][0]['id'])
+
+    if len(known) > 0:
+        for drug in known:
+            s = list(message['results'][drug]['node_bindings'].keys())
+            edge_attribute_sn = message['results'][drug]['node_bindings'][s[0]][0]['id']
+            if 'PUBCHEM' in edge_attribute_sn or 'CHEMBL' in edge_attribute_sn or 'UMLS' in edge_attribute_sn or 'UNII' in edge_attribute_sn or 'RXNORM' in edge_attribute_sn or not 'MONDO' in edge_attribute_sn:
+                known_ids.append(edge_attribute_sn)
+            else:
+                known_ids.append(message['results'][drug]['node_bindings'][s[1]][0]['id'])
+
+    smile_unkown = mol_to_smile_molpro(unknown_ids)
+    smile_known = mol_to_smile_molpro(known_ids)
+
+    similarity_map = find_nearest_neighbors(smile_unkown, smile_known, 0, 1)
+    return similarity_map
+
+
+def get_publication_info(pub_id):
+    """
+    Args: PMID
+    Returns: The publication info
+    """
+    base_url = "https://3md2qwxrrk.us-east-1.awsapprunner.com/publications?pubids="
+    request_id = '1df88223-c0f8-47f5-a1f3-661b944c7849'
+    full_url = f"{base_url}{pub_id}&request_id={request_id}"
+    try:
+        response = requests.get(full_url)
+        response.raise_for_status()
+        response = response.json()
+    except Exception:
+        return {
+            "_meta": {
+                "n_results": 0,
+            },
+        }
+    return response
+
+
+def sigmoid(x):
+    return 1 / (1 + np.exp(x))
+
+def query_id(message):
+    for i in message['query_graph']['nodes']:
+        if 'ids' in message['query_graph']['nodes'][i].keys():
+            if message['query_graph']['nodes'][i]['ids']:
+                known_node = message['query_graph']['nodes'][i]['categories'][0]
+            else:
+                unknown_node = message['query_graph']['nodes'][i]['categories'][0]
+        else:
+            unknown_node = message['query_graph']['nodes'][i]['categories'][0]
+    if unknown_node in ['biolink:ChemicalEntity', 'biolink:SmallMolecule', 'biolink:Drug']:
+        chk=1
+    else:
+        chk=0
+    return known_node, unknown_node, chk
+
+
+def recency_function_exp(number_of_publ, age_of_oldest_publ, max_number, max_age):
+    """
+    Calculates the recency based on number of publications accociated to each drug
+    and age of the oldest publication
+    
+    Args:
+        number_of_publ (float): The current number of publications.
+        age_of_oldest_publ (float): The current age of the oldest publication.
+        max_number (float): The maximum number of publication: e.g. consider 100 for all drugs.
+        max_age (float): The publication with the recent 50 years have been considered.
+    
+    Returns:
+        float: The recency value of z.
+    """
+    coef_number = 10
+    coef_age = 4
+    alter_number_of_publ = sigmoid(coef_number * (number_of_publ / max_number - 0.5))
+    alter_age_of_oldest_publ = sigmoid(coef_age * (age_of_oldest_publ / max_age - 0.5))
+
+    if np.isnan(number_of_publ) and not np.isnan(age_of_oldest_publ):
+        recency = alter_age_of_oldest_publ
+    elif np.isnan(age_of_oldest_publ) and not np.isnan(number_of_publ):
+        recency = alter_number_of_publ
+    else:
+        recency = alter_number_of_publ * alter_age_of_oldest_publ
+    
+    return recency
+
+
+def extracting_drug_fda_publ_date(message, unknown):
+    """
+    Upon querying, the response is returned as a list containing 10 dictionaries,
+    with each dictionary representing the response from an ARA. The function 'extracting_drug_fda_publ_date'
+    is designed to extract the drug entity name of each edge. It then checks the EPC attributes of each edge
+    to determine the FDA approval status of the drug and the accociated pulications PMID/PMC ID ... .
+    And finally extract the publishing date of the publications to get the oldest (Year) date among them.
+
+    Args:
+        Dictionary: response for a ARA to a query
+
+    Returns:
+        "An DataFrame constructed where each row represents an edge and contains information such as the drug entity
+        name, FDA status of the drug, a list of associated publications, the number of associated publications,
+        and the oldest publication date (year) linked to each drug."
+
+    """
+    attribute_type_id_list_fda = ['biolink:FDA_approval_status', 'biolink:FDA_APPROVAL_STATUS']
+    attribute_type_id_list_pub = ['biolink:publications', 'biolink:Publication', 'biolink:publication']
+    drug_idx_fda_status = []
+    today = date.today()
+
+    res_chk = 1
+    # for edge in message['knowledge_graph']['edges'].keys():
+    query_known, query_unknown, query_chk = query_id(message)
+    idi=-1
+    for tmp in unknown:
+        tmp_res = message['results'][tmp]['analyses'][0]['edge_bindings']
+        for tmp_1 in tmp_res:
+            idi+=1
+            edge = message['results'][tmp]['analyses'][0]['edge_bindings'][tmp_1][0]['id']
+    # edge_list = list(message['knowledge_graph']['edges'].keys())
+    # for idx, idi in enumerate(edge_list):
+    #     if idx % 20 == 0:
+    #         print(f'progressing {idx}')
+        # edge = edge_list[idx]
+            edge_attribute = message['knowledge_graph']['edges'][edge]
+            # if set(['subject', 'object']).issubset(edge_attribute.keys()):
+            if query_chk==1:
+                if 'PUBCHEM' in edge_attribute['subject'] or 'CHEMBL' in edge_attribute['subject'] or 'UNII' in edge_attribute['subject'] or 'RXNORM' in edge_attribute['subject'] or 'UMLS' in edge_attribute['subject'] or not 'MONDO' in edge_attribute['subject']:
+                    drug_idx = edge_attribute['subject']
+                else:
+                    drug_idx = edge_attribute['object']
+                if set(['attributes']).issubset(edge_attribute.keys()):
+                    if len(edge_attribute['attributes']) > 0:
+                        att_type_id = {}
+                        fda = []
+                        pub = []
+                        for i in range(len(edge_attribute['attributes'])):
+                            att_type_id[i] = edge_attribute['attributes'][i]['attribute_type_id']
+
+                        for key in att_type_id.keys():
+                            if att_type_id[key] in attribute_type_id_list_fda:
+                                fda.append(key)
+                            elif att_type_id[key] in attribute_type_id_list_pub:
+                                pub.append(key)
+
+                        if len(fda) > 0:
+                            if edge_attribute['attributes'][fda[0]]['value'] == 'FDA Approval':
+                                fda_status = 0.0
+                            else:
+                                fda_status = 1.0
+                        else:
+                            fda_status = None
+
+                        # Publication
+                        if len(pub) > 0:
+                            publications = edge_attribute['attributes'][pub[0]]['value']
+                            if '|' in publications:
+                                publications = publications.split('|')
+                            if type(publications) == 'str':
+                                publications = [publications]
+
+                            # Removal of all publication entries that are links
+                            publications = [x for x in publications if "http" not in x]
+                            # Removal of all publication entries that are Clinical Trials
+                            publications = [x for x in publications if "clinicaltrials" not in x]
+                            number_of_publ = len(publications)
+
+                            if len(publications)>0:
+                                # print(publications)
+                                publications_1 = ','.join(publications)
+                                try:
+                                    response_pub = get_publication_info(publications_1)
+                                    if response_pub['_meta']['n_results']==0:
+                                        age_oldest = np.nan
+                                    else:
+                                        publ_year = []
+                                        for key in response_pub['results'].keys():
+                                            if 'not_found' not in key:
+                                                publ_year.extend([int(response_pub['results'][key]['pub_year'])])
+                                        age_oldest = today.year - min(publ_year)
+                                except ConnectionError as e:
+                                    age_oldest = np.nan
+                        else:
+                            publications = None
+                            number_of_publ = 0.0
+                            age_oldest = np.nan
+                    drug_idx_fda_status.append((idi, drug_idx, fda_status, publications, number_of_publ, age_oldest))
+            else:
+                if query_unknown in ['biolink:Gene', 'biolink:Protein']:
+                    if 'NCBI' in edge_attribute['subject'] or 'GO' in edge_attribute['subject']:
+                        gene_idx = edge_attribute['subject']
+                    else:
+                        gene_idx = edge_attribute['object']
+                    drug_idx_fda_status.append((idi, gene_idx))
+                elif query_unknown in ['biolink:Disease', 'biolink:Phenotype']:
+                    if 'MONDO' in edge_attribute['subject']:
+                        dis_idx = edge_attribute['subject']
+                    else:
+                        dis_idx = edge_attribute['object']
+                    drug_idx_fda_status.append((idi, dis_idx))
+    if query_chk==1 and res_chk==1:
+        DF = pd.DataFrame(drug_idx_fda_status, columns=['edge', 'drug', 'fda status', 'publications', 'number_of_publ', 'age_oldest_pub'])
+    elif query_chk!=1 and res_chk==1:
+        DF = pd.DataFrame(drug_idx_fda_status, columns=['edge', 'result'])
+    else:
+        DF = pd.DataFrame()
+    return DF, query_chk
+
+def extract_results(message, unknown, known):
+    results = []
+    results_known = []
+    kid, ukid = 0, 0
+    for idi, i in enumerate(message['results']):
+        if idi in unknown:
+            results.append([])
+            for idj, j in enumerate(i['analyses']):
+                for idk, k in enumerate(j['edge_bindings'][list(j['edge_bindings'].keys())[0]]):
+                    results[ukid].append(k['id'])
+            ukid+=1
+
+        elif idi in known:
+            results_known.append([])
+            for idj, j in enumerate(i['analyses']):
+                for idk, k in enumerate(j['edge_bindings'][list(j['edge_bindings'].keys())[0]]):
+                    results_known[kid].append(k['id'])
+            kid+=1
+    return results, results_known
+
+def result_edge_correlation(results, results_known, df):
+    df_res = pd.DataFrame()
+    res_known = set()
+    res_unknown = set()
+    for idi, i in enumerate(results):
+        for j in i:
+            df_res = pd.concat([df_res, df[df['edge']==j]])
+            res_unknown.add(df.loc[df['edge']==j, 'drug'].iloc[0])
+
+    for idi, i in enumerate(results_known):
+        for j in i:
+            res_known.add(df.loc[df['edge']==j, 'drug'].iloc[0])
+    return df_res, res_unknown, res_known
+
+def novelty_score(fda_status, recency, similarity):
+    """
+    Calculate the novelty score for each drug entity based on FDA status, recency and similarity of the drug.
+    
+    FDA status 0 | 1 
+        --> 0 to be FDA approved
+    0 < recency < 1 
+        --> 1 to have a high recency where the publications are so new and number of publications is too few. 
+    0 < similarity < 1
+        --> 0 to have a very low molecular structure similarity, so it is novel.
+        
+    Args:
+        float: fda_status
+        float: recency
+        float: similarity
+    
+    Returns:
+        float: novelty_score
+    
+    """
+    if not np.isnan(recency):
+        score = recency
+        if not np.isnan(similarity):
+            similarity = 1 - similarity
+            if similarity > 0.5:
+                score = score*(0.73+similarity)
+            if score>1:
+                score=1        
+            if fda_status == 0:
+                score=score*0.85 
+    else:
+        if np.isnan(similarity): 
+            score=0 
+        else:
+            score=(1-similarity)
+    return score
+
+def compute_novelty(message, logger):
+    """ INPUT: JSON Response with merged annotated results for a 1-H query
+
+    1. load the json file
+    2. Give the json to extracting_drug_fda_publ_date(response) function to extract the EPC
+    3. Apply the recency function of df, to add a new column as recency to the dataframe
+    4. Add a new column to the df as similarity which has random number between 0-1
+    5. Now the dataframe df is ready for applying the novelty score on it
+
+    OUTPUT: Pandas DataFrame  with FDA Status, Recency, Similarity and Novelty score per result
+    """
+    # Step 1
+    known, unknown = find_known_results(message)
+    #
+    # # Step 2
+    similarity_map = molecular_sim(known, unknown, message)
+
+    df, query_chk = extracting_drug_fda_publ_date(message, unknown)
+#         # print(df.head())
+#         # print(query_chk)
+#
+    # df = pd.read_excel('DATAFRAME.xlsx', names=['edge', 'drug', 'fda status', 'publications', 'number_of_publ', 'age_oldest_pub'])
+    # query_chk = 1
+
+    # #
+    # res, res_known = extract_results(mergedAnnotatedOutput, unknown, known)
+#         # print(len(res_known))
+#         # print(len(res))
+#         # # #
+#         df_res, res_unknown, res_known = result_edge_correlation(res, res_known, df)
+    # print(len(res_unknown))
+    # print(len(res_known))
+    # df = df_res
+    # print(df.head())
+    # print(similarity_map)
+    if query_chk==1:
+        # Step 3:
+        # calculating the recency
+        df['recency'] = df.apply(lambda row: recency_function_exp(row['number_of_publ'], row['age_oldest_pub'], 100, 50) if not (np.isnan(row['number_of_publ']) or np.isnan(row['age_oldest_pub'])) else np.nan, axis=1)
+        #
+        # # Step 4:
+        # # This section will be added later. Currently just putting 'NaN':
+        # nearest_neighbours = calculate_nn_distance(res_known, res_unknown, 0, 1)
+
+        df['similarity'] = df.apply(lambda row: similarity_map[row['drug']][0][1] if row['drug'] in similarity_map.keys() else np.nan, axis=1)
+        # df = df.assign(similarity=np.nan)
+
+        # # Step 5:
+        # # Calculating the novelty score:
+        df['novelty_score'] = df.apply(lambda row: novelty_score(row['fda status'], row['recency'], row['similarity']), axis=1)
+
+        # # # Step 6
+        # # # Just sort them:
+        df = df[['drug', 'novelty_score']].sort_values(by= 'novelty_score', ascending= False)
+    else:
+        df = df.assign(novelty_score=0)
+    return df
+
+#for i in list(range(1, 5)):
+# start = time.time()
+# temp = compute_novelty('mergedAnnotatedOutput.json')
+# if temp.empty:
+#     print(f"No Results in mergedAnnotatedOutput.json")
+# else:
+#     temp_json = temp.to_json(f'mergedAnnotatedOutput_scores.json', orient='values')
+# print(time.time()-start)
\ No newline at end of file
diff --git a/app/novelty/extr_smile_molpro_by_id.py b/app/novelty/extr_smile_molpro_by_id.py
new file mode 100644
index 0000000..98df59d
--- /dev/null
+++ b/app/novelty/extr_smile_molpro_by_id.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+import requests
+
+
+def mol_to_smile_molpro(molecules):
+    """
+    
+    Returns:
+        Dict
+    
+    Args:
+        List    
+        Example: Attached sample_mols.json file 
+    """
+    
+    url = "https://translator.broadinstitute.org/molecular_data_provider/compound/by_id"
+    headers = {
+        "accept": "application/json",
+        "Content-Type": "application/json"
+    }
+    
+    data_mol = list(set(molecules))
+    smiles = {}
+
+    response = requests.post(url, headers=headers, json=data_mol)
+
+    if response.status_code == 200:
+        json_response = response.json()
+        #print(json_response)
+        collec_url = json_response['url']
+        temp_collec_response = requests.get(collec_url)
+        if response.status_code == 200:
+            collec_response = temp_collec_response.json()
+            #print('\n')
+            #print(collec_response['elements'][0]['identifiers'])
+            if json_response['size'] > 0:
+                
+                for i in range(json_response['size']):
+                    key_list = ['identifiers']
+                    if set(key_list).issubset(collec_response['elements'][i].keys()):
+                        #if 'smiles' in collec_response['elements'][i]['identifiers'].keys():
+                        identifiers = collec_response['elements'][i]['identifiers']
+                        smile = identifiers.get('smiles', 'No SMILES could be found')
+                        smiles[data_mol[i]] = smile
+                    else:
+                        smiles[data_mol[i]] = 'No identifiers could be found'
+                        
+            #Recursion: re-attempt to retrieve the SMILES for those (from the initial data_mol)
+            #for which the retrieval was not successful in the first attempt
+            if len(list(smiles.keys())) < len(data_mol):
+                diff = [mol for mol in data_mol if mol not in list(smiles.keys())]
+                diff_smiles = mol_to_smile_molpro(diff)
+                if len(diff_smiles) > 0:
+                    smiles.update(diff_smiles)           
+    else:
+        print(f"Error: {response.status_code} - {response.text}")
+        
+    
+    return smiles
\ No newline at end of file
diff --git a/app/novelty/known.py b/app/novelty/known.py
new file mode 100644
index 0000000..1e2c3a6
--- /dev/null
+++ b/app/novelty/known.py
@@ -0,0 +1,27 @@
+def find_known_results(message):
+    inferring_sources = [
+        "infores:aragorn",
+        "infores:arax",
+        "infores:biothings-explorer",
+        "infores:improving-agent",
+        "infores:robokop",
+        "infores:unsecret-agent",
+    ]
+    known_result_ids = []
+    unknown_result_ids = []
+    results = message["results"]
+    knowledge_graph = message["knowledge_graph"]
+    for idres, result in enumerate(results):
+        for analysis in result.get("analyses") or []:
+            for eb in analysis["edge_bindings"].values():
+                for element in eb:
+                    edge_id = element["id"]
+                    edge = knowledge_graph["edges"][edge_id]
+                    for source in edge["sources"]:
+                        if source["resource_role"] == "primary_knowledge_source":
+                            if source["resource_id"] not in inferring_sources:
+                                known_result_ids.append(idres)
+                                break
+                            else:
+                                unknown_result_ids.append(idres)
+    return known_result_ids, unknown_result_ids
diff --git a/app/novelty/mol_similarity.py b/app/novelty/mol_similarity.py
new file mode 100644
index 0000000..70c2f60
--- /dev/null
+++ b/app/novelty/mol_similarity.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+from rdkit import Chem
+from rdkit import DataStructs
+from rdkit.Chem import AllChem
+
+
+
+def find_nearest_neighbors(unknown_smiles_dict, known_smiles_dict, similarity_cutoff, num_neighbors):
+    """
+    
+    Returns:
+        Dict
+
+    Args:
+        unknown_smiles_dict: Dict
+        known_smiles_dict: Dict
+        similarity_cutoff: float: 0
+        num_neighbors: int: 1
+
+    """
+    unknown_smiles = {key:value for key,value in unknown_smiles_dict.items() if value != "No SMILES could be found"}
+    known_smiles = {key:value for key,value in known_smiles_dict.items() if value != "No SMILES could be found"}
+
+    # Convert input SMILES to a molecule
+    known_mols = {}
+    for key,value in known_smiles.items():
+        known_mol = Chem.MolFromSmiles(value)
+        if known_mol is None:
+            raise ValueError("Invalid SMILES string for",key)
+        else:
+            known_mols.update({key:known_mol})
+    nearest_neighbor_mapping = {}
+    for unknownkey,value in unknown_smiles.items():
+        query_mol = Chem.MolFromSmiles(value)
+        if query_mol is None:
+            raise ValueError("Invalid SMILES string")
+
+        # Calculate fingerprints for the query molecule
+        query_fp = AllChem.GetMorganFingerprint(query_mol, 2)
+
+        # Calculate similarity scores between the query molecule and all molecules in the dataset
+        similarities = []
+        for key, mol in known_mols.items():
+            fp = AllChem.GetMorganFingerprint(mol, 2)
+            similarity = DataStructs.TanimotoSimilarity(query_fp, fp)
+            similarities.append((key, similarity))
+
+        # Sort the similarities in descending order
+        similarities.sort(key=lambda x: x[1], reverse=True)
+
+        # Retrieve the nearest neighbors
+        neighbors = []
+        for i in range(min(num_neighbors, len(similarities))):
+            index, similarity = similarities[i]
+            if similarity >= similarity_cutoff:
+                neighbors.append((index, similarity))
+        nearest_neighbor_mapping.update({unknownkey:neighbors})
+    return nearest_neighbor_mapping
\ No newline at end of file
diff --git a/requirements-lock.txt b/requirements-lock.txt
index e5f9c09..cf79215 100644
--- a/requirements-lock.txt
+++ b/requirements-lock.txt
@@ -12,15 +12,22 @@ httptools==0.2.0
 httpx==0.24.1
 idna==3.4
 numpy==1.25.1
+pandas==2.0.3
+Pillow==10.0.0
 pydantic==1.10.9
+python-dateutil==2.8.2
 python-dotenv==1.0.0
+pytz==2023.3
 PyYAML==6.0
+rdkit==2023.3.2
 reasoner-pydantic==4.0.8
 redis==4.6.0
+six==1.16.0
 sniffio==1.3.0
 starlette==0.17.1
 tqdm==4.65.0
 typing_extensions==4.6.3
+tzdata==2023.3
 uvicorn==0.13.3
 uvloop==0.17.0
 watchgod==0.8.2
diff --git a/requirements.txt b/requirements.txt
index dbceaa9..03e7735 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,8 @@ fastapi==0.75.0
 gunicorn==20.1.0
 httpx==0.24.1
 numpy==1.25.1
+pandas==2.0.3
+rdkit==2023.3.2
 reasoner-pydantic==4.0.8
 redis==4.6.0
 tqdm==4.65.0

From fbffb73c7409f66b81126372a000dfe8d491ee61 Mon Sep 17 00:00:00 2001
From: Max Wang <max@covar.com>
Date: Thu, 3 Aug 2023 14:12:52 -0400
Subject: [PATCH 2/5] Actually call novelty code

---
 app/ordering_components.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/app/ordering_components.py b/app/ordering_components.py
index 5fc209a..9022df5 100644
--- a/app/ordering_components.py
+++ b/app/ordering_components.py
@@ -6,6 +6,8 @@
 from .config import settings
 from .clinical_evidence.compute_clinical_evidence import compute_clinical_evidence
 
+from .novelty.compute_novelty import compute_novelty
+
 redis_pool = redis.ConnectionPool(
     host=settings.redis_host,
     port=settings.redis_port,
@@ -42,15 +44,16 @@ def get_clinical_evidence(result, message, logger, db_conn):
     return compute_clinical_evidence(result, message, logger, db_conn)
 
 
-def get_novelty(result, message, logger):
-    # TODO get novelty from novelty package
-    return 0
+def get_novelty(message, logger):
+    return compute_novelty(message, logger)
 
 
 def get_ordering_components(message, logger):
     logger.debug(f"Computing scores for {len(message['results'])} results")
     db_conn = redis.Redis(connection_pool=redis_pool)
-    for result_index, result in enumerate(tqdm(message.get("results") or [])):
+    novelty_scores_dict = get_novelty(message, logger).to_dict(orient="index")
+    novelty_scores = {node["drug"]: node["novelty_score"] for node in novelty_scores_dict.values()}
+    for result in tqdm(message.get("results") or []):
         clinical_evidence_score = get_clinical_evidence(
             result,
             message,
@@ -62,8 +65,9 @@ def get_ordering_components(message, logger):
             "clinical_evidence": clinical_evidence_score,
             "novelty": 0,
         }
-        if result["ordering_components"]["clinical_evidence"] == 0:
+        if clinical_evidence_score == 0:
             # Only compute novelty if there is no clinical evidence
-            result["ordering_components"]["novelty"] = get_novelty(
-                result, message, logger
-            )
+            for node_bindings in result.get("node_bindings", {}).values():
+                for node_binding in node_bindings:
+                    if node_binding["id"] in novelty_scores:
+                        result["ordering_components"]["novelty"] = novelty_scores[node_binding["id"]]

From 14c4414d364e4591982b8a4c293f5885cfe698bf Mon Sep 17 00:00:00 2001
From: Max Wang <max@covar.com>
Date: Thu, 3 Aug 2023 14:13:05 -0400
Subject: [PATCH 3/5] Bump minor version

---
 app/server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/app/server.py b/app/server.py
index def83bf..b94790f 100644
--- a/app/server.py
+++ b/app/server.py
@@ -21,7 +21,7 @@
 
 openapi_args = dict(
     title="Answer Appraiser",
-    version="0.2.3",
+    version="0.3.0",
     terms_of_service="",
     translator_component="Utility",
     translator_teams=["Standards Reference Implementation Team"],

From aeda5611e9b183c0234cd82f843108edf5e770e4 Mon Sep 17 00:00:00 2001
From: Max Wang <max@covar.com>
Date: Thu, 3 Aug 2023 14:25:30 -0400
Subject: [PATCH 4/5] Black

---
 app/novelty/compute_novelty.py         | 320 ++++++++++++++++---------
 app/novelty/extr_smile_molpro_by_id.py |  55 ++---
 app/novelty/mol_similarity.py          |  31 ++-
 app/ordering_components.py             |   8 +-
 4 files changed, 263 insertions(+), 151 deletions(-)

diff --git a/app/novelty/compute_novelty.py b/app/novelty/compute_novelty.py
index 2b53729..258a759 100644
--- a/app/novelty/compute_novelty.py
+++ b/app/novelty/compute_novelty.py
@@ -4,7 +4,7 @@
 import numpy as np
 
 from .known import find_known_results
-from .extr_smile_molpro_by_id import  mol_to_smile_molpro
+from .extr_smile_molpro_by_id import mol_to_smile_molpro
 from .mol_similarity import find_nearest_neighbors
 
 """
@@ -18,27 +18,45 @@
 The end result of this script displays a table with values from different columns and accordingly lists the novelty score as well.
 """
 
+
 def molecular_sim(known, unknown, message):
     unknown_ids = []
     known_ids = []
     if len(unknown) > 0:
         for drug in unknown:
-            s = list(message['results'][drug]['node_bindings'].keys())
-            edge_attribute_sn = message['results'][drug]['node_bindings'][s[0]][0]['id']
-            if 'PUBCHEM' in edge_attribute_sn or 'CHEMBL' in edge_attribute_sn or 'UNII' in edge_attribute_sn or 'RXNORM' in edge_attribute_sn or 'UMLS' in edge_attribute_sn or not 'MONDO' in edge_attribute_sn:
+            s = list(message["results"][drug]["node_bindings"].keys())
+            edge_attribute_sn = message["results"][drug]["node_bindings"][s[0]][0]["id"]
+            if (
+                "PUBCHEM" in edge_attribute_sn
+                or "CHEMBL" in edge_attribute_sn
+                or "UNII" in edge_attribute_sn
+                or "RXNORM" in edge_attribute_sn
+                or "UMLS" in edge_attribute_sn
+                or not "MONDO" in edge_attribute_sn
+            ):
                 unknown_ids.append(edge_attribute_sn)
             else:
                 unknown_ids.append(
-                    message['results'][drug]['node_bindings'][s[1]][0]['id'])
+                    message["results"][drug]["node_bindings"][s[1]][0]["id"]
+                )
 
     if len(known) > 0:
         for drug in known:
-            s = list(message['results'][drug]['node_bindings'].keys())
-            edge_attribute_sn = message['results'][drug]['node_bindings'][s[0]][0]['id']
-            if 'PUBCHEM' in edge_attribute_sn or 'CHEMBL' in edge_attribute_sn or 'UMLS' in edge_attribute_sn or 'UNII' in edge_attribute_sn or 'RXNORM' in edge_attribute_sn or not 'MONDO' in edge_attribute_sn:
+            s = list(message["results"][drug]["node_bindings"].keys())
+            edge_attribute_sn = message["results"][drug]["node_bindings"][s[0]][0]["id"]
+            if (
+                "PUBCHEM" in edge_attribute_sn
+                or "CHEMBL" in edge_attribute_sn
+                or "UMLS" in edge_attribute_sn
+                or "UNII" in edge_attribute_sn
+                or "RXNORM" in edge_attribute_sn
+                or not "MONDO" in edge_attribute_sn
+            ):
                 known_ids.append(edge_attribute_sn)
             else:
-                known_ids.append(message['results'][drug]['node_bindings'][s[1]][0]['id'])
+                known_ids.append(
+                    message["results"][drug]["node_bindings"][s[1]][0]["id"]
+                )
 
     smile_unkown = mol_to_smile_molpro(unknown_ids)
     smile_known = mol_to_smile_molpro(known_ids)
@@ -53,7 +71,7 @@ def get_publication_info(pub_id):
     Returns: The publication info
     """
     base_url = "https://3md2qwxrrk.us-east-1.awsapprunner.com/publications?pubids="
-    request_id = '1df88223-c0f8-47f5-a1f3-661b944c7849'
+    request_id = "1df88223-c0f8-47f5-a1f3-661b944c7849"
     full_url = f"{base_url}{pub_id}&request_id={request_id}"
     try:
         response = requests.get(full_url)
@@ -71,19 +89,24 @@ def get_publication_info(pub_id):
 def sigmoid(x):
     return 1 / (1 + np.exp(x))
 
+
 def query_id(message):
-    for i in message['query_graph']['nodes']:
-        if 'ids' in message['query_graph']['nodes'][i].keys():
-            if message['query_graph']['nodes'][i]['ids']:
-                known_node = message['query_graph']['nodes'][i]['categories'][0]
+    for i in message["query_graph"]["nodes"]:
+        if "ids" in message["query_graph"]["nodes"][i].keys():
+            if message["query_graph"]["nodes"][i]["ids"]:
+                known_node = message["query_graph"]["nodes"][i]["categories"][0]
             else:
-                unknown_node = message['query_graph']['nodes'][i]['categories'][0]
+                unknown_node = message["query_graph"]["nodes"][i]["categories"][0]
         else:
-            unknown_node = message['query_graph']['nodes'][i]['categories'][0]
-    if unknown_node in ['biolink:ChemicalEntity', 'biolink:SmallMolecule', 'biolink:Drug']:
-        chk=1
+            unknown_node = message["query_graph"]["nodes"][i]["categories"][0]
+    if unknown_node in [
+        "biolink:ChemicalEntity",
+        "biolink:SmallMolecule",
+        "biolink:Drug",
+    ]:
+        chk = 1
     else:
-        chk=0
+        chk = 0
     return known_node, unknown_node, chk
 
 
@@ -91,13 +114,13 @@ def recency_function_exp(number_of_publ, age_of_oldest_publ, max_number, max_age
     """
     Calculates the recency based on number of publications accociated to each drug
     and age of the oldest publication
-    
+
     Args:
         number_of_publ (float): The current number of publications.
         age_of_oldest_publ (float): The current age of the oldest publication.
         max_number (float): The maximum number of publication: e.g. consider 100 for all drugs.
         max_age (float): The publication with the recent 50 years have been considered.
-    
+
     Returns:
         float: The recency value of z.
     """
@@ -112,7 +135,7 @@ def recency_function_exp(number_of_publ, age_of_oldest_publ, max_number, max_age
         recency = alter_number_of_publ
     else:
         recency = alter_number_of_publ * alter_age_of_oldest_publ
-    
+
     return recency
 
 
@@ -133,39 +156,57 @@ def extracting_drug_fda_publ_date(message, unknown):
         and the oldest publication date (year) linked to each drug."
 
     """
-    attribute_type_id_list_fda = ['biolink:FDA_approval_status', 'biolink:FDA_APPROVAL_STATUS']
-    attribute_type_id_list_pub = ['biolink:publications', 'biolink:Publication', 'biolink:publication']
+    attribute_type_id_list_fda = [
+        "biolink:FDA_approval_status",
+        "biolink:FDA_APPROVAL_STATUS",
+    ]
+    attribute_type_id_list_pub = [
+        "biolink:publications",
+        "biolink:Publication",
+        "biolink:publication",
+    ]
     drug_idx_fda_status = []
     today = date.today()
 
     res_chk = 1
     # for edge in message['knowledge_graph']['edges'].keys():
     query_known, query_unknown, query_chk = query_id(message)
-    idi=-1
+    idi = -1
     for tmp in unknown:
-        tmp_res = message['results'][tmp]['analyses'][0]['edge_bindings']
+        tmp_res = message["results"][tmp]["analyses"][0]["edge_bindings"]
         for tmp_1 in tmp_res:
-            idi+=1
-            edge = message['results'][tmp]['analyses'][0]['edge_bindings'][tmp_1][0]['id']
-    # edge_list = list(message['knowledge_graph']['edges'].keys())
-    # for idx, idi in enumerate(edge_list):
-    #     if idx % 20 == 0:
-    #         print(f'progressing {idx}')
-        # edge = edge_list[idx]
-            edge_attribute = message['knowledge_graph']['edges'][edge]
+            idi += 1
+            edge = message["results"][tmp]["analyses"][0]["edge_bindings"][tmp_1][0][
+                "id"
+            ]
+            # edge_list = list(message['knowledge_graph']['edges'].keys())
+            # for idx, idi in enumerate(edge_list):
+            #     if idx % 20 == 0:
+            #         print(f'progressing {idx}')
+            # edge = edge_list[idx]
+            edge_attribute = message["knowledge_graph"]["edges"][edge]
             # if set(['subject', 'object']).issubset(edge_attribute.keys()):
-            if query_chk==1:
-                if 'PUBCHEM' in edge_attribute['subject'] or 'CHEMBL' in edge_attribute['subject'] or 'UNII' in edge_attribute['subject'] or 'RXNORM' in edge_attribute['subject'] or 'UMLS' in edge_attribute['subject'] or not 'MONDO' in edge_attribute['subject']:
-                    drug_idx = edge_attribute['subject']
+            if query_chk == 1:
+                if (
+                    "PUBCHEM" in edge_attribute["subject"]
+                    or "CHEMBL" in edge_attribute["subject"]
+                    or "UNII" in edge_attribute["subject"]
+                    or "RXNORM" in edge_attribute["subject"]
+                    or "UMLS" in edge_attribute["subject"]
+                    or not "MONDO" in edge_attribute["subject"]
+                ):
+                    drug_idx = edge_attribute["subject"]
                 else:
-                    drug_idx = edge_attribute['object']
-                if set(['attributes']).issubset(edge_attribute.keys()):
-                    if len(edge_attribute['attributes']) > 0:
+                    drug_idx = edge_attribute["object"]
+                if set(["attributes"]).issubset(edge_attribute.keys()):
+                    if len(edge_attribute["attributes"]) > 0:
                         att_type_id = {}
                         fda = []
                         pub = []
-                        for i in range(len(edge_attribute['attributes'])):
-                            att_type_id[i] = edge_attribute['attributes'][i]['attribute_type_id']
+                        for i in range(len(edge_attribute["attributes"])):
+                            att_type_id[i] = edge_attribute["attributes"][i][
+                                "attribute_type_id"
+                            ]
 
                         for key in att_type_id.keys():
                             if att_type_id[key] in attribute_type_id_list_fda:
@@ -174,7 +215,10 @@ def extracting_drug_fda_publ_date(message, unknown):
                                 pub.append(key)
 
                         if len(fda) > 0:
-                            if edge_attribute['attributes'][fda[0]]['value'] == 'FDA Approval':
+                            if (
+                                edge_attribute["attributes"][fda[0]]["value"]
+                                == "FDA Approval"
+                            ):
                                 fda_status = 0.0
                             else:
                                 fda_status = 1.0
@@ -183,30 +227,40 @@ def extracting_drug_fda_publ_date(message, unknown):
 
                         # Publication
                         if len(pub) > 0:
-                            publications = edge_attribute['attributes'][pub[0]]['value']
-                            if '|' in publications:
-                                publications = publications.split('|')
-                            if type(publications) == 'str':
+                            publications = edge_attribute["attributes"][pub[0]]["value"]
+                            if "|" in publications:
+                                publications = publications.split("|")
+                            if type(publications) == "str":
                                 publications = [publications]
 
                             # Removal of all publication entries that are links
                             publications = [x for x in publications if "http" not in x]
                             # Removal of all publication entries that are Clinical Trials
-                            publications = [x for x in publications if "clinicaltrials" not in x]
+                            publications = [
+                                x for x in publications if "clinicaltrials" not in x
+                            ]
                             number_of_publ = len(publications)
 
-                            if len(publications)>0:
+                            if len(publications) > 0:
                                 # print(publications)
-                                publications_1 = ','.join(publications)
+                                publications_1 = ",".join(publications)
                                 try:
                                     response_pub = get_publication_info(publications_1)
-                                    if response_pub['_meta']['n_results']==0:
+                                    if response_pub["_meta"]["n_results"] == 0:
                                         age_oldest = np.nan
                                     else:
                                         publ_year = []
-                                        for key in response_pub['results'].keys():
-                                            if 'not_found' not in key:
-                                                publ_year.extend([int(response_pub['results'][key]['pub_year'])])
+                                        for key in response_pub["results"].keys():
+                                            if "not_found" not in key:
+                                                publ_year.extend(
+                                                    [
+                                                        int(
+                                                            response_pub["results"][
+                                                                key
+                                                            ]["pub_year"]
+                                                        )
+                                                    ]
+                                                )
                                         age_oldest = today.year - min(publ_year)
                                 except ConnectionError as e:
                                     age_oldest = np.nan
@@ -214,101 +268,131 @@ def extracting_drug_fda_publ_date(message, unknown):
                             publications = None
                             number_of_publ = 0.0
                             age_oldest = np.nan
-                    drug_idx_fda_status.append((idi, drug_idx, fda_status, publications, number_of_publ, age_oldest))
+                    drug_idx_fda_status.append(
+                        (
+                            idi,
+                            drug_idx,
+                            fda_status,
+                            publications,
+                            number_of_publ,
+                            age_oldest,
+                        )
+                    )
             else:
-                if query_unknown in ['biolink:Gene', 'biolink:Protein']:
-                    if 'NCBI' in edge_attribute['subject'] or 'GO' in edge_attribute['subject']:
-                        gene_idx = edge_attribute['subject']
+                if query_unknown in ["biolink:Gene", "biolink:Protein"]:
+                    if (
+                        "NCBI" in edge_attribute["subject"]
+                        or "GO" in edge_attribute["subject"]
+                    ):
+                        gene_idx = edge_attribute["subject"]
                     else:
-                        gene_idx = edge_attribute['object']
+                        gene_idx = edge_attribute["object"]
                     drug_idx_fda_status.append((idi, gene_idx))
-                elif query_unknown in ['biolink:Disease', 'biolink:Phenotype']:
-                    if 'MONDO' in edge_attribute['subject']:
-                        dis_idx = edge_attribute['subject']
+                elif query_unknown in ["biolink:Disease", "biolink:Phenotype"]:
+                    if "MONDO" in edge_attribute["subject"]:
+                        dis_idx = edge_attribute["subject"]
                     else:
-                        dis_idx = edge_attribute['object']
+                        dis_idx = edge_attribute["object"]
                     drug_idx_fda_status.append((idi, dis_idx))
-    if query_chk==1 and res_chk==1:
-        DF = pd.DataFrame(drug_idx_fda_status, columns=['edge', 'drug', 'fda status', 'publications', 'number_of_publ', 'age_oldest_pub'])
-    elif query_chk!=1 and res_chk==1:
-        DF = pd.DataFrame(drug_idx_fda_status, columns=['edge', 'result'])
+    if query_chk == 1 and res_chk == 1:
+        DF = pd.DataFrame(
+            drug_idx_fda_status,
+            columns=[
+                "edge",
+                "drug",
+                "fda status",
+                "publications",
+                "number_of_publ",
+                "age_oldest_pub",
+            ],
+        )
+    elif query_chk != 1 and res_chk == 1:
+        DF = pd.DataFrame(drug_idx_fda_status, columns=["edge", "result"])
     else:
         DF = pd.DataFrame()
     return DF, query_chk
 
+
 def extract_results(message, unknown, known):
     results = []
     results_known = []
     kid, ukid = 0, 0
-    for idi, i in enumerate(message['results']):
+    for idi, i in enumerate(message["results"]):
         if idi in unknown:
             results.append([])
-            for idj, j in enumerate(i['analyses']):
-                for idk, k in enumerate(j['edge_bindings'][list(j['edge_bindings'].keys())[0]]):
-                    results[ukid].append(k['id'])
-            ukid+=1
+            for idj, j in enumerate(i["analyses"]):
+                for idk, k in enumerate(
+                    j["edge_bindings"][list(j["edge_bindings"].keys())[0]]
+                ):
+                    results[ukid].append(k["id"])
+            ukid += 1
 
         elif idi in known:
             results_known.append([])
-            for idj, j in enumerate(i['analyses']):
-                for idk, k in enumerate(j['edge_bindings'][list(j['edge_bindings'].keys())[0]]):
-                    results_known[kid].append(k['id'])
-            kid+=1
+            for idj, j in enumerate(i["analyses"]):
+                for idk, k in enumerate(
+                    j["edge_bindings"][list(j["edge_bindings"].keys())[0]]
+                ):
+                    results_known[kid].append(k["id"])
+            kid += 1
     return results, results_known
 
+
 def result_edge_correlation(results, results_known, df):
     df_res = pd.DataFrame()
     res_known = set()
     res_unknown = set()
     for idi, i in enumerate(results):
         for j in i:
-            df_res = pd.concat([df_res, df[df['edge']==j]])
-            res_unknown.add(df.loc[df['edge']==j, 'drug'].iloc[0])
+            df_res = pd.concat([df_res, df[df["edge"] == j]])
+            res_unknown.add(df.loc[df["edge"] == j, "drug"].iloc[0])
 
     for idi, i in enumerate(results_known):
         for j in i:
-            res_known.add(df.loc[df['edge']==j, 'drug'].iloc[0])
+            res_known.add(df.loc[df["edge"] == j, "drug"].iloc[0])
     return df_res, res_unknown, res_known
 
+
 def novelty_score(fda_status, recency, similarity):
     """
     Calculate the novelty score for each drug entity based on FDA status, recency and similarity of the drug.
-    
-    FDA status 0 | 1 
+
+    FDA status 0 | 1
         --> 0 to be FDA approved
-    0 < recency < 1 
-        --> 1 to have a high recency where the publications are so new and number of publications is too few. 
+    0 < recency < 1
+        --> 1 to have a high recency where the publications are so new and number of publications is too few.
     0 < similarity < 1
         --> 0 to have a very low molecular structure similarity, so it is novel.
-        
+
     Args:
         float: fda_status
         float: recency
         float: similarity
-    
+
     Returns:
         float: novelty_score
-    
+
     """
     if not np.isnan(recency):
         score = recency
         if not np.isnan(similarity):
             similarity = 1 - similarity
             if similarity > 0.5:
-                score = score*(0.73+similarity)
-            if score>1:
-                score=1        
+                score = score * (0.73 + similarity)
+            if score > 1:
+                score = 1
             if fda_status == 0:
-                score=score*0.85 
+                score = score * 0.85
     else:
-        if np.isnan(similarity): 
-            score=0 
+        if np.isnan(similarity):
+            score = 0
         else:
-            score=(1-similarity)
+            score = 1 - similarity
     return score
 
+
 def compute_novelty(message, logger):
-    """ INPUT: JSON Response with merged annotated results for a 1-H query
+    """INPUT: JSON Response with merged annotated results for a 1-H query
 
     1. load the json file
     2. Give the json to extracting_drug_fda_publ_date(response) function to extract the EPC
@@ -325,51 +409,71 @@ def compute_novelty(message, logger):
     similarity_map = molecular_sim(known, unknown, message)
 
     df, query_chk = extracting_drug_fda_publ_date(message, unknown)
-#         # print(df.head())
-#         # print(query_chk)
-#
+    #         # print(df.head())
+    #         # print(query_chk)
+    #
     # df = pd.read_excel('DATAFRAME.xlsx', names=['edge', 'drug', 'fda status', 'publications', 'number_of_publ', 'age_oldest_pub'])
     # query_chk = 1
 
     # #
     # res, res_known = extract_results(mergedAnnotatedOutput, unknown, known)
-#         # print(len(res_known))
-#         # print(len(res))
-#         # # #
-#         df_res, res_unknown, res_known = result_edge_correlation(res, res_known, df)
+    #         # print(len(res_known))
+    #         # print(len(res))
+    #         # # #
+    #         df_res, res_unknown, res_known = result_edge_correlation(res, res_known, df)
     # print(len(res_unknown))
     # print(len(res_known))
     # df = df_res
     # print(df.head())
     # print(similarity_map)
-    if query_chk==1:
+    if query_chk == 1:
         # Step 3:
         # calculating the recency
-        df['recency'] = df.apply(lambda row: recency_function_exp(row['number_of_publ'], row['age_oldest_pub'], 100, 50) if not (np.isnan(row['number_of_publ']) or np.isnan(row['age_oldest_pub'])) else np.nan, axis=1)
+        df["recency"] = df.apply(
+            lambda row: recency_function_exp(
+                row["number_of_publ"], row["age_oldest_pub"], 100, 50
+            )
+            if not (np.isnan(row["number_of_publ"]) or np.isnan(row["age_oldest_pub"]))
+            else np.nan,
+            axis=1,
+        )
         #
         # # Step 4:
         # # This section will be added later. Currently just putting 'NaN':
         # nearest_neighbours = calculate_nn_distance(res_known, res_unknown, 0, 1)
 
-        df['similarity'] = df.apply(lambda row: similarity_map[row['drug']][0][1] if row['drug'] in similarity_map.keys() else np.nan, axis=1)
+        df["similarity"] = df.apply(
+            lambda row: similarity_map[row["drug"]][0][1]
+            if row["drug"] in similarity_map.keys()
+            else np.nan,
+            axis=1,
+        )
         # df = df.assign(similarity=np.nan)
 
         # # Step 5:
         # # Calculating the novelty score:
-        df['novelty_score'] = df.apply(lambda row: novelty_score(row['fda status'], row['recency'], row['similarity']), axis=1)
+        df["novelty_score"] = df.apply(
+            lambda row: novelty_score(
+                row["fda status"], row["recency"], row["similarity"]
+            ),
+            axis=1,
+        )
 
         # # # Step 6
         # # # Just sort them:
-        df = df[['drug', 'novelty_score']].sort_values(by= 'novelty_score', ascending= False)
+        df = df[["drug", "novelty_score"]].sort_values(
+            by="novelty_score", ascending=False
+        )
     else:
         df = df.assign(novelty_score=0)
     return df
 
-#for i in list(range(1, 5)):
+
+# for i in list(range(1, 5)):
 # start = time.time()
 # temp = compute_novelty('mergedAnnotatedOutput.json')
 # if temp.empty:
 #     print(f"No Results in mergedAnnotatedOutput.json")
 # else:
 #     temp_json = temp.to_json(f'mergedAnnotatedOutput_scores.json', orient='values')
-# print(time.time()-start)
\ No newline at end of file
+# print(time.time()-start)
diff --git a/app/novelty/extr_smile_molpro_by_id.py b/app/novelty/extr_smile_molpro_by_id.py
index 98df59d..504e30e 100644
--- a/app/novelty/extr_smile_molpro_by_id.py
+++ b/app/novelty/extr_smile_molpro_by_id.py
@@ -4,21 +4,18 @@
 
 def mol_to_smile_molpro(molecules):
     """
-    
+
     Returns:
         Dict
-    
+
     Args:
-        List    
-        Example: Attached sample_mols.json file 
+        List
+        Example: Attached sample_mols.json file
     """
-    
+
     url = "https://translator.broadinstitute.org/molecular_data_provider/compound/by_id"
-    headers = {
-        "accept": "application/json",
-        "Content-Type": "application/json"
-    }
-    
+    headers = {"accept": "application/json", "Content-Type": "application/json"}
+
     data_mol = list(set(molecules))
     smiles = {}
 
@@ -26,34 +23,32 @@ def mol_to_smile_molpro(molecules):
 
     if response.status_code == 200:
         json_response = response.json()
-        #print(json_response)
-        collec_url = json_response['url']
+        # print(json_response)
+        collec_url = json_response["url"]
         temp_collec_response = requests.get(collec_url)
         if response.status_code == 200:
             collec_response = temp_collec_response.json()
-            #print('\n')
-            #print(collec_response['elements'][0]['identifiers'])
-            if json_response['size'] > 0:
-                
-                for i in range(json_response['size']):
-                    key_list = ['identifiers']
-                    if set(key_list).issubset(collec_response['elements'][i].keys()):
-                        #if 'smiles' in collec_response['elements'][i]['identifiers'].keys():
-                        identifiers = collec_response['elements'][i]['identifiers']
-                        smile = identifiers.get('smiles', 'No SMILES could be found')
+            # print('\n')
+            # print(collec_response['elements'][0]['identifiers'])
+            if json_response["size"] > 0:
+                for i in range(json_response["size"]):
+                    key_list = ["identifiers"]
+                    if set(key_list).issubset(collec_response["elements"][i].keys()):
+                        # if 'smiles' in collec_response['elements'][i]['identifiers'].keys():
+                        identifiers = collec_response["elements"][i]["identifiers"]
+                        smile = identifiers.get("smiles", "No SMILES could be found")
                         smiles[data_mol[i]] = smile
                     else:
-                        smiles[data_mol[i]] = 'No identifiers could be found'
-                        
-            #Recursion: re-attempt to retrieve the SMILES for those (from the initial data_mol)
-            #for which the retrieval was not successful in the first attempt
+                        smiles[data_mol[i]] = "No identifiers could be found"
+
+            # Recursion: re-attempt to retrieve the SMILES for those (from the initial data_mol)
+            # for which the retrieval was not successful in the first attempt
             if len(list(smiles.keys())) < len(data_mol):
                 diff = [mol for mol in data_mol if mol not in list(smiles.keys())]
                 diff_smiles = mol_to_smile_molpro(diff)
                 if len(diff_smiles) > 0:
-                    smiles.update(diff_smiles)           
+                    smiles.update(diff_smiles)
     else:
         print(f"Error: {response.status_code} - {response.text}")
-        
-    
-    return smiles
\ No newline at end of file
+
+    return smiles
diff --git a/app/novelty/mol_similarity.py b/app/novelty/mol_similarity.py
index 70c2f60..d452b10 100644
--- a/app/novelty/mol_similarity.py
+++ b/app/novelty/mol_similarity.py
@@ -5,10 +5,11 @@
 from rdkit.Chem import AllChem
 
 
-
-def find_nearest_neighbors(unknown_smiles_dict, known_smiles_dict, similarity_cutoff, num_neighbors):
+def find_nearest_neighbors(
+    unknown_smiles_dict, known_smiles_dict, similarity_cutoff, num_neighbors
+):
     """
-    
+
     Returns:
         Dict
 
@@ -19,19 +20,27 @@ def find_nearest_neighbors(unknown_smiles_dict, known_smiles_dict, similarity_cu
         num_neighbors: int: 1
 
     """
-    unknown_smiles = {key:value for key,value in unknown_smiles_dict.items() if value != "No SMILES could be found"}
-    known_smiles = {key:value for key,value in known_smiles_dict.items() if value != "No SMILES could be found"}
+    unknown_smiles = {
+        key: value
+        for key, value in unknown_smiles_dict.items()
+        if value != "No SMILES could be found"
+    }
+    known_smiles = {
+        key: value
+        for key, value in known_smiles_dict.items()
+        if value != "No SMILES could be found"
+    }
 
     # Convert input SMILES to a molecule
     known_mols = {}
-    for key,value in known_smiles.items():
+    for key, value in known_smiles.items():
         known_mol = Chem.MolFromSmiles(value)
         if known_mol is None:
-            raise ValueError("Invalid SMILES string for",key)
+            raise ValueError("Invalid SMILES string for", key)
         else:
-            known_mols.update({key:known_mol})
+            known_mols.update({key: known_mol})
     nearest_neighbor_mapping = {}
-    for unknownkey,value in unknown_smiles.items():
+    for unknownkey, value in unknown_smiles.items():
         query_mol = Chem.MolFromSmiles(value)
         if query_mol is None:
             raise ValueError("Invalid SMILES string")
@@ -55,5 +64,5 @@ def find_nearest_neighbors(unknown_smiles_dict, known_smiles_dict, similarity_cu
             index, similarity = similarities[i]
             if similarity >= similarity_cutoff:
                 neighbors.append((index, similarity))
-        nearest_neighbor_mapping.update({unknownkey:neighbors})
-    return nearest_neighbor_mapping
\ No newline at end of file
+        nearest_neighbor_mapping.update({unknownkey: neighbors})
+    return nearest_neighbor_mapping
diff --git a/app/ordering_components.py b/app/ordering_components.py
index 9022df5..32a8e4d 100644
--- a/app/ordering_components.py
+++ b/app/ordering_components.py
@@ -52,7 +52,9 @@ def get_ordering_components(message, logger):
     logger.debug(f"Computing scores for {len(message['results'])} results")
     db_conn = redis.Redis(connection_pool=redis_pool)
     novelty_scores_dict = get_novelty(message, logger).to_dict(orient="index")
-    novelty_scores = {node["drug"]: node["novelty_score"] for node in novelty_scores_dict.values()}
+    novelty_scores = {
+        node["drug"]: node["novelty_score"] for node in novelty_scores_dict.values()
+    }
     for result in tqdm(message.get("results") or []):
         clinical_evidence_score = get_clinical_evidence(
             result,
@@ -70,4 +72,6 @@ def get_ordering_components(message, logger):
             for node_bindings in result.get("node_bindings", {}).values():
                 for node_binding in node_bindings:
                     if node_binding["id"] in novelty_scores:
-                        result["ordering_components"]["novelty"] = novelty_scores[node_binding["id"]]
+                        result["ordering_components"]["novelty"] = novelty_scores[
+                            node_binding["id"]
+                        ]

From 44e9599cb714f028dc4e1eccbe77e39bc8601b46 Mon Sep 17 00:00:00 2001
From: Max Wang <max@covar.com>
Date: Fri, 4 Aug 2023 15:33:04 -0400
Subject: [PATCH 5/5] Add updated novelty code

---
 app/novelty/compute_novelty.py         | 42 ++++++++++---------
 app/novelty/extr_smile_molpro_by_id.py | 57 ++++++++++++++------------
 app/ordering_components.py             |  3 +-
 3 files changed, 54 insertions(+), 48 deletions(-)

diff --git a/app/novelty/compute_novelty.py b/app/novelty/compute_novelty.py
index 258a759..806919b 100644
--- a/app/novelty/compute_novelty.py
+++ b/app/novelty/compute_novelty.py
@@ -2,6 +2,7 @@
 from datetime import date
 import requests
 import numpy as np
+import traceback
 
 from .known import find_known_results
 from .extr_smile_molpro_by_id import mol_to_smile_molpro
@@ -78,7 +79,7 @@ def get_publication_info(pub_id):
         response.raise_for_status()
         response = response.json()
     except Exception:
-        return {
+        response = {
             "_meta": {
                 "n_results": 0,
             },
@@ -169,7 +170,6 @@ def extracting_drug_fda_publ_date(message, unknown):
     today = date.today()
 
     res_chk = 1
-    # for edge in message['knowledge_graph']['edges'].keys():
     query_known, query_unknown, query_chk = query_id(message)
     idi = -1
     for tmp in unknown:
@@ -406,12 +406,14 @@ def compute_novelty(message, logger):
     known, unknown = find_known_results(message)
     #
     # # Step 2
-    similarity_map = molecular_sim(known, unknown, message)
 
+    # start = time.time()
     df, query_chk = extracting_drug_fda_publ_date(message, unknown)
+    # print(f"Time to extract fda status and Publication data:{time.time()-start}")
     #         # print(df.head())
     #         # print(query_chk)
     #
+    # df.to_excel(f'DATAFRAME.xlsx', header=False, index=False)
     # df = pd.read_excel('DATAFRAME.xlsx', names=['edge', 'drug', 'fda status', 'publications', 'number_of_publ', 'age_oldest_pub'])
     # query_chk = 1
 
@@ -427,6 +429,20 @@ def compute_novelty(message, logger):
     # print(df.head())
     # print(similarity_map)
     if query_chk == 1:
+        # start = time.time()
+        try:
+            similarity_map = molecular_sim(known, unknown, message)
+            df["similarity"] = df.apply(
+                lambda row: similarity_map[row["drug"]][0][1]
+                if row["drug"] in similarity_map.keys()
+                else np.nan,
+                axis=1,
+            )
+        except Exception as e:
+            logger.error(traceback.format_exc())
+            df = df.assign(similarity=np.nan)
+
+        # print(f"Time to compute Molecular Similarity:{time.time() - start}")
         # Step 3:
         # calculating the recency
         df["recency"] = df.apply(
@@ -439,15 +455,9 @@ def compute_novelty(message, logger):
         )
         #
         # # Step 4:
-        # # This section will be added later. Currently just putting 'NaN':
+        # # Calculating the Similarity:
         # nearest_neighbours = calculate_nn_distance(res_known, res_unknown, 0, 1)
 
-        df["similarity"] = df.apply(
-            lambda row: similarity_map[row["drug"]][0][1]
-            if row["drug"] in similarity_map.keys()
-            else np.nan,
-            axis=1,
-        )
         # df = df.assign(similarity=np.nan)
 
         # # Step 5:
@@ -458,6 +468,7 @@ def compute_novelty(message, logger):
             ),
             axis=1,
         )
+        # df.to_excel(f'DATAFRAME_result.xlsx', header=False, index=False)
 
         # # # Step 6
         # # # Just sort them:
@@ -466,14 +477,5 @@ def compute_novelty(message, logger):
         )
     else:
         df = df.assign(novelty_score=0)
+    # df.to_excel(f'DATAFRAME_NOVELTY.xlsx', header=False, index=False)
     return df
-
-
-# for i in list(range(1, 5)):
-# start = time.time()
-# temp = compute_novelty('mergedAnnotatedOutput.json')
-# if temp.empty:
-#     print(f"No Results in mergedAnnotatedOutput.json")
-# else:
-#     temp_json = temp.to_json(f'mergedAnnotatedOutput_scores.json', orient='values')
-# print(time.time()-start)
diff --git a/app/novelty/extr_smile_molpro_by_id.py b/app/novelty/extr_smile_molpro_by_id.py
index 504e30e..e15a707 100644
--- a/app/novelty/extr_smile_molpro_by_id.py
+++ b/app/novelty/extr_smile_molpro_by_id.py
@@ -1,54 +1,59 @@
-#!/usr/bin/env python
 import requests
 
 
 def mol_to_smile_molpro(molecules):
     """
+    Args:
+        List
 
     Returns:
         Dict
 
-    Args:
-        List
-        Example: Attached sample_mols.json file
     """
 
-    url = "https://translator.broadinstitute.org/molecular_data_provider/compound/by_id"
+    url = "https://molepro.transltr.io/molecular_data_provider/compound/by_id"
     headers = {"accept": "application/json", "Content-Type": "application/json"}
 
-    data_mol = list(set(molecules))
     smiles = {}
 
-    response = requests.post(url, headers=headers, json=data_mol)
+    data_mol = list(set(molecules))
+    # print(f'init data: {len(data_mol)}')
+    while data_mol:
+        # print(f'before: {len(data_mol)}')
+        data_mol_before = len(data_mol)
+        response = requests.post(url, headers=headers, json=data_mol)
 
-    if response.status_code == 200:
-        json_response = response.json()
-        # print(json_response)
-        collec_url = json_response["url"]
-        temp_collec_response = requests.get(collec_url)
         if response.status_code == 200:
-            collec_response = temp_collec_response.json()
-            # print('\n')
-            # print(collec_response['elements'][0]['identifiers'])
-            if json_response["size"] > 0:
+            json_response = response.json()
+            collec_url = json_response["url"]
+            temp_collec_response = requests.get(collec_url)
+            if temp_collec_response.status_code == 200:
+                collec_response = temp_collec_response.json()
+
                 for i in range(json_response["size"]):
                     key_list = ["identifiers"]
                     if set(key_list).issubset(collec_response["elements"][i].keys()):
-                        # if 'smiles' in collec_response['elements'][i]['identifiers'].keys():
                         identifiers = collec_response["elements"][i]["identifiers"]
                         smile = identifiers.get("smiles", "No SMILES could be found")
                         smiles[data_mol[i]] = smile
                     else:
                         smiles[data_mol[i]] = "No identifiers could be found"
 
-            # Recursion: re-attempt to retrieve the SMILES for those (from the initial data_mol)
-            # for which the retrieval was not successful in the first attempt
-            if len(list(smiles.keys())) < len(data_mol):
-                diff = [mol for mol in data_mol if mol not in list(smiles.keys())]
-                diff_smiles = mol_to_smile_molpro(diff)
-                if len(diff_smiles) > 0:
-                    smiles.update(diff_smiles)
-    else:
-        print(f"Error: {response.status_code} - {response.text}")
+                # Remove molecules with successfully retrieved smiles from data_mol
+                data_mol = [mol for mol in data_mol if mol not in smiles]
+                data_mol_after = len(data_mol)
+                # print(f'after: {len(data_mol)}')
+
+                if data_mol_after == data_mol_before:
+                    break
+
+            else:
+                print(
+                    f"Error: {temp_collec_response.status_code} - {temp_collec_response.text}"
+                )
+                break
+        else:
+            print(f"Error: {response.status_code} - {response.text}")
+            break
 
     return smiles
diff --git a/app/ordering_components.py b/app/ordering_components.py
index 32a8e4d..979f234 100644
--- a/app/ordering_components.py
+++ b/app/ordering_components.py
@@ -1,13 +1,12 @@
 """Compute scores for each result in the given message."""
-import os
 import redis
 from tqdm import tqdm
 
 from .config import settings
 from .clinical_evidence.compute_clinical_evidence import compute_clinical_evidence
-
 from .novelty.compute_novelty import compute_novelty
 
+
 redis_pool = redis.ConnectionPool(
     host=settings.redis_host,
     port=settings.redis_port,