From c7269b66678648e272c4d6f397d69d46006a0afd Mon Sep 17 00:00:00 2001
From: Vincent Emonet <vincent.emonet@gmail.com>
Date: Fri, 12 Jan 2024 11:59:16 +0000
Subject: [PATCH] improve training to remove grid

---
 src/train.py         | 131 ++++++++++++++++++++++++++-----------------
 src/train_compare.py | 119 ++++++++++++++++++++++-----------------
 2 files changed, 146 insertions(+), 104 deletions(-)

diff --git a/src/train.py b/src/train.py
index 22500fe..e0a57fb 100644
--- a/src/train.py
+++ b/src/train.py
@@ -14,6 +14,7 @@
 import torch
 from sklearn import ensemble, metrics
 from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split, KFold
+from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, f1_score, average_precision_score
 import xgboost as xgb
 from xgboost import XGBClassifier, DMatrix
 
@@ -302,16 +303,16 @@ def compute(df_known_dt: pd.DataFrame | str, out_dir: str = "data"):
 ################### Train with a grid of hyperparameters to find the best
 
 
-def get_params_combinations(params):
-	keys, values = zip(*params.items())
-	combinations = [dict(zip(keys, v)) for v in product(*values)]
-	return combinations
+# def get_params_combinations(params):
+# 	keys, values = zip(*params.items())
+# 	combinations = [dict(zip(keys, v)) for v in product(*values)]
+# 	return combinations
 
-def train_grid(
+def train_gpu(
     df_known_interactions: pd.DataFrame,
     df_drugs_embeddings: pd.DataFrame,
     df_targets_embeddings: pd.DataFrame,
-    params_grid: dict[str, int | float],
+    params: dict[str, int | float],
     save_model: str = "models/drug_target.pkl",
 ):
     """Train and compare a grid of hyperparameters
@@ -327,6 +328,7 @@ def train_grid(
         "target": df_targets_embeddings,
     }
 
+    print("Generate DT pairs")
     # Get pairs and their labels: All given known drug-target pairs are 1
     # we add pairs for missing drug/targets combinations as 0 (not known as interacting)
     pairs, labels = generate_dt_pairs(df_known_interactions)
@@ -334,23 +336,26 @@ def train_grid(
     # TODO: Split dataset for train/test?
     # X_train, X_test, y_train, y_test = train_test_split(pairs, labels, test_size=0.2, random_state=123)
 
+    print("Merging drug/target pairs and their labels in a DF")
     # Merge drug/target pairs and their labels in a DF
     train_df = pd.DataFrame(
         list(zip(pairs[:, 0], pairs[:, 1], labels)), columns=["drug", "target", "Class"]
     )
+    print("Merging embeddings in the DF")
     # Add the embeddings to the DF
     train_df = train_df.merge(embeddings["drug"], left_on="drug", right_on="drug").merge(
         embeddings["target"], left_on="target", right_on="target"
     )
 
+    print("Getting X and y")
     # X is the array of embeddings (drug+target), without other columns
     # y is the array of classes/labels (0 or 1)
     embedding_cols = train_df.columns.difference(["drug", "target", "Class"])
     X = train_df[embedding_cols].values
     y = train_df["Class"].values.ravel()
     print(f"Features count: {len(embedding_cols)}")
-    print(X)
-    print(y)
+    # print(X)
+    # print(y)
 
     ndrugs = len(embeddings["drug"])
     ntargets = len(embeddings["target"])
@@ -368,53 +373,73 @@ def train_grid(
     # https://github.com/compomics/ms2pip/blob/a8c61b41044f3f756b4551d7866d8030e68b1570/train_scripts/train_xgboost_c.py#L143
 
     results = []
-    count = 1
-    combinations = get_params_combinations(params_grid)
+    # combinations = get_params_combinations(params_grid)
 
     # NOTE: Trying to use train to run CV on GPU
-    for param_combin in combinations:
-        param_combin["device"] = "cuda:2"
-        param_combin["tree_method"] = "hist"
-        # param_combin["eval_metric"] = "rmse"  # Evaluation metric
-
-        print("Working on combination {}/{}".format(count, len(combinations)))
-        combination_time = time.time()
-        fold_results = []
-
-        # for fold, (train_index, test_index) in enumerate(kf.split(X)):
-        # Train model for each fold
-        for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
-            X_train, X_test = X[train_index], X[test_index]
-            y_train, y_test = y[train_index], y[test_index]
-
-            # Send data to GPU for xgboost
-            send_time = time.time()
-            dtrain = xgb.DMatrix(X_train, label=y_train)
-            dtest = xgb.DMatrix(X_test, label=y_test)
-            print(f"Sending data to GPU took {time.time() - send_time}s")
-
-            # Train xgboost model
-            model = xgb.train(param_combin, dtrain, num_boost_round=100)
-
-            # Evaluate model
-            predictions = model.predict(dtest)
-            rmse = np.sqrt(((predictions - y_test) ** 2).mean())
-            fold_results.append(rmse)
-            del dtrain, dtest, model
-            gc.collect()  # Force garbage collection
-            print(f"Completed fold {fold + 1}/{n_splits} for combination {count}/{len(combinations)} in {time.time() - send_time}s")
-
-        print(f"Combination {count} took {time.time() - combination_time}s")
-
-        # Store the average RMSE for this parameter combination
-        avg_rmse = np.mean(fold_results)
-        results.append({**param_combin, 'rmse': avg_rmse})
-        count += 1
-
-    df = pd.DataFrame(results)
+    # for param_combin in combinations:
+    params["device"] = "cuda:0"
+    params["tree_method"] = "hist"
+    # param_combin["eval_metric"] = "rmse"  # Evaluation metric
+
+    # print("Working on combination {}/{}".format(count, len(combinations)))
+    combination_time = time.time()
+    fold_results = []
+
+    # for fold, (train_index, test_index) in enumerate(kf.split(X)):
+    # Train model for each fold
+    for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
+        X_train, X_test = X[train_index], X[test_index]
+        y_train, y_test = y[train_index], y[test_index]
+
+        # Send data to GPU for xgboost
+        send_time = time.time()
+        dtrain = xgb.DMatrix(X_train, label=y_train)
+        dtest = xgb.DMatrix(X_test, label=y_test)
+        print(f"Sending data to GPU took {time.time() - send_time}s")
+
+        # Train xgboost model
+        model = xgb.train(params, dtrain, num_boost_round=100)
+
+        # Evaluate model
+        predictions = model.predict(dtest)
+        predictions_binary = np.round(predictions) # Convert probabilities to binary outputs
+
+        # Calculate metrics
+        rmse = np.sqrt(((predictions - y_test) ** 2).mean())
+        precision = precision_score(y_test, predictions_binary)
+        recall = recall_score(y_test, predictions_binary)
+        accuracy = accuracy_score(y_test, predictions_binary)
+        roc_auc = roc_auc_score(y_test, predictions)
+        f1 = f1_score(y_test, predictions_binary)
+        average_precision = average_precision_score(y_test, predictions)
+
+        fold_results.append({
+            'rmse': rmse,
+            'precision': precision,
+            'recall': recall,
+            'accuracy': accuracy,
+            'roc_auc': roc_auc,
+            'f1': f1,
+            'average_precision': average_precision
+        })
+        # rmse = np.sqrt(((predictions - y_test) ** 2).mean())
+        # fold_results.append(rmse)
+        del dtrain, dtest, model
+        gc.collect()  # Force garbage collection
+        print(f"Completed fold {fold + 1}/{n_splits} in {time.time() - send_time}s")
+
+    print(f"Combination took {time.time() - combination_time}s")
+
+    # # Store the average RMSE for this parameter combination
+    # avg_rmse = np.mean(fold_results)
+    # results.append({'rmse': avg_rmse})
+    # # count += 1
+    # df = pd.DataFrame(results)
+
+    df_avg_metrics = pd.DataFrame(fold_results).mean().to_dict()
     print("TRAINING RESULTS")
-    print(df)
-    return df
+    print(df_avg_metrics)
+    return df_avg_metrics
 
     #######################
 
@@ -424,7 +449,7 @@ def train_grid(
     #     # param_combin["n_jobs"] = n_jobs
     #     # param_combin["random_state"] = random_state
     #     # param_combin["tree_method"] = "gpu_hist"
-    #     param_combin["device"] = "cuda:2"
+    #     param_combin["device"] = "cuda:0"
     #     param_combin["tree_method"] = "hist"
 
     #     print("Working on combination {}/{}".format(count, len(combinations)))
diff --git a/src/train_compare.py b/src/train_compare.py
index ec673bf..644dfa7 100644
--- a/src/train_compare.py
+++ b/src/train_compare.py
@@ -1,7 +1,7 @@
 import os
 import pandas as pd
 
-from src.train import train, train_grid
+from src.train import train, train_gpu
 from src.utils import log, TrainingConfig
 from src.vectordb import init_vectordb
 
@@ -43,9 +43,8 @@ def drop_similar(df: str, col_id: str, threshold: float = 0.9):
 
 
 
-def exclude_sim_and_train(input_dir, out_dir, param_grid, subject_sim_threshold: float = 1, object_sim_threshold: float = 1):
+def exclude_similar(input_dir, subject_sim_threshold: float = 1, object_sim_threshold: float = 1):
     """Exclude similarities given thresholds, and run training on grid"""
-    os.makedirs(out_dir, exist_ok=True)
 
     print(f"🔨 Training for {subject_sim_threshold} - {object_sim_threshold}")
 
@@ -66,7 +65,7 @@ def exclude_sim_and_train(input_dir, out_dir, param_grid, subject_sim_threshold:
 
     log.info(f"DF LENGTH AFTER DROPPING: {len(df_drugs)} drugs and {len(df_targets)} targets, and {len(df_known_dt)} known pairs")
 
-    score = train_grid(df_known_dt, df_drugs, df_targets, param_grid, f"{out_dir}/model_drug_target_{subject_sim_threshold}_{object_sim_threshold}.pkl")
+    # score = train_gpu(df_known_dt, df_drugs, df_targets, params, f"{out_dir}/model_drug_target_{subject_sim_threshold}_{object_sim_threshold}.pkl")
 
     # score_df = train(df_known_dt, df_drugs, df_targets, save_model=f"{out_dir}/opentarget_drug_target_nosim.pkl", config=config)
     # score_df.insert(0, 'Drug sim threshold', config.subject_sim_threshold)
@@ -74,60 +73,78 @@ def exclude_sim_and_train(input_dir, out_dir, param_grid, subject_sim_threshold:
     # score_df.insert(2, 'CV nfold', config.cv_nfold)
     # score_df.insert(3, 'Max depth', config.max_depth)
 
-    return score
+    return df_known_dt, df_drugs, df_targets
 
 
-def train_grid_exclude_sim(input_dir, out_dir):
-    """Define the similarities thresholds and params grid, then run training"""
-    os.makedirs(out_dir, exist_ok=True)
-    # Shorter version for starting
-    subject_sim_thresholds = [1, 0.99]
-    object_sim_thresholds = [1, 0.99]
-    param_grid = {
-        'max_depth': [3, 4],
-        'learning_rate': [0.1, 0.01],
-        'subsample': [0.7, 0.8],
-        'colsample_bytree': [0.7, 0.8],
-        'gamma': [0, 1],
-        'reg_alpha': [0, 0.1],
-        'reg_lambda': [1, 2],
-        # 'n_estimators': [100, 200],
-    }
+# def train_grid_exclude_sim(input_dir, out_dir):
+#     """Define the similarities thresholds and params grid, then run training"""
+#     os.makedirs(out_dir, exist_ok=True)
+#     # Shorter version for starting
+#     # param_grid = {
+#     #     'max_depth': [3, 4],
+#     #     'learning_rate': [0.1, 0.01],
+#     #     'subsample': [0.7, 0.8],
+#     #     'colsample_bytree': [0.7, 0.8],
+#     #     'gamma': [0, 1],
+#     #     'reg_alpha': [0, 0.1],
+#     #     'reg_lambda': [1, 2],
+#     #     # 'n_estimators': [100, 200],
+#     # }
 
-    # Longer version
-    # subject_sim_thresholds = [1, 0.99, 0.98, 0.97, 0.95, 0.90]
-    # object_sim_thresholds = [1, 0.99, 0.98, 0.97, 0.95, 0.90]
-    # param_grid = {
-    #     'max_depth': [3, 4, 5, 6],
-    #     'learning_rate': [0.1, 0.01, 0.05],
-    #     'subsample': [0.7, 0.8, 0.9],
-    #     'colsample_bytree': [0.7, 0.8, 0.9],
-    #     'gamma': [0, 1, 2],
-    #     'reg_alpha': [0, 0.1, 0.5],
-    #     'reg_lambda': [1, 2, 5],
-    #     'n_estimators': [100, 200, 300],
-    # }
+#     # Longer version
+#     # subject_sim_thresholds = [1, 0.99, 0.98, 0.97, 0.95, 0.90]
+#     # object_sim_thresholds = [1, 0.99, 0.98, 0.97, 0.95, 0.90]
 
-    scores_df = pd.DataFrame()
-    for subject_sim_threshold in subject_sim_thresholds:
-        for object_sim_threshold in object_sim_thresholds:
-            sim_scores = exclude_sim_and_train(input_dir, out_dir, param_grid, subject_sim_threshold, object_sim_threshold)
-            sim_scores["subject_sim_threshold"] = subject_sim_threshold
-            sim_scores["object_sim_threshold"] = object_sim_threshold
-            scores_df = pd.concat([scores_df, sim_scores], ignore_index=True)
-
-    # score_list = []
-    # for config in configs:
-    #     score_list.append(train_not_similar(input_dir, out_dir, config))
-    # print(score_list)
-    # combined_df = pd.concat(score_list)
-    # combined_df.to_csv(f"{out_dir}/compare_scores.csv", index=False)
+#     scores_df = pd.DataFrame()
+#     for subject_sim_threshold in subject_sim_thresholds:
+#         for object_sim_threshold in object_sim_thresholds:
+#             sim_scores = exclude_similar(input_dir, out_dir, params, subject_sim_threshold, object_sim_threshold)
+#             sim_scores["subject_sim_threshold"] = subject_sim_threshold
+#             sim_scores["object_sim_threshold"] = object_sim_threshold
+#             scores_df = pd.concat([scores_df, sim_scores], ignore_index=True)
 
-    print("SCORES DF", scores_df)
-    scores_df.to_csv(f"{out_dir}/compare_scores.csv", index=False)
+#     # score_list = []
+#     # for config in configs:
+#     #     score_list.append(train_not_similar(input_dir, out_dir, config))
+#     # print(score_list)
+#     # combined_df = pd.concat(score_list)
+#     # combined_df.to_csv(f"{out_dir}/compare_scores.csv", index=False)
+
+#     print("SCORES DF", scores_df)
+#     scores_df.to_csv(f"{out_dir}/compare_scores.csv", index=False)
 
 
 
 if __name__ == "__main__":
-    train_grid_exclude_sim("data/opentargets", "data/grid")
+    # train_grid_exclude_sim("data/opentargets", "data/grid")
     # train_not_similar("data/opentargets", "data/opentargets_not_similar")
+    out_dir = "data/grid"
+    os.makedirs(out_dir, exist_ok=True)
+
+    # Longer version:
+    subject_sim_thresholds = [1, 0.99, 0.98, 0.97, 0.95, 0.90]
+    object_sim_thresholds = [1, 0.99, 0.98, 0.97, 0.95, 0.90]
+    params = {
+        'max_depth': 3,
+        'learning_rate': 0.1,
+        'subsample': 0.7,
+        'colsample_bytree': 0.7,
+        'gamma': 0,
+        'reg_alpha': 0.1,
+        'reg_lambda': 1,
+        # 'n_estimators': 100,
+    }
+    scores_df = pd.DataFrame()
+    for subject_sim_threshold in subject_sim_thresholds:
+        for object_sim_threshold in object_sim_thresholds:
+            # Exclude similar then run training on GPU
+            df_known_dt, df_drugs_embeddings, df_targets_embeddings  = exclude_similar("data/opentargets", subject_sim_threshold, object_sim_threshold)
+            print(f"Similar excluded for {subject_sim_threshold}/{object_sim_threshold}")
+
+            scores = train_gpu(df_known_dt, df_drugs_embeddings, df_targets_embeddings, params)
+            scores["subject_sim_threshold"] = subject_sim_threshold
+            scores["object_sim_threshold"] = object_sim_threshold
+            scores_df = pd.concat([scores_df, scores], ignore_index=True)
+
+    print("SCORES DF", scores_df)
+    scores_df.to_csv(f"{out_dir}/compare_scores.csv", index=False)