From c7269b66678648e272c4d6f397d69d46006a0afd Mon Sep 17 00:00:00 2001 From: Vincent Emonet Date: Fri, 12 Jan 2024 11:59:16 +0000 Subject: [PATCH] improve training to remove grid --- src/train.py | 131 ++++++++++++++++++++++++++----------------- src/train_compare.py | 119 ++++++++++++++++++++++----------------- 2 files changed, 146 insertions(+), 104 deletions(-) diff --git a/src/train.py b/src/train.py index 22500fe..e0a57fb 100644 --- a/src/train.py +++ b/src/train.py @@ -14,6 +14,7 @@ import torch from sklearn import ensemble, metrics from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split, KFold +from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, f1_score, average_precision_score import xgboost as xgb from xgboost import XGBClassifier, DMatrix @@ -302,16 +303,16 @@ def compute(df_known_dt: pd.DataFrame | str, out_dir: str = "data"): ################### Train with a grid of hyperparameters to find the best -def get_params_combinations(params): - keys, values = zip(*params.items()) - combinations = [dict(zip(keys, v)) for v in product(*values)] - return combinations +# def get_params_combinations(params): +# keys, values = zip(*params.items()) +# combinations = [dict(zip(keys, v)) for v in product(*values)] +# return combinations -def train_grid( +def train_gpu( df_known_interactions: pd.DataFrame, df_drugs_embeddings: pd.DataFrame, df_targets_embeddings: pd.DataFrame, - params_grid: dict[str, int | float], + params: dict[str, int | float], save_model: str = "models/drug_target.pkl", ): """Train and compare a grid of hyperparameters @@ -327,6 +328,7 @@ def train_grid( "target": df_targets_embeddings, } + print("Generate DT pairs") # Get pairs and their labels: All given known drug-target pairs are 1 # we add pairs for missing drug/targets combinations as 0 (not known as interacting) pairs, labels = generate_dt_pairs(df_known_interactions) @@ -334,23 +336,26 @@ def train_grid( # TODO: Split dataset for train/test? # X_train, X_test, y_train, y_test = train_test_split(pairs, labels, test_size=0.2, random_state=123) + print("Merging drug/target pairs and their labels in a DF") # Merge drug/target pairs and their labels in a DF train_df = pd.DataFrame( list(zip(pairs[:, 0], pairs[:, 1], labels)), columns=["drug", "target", "Class"] ) + print("Merging embeddings in the DF") # Add the embeddings to the DF train_df = train_df.merge(embeddings["drug"], left_on="drug", right_on="drug").merge( embeddings["target"], left_on="target", right_on="target" ) + print("Getting X and y") # X is the array of embeddings (drug+target), without other columns # y is the array of classes/labels (0 or 1) embedding_cols = train_df.columns.difference(["drug", "target", "Class"]) X = train_df[embedding_cols].values y = train_df["Class"].values.ravel() print(f"Features count: {len(embedding_cols)}") - print(X) - print(y) + # print(X) + # print(y) ndrugs = len(embeddings["drug"]) ntargets = len(embeddings["target"]) @@ -368,53 +373,73 @@ def train_grid( # https://github.com/compomics/ms2pip/blob/a8c61b41044f3f756b4551d7866d8030e68b1570/train_scripts/train_xgboost_c.py#L143 results = [] - count = 1 - combinations = get_params_combinations(params_grid) + # combinations = get_params_combinations(params_grid) # NOTE: Trying to use train to run CV on GPU - for param_combin in combinations: - param_combin["device"] = "cuda:2" - param_combin["tree_method"] = "hist" - # param_combin["eval_metric"] = "rmse" # Evaluation metric - - print("Working on combination {}/{}".format(count, len(combinations))) - combination_time = time.time() - fold_results = [] - - # for fold, (train_index, test_index) in enumerate(kf.split(X)): - # Train model for each fold - for fold, (train_index, test_index) in enumerate(skf.split(X, y)): - X_train, X_test = X[train_index], X[test_index] - y_train, y_test = y[train_index], y[test_index] - - # Send data to GPU for xgboost - send_time = time.time() - dtrain = xgb.DMatrix(X_train, label=y_train) - dtest = xgb.DMatrix(X_test, label=y_test) - print(f"Sending data to GPU took {time.time() - send_time}s") - - # Train xgboost model - model = xgb.train(param_combin, dtrain, num_boost_round=100) - - # Evaluate model - predictions = model.predict(dtest) - rmse = np.sqrt(((predictions - y_test) ** 2).mean()) - fold_results.append(rmse) - del dtrain, dtest, model - gc.collect() # Force garbage collection - print(f"Completed fold {fold + 1}/{n_splits} for combination {count}/{len(combinations)} in {time.time() - send_time}s") - - print(f"Combination {count} took {time.time() - combination_time}s") - - # Store the average RMSE for this parameter combination - avg_rmse = np.mean(fold_results) - results.append({**param_combin, 'rmse': avg_rmse}) - count += 1 - - df = pd.DataFrame(results) + # for param_combin in combinations: + params["device"] = "cuda:0" + params["tree_method"] = "hist" + # param_combin["eval_metric"] = "rmse" # Evaluation metric + + # print("Working on combination {}/{}".format(count, len(combinations))) + combination_time = time.time() + fold_results = [] + + # for fold, (train_index, test_index) in enumerate(kf.split(X)): + # Train model for each fold + for fold, (train_index, test_index) in enumerate(skf.split(X, y)): + X_train, X_test = X[train_index], X[test_index] + y_train, y_test = y[train_index], y[test_index] + + # Send data to GPU for xgboost + send_time = time.time() + dtrain = xgb.DMatrix(X_train, label=y_train) + dtest = xgb.DMatrix(X_test, label=y_test) + print(f"Sending data to GPU took {time.time() - send_time}s") + + # Train xgboost model + model = xgb.train(params, dtrain, num_boost_round=100) + + # Evaluate model + predictions = model.predict(dtest) + predictions_binary = np.round(predictions) # Convert probabilities to binary outputs + + # Calculate metrics + rmse = np.sqrt(((predictions - y_test) ** 2).mean()) + precision = precision_score(y_test, predictions_binary) + recall = recall_score(y_test, predictions_binary) + accuracy = accuracy_score(y_test, predictions_binary) + roc_auc = roc_auc_score(y_test, predictions) + f1 = f1_score(y_test, predictions_binary) + average_precision = average_precision_score(y_test, predictions) + + fold_results.append({ + 'rmse': rmse, + 'precision': precision, + 'recall': recall, + 'accuracy': accuracy, + 'roc_auc': roc_auc, + 'f1': f1, + 'average_precision': average_precision + }) + # rmse = np.sqrt(((predictions - y_test) ** 2).mean()) + # fold_results.append(rmse) + del dtrain, dtest, model + gc.collect() # Force garbage collection + print(f"Completed fold {fold + 1}/{n_splits} in {time.time() - send_time}s") + + print(f"Combination took {time.time() - combination_time}s") + + # # Store the average RMSE for this parameter combination + # avg_rmse = np.mean(fold_results) + # results.append({'rmse': avg_rmse}) + # # count += 1 + # df = pd.DataFrame(results) + + df_avg_metrics = pd.DataFrame(fold_results).mean().to_dict() print("TRAINING RESULTS") - print(df) - return df + print(df_avg_metrics) + return df_avg_metrics ####################### @@ -424,7 +449,7 @@ def train_grid( # # param_combin["n_jobs"] = n_jobs # # param_combin["random_state"] = random_state # # param_combin["tree_method"] = "gpu_hist" - # param_combin["device"] = "cuda:2" + # param_combin["device"] = "cuda:0" # param_combin["tree_method"] = "hist" # print("Working on combination {}/{}".format(count, len(combinations))) diff --git a/src/train_compare.py b/src/train_compare.py index ec673bf..644dfa7 100644 --- a/src/train_compare.py +++ b/src/train_compare.py @@ -1,7 +1,7 @@ import os import pandas as pd -from src.train import train, train_grid +from src.train import train, train_gpu from src.utils import log, TrainingConfig from src.vectordb import init_vectordb @@ -43,9 +43,8 @@ def drop_similar(df: str, col_id: str, threshold: float = 0.9): -def exclude_sim_and_train(input_dir, out_dir, param_grid, subject_sim_threshold: float = 1, object_sim_threshold: float = 1): +def exclude_similar(input_dir, subject_sim_threshold: float = 1, object_sim_threshold: float = 1): """Exclude similarities given thresholds, and run training on grid""" - os.makedirs(out_dir, exist_ok=True) print(f"🔨 Training for {subject_sim_threshold} - {object_sim_threshold}") @@ -66,7 +65,7 @@ def exclude_sim_and_train(input_dir, out_dir, param_grid, subject_sim_threshold: log.info(f"DF LENGTH AFTER DROPPING: {len(df_drugs)} drugs and {len(df_targets)} targets, and {len(df_known_dt)} known pairs") - score = train_grid(df_known_dt, df_drugs, df_targets, param_grid, f"{out_dir}/model_drug_target_{subject_sim_threshold}_{object_sim_threshold}.pkl") + # score = train_gpu(df_known_dt, df_drugs, df_targets, params, f"{out_dir}/model_drug_target_{subject_sim_threshold}_{object_sim_threshold}.pkl") # score_df = train(df_known_dt, df_drugs, df_targets, save_model=f"{out_dir}/opentarget_drug_target_nosim.pkl", config=config) # score_df.insert(0, 'Drug sim threshold', config.subject_sim_threshold) @@ -74,60 +73,78 @@ def exclude_sim_and_train(input_dir, out_dir, param_grid, subject_sim_threshold: # score_df.insert(2, 'CV nfold', config.cv_nfold) # score_df.insert(3, 'Max depth', config.max_depth) - return score + return df_known_dt, df_drugs, df_targets -def train_grid_exclude_sim(input_dir, out_dir): - """Define the similarities thresholds and params grid, then run training""" - os.makedirs(out_dir, exist_ok=True) - # Shorter version for starting - subject_sim_thresholds = [1, 0.99] - object_sim_thresholds = [1, 0.99] - param_grid = { - 'max_depth': [3, 4], - 'learning_rate': [0.1, 0.01], - 'subsample': [0.7, 0.8], - 'colsample_bytree': [0.7, 0.8], - 'gamma': [0, 1], - 'reg_alpha': [0, 0.1], - 'reg_lambda': [1, 2], - # 'n_estimators': [100, 200], - } +# def train_grid_exclude_sim(input_dir, out_dir): +# """Define the similarities thresholds and params grid, then run training""" +# os.makedirs(out_dir, exist_ok=True) +# # Shorter version for starting +# # param_grid = { +# # 'max_depth': [3, 4], +# # 'learning_rate': [0.1, 0.01], +# # 'subsample': [0.7, 0.8], +# # 'colsample_bytree': [0.7, 0.8], +# # 'gamma': [0, 1], +# # 'reg_alpha': [0, 0.1], +# # 'reg_lambda': [1, 2], +# # # 'n_estimators': [100, 200], +# # } - # Longer version - # subject_sim_thresholds = [1, 0.99, 0.98, 0.97, 0.95, 0.90] - # object_sim_thresholds = [1, 0.99, 0.98, 0.97, 0.95, 0.90] - # param_grid = { - # 'max_depth': [3, 4, 5, 6], - # 'learning_rate': [0.1, 0.01, 0.05], - # 'subsample': [0.7, 0.8, 0.9], - # 'colsample_bytree': [0.7, 0.8, 0.9], - # 'gamma': [0, 1, 2], - # 'reg_alpha': [0, 0.1, 0.5], - # 'reg_lambda': [1, 2, 5], - # 'n_estimators': [100, 200, 300], - # } +# # Longer version +# # subject_sim_thresholds = [1, 0.99, 0.98, 0.97, 0.95, 0.90] +# # object_sim_thresholds = [1, 0.99, 0.98, 0.97, 0.95, 0.90] - scores_df = pd.DataFrame() - for subject_sim_threshold in subject_sim_thresholds: - for object_sim_threshold in object_sim_thresholds: - sim_scores = exclude_sim_and_train(input_dir, out_dir, param_grid, subject_sim_threshold, object_sim_threshold) - sim_scores["subject_sim_threshold"] = subject_sim_threshold - sim_scores["object_sim_threshold"] = object_sim_threshold - scores_df = pd.concat([scores_df, sim_scores], ignore_index=True) - - # score_list = [] - # for config in configs: - # score_list.append(train_not_similar(input_dir, out_dir, config)) - # print(score_list) - # combined_df = pd.concat(score_list) - # combined_df.to_csv(f"{out_dir}/compare_scores.csv", index=False) +# scores_df = pd.DataFrame() +# for subject_sim_threshold in subject_sim_thresholds: +# for object_sim_threshold in object_sim_thresholds: +# sim_scores = exclude_similar(input_dir, out_dir, params, subject_sim_threshold, object_sim_threshold) +# sim_scores["subject_sim_threshold"] = subject_sim_threshold +# sim_scores["object_sim_threshold"] = object_sim_threshold +# scores_df = pd.concat([scores_df, sim_scores], ignore_index=True) - print("SCORES DF", scores_df) - scores_df.to_csv(f"{out_dir}/compare_scores.csv", index=False) +# # score_list = [] +# # for config in configs: +# # score_list.append(train_not_similar(input_dir, out_dir, config)) +# # print(score_list) +# # combined_df = pd.concat(score_list) +# # combined_df.to_csv(f"{out_dir}/compare_scores.csv", index=False) + +# print("SCORES DF", scores_df) +# scores_df.to_csv(f"{out_dir}/compare_scores.csv", index=False) if __name__ == "__main__": - train_grid_exclude_sim("data/opentargets", "data/grid") + # train_grid_exclude_sim("data/opentargets", "data/grid") # train_not_similar("data/opentargets", "data/opentargets_not_similar") + out_dir = "data/grid" + os.makedirs(out_dir, exist_ok=True) + + # Longer version: + subject_sim_thresholds = [1, 0.99, 0.98, 0.97, 0.95, 0.90] + object_sim_thresholds = [1, 0.99, 0.98, 0.97, 0.95, 0.90] + params = { + 'max_depth': 3, + 'learning_rate': 0.1, + 'subsample': 0.7, + 'colsample_bytree': 0.7, + 'gamma': 0, + 'reg_alpha': 0.1, + 'reg_lambda': 1, + # 'n_estimators': 100, + } + scores_df = pd.DataFrame() + for subject_sim_threshold in subject_sim_thresholds: + for object_sim_threshold in object_sim_thresholds: + # Exclude similar then run training on GPU + df_known_dt, df_drugs_embeddings, df_targets_embeddings = exclude_similar("data/opentargets", subject_sim_threshold, object_sim_threshold) + print(f"Similar excluded for {subject_sim_threshold}/{object_sim_threshold}") + + scores = train_gpu(df_known_dt, df_drugs_embeddings, df_targets_embeddings, params) + scores["subject_sim_threshold"] = subject_sim_threshold + scores["object_sim_threshold"] = object_sim_threshold + scores_df = pd.concat([scores_df, scores], ignore_index=True) + + print("SCORES DF", scores_df) + scores_df.to_csv(f"{out_dir}/compare_scores.csv", index=False)