Skip to content


improve training to remove grid
Browse files Browse the repository at this point in the history
  • Loading branch information
vemonet committed Jan 12, 2024
1 parent 823ab88 commit c7269b6
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 104 deletions.
131 changes: 78 additions & 53 deletions src/
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import torch
from sklearn import ensemble, metrics
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split, KFold
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, f1_score, average_precision_score
import xgboost as xgb
from xgboost import XGBClassifier, DMatrix

Expand Down Expand Up @@ -302,16 +303,16 @@ def compute(df_known_dt: pd.DataFrame | str, out_dir: str = "data"):
################### Train with a grid of hyperparameters to find the best

def get_params_combinations(params):
keys, values = zip(*params.items())
combinations = [dict(zip(keys, v)) for v in product(*values)]
return combinations
# def get_params_combinations(params):
# keys, values = zip(*params.items())
# combinations = [dict(zip(keys, v)) for v in product(*values)]
# return combinations

def train_grid(
def train_gpu(
df_known_interactions: pd.DataFrame,
df_drugs_embeddings: pd.DataFrame,
df_targets_embeddings: pd.DataFrame,
params_grid: dict[str, int | float],
params: dict[str, int | float],
save_model: str = "models/drug_target.pkl",
"""Train and compare a grid of hyperparameters
Expand All @@ -327,30 +328,34 @@ def train_grid(
"target": df_targets_embeddings,

print("Generate DT pairs")
# Get pairs and their labels: All given known drug-target pairs are 1
# we add pairs for missing drug/targets combinations as 0 (not known as interacting)
pairs, labels = generate_dt_pairs(df_known_interactions)

# TODO: Split dataset for train/test?
# X_train, X_test, y_train, y_test = train_test_split(pairs, labels, test_size=0.2, random_state=123)

print("Merging drug/target pairs and their labels in a DF")
# Merge drug/target pairs and their labels in a DF
train_df = pd.DataFrame(
list(zip(pairs[:, 0], pairs[:, 1], labels)), columns=["drug", "target", "Class"]
print("Merging embeddings in the DF")
# Add the embeddings to the DF
train_df = train_df.merge(embeddings["drug"], left_on="drug", right_on="drug").merge(
embeddings["target"], left_on="target", right_on="target"

print("Getting X and y")
# X is the array of embeddings (drug+target), without other columns
# y is the array of classes/labels (0 or 1)
embedding_cols = train_df.columns.difference(["drug", "target", "Class"])
X = train_df[embedding_cols].values
y = train_df["Class"].values.ravel()
print(f"Features count: {len(embedding_cols)}")
# print(X)
# print(y)

ndrugs = len(embeddings["drug"])
ntargets = len(embeddings["target"])
Expand All @@ -368,53 +373,73 @@ def train_grid(

results = []
count = 1
combinations = get_params_combinations(params_grid)
# combinations = get_params_combinations(params_grid)

# NOTE: Trying to use train to run CV on GPU
for param_combin in combinations:
param_combin["device"] = "cuda:2"
param_combin["tree_method"] = "hist"
# param_combin["eval_metric"] = "rmse" # Evaluation metric

print("Working on combination {}/{}".format(count, len(combinations)))
combination_time = time.time()
fold_results = []

# for fold, (train_index, test_index) in enumerate(kf.split(X)):
# Train model for each fold
for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

# Send data to GPU for xgboost
send_time = time.time()
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
print(f"Sending data to GPU took {time.time() - send_time}s")

# Train xgboost model
model = xgb.train(param_combin, dtrain, num_boost_round=100)

# Evaluate model
predictions = model.predict(dtest)
rmse = np.sqrt(((predictions - y_test) ** 2).mean())
del dtrain, dtest, model
gc.collect() # Force garbage collection
print(f"Completed fold {fold + 1}/{n_splits} for combination {count}/{len(combinations)} in {time.time() - send_time}s")

print(f"Combination {count} took {time.time() - combination_time}s")

# Store the average RMSE for this parameter combination
avg_rmse = np.mean(fold_results)
results.append({**param_combin, 'rmse': avg_rmse})
count += 1

df = pd.DataFrame(results)
# for param_combin in combinations:
params["device"] = "cuda:0"
params["tree_method"] = "hist"
# param_combin["eval_metric"] = "rmse" # Evaluation metric

# print("Working on combination {}/{}".format(count, len(combinations)))
combination_time = time.time()
fold_results = []

# for fold, (train_index, test_index) in enumerate(kf.split(X)):
# Train model for each fold
for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

# Send data to GPU for xgboost
send_time = time.time()
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
print(f"Sending data to GPU took {time.time() - send_time}s")

# Train xgboost model
model = xgb.train(params, dtrain, num_boost_round=100)

# Evaluate model
predictions = model.predict(dtest)
predictions_binary = np.round(predictions) # Convert probabilities to binary outputs

# Calculate metrics
rmse = np.sqrt(((predictions - y_test) ** 2).mean())
precision = precision_score(y_test, predictions_binary)
recall = recall_score(y_test, predictions_binary)
accuracy = accuracy_score(y_test, predictions_binary)
roc_auc = roc_auc_score(y_test, predictions)
f1 = f1_score(y_test, predictions_binary)
average_precision = average_precision_score(y_test, predictions)

'rmse': rmse,
'precision': precision,
'recall': recall,
'accuracy': accuracy,
'roc_auc': roc_auc,
'f1': f1,
'average_precision': average_precision
# rmse = np.sqrt(((predictions - y_test) ** 2).mean())
# fold_results.append(rmse)
del dtrain, dtest, model
gc.collect() # Force garbage collection
print(f"Completed fold {fold + 1}/{n_splits} in {time.time() - send_time}s")

print(f"Combination took {time.time() - combination_time}s")

# # Store the average RMSE for this parameter combination
# avg_rmse = np.mean(fold_results)
# results.append({'rmse': avg_rmse})
# # count += 1
# df = pd.DataFrame(results)

df_avg_metrics = pd.DataFrame(fold_results).mean().to_dict()
return df
return df_avg_metrics


Expand All @@ -424,7 +449,7 @@ def train_grid(
# # param_combin["n_jobs"] = n_jobs
# # param_combin["random_state"] = random_state
# # param_combin["tree_method"] = "gpu_hist"
# param_combin["device"] = "cuda:2"
# param_combin["device"] = "cuda:0"
# param_combin["tree_method"] = "hist"

# print("Working on combination {}/{}".format(count, len(combinations)))
Expand Down
119 changes: 68 additions & 51 deletions src/
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import pandas as pd

from src.train import train, train_grid
from src.train import train, train_gpu
from src.utils import log, TrainingConfig
from src.vectordb import init_vectordb

Expand Down Expand Up @@ -43,9 +43,8 @@ def drop_similar(df: str, col_id: str, threshold: float = 0.9):

def exclude_sim_and_train(input_dir, out_dir, param_grid, subject_sim_threshold: float = 1, object_sim_threshold: float = 1):
def exclude_similar(input_dir, subject_sim_threshold: float = 1, object_sim_threshold: float = 1):
"""Exclude similarities given thresholds, and run training on grid"""
os.makedirs(out_dir, exist_ok=True)

print(f"🔨 Training for {subject_sim_threshold} - {object_sim_threshold}")

Expand All @@ -66,68 +65,86 @@ def exclude_sim_and_train(input_dir, out_dir, param_grid, subject_sim_threshold:"DF LENGTH AFTER DROPPING: {len(df_drugs)} drugs and {len(df_targets)} targets, and {len(df_known_dt)} known pairs")

score = train_grid(df_known_dt, df_drugs, df_targets, param_grid, f"{out_dir}/model_drug_target_{subject_sim_threshold}_{object_sim_threshold}.pkl")
# score = train_gpu(df_known_dt, df_drugs, df_targets, params, f"{out_dir}/model_drug_target_{subject_sim_threshold}_{object_sim_threshold}.pkl")

# score_df = train(df_known_dt, df_drugs, df_targets, save_model=f"{out_dir}/opentarget_drug_target_nosim.pkl", config=config)
# score_df.insert(0, 'Drug sim threshold', config.subject_sim_threshold)
# score_df.insert(1, 'Target sim threshold', config.object_sim_threshold)
# score_df.insert(2, 'CV nfold', config.cv_nfold)
# score_df.insert(3, 'Max depth', config.max_depth)

return score
return df_known_dt, df_drugs, df_targets

def train_grid_exclude_sim(input_dir, out_dir):
"""Define the similarities thresholds and params grid, then run training"""
os.makedirs(out_dir, exist_ok=True)
# Shorter version for starting
subject_sim_thresholds = [1, 0.99]
object_sim_thresholds = [1, 0.99]
param_grid = {
'max_depth': [3, 4],
'learning_rate': [0.1, 0.01],
'subsample': [0.7, 0.8],
'colsample_bytree': [0.7, 0.8],
'gamma': [0, 1],
'reg_alpha': [0, 0.1],
'reg_lambda': [1, 2],
# 'n_estimators': [100, 200],
# def train_grid_exclude_sim(input_dir, out_dir):
# """Define the similarities thresholds and params grid, then run training"""
# os.makedirs(out_dir, exist_ok=True)
# # Shorter version for starting
# # param_grid = {
# # 'max_depth': [3, 4],
# # 'learning_rate': [0.1, 0.01],
# # 'subsample': [0.7, 0.8],
# # 'colsample_bytree': [0.7, 0.8],
# # 'gamma': [0, 1],
# # 'reg_alpha': [0, 0.1],
# # 'reg_lambda': [1, 2],
# # # 'n_estimators': [100, 200],
# # }

# Longer version
# subject_sim_thresholds = [1, 0.99, 0.98, 0.97, 0.95, 0.90]
# object_sim_thresholds = [1, 0.99, 0.98, 0.97, 0.95, 0.90]
# param_grid = {
# 'max_depth': [3, 4, 5, 6],
# 'learning_rate': [0.1, 0.01, 0.05],
# 'subsample': [0.7, 0.8, 0.9],
# 'colsample_bytree': [0.7, 0.8, 0.9],
# 'gamma': [0, 1, 2],
# 'reg_alpha': [0, 0.1, 0.5],
# 'reg_lambda': [1, 2, 5],
# 'n_estimators': [100, 200, 300],
# }
# # Longer version
# # subject_sim_thresholds = [1, 0.99, 0.98, 0.97, 0.95, 0.90]
# # object_sim_thresholds = [1, 0.99, 0.98, 0.97, 0.95, 0.90]

scores_df = pd.DataFrame()
for subject_sim_threshold in subject_sim_thresholds:
for object_sim_threshold in object_sim_thresholds:
sim_scores = exclude_sim_and_train(input_dir, out_dir, param_grid, subject_sim_threshold, object_sim_threshold)
sim_scores["subject_sim_threshold"] = subject_sim_threshold
sim_scores["object_sim_threshold"] = object_sim_threshold
scores_df = pd.concat([scores_df, sim_scores], ignore_index=True)

# score_list = []
# for config in configs:
# score_list.append(train_not_similar(input_dir, out_dir, config))
# print(score_list)
# combined_df = pd.concat(score_list)
# combined_df.to_csv(f"{out_dir}/compare_scores.csv", index=False)
# scores_df = pd.DataFrame()
# for subject_sim_threshold in subject_sim_thresholds:
# for object_sim_threshold in object_sim_thresholds:
# sim_scores = exclude_similar(input_dir, out_dir, params, subject_sim_threshold, object_sim_threshold)
# sim_scores["subject_sim_threshold"] = subject_sim_threshold
# sim_scores["object_sim_threshold"] = object_sim_threshold
# scores_df = pd.concat([scores_df, sim_scores], ignore_index=True)

print("SCORES DF", scores_df)
scores_df.to_csv(f"{out_dir}/compare_scores.csv", index=False)
# # score_list = []
# # for config in configs:
# # score_list.append(train_not_similar(input_dir, out_dir, config))
# # print(score_list)
# # combined_df = pd.concat(score_list)
# # combined_df.to_csv(f"{out_dir}/compare_scores.csv", index=False)

# print("SCORES DF", scores_df)
# scores_df.to_csv(f"{out_dir}/compare_scores.csv", index=False)

if __name__ == "__main__":
train_grid_exclude_sim("data/opentargets", "data/grid")
# train_grid_exclude_sim("data/opentargets", "data/grid")
# train_not_similar("data/opentargets", "data/opentargets_not_similar")
out_dir = "data/grid"
os.makedirs(out_dir, exist_ok=True)

# Longer version:
subject_sim_thresholds = [1, 0.99, 0.98, 0.97, 0.95, 0.90]
object_sim_thresholds = [1, 0.99, 0.98, 0.97, 0.95, 0.90]
params = {
'max_depth': 3,
'learning_rate': 0.1,
'subsample': 0.7,
'colsample_bytree': 0.7,
'gamma': 0,
'reg_alpha': 0.1,
'reg_lambda': 1,
# 'n_estimators': 100,
scores_df = pd.DataFrame()
for subject_sim_threshold in subject_sim_thresholds:
for object_sim_threshold in object_sim_thresholds:
# Exclude similar then run training on GPU
df_known_dt, df_drugs_embeddings, df_targets_embeddings = exclude_similar("data/opentargets", subject_sim_threshold, object_sim_threshold)
print(f"Similar excluded for {subject_sim_threshold}/{object_sim_threshold}")

scores = train_gpu(df_known_dt, df_drugs_embeddings, df_targets_embeddings, params)
scores["subject_sim_threshold"] = subject_sim_threshold
scores["object_sim_threshold"] = object_sim_threshold
scores_df = pd.concat([scores_df, scores], ignore_index=True)

print("SCORES DF", scores_df)
scores_df.to_csv(f"{out_dir}/compare_scores.csv", index=False)

0 comments on commit c7269b6

Please sign in to comment.