Skip to content

Commit

Permalink
Update train.py, changing RF to XGBoost
Browse files Browse the repository at this point in the history
  • Loading branch information
renayang2023 authored Apr 12, 2024
1 parent ace2287 commit 7d3692f
Showing 1 changed file with 101 additions and 68 deletions.
169 changes: 101 additions & 68 deletions src/predict_drug_target/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import json
from datetime import date, datetime
from itertools import product
import requests

import numpy as np
import pandas as pd
Expand All @@ -16,11 +17,12 @@
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split, KFold
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, f1_score, average_precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from xgboost import XGBClassifier, DMatrix

from predict_drug_target.utils import log, TrainingConfig
from predict_drug_target.vectordb import init_vectordb
from src.utils import log, TrainingConfig
from src.vectordb import init_vectordb

vectordb = init_vectordb(recreate=False)

Expand All @@ -46,26 +48,6 @@ def generate_dt_pairs(dt_df):
return pairs, labels


def balance_data(pairs, classes, n_proportion):
"""Don't take all the generated "don't interact" pairs, just the same amount of known pairs"""
classes = np.array(classes)
pairs = np.array(pairs)

indices_true = np.where(classes == 1)[0]
indices_false = np.where(classes == 0)[0]

np.random.shuffle(indices_false)
indices = indices_false[: (n_proportion * indices_true.shape[0])]

print(f"True positives: {len(indices_true)}")
print(f"True negatives: {len(indices_false)}")
pairs = np.concatenate((pairs[indices_true], pairs[indices]), axis=0)
classes = np.concatenate((classes[indices_true], classes[indices]), axis=0)

return pairs, classes



def multimetric_score(estimator, X_test, y_test, scorers):
"""Return a dict of score for multimetric scoring"""
scores = {}
Expand All @@ -88,6 +70,24 @@ def multimetric_score(estimator, X_test, y_test, scorers):
return scores


def balance_data(pairs, classes, n_proportion):
classes = np.array(classes)
pairs = np.array(pairs)

indices_true = np.where(classes == 1)[0]
indices_false = np.where(classes == 0)[0]

np.random.shuffle(indices_false)
indices = indices_false[: (n_proportion * indices_true.shape[0])]

print(f"True positives: {len(indices_true)}")
print(f"True negatives: {len(indices_false)}")
pairs = np.concatenate((pairs[indices_true], pairs[indices]), axis=0)
classes = np.concatenate((classes[indices_true], classes[indices]), axis=0)

return pairs, classes


def get_scores(clf, X_new, y_new):
scoring = ["precision", "recall", "accuracy", "roc_auc", "f1", "average_precision"]
scorers = metrics._scorer._check_multimetric_scoring(clf, scoring=scoring)
Expand Down Expand Up @@ -215,19 +215,34 @@ def train(
# nb_model = GaussianNB()
# lr_model = linear_model.LogisticRegression()
# rf_model = ensemble.RandomForestClassifier(n_estimators=200, n_jobs=-1)
rf_model = ensemble.RandomForestClassifier(
# rf_model = ensemble.RandomForestClassifier(
# n_estimators=200,
# criterion="gini", # log_loss
# max_depth=None, # config.max_depth
# min_samples_split=2,
# min_samples_leaf=1,
# max_features="sqrt",
# n_jobs=-1,
# # class_weight="balanced",
# )
xgb_model = XGBClassifier(
n_estimators=200,
criterion="log_loss",
max_depth=config.max_depth,
min_samples_split=2,
min_samples_leaf=1,
max_features="sqrt",
max_depth=None, #config.max_depth,
learning_rate=0.1,
subsample=0.8,
colsample_bytree=0.8,
gamma=0,
reg_alpha=0,
reg_lambda=1,
objective='binary:logistic', # For binary classification
n_jobs=-1,
class_weight="balanced",
random_state=42,
tree_method='hist', # Use GPU optimized histogram algorithm
# device='gpu',
)

# clfs = [('Naive Bayes',nb_model),('Logistic Regression',lr_model),('Random Forest',rf_model)]
clfs = [("Random Forest", rf_model)] # "XGBoost", xgb_model
clfs = [("XGBoost", xgb_model)] # "Random Forest", rf_model

n_seed = 100
n_fold = 10
Expand All @@ -247,7 +262,7 @@ def train(

os.makedirs("models", exist_ok=True)
with open(save_model, "wb") as f:
pickle.dump(rf_model, f) # xgb_model
pickle.dump(xgb_model, f) # rf_model

return agg_df.mean()
# return agg_df.to_dict(orient="records")
Expand All @@ -256,6 +271,11 @@ def train(
################### Train with a grid of hyperparameters to find the best


# def get_params_combinations(params):
# keys, values = zip(*params.items())
# combinations = [dict(zip(keys, v)) for v in product(*values)]
# return combinations

def train_gpu(
df_known_interactions: pd.DataFrame,
df_drugs_embeddings: pd.DataFrame,
Expand Down Expand Up @@ -328,24 +348,26 @@ def train_gpu(
best_accuracy = 0
os.makedirs("models", exist_ok=True)

# for fold, (train_index, test_index) in enumerate(kf.split(X)):
# Train model for each fold
for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
x_train, x_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
start_fold_time = time.time()

# Send data to GPU for XGBoost
# dtrain = xgb.DMatrix(x_train, label=y_train)
# dtest = xgb.DMatrix(x_test, label=y_test)
# # Send data to GPU for XGBoost
send_time = time.time()
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test, label=y_test)
# # print(f"Sending data to GPU took {time.time() - send_time}s")

# # Train XGBoost model
# model = xgb.train(params, dtrain, num_boost_round=100)
# predictions = model.predict(dtest)
# Train XGBoost model
model = xgb.train(params, dtrain, num_boost_round=100)
predictions = model.predict(dtest)

# Train Random Forest model
model = RandomForestClassifier(**params)
model.fit(x_train, y_train)
predictions = model.predict(x_test)
# # Train Random Forest model
# model = RandomForestClassifier(**params)
# model.fit(x_train, y_train)
# predictions = model.predict(x_test)

# Evaluate model
predictions_binary = np.round(predictions) # Convert probabilities to binary outputs
Expand Down Expand Up @@ -375,12 +397,14 @@ def train_gpu(
with open(save_model, "wb") as f:
pickle.dump(model, f)

# Force garbage collection for xgb on GPU:
# del dtrain, dtest, model
# gc.collect()
# os.makedirs("models", exist_ok=True)
# with open(save_model_path, "wb") as f:
# pickle.dump(model, f)

# del dtrain, dtest, model
gc.collect() # Force garbage collection for xgb on GPU
print(fold_results)
log.info(f"Completed fold {fold + 1}/{n_splits} in {time.time() - start_fold_time}s")
log.info(f"Completed fold {fold + 1}/{n_splits} in {time.time() - send_time}s")

log.info(f"Combination took {time.time() - combination_time}s")

Expand All @@ -402,6 +426,15 @@ def train_gpu(

def drop_similar(df: str, col_id: str, threshold: float = 0.9):
"""Given a DF remove all entities that are too similar"""
# Set the timeout duration in seconds (e.g., 60 seconds)
timeout_seconds = 120

# Create a session object with custom timeout settings
session = requests.Session()
session.timeout = timeout_seconds

# # Pass the session object to the VectorDB initialization
# vectordb = init_vectordb(session=session, recreate=False)
vectordb = init_vectordb(recreate=False)
indices_to_drop = []
# TODO: remove things that are too similar
Expand Down Expand Up @@ -461,31 +494,31 @@ def exclude_similar(input_dir, subject_sim_threshold: float = 1, object_sim_thre
os.makedirs(out_dir, exist_ok=True)

# Longer version:
subject_sim_thresholds = [1, 0.95, 0.90, 0.85, 0.80, 0.75, 0.70, 0.65, 0.60, 0.55, 0.50, 0.45, 0.40, 0.35, 0.30, 0.25, 0.20, 0.15, 0.10]
object_sim_thresholds = [1, 0.97, 0.94, 0.91, 0.88, 0.85, 0.82, 0.79, ]
subject_sim_thresholds = [1, 0.95, 0.90, 0.85, 0.80, 0.75, 0.70, 0.65, 0.60, 0.55, 0.50] # 0.45, 0.40, 0.35, 0.30, 0.25, 0.20, 0.15, 0.10
object_sim_thresholds = [1, 0.99, 0.98, 0.97, 0.96, 0.95, 0.94, 0.93, 0.92, 0.91, 0.90, 0.89, 0.88, 0.87] # 0.86, 0.85, 0.84, 0.82, 0.80, 0.78, 0.76, 0.74, 0.72, 0.70
# subject_sim_thresholds = [1]
# object_sim_thresholds = [1]
# params = { #XGB
# 'max_depth': 3,
# 'n_estimators': 100,
# # For XGB:
# 'learning_rate': 0.1,
# 'subsample': 0.7,
# 'colsample_bytree': 0.7,
# 'gamma': 0,
# 'reg_alpha': 0.1,
# 'reg_lambda': 1,
# }
params = {
params = { #XGB
'max_depth': None,
'n_estimators': 200,
'criterion': "log_loss",
'max_depth': None, #config.max_depth
'min_samples_split': 2,
'min_samples_leaf': 1,
'max_features': "sqrt",
'n_jobs': -1,
'class_weight': 'balanced',
# For XGB:
'learning_rate': 0.1,
'subsample': 0.8,
'colsample_bytree': 0.8,
'gamma': 0,
'reg_alpha': 0.1,
'reg_lambda': 1,
}
# params = {
# 'n_estimators': 200,
# 'criterion': "gini", # log_loss
# 'max_depth': None, #config.max_depth
# 'min_samples_split': 2,
# 'min_samples_leaf': 1,
# 'max_features': "sqrt",
# 'n_jobs': -1,
# # 'class_weight': 'balanced',
# }
scores_df = pd.DataFrame()
for subject_sim_threshold in subject_sim_thresholds:
for object_sim_threshold in object_sim_thresholds:
Expand All @@ -501,4 +534,4 @@ def exclude_similar(input_dir, subject_sim_threshold: float = 1, object_sim_thre
scores_df = pd.concat([scores_df, scores], ignore_index=True)

print("SCORES DF", scores_df)
scores_df.to_csv(f"{out_dir}/compare_scores.csv", index=False)
scores_df.to_csv(f"{out_dir}/compare_scores_opentargets_xgb.csv", index=False)

0 comments on commit 7d3692f

Please sign in to comment.