Skip to content

Commit

Permalink
Merge branch 'main' of github.com:MaastrichtU-IDS/predict-drug-target
Browse files Browse the repository at this point in the history
  • Loading branch information
vemonet committed Dec 18, 2023
2 parents 3596456 + db41c09 commit 74a6a68
Show file tree
Hide file tree
Showing 9 changed files with 396 additions and 215 deletions.
6 changes: 6 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
data
.venv
__pycache__/
.pytest_cache
.ipynb_checkpoints/
*.log
4 changes: 3 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ FROM ${BASE_IMAGE}

ARG DEBIAN_FRONTEND=noninteractive
ENV TZ=Europe/Amsterdam \
PYTHONUNBUFFERED=1
PYTHONUNBUFFERED=1 \
JOBLIB_TEMP_FOLDER=/app/data/tmp
# ^fixes no space left on device error

WORKDIR /app

Expand Down
4 changes: 1 addition & 3 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@ services:
restart: unless-stopped
volumes:
- ./:/app
# - ./data:/app/data
# - ./models:/app/models
# - ./MolecularTransformerEmbeddings:/app/MolecularTransformerEmbeddings
- /mnt/um-share-drive/deep-purpose:/app/data/shared
environment:
- VIRTUAL_HOST=predict-drug-target.137.120.31.160.nip.io
- LETSENCRYPT_HOST=predict-drug-target.137.120.31.160.nip.io
Expand Down
5 changes: 3 additions & 2 deletions src/__main__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import typer

from src.train import compute_and_train
from src.train import compute

cli = typer.Typer()

Expand All @@ -19,7 +19,8 @@ def train(
Examples:
$ predict-dt train known_drug_target.csv -o data/my_model
"""
scores = compute_and_train(known_drug_target, output)
df_known_dt, df_drugs, df_targets = compute(known_drug_target, output)
scores = train(df_known_dt, df_drugs, df_targets, f"{output}/model.pkl")
typer.echo(f"Training done: {scores}")


Expand Down
145 changes: 140 additions & 5 deletions src/train.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
"""Common functions for training the models"""
import numbers
import os
import pickle
import random
import concurrent.futures
from datetime import date
from datetime import date, datetime

import numpy as np
import pandas as pd
import torch
from sklearn import ensemble, metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split, KFold
from xgboost import XGBClassifier

from src.embeddings import compute_drug_embedding, compute_target_embedding
Expand All @@ -20,6 +21,8 @@


def generate_dt_pairs(dt_df):
"""Get pairs and their labels: All given known drug-target pairs are 1,
We add pairs for missing drug/targets combinations as 0 (not known as interacting)"""
dtKnown = {tuple(x) for x in dt_df[["drug", "target"]].values}
pairs = []
labels = []
Expand Down Expand Up @@ -94,6 +97,10 @@ def crossvalid(train_df, test_df, clfs, run_index, fold_index):
X_new = test_df[features_cols].values
y_new = test_df["Class"].values.ravel()

print("FIT X Y")
print(X)
print(y)

results = pd.DataFrame()
for name, clf in clfs:
clf.fit(X, y)
Expand Down Expand Up @@ -221,7 +228,8 @@ def train(
objective='binary:logistic', # For binary classification
n_jobs=-1,
random_state=42,
tree_method='gpu_hist', # Use GPU optimized histogram algorithm
tree_method='hist', # Use GPU optimized histogram algorithm
# device='gpu',
)

# clfs = [('Naive Bayes',nb_model),('Logistic Regression',lr_model),('Random Forest',rf_model)]
Expand Down Expand Up @@ -249,7 +257,7 @@ def train(
# return agg_df.to_dict(orient="records")


def compute_and_train(df_known_dt: pd.DataFrame | str, out_dir: str = "data"):
def compute(df_known_dt: pd.DataFrame | str, out_dir: str = "data"):
"""Compute embeddings and train model to predict interactions for a dataframe with 2 cols: drug, target"""
if isinstance(df_known_dt, str):
df_known_dt = pd.read_csv(df_known_dt)
Expand Down Expand Up @@ -283,4 +291,131 @@ def compute_and_train(df_known_dt: pd.DataFrame | str, out_dir: str = "data"):

# Run the training
log.info("Start training")
return train(df_known_dt, df_drugs, df_targets, save_model=f"{out_dir}/opentarget_drug_target.pkl")
# return train(df_known_dt, df_drugs, df_targets, save_model=f"{out_dir}/opentarget_drug_target.pkl")
return df_known_dt, df_drugs, df_targets


################### Train with a grid of hyperparameters to find the best

def train_grid(
df_known_interactions: pd.DataFrame,
df_drugs_embeddings: pd.DataFrame,
df_targets_embeddings: pd.DataFrame,
params_grid: dict[str, int | float],
save_model: str = "models/drug_target.pkl",
):
"""Train and compare a grid of hyperparameters
Training takes 3 dataframes as input, ideally use CURIEs for drug/target IDs:
1. a df with known drug-target interactions (2 cols: drug, target)
2. a df with drug embeddings: drug col + 512 cols for embeddings
3. a df with target embeddings: target col + 1280 cols for embeddings
"""
time_start = datetime.now()
embeddings = {
"drug": df_drugs_embeddings,
"target": df_targets_embeddings,
}

# Get pairs and their labels: All given known drug-target pairs are 1
# we add pairs for missing drug/targets combinations as 0 (not known as interacting)
pairs, labels = generate_dt_pairs(df_known_interactions)

# TODO: Split dataset for train/test?
# X_train, X_test, y_train, y_test = train_test_split(pairs, labels, test_size=0.2, random_state=123)

# Merge drug/target pairs and their labels in a DF
train_df = pd.DataFrame(
list(zip(pairs[:, 0], pairs[:, 1], labels)), columns=["drug", "target", "Class"]
)
# Add the embeddings to the DF
train_df = train_df.merge(embeddings["drug"], left_on="drug", right_on="drug").merge(
embeddings["target"], left_on="target", right_on="target"
)

# X is the array of embeddings (drug+target), without other columns
# y is the array of classes/labels (0 or 1)
embedding_cols = train_df.columns.difference(["drug", "target", "Class"])
X = train_df[embedding_cols].values
y = train_df["Class"].values.ravel()
print(f"Features count: {len(embedding_cols)}")
print(X)
print(y)

ndrugs = len(embeddings["drug"])
ntargets = len(embeddings["target"])
_unique, counts = np.unique(labels, return_counts=True)
ndrugtargets = counts[1]
log.info(f"Training based on {ndrugtargets} Drug-Targets known interactions: {ndrugs} drugs | {ntargets} targets")
random_state=123 # Or 42?
n_jobs = 2 # Or -1

xgb_model = XGBClassifier(
objective='binary:logistic',
n_jobs=-1,
random_state=random_state,
tree_method='hist', # Use GPU optimized histogram algorithm
device='cuda',
)

# Create a KFold object for cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params_grid, scoring='f1', cv=kf, n_jobs=n_jobs)
# or scoring='accuracy'

log.info("Fitting grid search")
grid_search.fit(X, y)

# Without CV:temp_folder
# grid_search = GridSearchCV(estimator=xgb_model, param_grid=params_grid, scoring='accuracy', cv=5, n_jobs=n_jobs)

# Perform grid search on the training data
# grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding accuracy
log.info("Best Parameters:", grid_search.best_params_)
log.info("Best Accuracy:", grid_search.best_score_)

# Creating DataFrame from cv_results
results_df = pd.DataFrame(grid_search.cv_results_)
results_df = results_df[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']]

# Evaluate on test data
best_model = grid_search.best_estimator_

# test_accuracy = best_model.score(X_test, y_test)
# log.info("Test Accuracy:", test_accuracy)

log.info(f"⚡ Training took {datetime.now() - time_start}")

# TODO: return a df with aggregated scores for each grid combination?


# # clfs = [('Naive Bayes',nb_model),('Logistic Regression',lr_model),('Random Forest',rf_model)]
# clfs = [("XGBoost", xgb_model)] # "Random Forest", rf_model

# n_seed = 100
# n_fold = params_grid.cv_nfold
# n_run = 2
# n_proportion = 1

# results_file = f"./data/results/drugbank_drug_targets_scores_{today}.csv"
# agg_results_file = f"./data/results/drugbank_drug_targets_agg_{today}.csv"

# # Run training
# all_scores_df = kfold_cv(pairs, labels, embeddings, clfs, n_run, n_fold, n_proportion, n_seed)
# all_scores_df.to_csv(results_file, sep=",", index=False)

# agg_df = all_scores_df.groupby(["method", "run"]).mean().groupby("method").mean()
# agg_df.to_csv(agg_results_file, sep=",", index=False)
# log.info("Aggregated results:")
# print(agg_df)


os.makedirs("models", exist_ok=True)
with open(save_model, "wb") as f:
pickle.dump(best_model, f) #rf_model

return results_df
# return agg_df.to_dict(orient="records")

135 changes: 135 additions & 0 deletions src/train_compare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import os
import pandas as pd

from src.train import train, train_grid
from src.utils import log, TrainingConfig
from src.vectordb import init_vectordb


# NOTE: script to test various config while training the model
# For speed, it DOES NOT compute embeddings for all drugs and targets
# It expects the embeddings to be already generated in a CSV (by train_compute.py)


def drop_similar(df: str, col_id: str, threshold: float = 0.9):
"""Given a DF remove all entities that are too similar"""
vectordb = init_vectordb(recreate=False)
indices_to_drop = []
# TODO: remove things that are too similar
# in df_drugs and df_targets
for i, row in df.iterrows():
if row[col_id] in indices_to_drop:
# If we already plan to drop this row, skip it
continue
# The column ID and the collection are the same (drug or target)
ent_matching = vectordb.get(col_id, row[col_id])
if ent_matching:
# Find vectors that are similar to the vector of the given drug ID
search_res = vectordb.search(col_id, ent_matching[0].vector)

for res in search_res:
if threshold < res.score < 1:
indices_to_drop.append(res.payload['id'])
# print(f"PLEASE REMOVE {res.score}")
df = df[df[col_id] != res.payload['id']]
# else:
# print(f"DONT REMOVE {res.score}")
# print(f"{res.payload['id']}: {res.score} ({res.id})")
else:
print(f"No match for {row[col_id]}")
log.info(f"DROPPING {col_id}: {len(indices_to_drop)}")
# return df.drop(indices_to_drop)
return df



def exclude_sim_and_train(input_dir, out_dir, param_grid, subject_sim_threshold: float = 1, object_sim_threshold: float = 1):
"""Exclude similarities given thresholds, and run training on grid"""
os.makedirs(out_dir, exist_ok=True)

print(f"🔨 Training for {subject_sim_threshold} - {object_sim_threshold}")

# Precomputed embeddings
df_known_dt = pd.read_csv(f"{input_dir}/known_drugs_targets.csv")
df_drugs = pd.read_csv(f"{input_dir}/drugs_embeddings.csv")
df_targets = pd.read_csv(f"{input_dir}/targets_embeddings.csv")

log.info(f"DF LENGTH BEFORE DROPPING: {len(df_drugs)} drugs and {len(df_targets)} targets, and {len(df_known_dt)} known pairs")

if subject_sim_threshold < 1:
df_drugs = drop_similar(df_drugs, "drug", subject_sim_threshold)

if object_sim_threshold < 1:
df_targets = drop_similar(df_targets, "target", object_sim_threshold)

df_known_dt = df_known_dt.merge(df_drugs[["drug"]], on="drug").merge(df_targets[["target"]], on="target")

log.info(f"DF LENGTH AFTER DROPPING: {len(df_drugs)} drugs and {len(df_targets)} targets, and {len(df_known_dt)} known pairs")

score = train_grid(df_known_dt, df_drugs, df_targets, param_grid, f"{out_dir}/model_drug_target_{subject_sim_threshold}_{object_sim_threshold}.pkl")

# score_df = train(df_known_dt, df_drugs, df_targets, save_model=f"{out_dir}/opentarget_drug_target_nosim.pkl", config=config)
# score_df.insert(0, 'Drug sim threshold', config.subject_sim_threshold)
# score_df.insert(1, 'Target sim threshold', config.object_sim_threshold)
# score_df.insert(2, 'CV nfold', config.cv_nfold)
# score_df.insert(3, 'Max depth', config.max_depth)

return score


def train_grid_exclude_sim(input_dir, out_dir):
"""Define the similarities thresholds and params grid, then run training"""
os.makedirs(out_dir, exist_ok=True)
# Shorter version for starting
subject_sim_thresholds = [1, 0.99]
object_sim_thresholds = [1, 0.99]
param_grid = {
'max_depth': [3, 4],
'learning_rate': [0.1, 0.01],
'subsample': [0.7, 0.8],
'colsample_bytree': [0.7, 0.8],
'gamma': [0, 1],
'reg_alpha': [0, 0.1],
'reg_lambda': [1, 2],
'n_estimators': [100, 200],
}

# Longer version
# subject_sim_thresholds = [1, 0.99, 0.98, 0.97, 0.95, 0.90]
# object_sim_thresholds = [1, 0.99, 0.98, 0.97, 0.95, 0.90]
# param_grid = {
# 'max_depth': [3, 4, 5, 6],
# 'learning_rate': [0.1, 0.01, 0.05],
# 'subsample': [0.7, 0.8, 0.9],
# 'colsample_bytree': [0.7, 0.8, 0.9],
# 'gamma': [0, 1, 2],
# 'reg_alpha': [0, 0.1, 0.5],
# 'reg_lambda': [1, 2, 5],
# 'n_estimators': [100, 200, 300],
# }

scores_df = pd.DataFrame()
for subject_sim_threshold in subject_sim_thresholds:
for object_sim_threshold in object_sim_thresholds:
score = exclude_sim_and_train(input_dir, out_dir, param_grid, subject_sim_threshold, object_sim_threshold)
scores_df = scores_df.append({
'subject_sim_threshold': subject_sim_threshold,
'object_sim_threshold': object_sim_threshold,
'score': score,
})

# score_list = []
# for config in configs:
# score_list.append(train_not_similar(input_dir, out_dir, config))
# print(score_list)
# combined_df = pd.concat(score_list)
# combined_df.to_csv(f"{out_dir}/compare_scores.csv", index=False)

print("SCORES DF", scores_df)
scores_df.to_csv(f"{out_dir}/compare_scores.csv", index=False)



if __name__ == "__main__":
train_grid_exclude_sim("data/opentargets", "data/grid")
# train_not_similar("data/opentargets", "data/opentargets_not_similar")
Loading

0 comments on commit 74a6a68

Please sign in to comment.