Skip to content

Commit

Permalink
rename train_compute to prepare
Browse files Browse the repository at this point in the history
  • Loading branch information
vemonet committed Jan 29, 2024
1 parent 0d27bab commit 0f181af
Show file tree
Hide file tree
Showing 9 changed files with 139 additions and 116 deletions.
4 changes: 3 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ services:
# GPU: predict-drug-target.137.120.31.160.nip.io
# CPU node3: predict-drug-target.137.120.31.148.nip.io
- VIRTUAL_PORT=8000
- NO_JAEGER=true
deploy:
resources:
reservations:
Expand All @@ -23,7 +24,8 @@ services:
shm_size: '4g'
# ports:
# - 8000:8000
command: uvicorn --host 0.0.0.0 src.api:app --reload
command: uvicorn --host 0.0.0.0 src.api:app
# --reload
networks:
- nginx

Expand Down
7 changes: 7 additions & 0 deletions prepare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
esm-extract esm2_t33_650M_UR50D data/download/drugbank_targets.fasta data/vectors/drugbank_targets_esm2_l33_mean --repr_layers 33 --include mean


# Clustering sim with mmseq
wget https://mmseqs.com/latest/mmseqs-linux-avx2.tar.gz
tar xvfz mmseqs-linux-avx2.tar.gz
export PATH=$(pwd)/mmseqs/bin/:$PATH
Expand All @@ -17,6 +18,8 @@ mmseqs createtsv clu_rep clu_rep clusterRes.tsv

mmseqs createtsv sequenceDB sequenceDB resultsDB_clu resultsDB_clu.tsv



# Install the Molecular Transformer Embeddings for proteins
# https://github.com/mpcrlab/MolecularTransformerEmbeddings
git clone https://github.com/mpcrlab/MolecularTransformerEmbeddings.git
Expand All @@ -27,3 +30,7 @@ python embed.py --data_path=../data/download/drugbank_smiles.txt
mkdir -p ../data/vectors
mv embeddings/drugbank_smiles.npz ../data/vectors/
cd ..


echo "Generate list of known_drug_target pairs for OpenTargets"
python3 src/prepare.py
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ classifiers = [
]

dependencies = [
"trapi-predict-kit[web] >=0.2.3",
# "trapi-predict-kit[web] @ git+https://github.com/MaastrichtU-IDS/trapi-predict-kit.git",
# "trapi-predict-kit[web] >=0.2.3",
"trapi-predict-kit[web] @ git+https://github.com/MaastrichtU-IDS/trapi-predict-kit.git",
"MolecularTransformerEmbeddings @ git+https://github.com/vemonet/MolecularTransformerEmbeddings.git",
"fair-esm",
"bio",
Expand Down
2 changes: 1 addition & 1 deletion scripts/download_opentargets.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ wget -r -np -nH --cut-dirs=8 -P ./data/download/opentargets/ -e robots=off -R "i
wget -r -np -nH --cut-dirs=8 -P ./data/download/opentargets/ -e robots=off -R "index.html*" https://ftp.ebi.ac.uk/pub/databases/opentargets/platform/23.09/output/etl/json/targets/

# Mechanisms of action:
# wget -r -np -nH --cut-dirs=8 -P ./data/download/opentargets/ -e robots=off -R "index.html*" https://ftp.ebi.ac.uk/pub/databases/opentargets/platform/23.09/output/etl/json/mechanismOfAction/
# wget -r -np -nH --cut-dirs=8 -P ./data/download/opentargets/ -e robots=off -R "index.html*" https://ftp.ebi.ac.uk/pub/databases/opentargets/platform/23.09/output/etl/json/mechanismOfAction/
36 changes: 18 additions & 18 deletions src/__main__.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
import typer

from src.train import compute
# from src.embeddings import compute

cli = typer.Typer()


@cli.command()
def train(
known_drug_target: str = typer.Argument(..., help="Input drug-target CSV file to train the model"),
output: str = typer.Option("output", "-o", help="Output directory to save the model"),
):
"""Train a model with input file and save output in the specified directory.
Args:
file (str): Input file for model training.
output_dir (str): Output directory to save the model.
Examples:
$ predict-dt train known_drug_target.csv -o data/my_model
"""
df_known_dt, df_drugs, df_targets = compute(known_drug_target, output)
scores = train(df_known_dt, df_drugs, df_targets, f"{output}/model.pkl")
typer.echo(f"Training done: {scores}")
# @cli.command()
# def train(
# known_drug_target: str = typer.Argument(..., help="Input drug-target CSV file to train the model"),
# output: str = typer.Option("output", "-o", help="Output directory to save the model"),
# ):
# """Train a model with input file and save output in the specified directory.

# Args:
# file (str): Input file for model training.
# output_dir (str): Output directory to save the model.

# Examples:
# $ predict-dt train known_drug_target.csv -o data/my_model
# """
# df_known_dt, df_drugs, df_targets = compute(known_drug_target, output)
# scores = train(df_known_dt, df_drugs, df_targets, f"{output}/model.pkl")
# typer.echo(f"Training done: {scores}")


@cli.command()
Expand Down
40 changes: 40 additions & 0 deletions src/embeddings.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import os
import concurrent.futures
from typing import Any

import esm
import pandas as pd
Expand Down Expand Up @@ -266,3 +268,41 @@ def compute_target_embedding(
df.loc[len(df)] = [target_id] + embeddings
vectordb.add("target", upload_list)
return df


def compute(df_known_dt: pd.DataFrame | str, vectordb: Any, out_dir: str = "data"):
"""Compute embeddings and train model to predict interactions for a dataframe with 2 cols: drug, target"""
if isinstance(df_known_dt, str):
df_known_dt = pd.read_csv(df_known_dt)

# These functions retrieves SMILES and compute embeddings in 1 batch
log.info("Running drug and target embeddings computing in parallel")
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
# Submit the drug and target embeddings calculation to the executor
future_drugs = executor.submit(compute_drug_embedding, vectordb, set(df_known_dt["drug"].tolist()), out_dir)
future_targets = executor.submit(compute_target_embedding, vectordb, set(df_known_dt["target"].tolist()), out_dir)
# Get the results
df_drugs = future_drugs.result()
df_targets = future_targets.result()

# Save result to CSV
# df_drugs = compute_drug_embedding(vectordb, set(df_known_dt["drug"].tolist()), tmp_dir=out_dir)
df_drugs.to_csv(f"{out_dir}/drugs_embeddings.csv", index=False)
log.info(f"Drugs embeddings saved to {out_dir}")

# df_targets = compute_target_embedding(vectordb, set(df_known_dt["target"].tolist()), tmp_dir=out_dir)
df_targets.to_csv(f"{out_dir}/targets_embeddings.csv", index=False)
log.info("Targets embeddings saved to {out_dir}")

# Remove from df_known_dt entries where we don't have SMILES or AA seq
known_dt_before = len(df_known_dt)
df_known_dt = df_known_dt.merge(df_drugs[["drug"]], on="drug").merge(df_targets[["target"]], on="target")
log.info(
f"Number of known interactions before and after removing rows for which we don't have smiles/sequence: {known_dt_before} > {len(df_known_dt)}"
)
df_known_dt.to_csv(f"{out_dir}/known_drugs_targets.csv", index=False)

# Run the training
log.info("Start training")
# return train(df_known_dt, df_drugs, df_targets, save_model=f"{out_dir}/opentarget_drug_target.pkl")
return df_known_dt, df_drugs, df_targets
91 changes: 51 additions & 40 deletions src/train_compute.py → src/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@

import pandas as pd
from tqdm import tqdm
from src import vectordb

from src.embeddings import compute_drug_embedding, compute_target_embedding
from src.train import train, compute
# from src.embeddings import compute_drug_embedding, compute_target_embedding
from src.embeddings import compute
from src.utils import COLLECTIONS, log, get_pref_ids
from src.vectordb import init_vectordb

Expand All @@ -15,7 +16,7 @@
# Download opentargets before running this script: ./scripts/download_opentargets.sh

# Output file path
output_file_path = "../data/opentargets/merged_parsed.csv"
# output_file_path = "../data/opentargets/merged_parsed.csv"


def get_jsonl_files(target_directory) -> list[str]:
Expand Down Expand Up @@ -51,45 +52,54 @@ def ensembl_to_uniprot():
# NOTE: to train the model on new data you will just need a CSV of known drug-target pairs with 2 columns: `drug` and `target`
# Use CURIEs for the drugs and targets IDs. Accepted namespaces: UniProtKB:, PUBCHEM.COMPOUNT:, CHEMBL.COMPOUND:

def train_opentargets(input_dir, out_dir):
def prepare_opentargets(input_dir, out_dir):
"""Compute embeddings and train the model using opentargets data."""
os.makedirs(out_dir, exist_ok=True)
# known_drug_targets = []

# ensembl_to_uniprot_dict = ensembl_to_uniprot()
# no_match = set()
# print(len(ensembl_to_uniprot_dict))

# # first extract the drug-target pairs from the opentargets json files
# json_files = get_jsonl_files(input_dir)
# for json_file in tqdm(json_files, desc="Processing files"):
# # log.info(json_file)
# for drug_id, target_id in extract_data_from_jsonl(json_file):
# try:
# known_drug_targets.append(
# {
# "drug": f"CHEMBL.COMPOUND:{drug_id}",
# "target": ensembl_to_uniprot_dict[target_id],
# }
# )
# except:
# no_match.add(target_id)

# log.info(f"No UniProt match for {len(no_match)} targets, e.g. {' ,'.join(list(no_match))}")

# df_known_dt = pd.DataFrame(known_drug_targets)
# print(df_known_dt)
# df_known_dt, df_drugs, df_targets = compute(df_known_dt, out_dir)
known_drug_targets = []

ensembl_to_uniprot_dict = ensembl_to_uniprot()
no_match = set()
print(len(ensembl_to_uniprot_dict))

# first extract the drug-target pairs from the opentargets json files
json_files = get_jsonl_files(input_dir)
for json_file in tqdm(json_files, desc="Processing files"):
# log.info(json_file)
for drug_id, target_id in extract_data_from_jsonl(json_file):
try:
known_drug_targets.append(
{
"drug": f"CHEMBL.COMPOUND:{drug_id}",
"target": ensembl_to_uniprot_dict[target_id],
}
)
except:
no_match.add(target_id)

log.info(f"No UniProt match for {len(no_match)} targets, e.g. {' ,'.join(list(no_match))}")

df_known_dt = pd.DataFrame(known_drug_targets)
known_dt_path = f"{out_dir}/known_drugs_targets.csv"

# NOTE: block to skip computing
df_known_dt = pd.read_csv(f"data/opentargets/known_drugs_targets.csv")
df_drugs = pd.read_csv(f"data/opentargets/drugs_embeddings.csv")
df_targets = pd.read_csv(f"data/opentargets/targets_embeddings.csv")
print(df_known_dt)
print(f"Known drug-targets pairs stored in {known_dt_path}")
df_known_dt.to_csv(known_dt_path)

print("Computing embeddings")
df_known_dt2, df_drugs, df_targets = compute(df_known_dt, init_vectordb(), out_dir)

scores = train(df_known_dt, df_drugs, df_targets, f"{out_dir}/model.pkl")
df_drugs.to_csv(f"{out_dir}/drugs_embeddings.csv")
df_targets.to_csv(f"{out_dir}/targets_embeddings.csv")

# NOTE: block to skip computing
# df_known_dt, df_drugs, df_targets = compute(df_known_dt, out_dir)
# df_known_dt = pd.read_csv(f"data/opentargets/known_drugs_targets.csv")
# df_drugs = pd.read_csv(f"data/opentargets/drugs_embeddings.csv")
# df_targets = pd.read_csv(f"data/opentargets/targets_embeddings.csv")
# scores = train(df_known_dt, df_drugs, df_targets, f"{out_dir}/model.pkl")

def train_drugbank():

def prepare_drugbank():
"""Compute embeddings and train the model using drugbank data."""
file_known_dt = "data/drugbank/DB_DTI_4vectordb.csv"
out_dir = "data/drugbank"
Expand All @@ -100,9 +110,10 @@ def train_drugbank():
print(convert_dict)
df_known_dt["drug"] = df_known_dt["drug"].apply(lambda curie: convert_dict[curie])
print(df_known_dt)
df_known_dt, df_drugs, df_targets = compute(df_known_dt, out_dir)
scores = train(df_known_dt, df_drugs, df_targets, f"{out_dir}/model.pkl")
df_known_dt.to_csv("data/drugbank/known_drugs_targets.csv")
# df_known_dt, df_drugs, df_targets = compute(df_known_dt, out_dir)
# scores = train(df_known_dt, df_drugs, df_targets, f"{out_dir}/model.pkl")

if __name__ == "__main__":
# train_drugbank()
train_opentargets("data/download/opentargets/knownDrugsAggregated", "data/opentargets")
# prepare_drugbank()
prepare_opentargets("data/download/opentargets/knownDrugsAggregated", "data/opentargets")
49 changes: 5 additions & 44 deletions src/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import pickle
import time
import random
import concurrent.futures
from datetime import date, datetime
from itertools import product

Expand All @@ -19,7 +18,6 @@
import xgboost as xgb
from xgboost import XGBClassifier, DMatrix

from src.embeddings import compute_drug_embedding, compute_target_embedding
from src.utils import log, TrainingConfig
from src.vectordb import init_vectordb

Expand Down Expand Up @@ -263,44 +261,6 @@ def train(
# return agg_df.to_dict(orient="records")


def compute(df_known_dt: pd.DataFrame | str, out_dir: str = "data"):
"""Compute embeddings and train model to predict interactions for a dataframe with 2 cols: drug, target"""
if isinstance(df_known_dt, str):
df_known_dt = pd.read_csv(df_known_dt)

# These functions retrieves SMILES and compute embeddings in 1 batch
log.info("Running drug and target embeddings computing in parallel")
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
# Submit the drug and target embeddings calculation to the executor
future_drugs = executor.submit(compute_drug_embedding, vectordb, set(df_known_dt["drug"].tolist()), out_dir)
future_targets = executor.submit(compute_target_embedding, vectordb, set(df_known_dt["target"].tolist()), out_dir)
# Get the results
df_drugs = future_drugs.result()
df_targets = future_targets.result()

# Save result to CSV
# df_drugs = compute_drug_embedding(vectordb, set(df_known_dt["drug"].tolist()), tmp_dir=out_dir)
df_drugs.to_csv(f"{out_dir}/drugs_embeddings.csv", index=False)
log.info(f"Drugs embeddings saved to {out_dir}")

# df_targets = compute_target_embedding(vectordb, set(df_known_dt["target"].tolist()), tmp_dir=out_dir)
df_targets.to_csv(f"{out_dir}/targets_embeddings.csv", index=False)
log.info("Targets embeddings saved to {out_dir}")

# Remove from df_known_dt entries where we don't have SMILES or AA seq
known_dt_before = len(df_known_dt)
df_known_dt = df_known_dt.merge(df_drugs[["drug"]], on="drug").merge(df_targets[["target"]], on="target")
log.info(
f"Number of known interactions before and after removing rows for which we don't have smiles/sequence: {known_dt_before} > {len(df_known_dt)}"
)
df_known_dt.to_csv(f"{out_dir}/known_drugs_targets.csv", index=False)

# Run the training
log.info("Start training")
# return train(df_known_dt, df_drugs, df_targets, save_model=f"{out_dir}/opentarget_drug_target.pkl")
return df_known_dt, df_drugs, df_targets


################### Train with a grid of hyperparameters to find the best


Expand Down Expand Up @@ -329,26 +289,26 @@ def train_gpu(
"target": df_targets_embeddings,
}

print("Generate DT pairs")
print("Generate Drug-Target pairs DF")
# Get pairs and their labels: All given known drug-target pairs are 1
# we add pairs for missing drug/targets combinations as 0 (not known as interacting)
pairs, labels = generate_dt_pairs(df_known_interactions)

# TODO: Split dataset for train/test?
# X_train, X_test, y_train, y_test = train_test_split(pairs, labels, test_size=0.2, random_state=123)

print("Merging drug/target pairs and their labels in a DF")
print("Merging drug/target labels to the DF")
# Merge drug/target pairs and their labels in a DF
train_df = pd.DataFrame(
list(zip(pairs[:, 0], pairs[:, 1], labels)), columns=["drug", "target", "Class"]
)
print("Merging embeddings in the DF")
print("Merging embeddings to the DF")
# Add the embeddings to the DF
train_df = train_df.merge(embeddings["drug"], left_on="drug", right_on="drug").merge(
embeddings["target"], left_on="target", right_on="target"
)

print("Getting X and y")
print("Getting X and y data")
# X is the array of embeddings (drug+target), without other columns
# y is the array of classes/labels (0 or 1)
embedding_cols = train_df.columns.difference(["drug", "target", "Class"])
Expand Down Expand Up @@ -399,6 +359,7 @@ def train_gpu(
# Train xgboost model
# model = xgb.train(params, dtrain, num_boost_round=100)

# Train Random Forest model
model = RandomForestClassifier(**params)
model.fit(x_train, y_train)

Expand Down
Loading

0 comments on commit 0f181af

Please sign in to comment.