diff --git a/CITATION.cff b/CITATION.cff index aa94975..e4678ab 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,4 +1,4 @@ -cff-version: 0.2.1 +cff-version: 0.3.0 message: "If you use this software, please cite it as below." authors: - given-names: "Kamil Tamiola" @@ -7,7 +7,7 @@ authors: affiliation: "Peptone Ltd." orcid: "" title: "Attention based DisOrder PredicTor" -version: 0.2.1 +version: 0.3.0 doi: date-released: url: "https://github.com/PeptoneInc/ADOPT" diff --git a/README.md b/README.md index 4e4a82d..f4c7d81 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ ADOPT has been introduced in our paper [ADOPT: intrinsic protein disorder predic -Our disorder predictor is made up of two main blocks, namely: a **self-supervised encoder** and a **supervised disorder predictor**. We use [Facebook’s Evolutionary Scale Modeling (ESM)](https://github.com/facebookresearch/esm) library to extract dense residue evel representations, which feed the supervised machine learning based predictor. +Our disorder predictor is made up of two main blocks, namely: a **self-supervised encoder** and a **supervised disorder predictor**. We use [Facebook’s Evolutionary Scale Modeling (ESM)](https://github.com/facebookresearch/esm) library to extract dense residue evel representations, which feed the supervised machine learning based predictor. The ESM library exploits a set of deep Transformer encoder models, which processes character sequences of amino acids as inputs. @@ -54,7 +54,7 @@ Install the **adopt** package: Clone the ADOPT repository, go to the ADOPT directory and run ```bash -$ python setup.py install +python setup.py install ``` Then, you can predict the intrinsic disorder of each reesidue in a protein sequence, as follows: @@ -82,6 +82,7 @@ predicted_z_scores = z_score_pred.get_z_score(representation) ### Scripts The [scripts](scripts) directory contains: + * [inference](scripts/adopt_inference.sh) script to predict, in bulk, the disorder of each residue in each protein sequence reported in a FASTA file, with ADOPT where you need to specify: - `NEW_PROT_FASTA_FILE_PATH` defining your FASTA file path - `NEW_PROT_RES_REPR_DIR_PATH` defining where the residue level representations will be extracted @@ -91,27 +92,28 @@ The [scripts](scripts) directory contains: ### Notebooks The [notebooks](notebooks) directory contains: -* [disorder prediction](notebooks/adopt_disorder_prediction.ipynb) notebook + +* [disorder prediction](notebooks/adopt_disorder_prediction.ipynb) notebook * [multi-head attention weights visualisation](notebooks/adopt_attention_viz.ipynb) notebook ### Compute residue level representations -In order to predict the **Z score** related to each residue in a protein sequence, we have to compute the residue level representations, extracted from the pretrained model. +In order to predict the **Z score** related to each residue in a protein sequence, we have to compute the residue level representations, extracted from the pretrained model. In the ADOPT directory run: ```bash -$ python embedding.py -f - -r +python embedding.py -f \ + -r ``` Where: + * `-f` defines the FASTA file containing the proteins for which you want to compute the intrinsic disorder * `-r` defines the path where you want to save the residue level representations A subdirectory containing the residue level representation extracted from each pre-trained model available will be created under both the `residue_level_representation_dir`. - ### Predict intrinsic disorder with ADOPT Once we have extracted the residue level representations we can predict the intrinsic disorder (Z score). @@ -119,17 +121,18 @@ Once we have extracted the residue level representations we can predict the intr In the ADOPT directory run: ```bash -$ python inference.py -s - -m - -f - -r - -p +python inference.py -s \ + -m \ + -f \ + -r \ + -p ``` Where: -* `-s` defines the **training strategies** defined belowe + +* `-s` defines the **training strategies** defined below +* `-m` defines the pre-trained model we want to use. We suggest you use the `esm-1b` model. * `-f` defines the FASTA file containing the proteins for which you want to compute the intrinsic disorder -* `-m` defines the residue level representation of the pre-trained models we want to use. We suggest you use the `esm-1b` model. * `-r` defines the path where you've already saved the residue level representations * `-p` defines the path where you want the Z scores to be saved @@ -142,7 +145,6 @@ The output is a `.json` file contains the Z scores related to each residue of ea | `train_on_cleared_1325_cv_residue_split`| `esm-1b` and `esm-1v` | | `train_on_cleared_1325_cv_sequence_split`| `esm-1b` and `esm-1v` | - ### Train ADOPT disorder predictor Once we have extracted the residue level representations of the protein for which we want to predict the intrinsic disorder (Z score), we can train the predictor. @@ -152,21 +154,21 @@ Once we have extracted the residue level representations of the protein for whic In the ADOPT directory run: ```bash -$ python training.py -s - -t - -e - -r - -p +python training.py -s \ + -t \ + -e \ + -r \ + -p ``` Where: + * `-s` defines the **training strategies** defined above * `-t` defines the JSON containing the proteins we want to use as *training set* * `-e` defines the JSON containing the proteins we want to use as *test set* * `-r` defines the path where we saved the residue level representations of the proteins in the *training set* * `-p` defines the path where we saved the residue level representations of the proteins in the *test set* - ## Citations If you use this work in your research, please cite the the relevant paper: @@ -175,8 +177,6 @@ If you use this work in your research, please cite the the relevant paper: @article{redl2021adopt} ``` - ## Licence This source code is licensed under the MIT license found in the `LICENSE` file in the root directory of this source tree. - diff --git a/adopt/embedding.py b/adopt/embedding.py index 04d20a4..0cebf28 100644 --- a/adopt/embedding.py +++ b/adopt/embedding.py @@ -3,14 +3,37 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import getopt +import argparse import subprocess -import sys from pathlib import Path from adopt import constants +def create_parser(): + parser = argparse.ArgumentParser( + description="Extract residue level representations" + ) + + parser.add_argument( + "-f", + "--fasta_path", + type=str, + metavar="", + required=True, + help="FASTA file containing the proteins for which you want to compute the intrinsic disorder", + ) + parser.add_argument( + "-r", + "--repr_dir", + type=str, + metavar="", + required=True, + help="Residue level representation directory", + ) + return parser + + # extract residue level representations of each protein sequence in the fasta file def get_representations(fasta_file, repr_dir): for esm_model in constants.esm_models: @@ -40,31 +63,7 @@ def get_representations(fasta_file, repr_dir): output, error = process.communicate() -def main(argv): - try: - opts, args = getopt.getopt(argv, "hf:r:", ["fasta_file=", "repr_dir="]) - except getopt.GetoptError: - print( - "usage: embedding.py" - "-f " - "-r " - ) - sys.exit(2) - for opt, arg in opts: - if opt == "-h": - print( - "usage: embedding.py" - "-f " - "-r " - ) - sys.exit() - elif opt in ("-f", "--fasta_dir"): - fasta_dir = arg - elif opt in ("-r", "--repr_dir"): - repr_dir = arg - - get_representations(fasta_dir, repr_dir) - - if __name__ == "__main__": - main(sys.argv[1:]) + parser = create_parser() + args = parser.parse_args() + get_representations(args.fasta_path, args.repr_dir) diff --git a/adopt/inference.py b/adopt/inference.py index 9caf5f0..e67f94a 100644 --- a/adopt/inference.py +++ b/adopt/inference.py @@ -3,7 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import getopt +import argparse import os import sys @@ -14,6 +14,54 @@ from adopt import constants, utils +def create_parser(): + parser = argparse.ArgumentParser( + description="Predict the intrinsic disorder (Z score)" + ) + + parser.add_argument( + "-s", + "--train_strategy", + type=str, + metavar="", + required=True, + help="Training strategies", + ) + parser.add_argument( + "-m", + "--model_type", + type=str, + metavar="", + required=True, + help="pre-trained model we want to use", + ) + parser.add_argument( + "-f", + "--fasta_path", + type=str, + metavar="", + required=True, + help="FASTA file containing the proteins for which you want to compute the intrinsic disorder", + ) + parser.add_argument( + "-r", + "--repr_dir", + type=str, + metavar="", + required=True, + help="Residue level representation directory", + ) + parser.add_argument( + "-p", + "--pred_z_scores_path", + type=str, + metavar="", + required=True, + help="Path where you want the Z scores to be saved", + ) + return parser + + class ZScorePred: def __init__(self, strategy, model_type): self.strategy = strategy @@ -93,73 +141,35 @@ def get_z_score_from_fasta( df_results.to_json(predicted_z_scores_path, orient="records") -def main(argv): - try: - opts, args = getopt.getopt( - argv, - "hs:m:f:r:p:", - [ - "train_strategy=", - "model_type=", - "infer_fasta_file=", - "infer_repr_dir=", - "pred_z_scores_file", - ], - ) - except getopt.GetoptError: +def main(args): + if args.train_strategy not in constants.train_strategies: + print("The training strategies are:") + print(*constants.train_strategies, sep="\n") + sys.exit(2) + + if (args.model_type not in constants.model_types) and ( + args.model_type != "combined" + ): + print("The pre-trained models are:") + print(*constants.model_types, sep="\n") + print("combined") + sys.exit(2) + + if (args.train_strategy != "train_on_cleared_1325_test_on_117_residue_split") and ( + args.model_type == "combined" + ): print( - "usage: inference.py" - "-s " - "-m " - "-f " - "-r " - "-p " + "Only the train_on_cleared_1325_test_on_117_residue_split strategy" + "is allowed with the model" ) sys.exit(2) - for opt, arg in opts: - if opt == "-h": - print( - "usage: inference.py" - "-s " - "-m " - "-f " - "-r " - "-p " - ) - sys.exit() - elif opt in ("-s", "--train_strategy"): - train_strategy = arg - if train_strategy not in constants.train_strategies: - print("The training strategies are:") - print(*constants.train_strategies, sep="\n") - sys.exit(2) - elif opt in ("-m", "--model_type"): - model_type = arg - if (model_type not in constants.model_types) and (model_type != "combined"): - print("The pre-trained models are:") - print(*constants.model_types, sep="\n") - print("combined") - sys.exit(2) - if ( - train_strategy != "train_on_cleared_1325_test_on_117_residue_split" - ) and (model_type == "combined"): - print( - "Only the train_on_cleared_1325_test_on_117_residue_split strategy" - "is allowed with the model" - ) - sys.exit() - elif opt in ("-f", "--infer_fasta_file"): - infer_fasta_file = arg - elif opt in ("-r", "--infer_repr_dir"): - infer_repr_dir = arg - elif opt in ("-p", "--pred_z_scores_file"): - pred_z_scores_file = arg - - z_score_pred = ZScorePred(train_strategy, model_type) - z_score_pred.get_z_score_from_fasta( - infer_fasta_file, infer_repr_dir, pred_z_scores_file - ) if __name__ == "__main__": - main(sys.argv[1:]) + parser = create_parser() + args = parser.parse_args() + main(args) + z_score_pred = ZScorePred(args.train_strategy, args.model_type) + z_score_pred.get_z_score_from_fasta( + args.fasta_path, args.repr_dir, args.pred_z_scores_path + ) diff --git a/adopt/training.py b/adopt/training.py index fb90d76..53b99f9 100644 --- a/adopt/training.py +++ b/adopt/training.py @@ -3,7 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import getopt +import argparse import sys import numpy as np @@ -13,7 +13,52 @@ from adopt import CheZod, constants, utils + # disorder predictor training +def create_parser(): + parser = argparse.ArgumentParser(description="Train ADOPT") + + parser.add_argument( + "-s", + "--train_strategy", + type=str, + metavar="", + required=True, + help="Training strategies", + ) + parser.add_argument( + "-t", + "--train_json_file", + type=str, + metavar="", + required=True, + help="JSON file containing the proteins we want to use as training set", + ) + parser.add_argument( + "-e", + "--test_json_file", + type=str, + metavar="", + required=True, + help="JSON file containing the proteins we want to use as test set", + ) + parser.add_argument( + "-r", + "--train_repr_dir", + type=str, + metavar="", + required=True, + help="Training set residue level representation directory", + ) + parser.add_argument( + "-p", + "--test_repr_dir", + type=str, + metavar="", + required=True, + help="Test set residue level representation directory", + ) + return parser class DisorderPred: @@ -331,68 +376,28 @@ def cleared_sequence_cv(self): ) -def main(argv): - try: - opts, args = getopt.getopt( - argv, - "hs:t:e:r:p:", - [ - "train_strategy=", - "train_json_file=", - "test_json_file=", - "train_repr_dir=", - "test_repr_dir=", - ], - ) - except getopt.GetoptError: - print( - "usage: training.py" - "-s " - "-t " - "-e " - "-r " - "-p " - ) +def main(args): + if args.train_strategy not in constants.train_strategies: + print("The training strategies are:") + print(*constants.train_strategies, sep="\n") sys.exit(2) - for opt, arg in opts: - if opt == "-h": - print( - "usage: training.py" - "-s " - "-t " - "-e " - "-r " - "-p " - ) - sys.exit() - elif opt in ("-s", "--train_strategy"): - train_strategy = arg - if train_strategy not in constants.train_strategies: - print("The training strategies are:") - print(*constants.train_strategies, sep="\n") - sys.exit(2) - elif opt in ("-t", "--train_json_file"): - train_sequences = arg - elif opt in ("-e", "--test_json_file"): - test_sequences = arg - elif opt in ("-r", "--train_repr_dir"): - train_repr_dir = arg - elif opt in ("-p", "--test_repr_dir"): - test_repr_dir = arg + +if __name__ == "__main__": + parser = create_parser() + args = parser.parse_args() + main(args) disorder_pred = DisorderPred( - train_sequences, test_sequences, train_repr_dir, test_repr_dir + args.train_json_file, + args.test_json_file, + args.train_repr_dir, + args.test_repr_dir, ) - - if train_strategy == "train_on_cleared_1325_test_on_117_residue_split": + if args.train_strategy == "train_on_cleared_1325_test_on_117_residue_split": disorder_pred.cleared_residue() - elif train_strategy == "train_on_1325_cv_residue_split": + elif args.train_strategy == "train_on_1325_cv_residue_split": disorder_pred.residue_cv() - elif train_strategy == "train_on_cleared_1325_cv_residue_split": + elif args.train_strategy == "train_on_cleared_1325_cv_residue_split": disorder_pred.cleared_residue_cv() else: disorder_pred.cleared_sequence_cv() - - -if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/adopt/version.py b/adopt/version.py index 7a1faf3..ef9fb5e 100644 --- a/adopt/version.py +++ b/adopt/version.py @@ -3,4 +3,4 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -version = "0.2.1" +version = "0.3.0"