Merge pull request #15 from PeptoneInc/onnx-runtime

Onnx runtime
PeptoneLtd · Nov 15, 2021 · 00f7d4c · 00f7d4c
2 parents ac34eed + ce9e6d1
commit 00f7d4c
Show file tree

Hide file tree

Showing 17 changed files with 581 additions and 288 deletions.
diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml
@@ -48,8 +48,12 @@ jobs:
       # Run Linter against code base #
       ################################
       - name: Lint Code Base
-        uses: github/super-linter@v4.8.1
+        uses: github/super-linter/slim@v4
         env:
           VALIDATE_ALL_CODEBASE: false
           DEFAULT_BRANCH: main
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          IGNORE_GENERATED_FILES: true
+          VALIDATE_PYTHON_BLACK: false
+          VALIDATE_PYTHON_ISORT: false
+
diff --git a/CITATION.cff b/CITATION.cff
@@ -7,7 +7,7 @@ authors:
     affiliation: "Peptone Ltd."
     orcid: ""
 title: "Attention DisOrder PredicTor"
-version: 0.1.0
+version: 0.1.1
 doi: 
 date-released: 
 url: "https://github.com/PeptoneInc/ADOPT"
diff --git a/adopt/__init__.py b/adopt/__init__.py
@@ -3,6 +3,9 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from version import version as __version__
-from data import CheZod
-from training import DisorderPred
+"@generated"
+
+from . import constants, utils
+from .data import CheZod
+from .training import DisorderPred
+from .version import version as __version__
diff --git a/adopt/constants.py b/adopt/constants.py
@@ -3,22 +3,24 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-esm_models = ["esm1v_t33_650M_UR90S_1",
-              "esm1b_t33_650M_UR50S"] 
-             # "esm_msa1b_t12_100M_UR50S"]
+esm_models = ["esm1v_t33_650M_UR90S_1", "esm1b_t33_650M_UR50S"]
+# "esm_msa1b_t12_100M_UR50S"]
 
-model_types = ['esm-1v', 'esm-1b']#, 'esm-msa']
+model_types = ["esm-1v", "esm-1b"]  # , 'esm-msa']
 
-models_dict = {"esm1v_t33_650M_UR90S_1":'esm-1v',
-              "esm1b_t33_650M_UR50S":'esm-1b'}
-              #"esm_msa1b_t12_100M_UR50S":'esm-msa'}
+models_dict = {"esm1v_t33_650M_UR90S_1": "esm-1v", "esm1b_t33_650M_UR50S": "esm-1b"}
+# "esm_msa1b_t12_100M_UR50S":'esm-msa'}
 
-train_strategies = ["train_on_cleared_1325_test_on_117_residue_split",
-                    "train_on_1325_cv_residue_split",
-                    "train_on_cleared_1325_cv_residue_split",
-                    "train_on_cleared_1325_cv_sequence_split"]
+train_strategies = [
+    "train_on_cleared_1325_test_on_117_residue_split",
+    "train_on_1325_cv_residue_split",
+    "train_on_cleared_1325_cv_residue_split",
+    "train_on_cleared_1325_cv_sequence_split",
+]
 
-strategies_dict = {"train_on_cleared_1325_test_on_117_residue_split":"cleared_residue",
-                   "train_on_1325_cv_residue_split":"residue_cv",
-                   "train_on_cleared_1325_cv_residue_split":"cleared_residue_cv",
-                   "train_on_cleared_1325_cv_sequence_split":"cleared_sequence_cv"}
+strategies_dict = {
+    "train_on_cleared_1325_test_on_117_residue_split": "cleared_residue",
+    "train_on_1325_cv_residue_split": "residue_cv",
+    "train_on_cleared_1325_cv_residue_split": "cleared_residue_cv",
+    "train_on_cleared_1325_cv_sequence_split": "cleared_sequence_cv",
+}
diff --git a/adopt/data.py b/adopt/data.py
@@ -4,8 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import pandas as pd
-import constants
-import utils
+from adopt import constants, utils
 
 
 class CheZod:
@@ -17,48 +16,73 @@ def get_chezod_raw(self):
         df_ch = pd.read_json(self.path_chezod_1325_raw)
         df_117 = pd.read_json(self.path_chezod_117_raw)
 
-        # since there are some proteins in the 1325 set, we will remove these and create a reduced dataframe for later use 
+        # since there are some proteins in the 1325 set,
+        # we will remove these and create a reduced dataframe for later use
         # check the overlap, if any exists, in the 117 and 1325 sets
-        overlaps = list(set(list(df_ch['brmid'])) & set(list(df_117['brmid'])))
+        overlaps = list(set(list(df_ch["brmid"])) & set(list(df_117["brmid"])))
 
-        # Drop the overlaps from the 1325 
-        df_cleared = df_ch[~df_ch['brmid'].isin(overlaps)]
+        # Drop the overlaps from the 1325
+        df_cleared = df_ch[~df_ch["brmid"].isin(overlaps)]
         return df_cleared, df_ch, df_117
 
-    def get_train_test_sets(self,
-                            path_chezod_1325_repr,
-                            path_chezod_117_repr):
+    def get_train_test_sets(self, path_chezod_1325_repr, path_chezod_117_repr):
         # collect the path to representations according to model type and train vs test set
-        repr_path = utils.representation_path(path_chezod_1325_repr,
-                                              path_chezod_117_repr)
+        repr_path = utils.representation_path(
+            path_chezod_1325_repr, path_chezod_117_repr
+        )
 
         df_cleared, _, df_117 = self.get_chezod_raw()
 
-        # read the data 
+        # read the data
         ex_train, zed_train = {}, {}
         ex_test, zed_test = {}, {}
 
         for model_type in constants.model_types:
-            if model_type=='esm-msa':
-                msa_ind=True
+            if model_type == "esm-msa":
+                msa_ind = True
             else:
-                msa_ind=False
-
-            ex_train[model_type], zed_train[model_type] = utils.pedestrian_input(list(df_cleared['brmid']), df_cleared, repr_path[model_type]['1325'], z_col='z-score', msa=msa_ind)
+                msa_ind = False
+
+            ex_train[model_type], zed_train[model_type] = utils.pedestrian_input(
+                list(df_cleared["brmid"]),
+                df_cleared,
+                repr_path[model_type]["1325"],
+                z_col="z-score",
+                msa=msa_ind,
+            )
             # assemble the test data from the 117 set
-            ex_test[model_type], zed_test[model_type] = utils.pedestrian_input(list(df_117['brmid']), df_117, repr_path[model_type]['117'], z_col='zscore', msa=msa_ind)
+            ex_test[model_type], zed_test[model_type] = utils.pedestrian_input(
+                list(df_117["brmid"]),
+                df_117,
+                repr_path[model_type]["117"],
+                z_col="zscore",
+                msa=msa_ind,
+            )
 
-        # Quick check, whether the number of inputs is the same for all 3 model types 
+        # Quick check, whether the number of inputs is the same for all 3 model types
         for model_type in constants.model_types:
             print(model_type)
-            print('----------------------------')
-            print('training set')
-            print('input shape: ', ex_train[model_type].shape, 'output shape: ', zed_train[model_type].shape)
-            print('test set')
-            print('input shape: ', ex_test[model_type].shape, 'output shape: ', zed_test[model_type].shape)
+            print("----------------------------")
+            print("training set")
+            print(
+                "input shape: ",
+                ex_train[model_type].shape,
+                "output shape: ",
+                zed_train[model_type].shape,
+            )
+            print("test set")
+            print(
+                "input shape: ",
+                ex_test[model_type].shape,
+                "output shape: ",
+                zed_test[model_type].shape,
+            )
             print()
 
-        if ex_train[constants.model_types[0]].shape[0]==ex_train[constants.model_types[1]].shape[0]:#==ex_train[constants.model_types[2]].shape[0]:
-            print('The number of inputs is the same for each model type')
-
-        return ex_train, zed_train, ex_test, zed_test
+        if (
+            ex_train[constants.model_types[0]].shape[0]
+            == ex_train[constants.model_types[1]].shape[0]
+        ):  # ==ex_train[constants.model_types[2]].shape[0]:
+            print("The number of inputs is the same for each model type")
+
+        return ex_train, zed_train, ex_test, zed_test
diff --git a/adopt/embedding.py b/adopt/embedding.py
@@ -3,34 +3,60 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+import getopt
 import subprocess
 import sys
-import getopt
 from pathlib import Path
-import constants
+
+from adopt import constants
+
 
 # extract residue level representations of each protein sequence in the fasta file
 def get_representations(fasta_file, repr_dir):
     for esm_model in constants.esm_models:
-        model_dir = str(repr_dir)+"/"+constants.models_dict[esm_model]
+        model_dir = str(repr_dir) + "/" + constants.models_dict[esm_model]
         Path(str(model_dir)).mkdir(parents=True, exist_ok=True)
-        if 'esm_msa' in esm_model:
-            bashCommand = "python ../esm/extract.py "+str(esm_model)+" "+str(fasta_file)+" "+model_dir+ " --repr_layers 12 --include per_tok" # todo fasta_file->msa_fasta_file
+        if "esm_msa" in esm_model:
+            bashCommand = (
+                "python ../esm/extract.py "
+                + str(esm_model)
+                + " "
+                + str(fasta_file)
+                + " "
+                + model_dir
+                + " --repr_layers 12 --include per_tok"
+            )  # todo fasta_file->msa_fasta_file
         else:
-            bashCommand = "python ../esm/extract.py "+str(esm_model)+" "+str(fasta_file)+" "+model_dir+ " --repr_layers 33 --include per_tok"
+            bashCommand = (
+                "python ../esm/extract.py "
+                + str(esm_model)
+                + " "
+                + str(fasta_file)
+                + " "
+                + model_dir
+                + " --repr_layers 33 --include per_tok"
+            )
         process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
         output, error = process.communicate()
 
 
 def main(argv):
     try:
-        opts, args = getopt.getopt(argv, "hf:r:", ["fasta_file=", "repr_dir="]) 
+        opts, args = getopt.getopt(argv, "hf:r:", ["fasta_file=", "repr_dir="])
     except getopt.GetoptError:
-        print('usage: embedding.py -f <fasta_file_path=> -r <residue_level_representation_dir>')
+        print(
+            "usage: embedding.py"
+            "-f <fasta_file_path>"
+            "-r <residue_level_representation_dir>"
+        )
         sys.exit(2)
     for opt, arg in opts:
-        if opt == '-h':
-            print('usage: embedding.py -f <fasta_files_dir> -r <residue_level_representation_dir>')
+        if opt == "-h":
+            print(
+                "usage: embedding.py"
+                "-f <fasta_file_path>"
+                "-r <residue_level_representation_dir>"
+            )
             sys.exit()
         elif opt in ("-f", "--fasta_dir"):
             fasta_dir = arg
@@ -39,5 +65,6 @@ def main(argv):
 
     get_representations(fasta_dir, repr_dir)
 
+
 if __name__ == "__main__":
-    main(sys.argv[1:])
+    main(sys.argv[1:])