Skip to content

Commit

Permalink
Merge pull request #15 from PeptoneInc/onnx-runtime
Browse files Browse the repository at this point in the history
Onnx runtime
  • Loading branch information
CFisicaro authored Nov 15, 2021
2 parents ac34eed + ce9e6d1 commit 00f7d4c
Show file tree
Hide file tree
Showing 17 changed files with 581 additions and 288 deletions.
6 changes: 5 additions & 1 deletion .github/workflows/linter.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,12 @@ jobs:
# Run Linter against code base #
################################
- name: Lint Code Base
uses: github/super-linter@v4.8.1
uses: github/super-linter/slim@v4
env:
VALIDATE_ALL_CODEBASE: false
DEFAULT_BRANCH: main
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
IGNORE_GENERATED_FILES: true
VALIDATE_PYTHON_BLACK: false
VALIDATE_PYTHON_ISORT: false

2 changes: 1 addition & 1 deletion CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ authors:
affiliation: "Peptone Ltd."
orcid: ""
title: "Attention DisOrder PredicTor"
version: 0.1.0
version: 0.1.1
doi:
date-released:
url: "https://github.com/PeptoneInc/ADOPT"
9 changes: 6 additions & 3 deletions adopt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

from version import version as __version__
from data import CheZod
from training import DisorderPred
"@generated"

from . import constants, utils
from .data import CheZod
from .training import DisorderPred
from .version import version as __version__
32 changes: 17 additions & 15 deletions adopt/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,24 @@
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

esm_models = ["esm1v_t33_650M_UR90S_1",
"esm1b_t33_650M_UR50S"]
# "esm_msa1b_t12_100M_UR50S"]
esm_models = ["esm1v_t33_650M_UR90S_1", "esm1b_t33_650M_UR50S"]
# "esm_msa1b_t12_100M_UR50S"]

model_types = ['esm-1v', 'esm-1b']#, 'esm-msa']
model_types = ["esm-1v", "esm-1b"] # , 'esm-msa']

models_dict = {"esm1v_t33_650M_UR90S_1":'esm-1v',
"esm1b_t33_650M_UR50S":'esm-1b'}
#"esm_msa1b_t12_100M_UR50S":'esm-msa'}
models_dict = {"esm1v_t33_650M_UR90S_1": "esm-1v", "esm1b_t33_650M_UR50S": "esm-1b"}
# "esm_msa1b_t12_100M_UR50S":'esm-msa'}

train_strategies = ["train_on_cleared_1325_test_on_117_residue_split",
"train_on_1325_cv_residue_split",
"train_on_cleared_1325_cv_residue_split",
"train_on_cleared_1325_cv_sequence_split"]
train_strategies = [
"train_on_cleared_1325_test_on_117_residue_split",
"train_on_1325_cv_residue_split",
"train_on_cleared_1325_cv_residue_split",
"train_on_cleared_1325_cv_sequence_split",
]

strategies_dict = {"train_on_cleared_1325_test_on_117_residue_split":"cleared_residue",
"train_on_1325_cv_residue_split":"residue_cv",
"train_on_cleared_1325_cv_residue_split":"cleared_residue_cv",
"train_on_cleared_1325_cv_sequence_split":"cleared_sequence_cv"}
strategies_dict = {
"train_on_cleared_1325_test_on_117_residue_split": "cleared_residue",
"train_on_1325_cv_residue_split": "residue_cv",
"train_on_cleared_1325_cv_residue_split": "cleared_residue_cv",
"train_on_cleared_1325_cv_sequence_split": "cleared_sequence_cv",
}
80 changes: 52 additions & 28 deletions adopt/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
# LICENSE file in the root directory of this source tree.

import pandas as pd
import constants
import utils
from adopt import constants, utils


class CheZod:
Expand All @@ -17,48 +16,73 @@ def get_chezod_raw(self):
df_ch = pd.read_json(self.path_chezod_1325_raw)
df_117 = pd.read_json(self.path_chezod_117_raw)

# since there are some proteins in the 1325 set, we will remove these and create a reduced dataframe for later use
# since there are some proteins in the 1325 set,
# we will remove these and create a reduced dataframe for later use
# check the overlap, if any exists, in the 117 and 1325 sets
overlaps = list(set(list(df_ch['brmid'])) & set(list(df_117['brmid'])))
overlaps = list(set(list(df_ch["brmid"])) & set(list(df_117["brmid"])))

# Drop the overlaps from the 1325
df_cleared = df_ch[~df_ch['brmid'].isin(overlaps)]
# Drop the overlaps from the 1325
df_cleared = df_ch[~df_ch["brmid"].isin(overlaps)]
return df_cleared, df_ch, df_117

def get_train_test_sets(self,
path_chezod_1325_repr,
path_chezod_117_repr):
def get_train_test_sets(self, path_chezod_1325_repr, path_chezod_117_repr):
# collect the path to representations according to model type and train vs test set
repr_path = utils.representation_path(path_chezod_1325_repr,
path_chezod_117_repr)
repr_path = utils.representation_path(
path_chezod_1325_repr, path_chezod_117_repr
)

df_cleared, _, df_117 = self.get_chezod_raw()

# read the data
# read the data
ex_train, zed_train = {}, {}
ex_test, zed_test = {}, {}

for model_type in constants.model_types:
if model_type=='esm-msa':
msa_ind=True
if model_type == "esm-msa":
msa_ind = True
else:
msa_ind=False

ex_train[model_type], zed_train[model_type] = utils.pedestrian_input(list(df_cleared['brmid']), df_cleared, repr_path[model_type]['1325'], z_col='z-score', msa=msa_ind)
msa_ind = False

ex_train[model_type], zed_train[model_type] = utils.pedestrian_input(
list(df_cleared["brmid"]),
df_cleared,
repr_path[model_type]["1325"],
z_col="z-score",
msa=msa_ind,
)
# assemble the test data from the 117 set
ex_test[model_type], zed_test[model_type] = utils.pedestrian_input(list(df_117['brmid']), df_117, repr_path[model_type]['117'], z_col='zscore', msa=msa_ind)
ex_test[model_type], zed_test[model_type] = utils.pedestrian_input(
list(df_117["brmid"]),
df_117,
repr_path[model_type]["117"],
z_col="zscore",
msa=msa_ind,
)

# Quick check, whether the number of inputs is the same for all 3 model types
# Quick check, whether the number of inputs is the same for all 3 model types
for model_type in constants.model_types:
print(model_type)
print('----------------------------')
print('training set')
print('input shape: ', ex_train[model_type].shape, 'output shape: ', zed_train[model_type].shape)
print('test set')
print('input shape: ', ex_test[model_type].shape, 'output shape: ', zed_test[model_type].shape)
print("----------------------------")
print("training set")
print(
"input shape: ",
ex_train[model_type].shape,
"output shape: ",
zed_train[model_type].shape,
)
print("test set")
print(
"input shape: ",
ex_test[model_type].shape,
"output shape: ",
zed_test[model_type].shape,
)
print()

if ex_train[constants.model_types[0]].shape[0]==ex_train[constants.model_types[1]].shape[0]:#==ex_train[constants.model_types[2]].shape[0]:
print('The number of inputs is the same for each model type')

return ex_train, zed_train, ex_test, zed_test
if (
ex_train[constants.model_types[0]].shape[0]
== ex_train[constants.model_types[1]].shape[0]
): # ==ex_train[constants.model_types[2]].shape[0]:
print("The number of inputs is the same for each model type")

return ex_train, zed_train, ex_test, zed_test
49 changes: 38 additions & 11 deletions adopt/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,60 @@
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import getopt
import subprocess
import sys
import getopt
from pathlib import Path
import constants

from adopt import constants


# extract residue level representations of each protein sequence in the fasta file
def get_representations(fasta_file, repr_dir):
for esm_model in constants.esm_models:
model_dir = str(repr_dir)+"/"+constants.models_dict[esm_model]
model_dir = str(repr_dir) + "/" + constants.models_dict[esm_model]
Path(str(model_dir)).mkdir(parents=True, exist_ok=True)
if 'esm_msa' in esm_model:
bashCommand = "python ../esm/extract.py "+str(esm_model)+" "+str(fasta_file)+" "+model_dir+ " --repr_layers 12 --include per_tok" # todo fasta_file->msa_fasta_file
if "esm_msa" in esm_model:
bashCommand = (
"python ../esm/extract.py "
+ str(esm_model)
+ " "
+ str(fasta_file)
+ " "
+ model_dir
+ " --repr_layers 12 --include per_tok"
) # todo fasta_file->msa_fasta_file
else:
bashCommand = "python ../esm/extract.py "+str(esm_model)+" "+str(fasta_file)+" "+model_dir+ " --repr_layers 33 --include per_tok"
bashCommand = (
"python ../esm/extract.py "
+ str(esm_model)
+ " "
+ str(fasta_file)
+ " "
+ model_dir
+ " --repr_layers 33 --include per_tok"
)
process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
output, error = process.communicate()


def main(argv):
try:
opts, args = getopt.getopt(argv, "hf:r:", ["fasta_file=", "repr_dir="])
opts, args = getopt.getopt(argv, "hf:r:", ["fasta_file=", "repr_dir="])
except getopt.GetoptError:
print('usage: embedding.py -f <fasta_file_path=> -r <residue_level_representation_dir>')
print(
"usage: embedding.py"
"-f <fasta_file_path>"
"-r <residue_level_representation_dir>"
)
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print('usage: embedding.py -f <fasta_files_dir> -r <residue_level_representation_dir>')
if opt == "-h":
print(
"usage: embedding.py"
"-f <fasta_file_path>"
"-r <residue_level_representation_dir>"
)
sys.exit()
elif opt in ("-f", "--fasta_dir"):
fasta_dir = arg
Expand All @@ -39,5 +65,6 @@ def main(argv):

get_representations(fasta_dir, repr_dir)


if __name__ == "__main__":
main(sys.argv[1:])
main(sys.argv[1:])
Loading

0 comments on commit 00f7d4c

Please sign in to comment.