Skip to content

Commit

Permalink
struct: create proteonemo package
Browse files Browse the repository at this point in the history
  • Loading branch information
CFisicaro committed Mar 7, 2022
1 parent 058b07a commit 8a917e5
Show file tree
Hide file tree
Showing 20 changed files with 80 additions and 24 deletions.
15 changes: 14 additions & 1 deletion proteonemo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from .version import version as __version__

from proteonemo.package_info import (
__contact_emails__,
__contact_names__,
__description__,
__download_url__,
__homepage__,
__keywords__,
__license__,
__package_name__,
__repository_url__,
__shortversion__,
__version__,
)
2 changes: 1 addition & 1 deletion proteonemo/version.py → proteonemo/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.

version = "0.1.0"
from proteonemo.models.bert_prot_model import BERTPROTModel
File renamed without changes.
35 changes: 35 additions & 0 deletions proteonemo/package_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Copyright (c) 2021 Peptone.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


MAJOR = 0
MINOR = 1
PATCH = 0
PRE_RELEASE = ''

# Use the following formatting: (major, minor, patch, pre-release)
VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)

__shortversion__ = '.'.join(map(str, VERSION[:3]))
__version__ = '.'.join(map(str, VERSION[:3])) + ''.join(VERSION[3:])

__package_name__ = 'proteonemo'
__contact_names__ = 'Peptone'
__contact_emails__ = '[email protected]'
__homepage__ = 'https://peptone.io/'
__repository_url__ = 'https://github.com/PeptoneInc/ProteoNeMo.git'
__download_url__ = 'https://github.com/PeptoneInc/ProteoNeMo/archive/refs/heads/main.zip'
__description__ = 'ProteoNeMo - protein embeddings at scale'
__license__ = 'Apache2'
__keywords__ = 'protein, embedding, deep learning, machine learning, gpu, NeMo, peptone, pytorch, torch, tts'
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,12 @@
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.

from proteonemo.preprocessing.tokenization import ProteoNeMoTokenizer
from proteonemo.preprocessing import tokenization
from proteonemo.preprocessing.uniref_downloader import UniRefDownloader
from proteonemo.preprocessing.uniprotkb_downloader import UniProtKBDownloader
from proteonemo.preprocessing.uniparc_downloader import UniParcDownloader
from proteonemo.preprocessing.downloader import Downloader
from proteonemo.preprocessing.protein_sharding import Sharding
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
import numpy as np
from tqdm import tqdm, trange

from tokenization import ProteoNeMoTokenizer
import tokenization as tokenization
from proteonemo.preprocessing.tokenization import ProteoNeMoTokenizer
from proteonemo.preprocessing import tokenization as tokenization

import random
import collections
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from UniRefDownloader import UniRefDownloader
from UniProtKBDownloader import UniProtKBDownloader
from UniParcDownloader import UniParcDownloader
from proteonemo.preprocessing.uniref_downloader import UniRefDownloader
from proteonemo.preprocessing.uniprotkb_downloader import UniProtKBDownloader
from proteonemo.preprocessing.UniParcDownloader import UniParcDownloader


class Downloader:
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion proteonemo/bert_pred.py → scripts/bert_pred.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from nemo.core.config import hydra_runner
from pytorch_lightning.plugins import DDPPlugin
from nemo.utils.app_state import AppState
from bert_prot_model import BERTPROTModel
from proteonemo.models.bert_prot_model import BERTPROTModel
from nemo.collections.nlp.data.language_modeling.lm_bert_dataset import BertPretrainingPreprocessedDataset
from nemo.collections.nlp.modules.common.megatron.megatron_utils import compute_model_parallel_rank
from torch.utils.data import DataLoader
Expand Down
8 changes: 4 additions & 4 deletions preprocessing/bertPrep.py → scripts/bert_prep.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import Downloader
import ProteinSharding
from proteonemo.preprocessing.downloader import Downloader
from proteonemo.preprocessing.protein_sharding import Sharding

import argparse
import os
Expand Down Expand Up @@ -51,7 +51,7 @@ def main(args):
if not os.path.exists(directory_structure['download']):
os.makedirs(directory_structure['download'])

downloader = Downloader.Downloader(args.dataset, directory_structure['download'])
downloader = Downloader(args.dataset, directory_structure['download'])
downloader.download()

elif args.action == 'sharding':
Expand Down Expand Up @@ -90,7 +90,7 @@ def main(args):
os.makedirs(directory_structure['sharded'] + '/' + args.dataset)

rng = random.Random(args.random_seed)
sharding = ProteinSharding.Sharding(args.input_files, output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set, rng)
sharding = Sharding(args.input_files, output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set, rng)
sharding.load_fastas()
sharding.write_shards_to_disk()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@
from pytorch_lightning.plugins import DDPPlugin

#from nemo.collections.nlp.models.language_modeling import BERTLMModel
from bert_prot_model import BERTPROTModel
from proteonemo.models.bert_prot_model import BERTPROTModel
from nemo.core.config import hydra_runner
from nemo.utils import logging
from nemo.utils.exp_manager import exp_manager


@hydra_runner(config_path="conf", config_name="bert_pretraining_from_preprocessed_config")
@hydra_runner(config_path="../conf", config_name="bert_pretraining_from_preprocessed_config")
def main(cfg: DictConfig) -> None:
logging.info(f'Config:\n {OmegaConf.to_yaml(cfg)}')
trainer = pl.Trainer(plugins=[DDPPlugin(find_unused_parameters=True)], **cfg.trainer)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,17 @@ to_download=${1:-"uniref_50_only"}

#Download
if [ "$to_download" = "uniref_all" ] ; then
python3 bertPrep.py --action download --dataset uniref_90
python3 bertPrep.py --action download --dataset uniref_100
python3 bert_prep.py --action download --dataset uniref_90
python3 bert_prep.py --action download --dataset uniref_100
elif [ "$to_download" = "uniparc" ] ; then
python3 /proteonemo/preprocessing/bertPrep.py --action download --dataset uniparc
elif [ "$to_download" = "uniprotkb_all" ] ; then
python3 bertPrep.py --action download --dataset uniprotkb_swissprot
python3 bertPrep.py --action download --dataset uniprotkb_trembl
python3 bertPrep.py --action download --dataset uniprotkb_isoformseqs
python3 bert_prep.py --action download --dataset uniprotkb_swissprot
python3 bert_prep.py --action download --dataset uniprotkb_trembl
python3 bert_prep.py --action download --dataset uniprotkb_isoformseqs
fi

python3 /proteonemo/preprocessing/bertPrep.py --action download --dataset uniref_50
python3 bert_prep.py --action download --dataset uniref_50

if [ "$to_download" = "uniref_all" ] ; then
DATASET="uniref_all"
Expand All @@ -40,9 +40,9 @@ else
fi

# Shard the text files
python3 bertPrep.py --action sharding --dataset $DATASET
python3 bert_prep.py --action sharding --dataset $DATASET

# Create HDF5 files
python3 bertPrep.py --action create_hdf5_files --dataset $DATASET --max_seq_length 1024 \
--max_predictions_per_seq 160 --vocab_file vocab.txt --small_vocab_file vocab_small.txt --do_upper_case
python3 bert_prep.py --action create_hdf5_files --dataset $DATASET --max_seq_length 1024 \
--max_predictions_per_seq 160 --vocab_file ../static/vocab.txt --small_vocab_file ../static/vocab_small.txt --do_upper_case

File renamed without changes.
File renamed without changes.

0 comments on commit 8a917e5

Please sign in to comment.