Skip to content

Commit

Permalink
Merge pull request #108 from bluegenes/nr_pipeline
Browse files Browse the repository at this point in the history
annotate with NR
  • Loading branch information
camillescott authored Jun 18, 2018
2 parents 66a1b98 + 19e143f commit b1b2015
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 12 deletions.
34 changes: 31 additions & 3 deletions dammit/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def build_default_pipeline(handler, config, databases):
register_transdecoder_tasks(handler, config, databases)
register_rfam_tasks(handler, config, databases)
register_lastal_tasks(handler, config, databases,
include_uniref=False)
include_uniref=False, include_nr=False)
register_user_db_tasks(handler, config, databases)
register_annotate_tasks(handler, config, databases)

Expand All @@ -149,13 +149,39 @@ def build_full_pipeline(handler, config, databases):
register_transdecoder_tasks(handler, config, databases)
register_rfam_tasks(handler, config, databases)
register_lastal_tasks(handler, config, databases,
include_uniref=True)
include_uniref=True, include_nr=False)
register_user_db_tasks(handler, config, databases)
register_annotate_tasks(handler, config, databases)

return handler


def build_nr_pipeline(handler, config, databases):
'''Register tasks for the full+nr dammit pipeline (with uniref90 AND nr).
Args:
handler (handler.TaskHandler): The task handler to register on.
config (dict): Config dictionary, which contains the command
line arguments and the entries from the config file.
databases (dict): The dictionary of files from a database
TaskHandler.
Returns:
handler.TaskHandler: The handler passed in.
'''
register_stats_task(handler)
register_busco_task(handler, config, databases)
register_transdecoder_tasks(handler, config, databases)
register_rfam_tasks(handler, config, databases)
register_lastal_tasks(handler, config, databases,
include_uniref=True, include_nr=True)
register_user_db_tasks(handler, config, databases)
register_annotate_tasks(handler, config, databases)

return handler



def build_quick_pipeline(handler, config, databases):
'''Register tasks for the quick annotation pipeline.
Expand Down Expand Up @@ -304,7 +330,7 @@ def register_rfam_tasks(handler, config, databases):


def register_lastal_tasks(handler, config, databases,
include_uniref=False):
include_uniref=False, include_nr=False):
'''Register tasks for `lastal` searches. By default, this will just
align the transcriptome against OrthoDB; if requested, it will align against
uniref90 as well, which takes considerably longer.
Expand All @@ -325,6 +351,8 @@ def register_lastal_tasks(handler, config, databases,
dbs['OrthoDB'] = databases['OrthoDB']
if include_uniref is True:
dbs['uniref90'] = databases['uniref90']
if include_nr is True:
dbs['nr'] = databases['nr']

for name, db in dbs.items():
output_fn = '{0}.x.{1}.maf'.format(input_fn, name)
Expand Down
23 changes: 20 additions & 3 deletions dammit/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
from dammit.meta import __version__, __authors__, __description__, __date__, get_config
from dammit.annotate import (build_quick_pipeline,
build_default_pipeline,
build_full_pipeline)
build_full_pipeline,
build_nr_pipeline)


class DammitApp(object):
Expand Down Expand Up @@ -139,6 +140,16 @@ def add_common_args(parser):
' time.'
)

pgroup.add_argument('--nr',
action='store_true',
default=False,
help='Also include annotation to NR database, which'\
' is left out of the default and "full"'\
' pipelines because it is huge and'\
' homology searches take a long time.'
)


pgroup.add_argument('--quick',
default=False,
action='store_true',
Expand Down Expand Up @@ -267,7 +278,8 @@ def handle_databases(self):
databases.build_default_pipeline(handler,
self.config_d,
self.databases_d,
with_uniref=self.args.full)
with_uniref=self.args.full,
with_nr=self.args.nr)
if self.args.install:
return databases.install(handler)
else:
Expand All @@ -287,7 +299,8 @@ def handle_annotate(self):
databases.build_default_pipeline(db_handler,
self.config_d,
self.databases_d,
with_uniref=self.args.full)
with_uniref=self.args.full,
with_nr=self.args.nr)
if self.config_d['force'] is True:
utd_msg = '*All database tasks up-to-date.*'
ood_msg = '*Some database tasks out-of-date; '\
Expand All @@ -306,6 +319,10 @@ def handle_annotate(self):
build_full_pipeline(annotate_handler,
self.config_d,
db_handler.files)
elif self.args.nr:
build_nr_pipeline(annotate_handler,
self.config_d,
db_handler.files)
else:
build_default_pipeline(annotate_handler,
self.config_d,
Expand Down
8 changes: 8 additions & 0 deletions dammit/databases.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@
"version": "current",
"url": "ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz"
},
"nr": {
"access": "download",
"db_type": "prot",
"filename": "nr",
"fileformat": "gz",
"version": "current",
"url": "ftp://ftp.ncbi.nih.gov/blast/db/FASTA/nr.gz"
},

"OrthoDB": {
"access": "download",
Expand Down
20 changes: 19 additions & 1 deletion dammit/databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def check_or_fail(handler):
sys.exit(2)


def build_default_pipeline(handler, config, databases, with_uniref=False):
def build_default_pipeline(handler, config, databases, with_uniref=False, with_nr=False):
'''Register tasks for dammit's builtin database prep pipeline.
Args:
Expand All @@ -138,6 +138,8 @@ def build_default_pipeline(handler, config, databases, with_uniref=False):
register_busco_tasks(handler, config, databases)
if with_uniref:
register_uniref90_tasks(handler, config['last']['lastdb'], databases)
if with_nr:
register_nr_tasks(handler, config['last']['lastdb'], databases)

return handler

Expand Down Expand Up @@ -243,3 +245,19 @@ def register_uniref90_tasks(handler, params, databases):
task_dep=[task.name]))
return handler

def register_nr_tasks(handler, params, databases):
nr = databases['nr']
task = get_download_and_gunzip_task(nr['url'],
nr['filename'])
filename = path.join(handler.directory, nr['filename'])
handler.register_task('download:nr',
task,
files={'nr': filename})
handler.register_task('lastdb:nr',
LastDBTask().task(filename,
filename,
prot=True,
params=params,
task_dep=[task.name]))
return handler

16 changes: 11 additions & 5 deletions generate-test-data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@ DATA_DIR=dammit/tests/test-data
TEST_FILE=pom.single.fa
TEST_PEP=pep.fa

dammit annotate $DATA_DIR/$TEST_FILE
dammit annotate $DATA_DIR/$TEST_FILE --full -o $TEST_FILE.dammit.full
dammit annotate --evalue 10.0 $DATA_DIR/$TEST_FILE -o $TEST_FILE.dammit.evalue10
dammit annotate $DATA_DIR/$TEST_FILE --user-databases $DATA_DIR/$TEST_PEP -o $TEST_FILE.dammit.udb

cp $TEST_FILE.dammit/$TEST_FILE.dammit.fasta $DATA_DIR/
dammit annotate $DATA_DIR/$TEST_FILE
dammit annotate $DATA_DIR/$TEST_FILE --full -o $TEST_FILE.dammit.full
dammit annotate $DATA_DIR/$TEST_FILE --nr -o $TEST_FILE.dammit.nr
dammit annotate --evalue 10.0 $DATA_DIR/$TEST_FILE -o $TEST_FILE.dammit.evalue10
dammit annotate $DATA_DIR/$TEST_FILE --user-databases $DATA_DIR/$TEST_PEP -o $TEST_FILE.dammit.udb

cp $TEST_FILE.dammit/$TEST_FILE.dammit.fasta $DATA_DIR/
cp $TEST_FILE.dammit/$TEST_FILE.dammit.gff3 $DATA_DIR/

cp $TEST_FILE.dammit.evalue10/$TEST_FILE.dammit.fasta $DATA_DIR/$TEST_FILE.dammit.fasta.evalue10
Expand All @@ -20,3 +22,7 @@ cp $TEST_FILE.dammit.udb/$TEST_FILE.dammit.gff3 $DATA_DIR/$TEST_FILE.dammit.gff3

cp $TEST_FILE.dammit.full/$TEST_FILE.dammit.fasta $DATA_DIR/$TEST_FILE.dammit.fasta.full
cp $TEST_FILE.dammit.full/$TEST_FILE.dammit.gff3 $DATA_DIR/$TEST_FILE.dammit.gff3.full

cp $TEST_FILE.dammit.nr/$TEST_FILE.dammit.fasta $DATA_DIR/$TEST_FILE.dammit.fasta.nr
cp $TEST_FILE.dammit.nr/$TEST_FILE.dammit.gff3 $DATA_DIR/$TEST_FILE.dammit.gff3.nr

0 comments on commit b1b2015

Please sign in to comment.