diff --git a/dammit/annotate.py b/dammit/annotate.py index 5c702d93..bd1db485 100644 --- a/dammit/annotate.py +++ b/dammit/annotate.py @@ -124,7 +124,7 @@ def build_default_pipeline(handler, config, databases): register_transdecoder_tasks(handler, config, databases) register_rfam_tasks(handler, config, databases) register_lastal_tasks(handler, config, databases, - include_uniref=False) + include_uniref=False, include_nr=False) register_user_db_tasks(handler, config, databases) register_annotate_tasks(handler, config, databases) @@ -149,13 +149,39 @@ def build_full_pipeline(handler, config, databases): register_transdecoder_tasks(handler, config, databases) register_rfam_tasks(handler, config, databases) register_lastal_tasks(handler, config, databases, - include_uniref=True) + include_uniref=True, include_nr=False) register_user_db_tasks(handler, config, databases) register_annotate_tasks(handler, config, databases) return handler +def build_nr_pipeline(handler, config, databases): + '''Register tasks for the full+nr dammit pipeline (with uniref90 AND nr). + + Args: + handler (handler.TaskHandler): The task handler to register on. + config (dict): Config dictionary, which contains the command + line arguments and the entries from the config file. + databases (dict): The dictionary of files from a database + TaskHandler. + + Returns: + handler.TaskHandler: The handler passed in. + ''' + register_stats_task(handler) + register_busco_task(handler, config, databases) + register_transdecoder_tasks(handler, config, databases) + register_rfam_tasks(handler, config, databases) + register_lastal_tasks(handler, config, databases, + include_uniref=True, include_nr=True) + register_user_db_tasks(handler, config, databases) + register_annotate_tasks(handler, config, databases) + + return handler + + + def build_quick_pipeline(handler, config, databases): '''Register tasks for the quick annotation pipeline. @@ -304,7 +330,7 @@ def register_rfam_tasks(handler, config, databases): def register_lastal_tasks(handler, config, databases, - include_uniref=False): + include_uniref=False, include_nr=False): '''Register tasks for `lastal` searches. By default, this will just align the transcriptome against OrthoDB; if requested, it will align against uniref90 as well, which takes considerably longer. @@ -325,6 +351,8 @@ def register_lastal_tasks(handler, config, databases, dbs['OrthoDB'] = databases['OrthoDB'] if include_uniref is True: dbs['uniref90'] = databases['uniref90'] + if include_nr is True: + dbs['nr'] = databases['nr'] for name, db in dbs.items(): output_fn = '{0}.x.{1}.maf'.format(input_fn, name) diff --git a/dammit/app.py b/dammit/app.py index 103acb9e..295994e5 100644 --- a/dammit/app.py +++ b/dammit/app.py @@ -18,7 +18,8 @@ from dammit.meta import __version__, __authors__, __description__, __date__, get_config from dammit.annotate import (build_quick_pipeline, build_default_pipeline, - build_full_pipeline) + build_full_pipeline, + build_nr_pipeline) class DammitApp(object): @@ -139,6 +140,16 @@ def add_common_args(parser): ' time.' ) + pgroup.add_argument('--nr', + action='store_true', + default=False, + help='Also include annotation to NR database, which'\ + ' is left out of the default and "full"'\ + ' pipelines because it is huge and'\ + ' homology searches take a long time.' + ) + + pgroup.add_argument('--quick', default=False, action='store_true', @@ -267,7 +278,8 @@ def handle_databases(self): databases.build_default_pipeline(handler, self.config_d, self.databases_d, - with_uniref=self.args.full) + with_uniref=self.args.full, + with_nr=self.args.nr) if self.args.install: return databases.install(handler) else: @@ -287,7 +299,8 @@ def handle_annotate(self): databases.build_default_pipeline(db_handler, self.config_d, self.databases_d, - with_uniref=self.args.full) + with_uniref=self.args.full, + with_nr=self.args.nr) if self.config_d['force'] is True: utd_msg = '*All database tasks up-to-date.*' ood_msg = '*Some database tasks out-of-date; '\ @@ -306,6 +319,10 @@ def handle_annotate(self): build_full_pipeline(annotate_handler, self.config_d, db_handler.files) + elif self.args.nr: + build_nr_pipeline(annotate_handler, + self.config_d, + db_handler.files) else: build_default_pipeline(annotate_handler, self.config_d, diff --git a/dammit/databases.json b/dammit/databases.json index 735c2026..b715f29c 100644 --- a/dammit/databases.json +++ b/dammit/databases.json @@ -28,6 +28,14 @@ "version": "current", "url": "ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz" }, + "nr": { + "access": "download", + "db_type": "prot", + "filename": "nr", + "fileformat": "gz", + "version": "current", + "url": "ftp://ftp.ncbi.nih.gov/blast/db/FASTA/nr.gz" + }, "OrthoDB": { "access": "download", diff --git a/dammit/databases.py b/dammit/databases.py index ee45184a..7ef77943 100644 --- a/dammit/databases.py +++ b/dammit/databases.py @@ -117,7 +117,7 @@ def check_or_fail(handler): sys.exit(2) -def build_default_pipeline(handler, config, databases, with_uniref=False): +def build_default_pipeline(handler, config, databases, with_uniref=False, with_nr=False): '''Register tasks for dammit's builtin database prep pipeline. Args: @@ -138,6 +138,8 @@ def build_default_pipeline(handler, config, databases, with_uniref=False): register_busco_tasks(handler, config, databases) if with_uniref: register_uniref90_tasks(handler, config['last']['lastdb'], databases) + if with_nr: + register_nr_tasks(handler, config['last']['lastdb'], databases) return handler @@ -243,3 +245,19 @@ def register_uniref90_tasks(handler, params, databases): task_dep=[task.name])) return handler +def register_nr_tasks(handler, params, databases): + nr = databases['nr'] + task = get_download_and_gunzip_task(nr['url'], + nr['filename']) + filename = path.join(handler.directory, nr['filename']) + handler.register_task('download:nr', + task, + files={'nr': filename}) + handler.register_task('lastdb:nr', + LastDBTask().task(filename, + filename, + prot=True, + params=params, + task_dep=[task.name])) + return handler + diff --git a/generate-test-data.sh b/generate-test-data.sh index f4d97f76..0898a7c3 100644 --- a/generate-test-data.sh +++ b/generate-test-data.sh @@ -4,12 +4,14 @@ DATA_DIR=dammit/tests/test-data TEST_FILE=pom.single.fa TEST_PEP=pep.fa -dammit annotate $DATA_DIR/$TEST_FILE -dammit annotate $DATA_DIR/$TEST_FILE --full -o $TEST_FILE.dammit.full -dammit annotate --evalue 10.0 $DATA_DIR/$TEST_FILE -o $TEST_FILE.dammit.evalue10 -dammit annotate $DATA_DIR/$TEST_FILE --user-databases $DATA_DIR/$TEST_PEP -o $TEST_FILE.dammit.udb -cp $TEST_FILE.dammit/$TEST_FILE.dammit.fasta $DATA_DIR/ +dammit annotate $DATA_DIR/$TEST_FILE +dammit annotate $DATA_DIR/$TEST_FILE --full -o $TEST_FILE.dammit.full +dammit annotate $DATA_DIR/$TEST_FILE --nr -o $TEST_FILE.dammit.nr +dammit annotate --evalue 10.0 $DATA_DIR/$TEST_FILE -o $TEST_FILE.dammit.evalue10 +dammit annotate $DATA_DIR/$TEST_FILE --user-databases $DATA_DIR/$TEST_PEP -o $TEST_FILE.dammit.udb + +cp $TEST_FILE.dammit/$TEST_FILE.dammit.fasta $DATA_DIR/ cp $TEST_FILE.dammit/$TEST_FILE.dammit.gff3 $DATA_DIR/ cp $TEST_FILE.dammit.evalue10/$TEST_FILE.dammit.fasta $DATA_DIR/$TEST_FILE.dammit.fasta.evalue10 @@ -20,3 +22,7 @@ cp $TEST_FILE.dammit.udb/$TEST_FILE.dammit.gff3 $DATA_DIR/$TEST_FILE.dammit.gff3 cp $TEST_FILE.dammit.full/$TEST_FILE.dammit.fasta $DATA_DIR/$TEST_FILE.dammit.fasta.full cp $TEST_FILE.dammit.full/$TEST_FILE.dammit.gff3 $DATA_DIR/$TEST_FILE.dammit.gff3.full + +cp $TEST_FILE.dammit.nr/$TEST_FILE.dammit.fasta $DATA_DIR/$TEST_FILE.dammit.fasta.nr +cp $TEST_FILE.dammit.nr/$TEST_FILE.dammit.gff3 $DATA_DIR/$TEST_FILE.dammit.gff3.nr +