diff --git a/bin/utils.py b/bin/utils.py index 9d062ba..4662722 100644 --- a/bin/utils.py +++ b/bin/utils.py @@ -7,7 +7,6 @@ from typing import Any import requests -from requests.exceptions import RequestException POLLING_INTERVAL = 0.5 @@ -16,7 +15,10 @@ def safe_get(url: str): Get a URL and return the response. """ try: - return requests.get(url) + return requests.get(url, timeout = 300) + except requests.exceptions.Timeout as e: + print(f"Request timed out. This might be due to a server issue. If this persists, try again later. Details:\n{e}", file=sys.stderr) + sys.exit(10) except requests.exceptions.RequestException as e: print(f"A network issue occurred. Retrying request. Details:\n{e}", file=sys.stderr) sys.exit(10) @@ -27,7 +29,10 @@ def safe_post(url: str, data: dict = dict(), json: dict = dict()): Post data to a URL and return the response. """ try: - return requests.post(url, data=data, json=json) + return requests.post(url, data = data, json = json, timeout = 300) + except requests.exceptions.Timeout as e: + print(f"Request timed out. This might be due to a server issue. If this persists, try again later. Details:\n{e}", file=sys.stderr) + sys.exit(10) except requests.exceptions.RequestException as e: print(f"A network issue occurred. Retrying request. Details:\n{e}", file=sys.stderr) sys.exit(10) diff --git a/conf/modules.config b/conf/modules.config index 614ed45..aba1e3e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -26,7 +26,8 @@ process { publishDir = [ path: { "${params.outdir}/seqinfo" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.output_intermediates ] errorStrategy = {task.exitStatus == 10 ? 'retry' : 'finish'} maxRetries = 3 @@ -36,7 +37,8 @@ process { publishDir = [ path: { "${params.outdir}/orthologs/oma" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.output_intermediates ] errorStrategy = {task.exitStatus == 10 ? 'retry' : 'finish'} maxRetries = 3 @@ -46,7 +48,8 @@ process { publishDir = [ path: { "${params.outdir}/orthologs/panther" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.output_intermediates ] errorStrategy = {task.exitStatus == 10 ? 'retry' : 'finish'} maxRetries = 3 @@ -56,7 +59,8 @@ process { publishDir = [ path: { "${params.outdir}/orthologs/orthoinspector" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.output_intermediates ] errorStrategy = {task.exitStatus == 10 ? 'retry' : 'finish'} maxRetries = 3 @@ -66,7 +70,8 @@ process { publishDir = [ path: { "${params.outdir}/orthologs/eggnog" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.output_intermediates ] errorStrategy = {task.exitStatus == 10 ? 'retry' : 'finish'} maxRetries = 3 @@ -74,11 +79,6 @@ process { withName: 'MERGE_CSV' { ext.args = '-f 1 --outer-join --na 0' - publishDir = [ - path: { "${params.outdir}/orthologs/merge_csv" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] } withName: 'MAKE_SCORE_TABLE' { @@ -93,7 +93,8 @@ process { publishDir = [ path: { "${params.outdir}/orthologs/filter_hits" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.output_intermediates ] } @@ -105,33 +106,9 @@ process { ] } - withName: 'MAKE_HITS_TABLE' { - publishDir = [ - path: { "${params.outdir}/orthologs/hits" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: 'MERGE_HITS' { ext.args = "-u 0 -k" ext.prefix = "aggregated_hits" - publishDir = [ - path: { "${params.outdir}/orthologs/hits" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: 'MAKE_STATS' { - publishDir = [ - path: { "${params.outdir}/orthologs/stats" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: 'STATS2CSV' { publishDir = [ path: { "${params.outdir}/orthologs/stats" }, mode: params.publish_dir_mode, @@ -155,7 +132,7 @@ process { withName: 'FETCH_SEQUENCES_ONLINE' { publishDir = [ - path: { "${params.outdir}/sequences" }, + path: { "${params.outdir}/alignment/sequences" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -165,7 +142,7 @@ process { withName: 'FETCH_AFDB_STRUCTURES' { publishDir = [ - path: { "${params.outdir}/structures" }, + path: { "${params.outdir}/alignment/structures" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -177,15 +154,8 @@ process { publishDir = [ path: { "${params.outdir}/alignment/filter" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: 'CREATE_TCOFFEETEMPLATE' { - publishDir = [ - path: { "${params.outdir}/alignment/template" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.output_intermediates ] } @@ -201,14 +171,6 @@ process { // Tree reconstruction // ---------------------- - withName: 'CONVERT_PHYLIP' { - publishDir = [ - path: { "${params.outdir}/trees/convert" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: 'IQTREE' { ext.args = '-m TEST' + (params.iqtree_bootstrap > 0 ? ' -bb ' + params.iqtree_bootstrap : '') publishDir = [ @@ -247,19 +209,12 @@ process { // Report generation // ---------------------- - withName: 'CONVERT_FASTA' { - publishDir = [ - path: { "${params.outdir}/report/convert" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: 'DUMP_PARAMS' { publishDir = [ path: { "${params.outdir}/report/params" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.output_intermediates ] } diff --git a/docs/usage.md b/docs/usage.md index 5919ae1..cc4ee4d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -101,12 +101,18 @@ If you want to use local database copies for the run, you must provide the requi | `eggnog_path` | `1_members.tsv.gz` | | `eggnog_idmap_path` | `latest.Eukaryota.tsv.gz` | +If you need reduced versions of the local databases for testing, you can find them [here](https://github.com/nf-core/test-datasets/tree/reportho/testdata/databases). Note that they were designed to work with the [test samplesheet](https://github.com/nf-core/test-datasets/blob/reportho/testdata/samplesheet/samplesheet.csv) and will likely not provide any result for other queries. + ### Running offline -With large input sizes, you might want to run the pipeline locally, without runtime access to APIs. There are two main parameters used to achieve this. If you want to use local databases, set `--local_databases` to `true`. Remember to set `--use_all` to `false` to ensure the database step is run fully offline. If your input is especially large, you can also skip the initial online identification steps by setting `--offline_run` to `true`. Note that FASTA input will not work with this option enabled. You can check `test_offline.config` to see the required options for a fully offline run. Keep in mind that the options only affect ortholog finding, and the downstream analysis still requires connection to obtain sequences and structures. +With large input sizes, you might want to run the pipeline locally, without runtime access to APIs. There are two main parameters used to achieve this. If you want to use local databases, set `--local_databases` to `true`. Remember to set `--use_all` to `false` to ensure the database step is run fully offline. If your input is especially large, you can also skip the initial online identification steps by setting `--offline_run` to `true`. Note that FASTA input will not work with this option enabled, and the pipeline will be aborted if this is attempted. You can check [test_offline.config](https://github.com/nf-core/reportho/blob/master/conf/test_offline.config) to see the required options for a fully offline run. Keep in mind that the options only affect ortholog finding, and the downstream analysis still requires connection to obtain sequences and structures. While those options allow the pipeline to run its steps offline, the pipeline requires certain configuration files and container images that are downloaded from the internet. If you wish to run the pipeline on a machine without a connection, you can pre-download the required files with `nf-core download`. See [the nf-core tools documentation](https://nf-co.re/docs/nf-core-tools/pipelines/download) for details. +### Downstream analysis + +Downstream analysis (i.e. MSA and phylogeny) relies on online resources to obtain sequences and structures, and thus cannot be run offline. For your convenience, it will be automatically disabled if you enable `offline_run`. Note that in case some sequences or structures cannot be obtained, the corresponding ortholog will be excluded from the alignment and phylogeny. In particular, only the orthologs with both a sequence and a structure available will be retained if `use_structures` is enabled. + ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: diff --git a/nextflow.config b/nextflow.config index 303129b..26c16f9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -10,6 +10,7 @@ params { // Input options input = null + output_intermediates = false // MultiQC options multiqc_config = null diff --git a/nextflow_schema.json b/nextflow_schema.json index e93113e..0a08e56 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -29,6 +29,12 @@ "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", "fa_icon": "fas fa-folder-open" }, + "output_intermediates": { + "type": "boolean", + "default": "false", + "description": "Output intermediate files, including specific prediction lists.", + "fa_icon": "fas fa-folder-open" + }, "email": { "type": "string", "description": "Email address for completion summary.", diff --git a/subworkflows/local/get_orthologs.nf b/subworkflows/local/get_orthologs.nf index 8c524de..124b0c3 100644 --- a/subworkflows/local/get_orthologs.nf +++ b/subworkflows/local/get_orthologs.nf @@ -36,16 +36,15 @@ workflow GET_ORTHOLOGS { ch_eggnog = params.eggnog_path ? Channel.value(file(params.eggnog_path)) : Channel.empty() ch_eggnog_idmap = params.eggnog_idmap_path ? Channel.value(file(params.eggnog_idmap_path)) : Channel.empty() - fasta_input = true - ch_samplesheet_fasta.ifEmpty { - fasta_input = false - } - ch_samplesheet_fasta.view() - if (fasta_input && params.offline_run) { - log.warn("You are using FASTA input in an offline run. Online identification will be used. Be aware it might cause rate limit issues.") - } + ch_samplesheet_fasta.map { + if (params.offline_run) { + error "Tried to use FASTA input in an offline run. Aborting pipeline for user safety." + } + return it + }.set { ch_samplesheet_fasta } // Preprocessing - find the ID and taxid of the query sequences + ch_samplesheet_fasta .map { it -> [it[0], file(it[1])] } .set { ch_fasta } @@ -66,12 +65,10 @@ workflow GET_ORTHOLOGS { ch_versions = ch_versions.mix(WRITE_SEQINFO.out.versions) // Ortholog fetching - if(params.offline_run && params.use_all) { - log.warn("Both '--use_all' and '--offline_run' parameters have been specified!\nThose databases that can't be run offline will be run online.") - } - if(params.use_all) { - // OMA + // OMA + + if (params.use_all || !params.skip_oma) { if (params.local_databases) { FETCH_OMA_GROUP_LOCAL ( ch_query, @@ -98,7 +95,11 @@ workflow GET_ORTHOLOGS { ch_versions = ch_versions.mix(FETCH_OMA_GROUP_ONLINE.out.versions) } - // Panther + } + + // PANTHER + + if (params.use_all || !params.skip_panther) { if (params.local_databases) { FETCH_PANTHER_GROUP_LOCAL ( ch_query, @@ -121,7 +122,11 @@ workflow GET_ORTHOLOGS { ch_versions = ch_versions.mix(FETCH_PANTHER_GROUP_ONLINE.out.versions) } - // OrthoInspector + } + + // OrthoInspector + + if ((params.use_all || !params.skip_orthoinspector) && !params.local_databases) { FETCH_INSPECTOR_GROUP_ONLINE ( ch_query, params.orthoinspector_version @@ -132,8 +137,11 @@ workflow GET_ORTHOLOGS { .set { ch_orthogroups } ch_versions = ch_versions.mix(FETCH_INSPECTOR_GROUP_ONLINE.out.versions) + } + + // EggNOG - // EggNOG + if (params.use_all || (!params.skip_eggnog && params.local_databases)) { FETCH_EGGNOG_GROUP_LOCAL ( ch_query, ch_eggnog, @@ -149,92 +157,6 @@ workflow GET_ORTHOLOGS { ch_versions = ch_versions.mix(FETCH_EGGNOG_GROUP_LOCAL.out.versions) } - else { // online/local separation is used - // local only - if (params.local_databases) { - if (!params.skip_oma) { - FETCH_OMA_GROUP_LOCAL ( - ch_query, - ch_oma_groups, - ch_oma_uniprot, - ch_oma_ensembl, - ch_oma_refseq - ) - - ch_orthogroups - .mix(FETCH_OMA_GROUP_LOCAL.out.oma_group) - .set { ch_orthogroups } - - ch_versions = ch_versions.mix(FETCH_OMA_GROUP_LOCAL.out.versions) - } - - if (!params.skip_panther) { - FETCH_PANTHER_GROUP_LOCAL ( - ch_query, - ch_panther - ) - - ch_orthogroups - .mix(FETCH_PANTHER_GROUP_LOCAL.out.panther_group) - .set { ch_orthogroups } - - ch_versions = ch_versions.mix(FETCH_PANTHER_GROUP_LOCAL.out.versions) - } - - if(!params.skip_eggnog) { - FETCH_EGGNOG_GROUP_LOCAL ( - ch_query, - ch_eggnog, - ch_eggnog_idmap, - ch_oma_ensembl, - ch_oma_refseq, - params.offline_run - ) - - ch_orthogroups - .mix(FETCH_EGGNOG_GROUP_LOCAL.out.eggnog_group) - .set { ch_orthogroups } - - ch_versions = ch_versions.mix(FETCH_EGGNOG_GROUP_LOCAL.out.versions) - } - } - else { // online only - if (!params.skip_oma) { - FETCH_OMA_GROUP_ONLINE ( - ch_query - ) - - ch_orthogroups - .mix(FETCH_OMA_GROUP_ONLINE.out.oma_group) - .set { ch_orthogroups } - - ch_versions = ch_versions.mix(FETCH_OMA_GROUP_ONLINE.out.versions) - } - if (!params.skip_panther) { - FETCH_PANTHER_GROUP_ONLINE ( - ch_query - ) - - ch_orthogroups - .mix(FETCH_PANTHER_GROUP_ONLINE.out.panther_group) - .set { ch_orthogroups } - - ch_versions = ch_versions.mix(FETCH_PANTHER_GROUP_ONLINE.out.versions) - } - if (!params.skip_orthoinspector) { - FETCH_INSPECTOR_GROUP_ONLINE ( - ch_query, - params.orthoinspector_version - ) - - ch_orthogroups - .mix(FETCH_INSPECTOR_GROUP_ONLINE.out.inspector_group) - .set { ch_orthogroups } - - ch_versions = ch_versions.mix(FETCH_INSPECTOR_GROUP_ONLINE.out.versions) - } - } - } // Result merging diff --git a/subworkflows/local/make_trees.nf b/subworkflows/local/make_trees.nf index 6f60967..5d12701 100644 --- a/subworkflows/local/make_trees.nf +++ b/subworkflows/local/make_trees.nf @@ -11,10 +11,10 @@ workflow MAKE_TREES { main: ch_versions = Channel.empty() - ch_mltree = Channel.empty() - ch_metree = Channel.empty() - ch_mlplot = Channel.empty() - ch_meplot = Channel.empty() + ch_mltree = ch_alignment.map { [it[0], []] } + ch_metree = ch_alignment.map { [it[0], []] } + ch_mlplot = ch_alignment.map { [it[0], []] } + ch_meplot = ch_alignment.map { [it[0], []] } if (!params.skip_iqtree) { IQTREE ( @@ -26,8 +26,6 @@ workflow MAKE_TREES { ch_versions = ch_versions.mix(IQTREE.out.versions) - ch_mlplot = ch_alignment.map { [it[0], []] } - if(!params.skip_treeplots) { PLOT_IQTREE ( IQTREE.out.phylogeny, @@ -56,8 +54,6 @@ workflow MAKE_TREES { ch_versions = ch_versions.mix(FASTME.out.versions) - ch_meplot = ch_alignment.map { [it[0], []] } - if(!params.skip_treeplots) { PLOT_FASTME ( FASTME.out.nwk, diff --git a/subworkflows/local/report.nf b/subworkflows/local/report.nf index b3c60ed..5a997de 100644 --- a/subworkflows/local/report.nf +++ b/subworkflows/local/report.nf @@ -30,25 +30,6 @@ workflow REPORT { ch_versions = Channel.empty() ch_fasta = ch_seqinfo.map { [it[0], []] } - if(params.skip_downstream) { - ch_seqhits = ch_seqinfo.map { [it[0], []] } - ch_seqmisses = ch_seqinfo.map { [it[0], []] } - ch_strhits = ch_seqinfo.map { [it[0], []] } - ch_strmisses = ch_seqinfo.map { [it[0], []] } - ch_alignment = ch_seqinfo.map { [it[0], []] } - } - else if(!params.use_structures) { - ch_strhits = ch_seqinfo.map { [it[0], []] } - ch_strmisses = ch_seqinfo.map { [it[0], []] } - } - - if (params.skip_iqtree) { - ch_iqtree = ch_seqinfo.map { [it[0], []] } - } - if (params.skip_fastme) { - ch_fastme = ch_seqinfo.map { [it[0], []] } - } - DUMP_PARAMS( ch_seqinfo.map { [it[0], it[3]] }, params.use_structures, diff --git a/subworkflows/local/utils_nfcore_reportho_pipeline/main.nf b/subworkflows/local/utils_nfcore_reportho_pipeline/main.nf index 44dc7eb..d0e7824 100644 --- a/subworkflows/local/utils_nfcore_reportho_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_reportho_pipeline/main.nf @@ -73,6 +73,11 @@ workflow PIPELINE_INITIALISATION { nextflow_cli_args ) + // + // Validate parameters + // + validateParameters() + // // Create channel from input file provided through params.input and check for query // @@ -140,6 +145,31 @@ workflow PIPELINE_COMPLETION { ======================================================================================== */ +// +// Validate parameters +// +def validateParameters() { + validateOfflineSettings() +} + +def validateOfflineSettings() { + if (params.offline_run) { + if (!params.local_databases) { + params.local_databases = true + log.warn("Offline mode enabled, setting 'local_databases' to 'true'") + } + if (!params.skip_downstream) { + params.skip_downstream = true + log.warn("Offline mode enabled, setting 'skip_downstream' to 'true'") + } + if (params.use_all) { + log.warn("Offline run set with 'use_all', only local databases will be used") + } + } else if (params.use_all && params.local_databases) { + log.warn("Local databases set with 'use_all', only local databases will be used") + } +} + // // Validate channels from input samplesheet