From 6d0ab8b6a96de53e9c3cc0df086048f59a56f512 Mon Sep 17 00:00:00 2001 From: jvfe Date: Mon, 3 Jun 2024 17:01:27 -0300 Subject: [PATCH] feat: Add the download entry to the pipeline Signed-off-by: jvfe --- conf/base.config | 12 +++---- conf/modules.config | 31 ++++++++++++++++++ conf/test.config | 7 ++++ main.nf | 5 +++ modules/local/download/main.nf | 25 ++++++++++++++ nextflow.config | 12 +++++++ nextflow_schema.json | 60 ++++++++++++++++++++++++++++++++-- workflows/download.nf | 45 +++++++++++++++++++++++++ 8 files changed, 187 insertions(+), 10 deletions(-) create mode 100644 modules/local/download/main.nf create mode 100644 workflows/download.nf diff --git a/conf/base.config b/conf/base.config index 0158901..d62b5e1 100644 --- a/conf/base.config +++ b/conf/base.config @@ -10,7 +10,6 @@ process { - // TODO nf-core: Check the defaults for all processes cpus = { check_max( 1 * task.attempt, 'cpus' ) } memory = { check_max( 6.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } @@ -19,13 +18,6 @@ process { maxRetries = 1 maxErrors = '-1' - // Process-specific resource requirements - // NOTE - Please try and re-use the labels below as much as possible. - // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. - // If possible, it would be nice to keep the same label naming convention when - // adding in your local modules too. - // TODO nf-core: Customise requirements for specific processes. - // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_single { cpus = { check_max( 1 , 'cpus' ) } memory = { check_max( 6.GB * task.attempt, 'memory' ) } @@ -59,6 +51,10 @@ process { errorStrategy = 'retry' maxRetries = 2 } + withLabel:error_retry_delay { + errorStrategy = { sleep(Math.pow(2, task.attempt) * 200 as long); return 'retry' } + maxRetries = 3 + } withName:CUSTOM_DUMPSOFTWAREVERSIONS { cache = false } diff --git a/conf/modules.config b/conf/modules.config index 6bf4697..be1fe2d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -27,6 +27,37 @@ process { ] } + // DOWNLOAD ENTRY + withName: DOWNLOAD_FUNCTIONAL_DB { + publishDir = [ + path: { "${params.outdir}/references/functional" }, + mode: "move", + ] + } + withName: DOWNLOAD_FUNCTIONAL_DICT { + publishDir = [ + path: { "${params.outdir}/references/functional" }, + mode: "move", + ] + } + withName: DOWNLOAD_KAIJU { + publishDir = [ + path: { "${params.outdir}/references/kaiju" }, + mode: "move", + ] + } + withName: DOWNLOAD_KRAKEN { + publishDir = [ + path: { "${params.outdir}/references/kraken2" }, + mode: "move", + ] + } + withName: DOWNLOAD_HOST { + publishDir = [ + path: { "${params.outdir}/references/host" }, + mode: "move", + ] + } // Host removal withName: BOWTIE2_ALIGN { ext.args = "--sensitive" diff --git a/conf/test.config b/conf/test.config index c2ffbe1..dccd906 100644 --- a/conf/test.config +++ b/conf/test.config @@ -26,6 +26,13 @@ params { id_mapping = "$projectDir/test_data/idmapping_selected.tab.example.gz" reference_fasta = "$projectDir/test_data/protein.faa.gz" + // download entry + functional_db = 'https://github.com/dalmolingroup/euryale/raw/main/test_data/protein.faa.gz' + functional_dictionary = 'https://github.com/dalmolingroup/euryale/raw/main/test_data/idmapping_selected.tab.example.gz' + kaiju_db_url = 'https://github.com/nf-core/test-datasets/raw/taxprofiler/data/database/kaiju/kaiju.tar.gz' + kraken2_db_url = 'https://github.com/nf-core/test-datasets/raw/taxprofiler/data/database/kraken2/testdb-kraken2.tar.gz' + host_url = 'https://github.com/dalmolingroup/euryale/raw/main/test_data/GCA_002596845.1_ASM259684v1_genomic.fna.gz' + // Annotation params minimum_bitscore = 30 minimum_pident = 30 diff --git a/main.nf b/main.nf index 13ac8fd..7419278 100644 --- a/main.nf +++ b/main.nf @@ -32,6 +32,7 @@ WorkflowMain.initialise(workflow, params, log) */ include { EURYALE } from './workflows/euryale' +include { DOWNLOAD } from './workflows/download' // // WORKFLOW: Run main dalmolingroup/euryale analysis pipeline @@ -40,6 +41,10 @@ workflow DALMOLINGROUP_EURYALE { EURYALE () } +workflow download { + DOWNLOAD () +} + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN ALL WORKFLOWS diff --git a/modules/local/download/main.nf b/modules/local/download/main.nf new file mode 100644 index 0000000..6f12b08 --- /dev/null +++ b/modules/local/download/main.nf @@ -0,0 +1,25 @@ +process DOWNLOAD { + tag "$id" + + label 'process_single' + label 'error_retry_delay' + + input: + val id + val url + + output: + path "${prefix}", emit: db + + script: + prefix = task.ext.prefix ?: "${id}" + + """ + wget -O ${prefix} $url + """ + + stub: + """ + touch ${prefix} + """ +} diff --git a/nextflow.config b/nextflow.config index eb9dcf7..d98ac4c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -50,6 +50,18 @@ params { skip_alignment = false skip_microview = false + // Dowload entry options + download_functional = true + download_kaiju = true + download_kraken = false + download_host = false + + functional_db = 'https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz' + functional_dictionary = 'https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz' + kaiju_db_url = 'https://kaiju-idx.s3.eu-central-1.amazonaws.com/2023/kaiju_db_nr_2023-05-10.tgz' + kraken2_db_url = 'https://genome-idx.s3.amazonaws.com/kraken/k2_standard_08gb_20240112.tar.gz' + host_url = 'http://ftp.ensembl.org/pub/release-112/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz' + // MultiQC options multiqc_config = null multiqc_title = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 35e15f4..7183bc6 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -237,6 +237,58 @@ } } }, + "download_entry": { + "title": "Download Entry", + "type": "object", + "description": "", + "default": "", + "properties": { + "download_functional": { + "type": "boolean", + "default": true, + "description": "Whether to dowload functional references" + }, + "download_kaiju": { + "type": "boolean", + "default": true, + "description": "Whether to dowload the Kaiju reference db" + }, + "download_kraken": { + "type": "boolean", + "description": "Whether to dowload the Kraken2 reference db" + }, + "download_host": { + "type": "boolean", + "description": "Whether to download the host reference genome" + }, + "functional_db": { + "type": "string", + "default": "https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz", + "description": "Functional reference URL (download entry)" + }, + "functional_dictionary": { + "type": "string", + "default": "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz", + "description": "Functional dictionary URL (download entry)" + }, + "kaiju_db_url": { + "type": "string", + "default": "https://kaiju-idx.s3.eu-central-1.amazonaws.com/2023/kaiju_db_nr_2023-05-10.tgz", + "description": "Kaiju reference URL (download entry)" + }, + "kraken2_db_url": { + "type": "string", + "default": "https://genome-idx.s3.amazonaws.com/kraken/k2_standard_08gb_20240112.tar.gz", + "description": "Kraken2 reference URL (download entry)" + }, + "host_url": { + "type": "string", + "default": "http://ftp.ensembl.org/pub/release-112/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz", + "description": "Host FASTA reference URL (download entry)" + } + }, + "fa_icon": "fas fa-database" + }, "max_job_request_options": { "title": "Max job request options", "type": "object", @@ -352,7 +404,8 @@ "type": "string", "description": "Custom logo file to supply to MultiQC. File name must also be set in the MultiQC config file", "fa_icon": "fas fa-image", - "hidden": true + "hidden": true, + "default": "/home/jvfe/dev/pesquisa/dalmolingroup-euryale/assets/euryale_logo.png" }, "multiqc_methods_description": { "type": "string", @@ -362,7 +415,7 @@ "tracedir": { "type": "string", "description": "Directory to keep pipeline Nextflow logs and reports.", - "default": "${params.outdir}/pipeline_info", + "default": "null/pipeline_info", "fa_icon": "fas fa-cogs", "hidden": true }, @@ -413,6 +466,9 @@ { "$ref": "#/definitions/reference_genome_options" }, + { + "$ref": "#/definitions/download_entry" + }, { "$ref": "#/definitions/max_job_request_options" }, diff --git a/workflows/download.nf b/workflows/download.nf new file mode 100644 index 0000000..083470d --- /dev/null +++ b/workflows/download.nf @@ -0,0 +1,45 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + VALIDATE INPUTS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) + +// Validate input parameters +WorkflowEuryale.initialise(params, log) + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT LOCAL MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// +// MODULES +// + +include { DOWNLOAD as DOWNLOAD_FUNCTIONAL_DB } from '../modules/local/download/main' +include { DOWNLOAD as DOWNLOAD_FUNCTIONAL_DICT } from '../modules/local/download/main' +include { DOWNLOAD as DOWNLOAD_KAIJU } from '../modules/local/download/main' +include { DOWNLOAD as DOWNLOAD_KRAKEN } from '../modules/local/download/main' +include { DOWNLOAD as DOWNLOAD_HOST } from '../modules/local/download/main' + +workflow DOWNLOAD { + if (params.download_functional) { + DOWNLOAD_FUNCTIONAL_DB("reference_fasta.fa.gz", params.functional_db) + DOWNLOAD_FUNCTIONAL_DICT("id_mapping.tab.gz", params.functional_dictionary) + } + + if (params.download_kaiju) { + DOWNLOAD_KAIJU("kaiju_db.tar.gz", params.kaiju_db_url) + } + + if (params.download_kraken) { + DOWNLOAD_KRAKEN("kraken2_db.tar.gz", params.kraken2_db_url) + } + + if (params.download_host) { + DOWNLOAD_HOST("host_fasta.fa.gz", params.host_url) + } +}