From a9edc36f3cc915707a1fd3a5010a8d0246754656 Mon Sep 17 00:00:00 2001 From: Julian Libiseller-Egger Date: Wed, 6 Dec 2023 14:37:34 +0000 Subject: [PATCH] Docs update [CW-3102] --- .gitlab-ci.yml | 4 + .pre-commit-config.yaml | 12 +- CHANGELOG.md | 7 + README.md | 422 ++++++++++++++++---------- bin/workflow_glue/subset_reads.py | 9 +- docs/01_brief_description.md | 1 + docs/{intro.md => 02_introduction.md} | 11 +- docs/03_compute_requirements.md | 13 + docs/04_install_and_run.md | 35 +++ docs/05_related_protocols.md | 3 + docs/06_inputs.md | 73 +++++ docs/07_outputs.md | 12 + docs/08_pipeline_overview.md | 75 +++++ docs/09_troubleshooting.md | 2 + docs/10_FAQ.md | 32 ++ docs/11_other.md | 5 + docs/header.md | 3 - docs/links.md | 11 - docs/quickstart.md | 176 ----------- lib/NfcoreSchema.groovy | 4 +- lib/ingress.nf | 8 + main.nf | 31 +- modules/local/common.nf | 6 + modules/local/de-novo.nf | 7 + modules/local/variant-calling.nf | 7 + nextflow.config | 4 +- nextflow_schema.json | 15 +- output_definition.json | 68 +++++ 28 files changed, 680 insertions(+), 376 deletions(-) create mode 100644 docs/01_brief_description.md rename docs/{intro.md => 02_introduction.md} (69%) create mode 100644 docs/03_compute_requirements.md create mode 100644 docs/04_install_and_run.md create mode 100644 docs/05_related_protocols.md create mode 100644 docs/06_inputs.md create mode 100644 docs/07_outputs.md create mode 100644 docs/08_pipeline_overview.md create mode 100644 docs/09_troubleshooting.md create mode 100644 docs/10_FAQ.md create mode 100644 docs/11_other.md delete mode 100644 docs/header.md delete mode 100644 docs/links.md delete mode 100644 docs/quickstart.md create mode 100644 output_definition.json diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 17bff99..096ca6d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -5,6 +5,7 @@ include: variables: NF_WORKFLOW_OPTS: > + -executor.\$$local.memory 16GB --fastq test_data/fastq --reference test_data/reference.fasta --threads 2 @@ -41,6 +42,7 @@ docker-run: - if: $MATRIX_NAME == "ref-sample_sheet" variables: NF_WORKFLOW_OPTS: > + -executor.\$$local.memory 16GB --fastq test_data/fastq --reference test_data/reference.fasta --sample_sheet test_data/sample_sheet.csv @@ -54,6 +56,7 @@ docker-run: - if: $MATRIX_NAME == "filter-all" variables: NF_WORKFLOW_OPTS: > + -executor.\$$local.memory 16GB --fastq test_data/fastq --reference test_data/reference.fasta --min_read_qual 20 @@ -69,6 +72,7 @@ docker-run: - if: $MATRIX_NAME == "de-novo" variables: NF_WORKFLOW_OPTS: > + -executor.\$$local.memory 16GB --fastq test_data/fastq-denovo --drop_frac_longest_reads 0.05 NF_PROCESS_FILES: > diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6583953..8134142 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,22 +1,14 @@ repos: - repo: local hooks: - - id: docs_schema - name: docs_schema - entry: parse_docs -p docs -e .md -s intro links -oj nextflow_schema.json - language: python - always_run: true - pass_filenames: false - additional_dependencies: - - epi2melabs - id: docs_readme name: docs_readme - entry: parse_docs -p docs -e .md -s header intro quickstart links -ot README.md + entry: parse_docs -p docs -e .md -s 01_brief_description 02_introduction 03_compute_requirements 04_install_and_run 05_related_protocols 06_inputs 07_outputs 08_pipeline_overview 09_troubleshooting 10_FAQ 11_other -ot README.md -od output_definition.json -ns nextflow_schema.json language: python always_run: true pass_filenames: false additional_dependencies: - - epi2melabs + - epi2melabs>=0.0.50 - id: build_models name: build_models entry: datamodel-codegen --strict-nullable --base-class workflow_glue.results_schema_helpers.BaseModel --use-schema-description --disable-timestamp --input results_schema.yml --input-file-type openapi --output bin/workflow_glue/results_schema.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 54ff221..76b7d01 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [v1.0.0] +### Added +- Memory requirements for each process. + +### Changed +- Reworked docs to follow new layout. + ## [v0.6.2] ### Fixed - The de-novo QC stage failing when not a single input read re-aligns against the draft consensus. diff --git a/README.md b/README.md index 9427387..667844e 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,6 @@ -# wf-amplicon - - +# Amplicon workflow +Nextflow workflow for analysing Oxford Nanopore reads created by amplicon sequencing. @@ -14,8 +13,7 @@ The workflow requires raw reads in FASTQ format and can be run in two modes: * Variant calling mode: Trigger this mode by passing a reference FASTA file. After initial filtering (based on read length and quality) and adapter trimming, [minimap2](https://github.com/lh3/minimap2) is used to align the - reads to the reference (please note that the reference should only contain the - expected sequences of the individual amplicons). Variants are then called with + reads to the reference. Variants are then called with [Medaka](https://github.com/nanoporetech/medaka). This mode allows for multiple amplicons per barcode (for details on how to map specific target amplicons to individual samples / barcodes, see below). @@ -26,59 +24,188 @@ The workflow requires raw reads in FASTQ format and can be run in two modes: [Medaka](https://github.com/nanoporetech/medaka). Please note that only one amplicon per barcode is supported in de-novo consensus mode. -The results of the workflow include an interactive HTML report, FASTA files with -the consensus sequences of the amplicons, BAM files with the alignments, and VCF -files containing the variants (if run in variant calling mode). +> Note: This workflow is *not* intended for marker gene sequencing of mixtures / communities of different organisms (e.g. 16S sequencing). +> In de-novo consensus mode it expects a single amplicon per barcode. +> When running in variant calling mode, multiple amplicons per barcode can be processed, but their sequences need to be sufficiently different from each other so that most reads only align to one of the provided references. + + + + +## Compute requirements + +Recommended requirements: ++ CPUs = 12 ++ Memory = 32GB +Minimum requirements: ++ CPUs = 6 ++ Memory = 16GB -## Quickstart +Approximate run time: 0.5-5 minutes per sample (depending on number of reads, length of amplicons, and available compute). -The workflow relies on the following dependencies: +ARM processor support: True -- [Nextflow](https://www.nextflow.io/) for managing compute and software - resources. -- Either [Docker](https://www.docker.com/products/docker-desktop) or - [Singularity](https://docs.sylabs.io/guides/latest/user-guide/) to provide - isolation of the required software. -It is not necessary to clone or download the git repository in order to run the -workflow. For more information on running EPI2ME Labs workflows, [visit our -website](https://labs.epi2me.io/wfindex). -**Workflow options** -If you have [Nextflow](https://www.nextflow.io/) installed, you can run the -following to obtain the workflow: +## Install and run + +These are instructions to install and run the workflow on command line. You can also access the workflow via the [EPI2ME application](https://labs.epi2me.io/downloads/). + +The workflow uses [Nextflow](https://www.nextflow.io/) to manage compute and software resources, therefore Nextflow will need to be installed before attempting to run the workflow. + +The workflow can currently be run using either [Docker](https://www.docker.com/products/docker-desktop) or +[Singularity](https://docs.sylabs.io/guides/3.0/user-guide/index.html) to provide isolation of +the required software. Both methods are automated out-of-the-box provided +either Socker or Singularity is installed. This is controlled by the [`-profile`](https://www.nextflow.io/docs/latest/config.html#config-profiles) parameter as exemplified below. + +It is not required to clone or download the git repository in order to run the workflow. +More information on running EPI2ME workflows can be found on our [website](https://labs.epi2me.io/wfindex). + +The following command can be used to obtain the workflow. This will pull the repository in to the assets folder of Nextflow and provide a list of all parameters available for the workflow as well as an example command: ``` -nextflow run epi2me-labs/wf-amplicon --help +nextflow run epi2me-labs/wf-amplicon –-help ``` -This will show the workflow's command line options. - -### Usage +A demo dataset is provided for testing of the workflow. It can be downloaded using: -This section covers how to use the workflow when providing a reference file with -the expected sequences of the amplicons ("variant calling mode"). For details on -how to run the workflow without a reference ("de-novo consensus mode"), see the -relevant section below. +``` +wget https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-amplicon/wf-amplicon-demo.tar.gz +tar -xzvf wf-amplicon-demo.tar.gz +``` -Basic usage is as follows +The workflow can be run with the demo data using: -```bash +``` nextflow run epi2me-labs/wf-amplicon \ - --fastq $input \ - --reference references.fa \ - --threads 4 + --fastq wf-amplicon-demo/fastq \ + --reference wf-amplicon-demo/reference.fa \ + -profile standard ``` -`$input` can be a single FASTQ file, a directory containing FASTQ files, or a -directory containing barcoded sub-directories which in turn contain FASTQ files. -A sample sheet can be included with `--sample_sheet` and a sample name for an -individual sample with `--sample`. If a sample sheet is provided, it needs to -contain at least these three columns: +For further information about running a workflow on the command line see https://labs.epi2me.io/wfquickstart/ + + + + +## Related protocols + +This workflow is designed to take input sequences that have been produced from [Oxford Nanopore Technologies](https://nanoporetech.com/) devices. + +Find related protocols in the [Nanopore community](https://community.nanoporetech.com/docs/). + + + + +## Inputs + +### Input Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| fastq | string | FASTQ files to use in the analysis. | This accepts one of three cases: (i) the path to a single FASTQ file; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. | | +| analyse_unclassified | boolean | Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory. | If selected and if the input is a multiplex directory the workflow will also process the unclassified directory. | False | +| reference | string | Path to a reference FASTA file. | The reference file should contain one sequence per amplicon. | | + + +### Sample Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| sample_sheet | string | A CSV file used to map barcodes to sample aliases. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files. | The sample sheet is a CSV file with, minimally, columns named `barcode` and `alias`. Extra columns are allowed. A `type` column is required for certain workflows and should have the following values; `test_sample`, `positive_control`, `negative_control`, `no_template_control`. | | +| sample | string | A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files. | | | + + +### Pre-processing Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| min_read_length | integer | Shorter reads will be removed. | | 300 | +| max_read_length | integer | Longer reads will be removed. | | | +| min_read_qual | number | Reads with a lower mean quality will be removed. | | 10 | +| drop_frac_longest_reads | number | Drop fraction of longest reads. | The very longest reads might be concatemers or contain other artifacts. In many cases removing them simplifies de novo consensus generation. | 0.05 | +| take_longest_remaining_reads | boolean | Whether to use the longest (remaining) reads. | With this option, reads are not randomly selected during downsampling (potentially after the longest reads have been removed), but instead the longest remaining reads are taken. This generally improves performance on long amplicons. | True | +| reads_downsampling_size | integer | Downsample to this number of reads per sample. | Downsampling is performed after filtering. Set to 0 to disable downsampling. | 0 | +| min_n_reads | number | Samples / barcodes with fewer reads will not be processed. | | 40 | + + +### Variant Calling Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| min_coverage | integer | Minimum coverage for variants to keep. | Only variants covered by more than this number of reads are reported in the resulting VCF file. | 20 | +| basecaller_cfg | string | Name of the basecaller model that processed the signal data; used to select an appropriate Medaka model. | The basecaller configuration is used to automatically select the appropriate Medaka model. The automatic selection can be overridden with the 'medaka_model' parameters. Available models are: 'dna_r10.4.1_e8.2_400bps_hac@v3.5.2', 'dna_r10.4.1_e8.2_400bps_sup@v3.5.2', 'dna_r9.4.1_e8_fast@v3.4', 'dna_r9.4.1_e8_hac@v3.3', 'dna_r9.4.1_e8_sup@v3.3', 'dna_r10.4.1_e8.2_400bps_hac_prom', 'dna_r9.4.1_450bps_hac_prom', 'dna_r10.3_450bps_hac', 'dna_r10.3_450bps_hac_prom', 'dna_r10.4.1_e8.2_260bps_hac', 'dna_r10.4.1_e8.2_260bps_hac_prom', 'dna_r10.4.1_e8.2_400bps_hac', 'dna_r9.4.1_450bps_hac', 'dna_r9.4.1_e8.1_hac', 'dna_r9.4.1_e8.1_hac_prom'. | dna_r10.4.1_e8.2_400bps_sup@v4.2.0 | +| medaka_model | string | The name of the Medaka model to use. This will override the model automatically chosen based on the provided basecaller configuration. | The workflow will attempt to map the basecaller model (provided with 'basecaller_cfg') used to a suitable Medaka model. You can override this by providing a model with this option instead. | | + + +### De-novo Consensus Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| force_spoa_length_threshold | integer | Consensus length below which to force SPOA consensus generation. | If the consensus generated by `miniasm` is shorter than this value, force consensus generation with SPOA (regardless of whether the sequence produced by `miniasm` passed QC or not). The rationale for this parameter is that `miniasm` sometimes gives slightly truncated assemblies for short amplicons from RBK data, whereas SPOA tends to be more robust in this regard. | 2000 | +| spoa_minimum_relative_coverage | number | Minimum coverage (relative to the number of reads per sample after filtering) when constructing the consensus with SPOA. | Needs to be a number between 0.0 and 1.0. The result of multiplying this number with the number of reads for the corresponding sample (after filtering) is passed on to SPOA's `--min-coverage` option. | 0.15 | +| minimum_mean_depth | integer | Mean depth threshold to pass consensus quality control. | Draft consensus sequences with a lower average depth of coverage after re-aligning the input reads will fail QC. | 30 | +| primary_alignments_threshold | number | Fraction of primary alignments to pass quality control. | Draft consensus sequences with a lower fraction of primary alignments after re-aligning the input reads will fail QC. | 0.7 | + + +### Output Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| out_dir | string | Directory for output of all workflow results. | | output | +| combine_results | boolean | Whether to merge per-sample results into a single BAM / VCF file. | Per default, results are grouped per sample. With this option, an additional BAM and VCF file are produced which contain the alignments / variants for all samples and amplicons. | False | + + +### Advanced Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| number_depth_windows | integer | Number of windows used during depth of coverage calculations. | Depth of coverage is calculated for each sample across each amplicon split into this number of windows. A higher number will produce more fine-grained plots at the expense of run time. | 100 | +| medaka_target_depth_per_strand | integer | Downsample each amplicon to this per-strand depth before running Medaka. | Medaka performs best with even strand coverage and depths between 80X and 400X. To avoid too high coverage, the workflow downsamples the reads for each amplicon to this per-strand depth before running Medaka. Changing this value is discouraged as it might cause decreased performance. | 75 | + + +### Miscellaneous Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| threads | integer | Maximum number of CPU threads to use per workflow task. | Several tasks in this workflow benefit from using multiple CPU threads. This option sets the maximum number of CPU threads for such processes. The total CPU resources used by the workflow are constrained by the executor configuration in `nextflow.config`. | 4 | +| disable_ping | boolean | Enable to prevent sending a workflow ping. | | False | + + + + + + +## Outputs + +Outputs files may be aggregated including information for all samples or provided per sample. Per-sample files will be prefixed with respective aliases and represented below as {{ alias }}. + +| Title | File path | Description | Per sample or aggregated | +|-------|-----------|-------------|--------------------------| +| workflow report | ./wf-amplicon-report.html | Report for all samples. | aggregated | +| Sanitized reference file | ./reference_sanitized_seqIDs.fasta | Some programs used by the workflow don't like special characters (like colons) in the sequence IDs in the reference FASTA file. The reference is thus "sanitized" by replacing these characters with underscores. This file is only generated when the workflow is run in variant calling mode. | aggregated | +| Alignments BAM file | ./{{ alias }}/alignments/aligned.sorted.bam | BAM file with alignments of input reads against the references (in variant calling mode) or the created consensus (in de-novo consensus mode). | per-sample | +| Alignments index file | ./{{ alias }}/alignments/aligned.sorted.bam.bai | Index for alignments BAM file. | per-sample | +| De-novo consensus FASTQ file | ./{{ alias }}/consensus/consensus.fastq | Consensus file generated by de-novo consensus pipeline. | per-sample | +| Consensus FASTA file | ./{{ alias }}/consensus/consensus.fasta | Consensus file generated variant calling pipeline. | per-sample | +| Variants VCF file | ./{{ alias }}/variants/medaka.annotated.vcf.gz | VCF file of variants detected against the provided reference. | per-sample | +| Variants index file | ./{{ alias }}/variants/medaka.annotated.vcf.gz.csi | Index for variants VCF file. | per-sample | + + + + +## Pipeline overview + +> Note: The initial preprocessing steps are the same for the variant calling and the de-novo consensus mode. + +#### 1. Read preprocessing + +The input data can be a single FASTQ file, a directory containing FASTQ files, or a directory containing barcoded sub-directories which in turn contain FASTQ files. +A sample sheet can be included with `--sample_sheet` and a sample name for an individual sample with `--sample`. +If a sample sheet is provided, it needs to contain at least these three columns: - `barcode`: names of the sub-directories with the FASTQ files of the different samples @@ -86,15 +213,87 @@ contain at least these three columns: - `type`: sample types (needs to be one of `test_sample`, `positive_control`, `negative_control`, `no_template_control`) -Additionally, it can also contain a column named `ref`, which can be used to -specify one or more sequences in the reference FASTA file for the individual -samples. If a sample should be aligned against multiple reference sequences, -their IDs can be included as a space-delimited list. If a specific sample should -be aligned against all references, you can leave the corresponding cell in the -`ref` column blank. As an example, the following sample sheet tells the workflow -that the first two barcodes (`sample1` and `sample2`) are expected to contain -reads from both amplicons in the reference, whereas the other two samples only -contain one amplicon each. +After parsing the sample sheet, the raw reads are filtered. Relevant options for filtering are + +- `--min_read_length` +- `--max_read_length` +- `--min_read_qual` + +Reads can optionally also be downsampled (`--reads_downsampling_size` controls the number of reads to keep per sample). +The workflow supports random downsampling as well as selecting reads based on read length. +Subsetting based on read length tends to work better for de-novo consensus generation and usually does not decrease performance of the variant calling mode. +It is thus set as default (see the FAQ section below for details and for how to disable this). +The selected reads are then trimmed with [Porechop](https://github.com/rrwick/Porechop) prior to downstream analysis. + +> Note: Samples with fewer reads than `--min_n_reads` after preprocessing are ignored. + + +### Variant calling mode + +#### 2. Align reads + +Preprocessed reads are aligned against the provided reference with [Minimap2](https://github.com/lh3/minimap2) (please note that the reference should only contain the expected sequences of the individual amplicons). +[Bamstats](https://github.com/epi2me-labs/fastcat#bamstats) is used to collate alignment statistics and [mosdepth](https://github.com/brentp/mosdepth) to calculate the depth of coverage along the amplicon. + +#### 3. Call variants + +After alignment, haploid variants are called with [Medaka](https://github.com/nanoporetech/medaka). +You can set the minimum coverage a variant needs to exceed in order to be included in the results with `--min_coverage`. +Variants with lower coverage will still be listed in the resulting VCF files, but with `LOW_DEPTH` instead of `PASS` in the `FILTER` column. + +The workflow selects the appropriate [Medaka models](https://github.com/nanoporetech/medaka#models) based on the basecaller configuration that was used to process the signal data. +You can use the parameter `--basecaller_cfg` to provide this information (e.g. `dna_r10.4.1_e8.2_400bps_hac`). +Alternatively, you can choose the [Medaka](https://github.com/nanoporetech/medaka) model directly with `--medaka_model`. + +#### 4. Use the variants to generate a consensus + +The variants passing the depth filter are then incorporated in the reference to create the consensus sequence. + +### De-novo consensus mode + +#### 2. Create a draft consensus + +When running in de-novo mode, [miniasm](https://github.com/lh3/miniasm) or [SPOA](https://github.com/rvaser/spoa) are used to create a draft consensus after the reads were selected and trimmed. +`miniasm` tends to work better for longer amplicons and `spoa` for shorter amplicons. +The workflow attempts to create an assembly with `miniasm` first. +If this succeeds, the draft assembly is polished with [Racon](https://github.com/lbcb-sci/racon) and the input reads are re-aligned against it. +This is followed by some post-processing (trimming low-coverage regions from the ends) and quality control (QC). +During QC, the following sanity-checks are performed: + +- The mean coverage of the consensus needs to be above `--minimum_mean_depth` (default: 30X). +- The fraction of primary alignments for the re-aligned reads needs to be more than + `--primary_alignment_threshold` (default: 70%). Large numbers of secondary / + supplementary alignments indicate that something might have gone wrong during + assembly. If your amplicons contain long repetitive regions, you can lower + this number. + +If more than one contig from the draft assembly passes these filters, the one with the highest coverage is selected. + +If assembly generation fails or none of the contigs produced by `miniasm` pass QC, `spoa` is tried. +Regardless of how the draft consensus was generated (`miniasm` or `spoa`), a final polishing step with +[Medaka](https://github.com/nanoporetech/medaka) is performed. + +> Note: Since `spoa` tends to produce a better consensus than `miniasm` for short amplicons, users can force the workflow to run `spoa` if the assembly created by `miniasm` is shorter than `--force_spoa_length_threshold` (even if it passes QC). + + + + +## Troubleshooting + ++ If the workflow fails please run it with the demo data set to ensure the workflow itself is working. This will help us determine if the issue is related to the environment, input parameters or a bug. ++ Please see [here](https://labs.epi2me.io/trouble-shooting/) for how to resolve some common Nextflow issues and [here](https://labs.epi2me.io/how-to-exits/) for how to interpret command exit codes. + + + + +## FAQ's + +*Variant calling mode: How can I specify specific reference sequences for individual barcodes?* - Per default, the workflow aligns all barcodes against all sequences in the supplied reference. +If you want to map specific reference sequences to specific amplicons, you can add a column named `ref` containing the respective target reference IDs to the sample sheet. +If a barcode should be aligned against multiple reference sequences, their IDs can be included as a space-delimited list. +If a specific sample should be aligned against all references, you can leave the corresponding cell in the `ref` column blank. +Consider the following example with 4 barcodes and 2 sequences in the reference file. The sample sheet tells the workflow that the first two barcodes (`sample1` and `sample2`) are expected to contain +reads from both amplicons in the reference, whereas the other two samples only contain one amplicon each. ``` barcode,alias,type,ref @@ -104,122 +303,35 @@ barcode03,sample3,positive_control,katG::NC_000962.3:2154725-2155670 barcode04,sample4,test_sample,rpoB::NC_000962.3:760285-761376 ``` -Relevant options for filtering of raw reads are +*How much coverage do I need?* - We recommend >150X average coverage across the individual amplicons. +1500 reads per amplicon should thus be enough in the vast majority of cases. +You can speed up execution time by setting `--reads_downsampling_size 1500` (or to a smaller number if your amplicons are not longer than a few kb). -- `--min_read_length` -- `--max_read_length` -- `--min_read_qual` +*Why does the workflow select reads based on length rather than random downsampling per default?* - Despite the workflow dropping reads containing adapter sequences in the middle (using `porechop --discard_middle`), some reads in the input data stemming from concatemers or other PCR artifacts might still make it through preprocessing and could thus interfere with consensus generation. +A simple way to avoid this is to drop a small fraction (e.g. 5%) of longest reads. +Furthermore, `spoa` depends on the order of reads in the input and benefits from "seeing" longer reads first. +Therefore, the following workflow options are used per default `--drop_frac_longest_reads 0.05 --take_longest_remaining_reads`. +This essentially drops the longest 5% of reads and then takes selects the next longest reads (either all of them or the number specified with `--reads_downsampling_size`). +To disable the default behaviour and enforce random downsampling of input reads, use `--drop_frac_longest_reads 0 --take_longest_remaining_reads false`. + +*Is there a difference between the consensus sequences created in variant calling and de-novo consensus mode?* - In variant calling mode, the consensus sequence is generated by applying the variants found by [Medaka](https://github.com/nanoporetech/medaka) to the provided reference. +This comes with the caveat that structural variants too large to be detected by Medaka are not going to be represented in the consensus. +For this reason we suggest to always have a look at the coverage plots in the generated HTML report. +In de-novo consensus mode, however, the consensus is generated either via assembly or [POA](https://doi.org/10.1093/bioinformatics/btg109) and no prior information besides the reads is taken into account. + +If your question is not answered here, please report any issues or suggestions on the [github issues](https://github.com/epi2me-labs/wf-alignment/issues) page or start a discussion on the [community](https://community.nanoporetech.com/). -After initial filtering, reads can optionally be downsampled -(`--reads_downsampling_size` controls the number of reads to keep per sample). -The workflow supports random downsampling as well as selecting reads based on -read length. Subsetting based on read length tends to work better for de-novo -consensus generation and is thus set as default (see the section for the de-novo -consensus mode below for details and for how to disable this). However, it -should not detriment the performance of the variant-calling mode. The selected -reads are then trimmed with [Porechop](https://github.com/rrwick/Porechop) -before being aligned against the reference. - -> Note: Samples with fewer reads than `--min_n_reads` after preprocessing are -> ignored. - -After alignment, haploid variants are called with -[Medaka](https://github.com/nanoporetech/medaka). You can set the minimum -coverage a variant needs to exceed in order to be included in the results with -`--min_coverage`. Variants with lower coverage will still be included in the -resulting VCF files, but with `LOW_DEPTH` instead of `PASS` in the `FILTER` -column. - -The workflow selects the appropriate -[Medaka models](https://github.com/nanoporetech/medaka#models) based on the basecaller -configuration that was used to process the signal data. You can use the -parameter `--basecaller_cfg` to provide this information (e.g. -`dna_r10.4.1_e8.2_400bps_hac`). Alternatively, you can choose the -[Medaka](https://github.com/nanoporetech/medaka) model directly with -`--medaka_model`. - -If you want to use -[Singularity](https://docs.sylabs.io/guides/latest/user-guide/) instead of -[Docker](https://www.docker.com/products/docker-desktop), add `-profile -singularity` to the nextflow invocation. - -### Key outputs - -- Interactive HTML report detailing the results. -- BAM files with reads aligned against the reference. -- VCF files with variants called by - [Medaka](https://github.com/nanoporetech/medaka). -- FASTA files with consensus sequences generated by applying the variants to the - provided reference. - -A sub-directory with these output files will be created for each sample. -Additionally, a single BAM and VCF file containing the results of all samples -will be produced if the option `--combine_results` is provided. - -### Known issues and limitations - -The consensus sequence is generated by applying the variants found by -[Medaka](https://github.com/nanoporetech/medaka) to the provided reference. This -comes with the caveat that deletions at either end of the amplicon are not going -to be reflected in the consensus (i.e. it will still contain the deleted -regions). For this reason we suggest to always have a look at the coverage plots -in the generated HTML report. - -### Running without a reference - -The workflow can also be run without a reference. It will then use -[miniasm](https://github.com/lh3/miniasm) or -[SPOA](https://github.com/rvaser/spoa) to create a draft consensus. `miniasm` -tends to work better for longer amplicons and `spoa` for shorter amplicons. The -workflow attempts to create an assembly with `miniasm` first. If this succeeds, -the draft assembly is polished with [Racon](https://github.com/lbcb-sci/racon) and -the input reads are re-aligned against it. This is followed by some post-processing -(trimming low-coverage regions from the ends) and quality control (QC). During QC, the following -sanity-checks are performed: - -- The mean coverage of the consensus is above `--minimum_mean_depth` (default: 30X). -- The fraction of primary alignments for the re-aligned reads is more than - `--primary_alignment_threshold` (default: 70%). Large numbers of secondary / - supplementary alignments indicate that something might have gone wrong during - assembly. If your amplicons contain long repetitive regions, you can lower - this number. -If more than one contig from the draft assembly passes these filters, the one -with the highest coverage is selected. -If assembly generation fails or none of the contigs produced by `miniasm` pass -QC, `spoa` is tried. Regardless of how the draft consensus was generated -(`miniasm` or `spoa`), a final polishing step with -[Medaka](https://github.com/nanoporetech/medaka) is performed. -Note: Since `spoa` tends to produce a better consensus than `miniasm` for short -amplicons, users can force the workflow to run `spoa` if the assembly created by -`miniasm` is shorter than `--force_spoa_length_threshold` (even if it passes -QC). +## Related blog posts -**Random downsampling vs selecting reads by length:** +## Related blog posts -Despite the workflow dropping reads containing adapter sequences in the middle -(with `porechop --discard_middle`), some reads in the input data stemming from -concatemers or other PCR artifacts might still make it through preprocessing and -could thus interfere with consensus generation. A simple way to avoid this is to -drop a small fraction (e.g. 5%) of longest reads. Additionally, `spoa` depends -on the order of reads in the input and benefits from "seeing" longer reads -first. Therefore, the following options are used per default -`--drop_frac_longest_reads 0.05 --take_longest_remaining_reads`. To disable this -and enforce random downsampling of input reads, use `--drop_frac_longest_reads 0 ---take_longest_remaining_reads false`. +- [How to align your data](https://labs.epi2me.io/how-to-align/) +See the [EPI2ME website](https://labs.epi2me.io/) for lots of other resources and blog posts. -## Useful links -- [Nextflow](https://www.nextflow.io/) -- [Docker](https://www.docker.com/products/docker-desktop) -- [Singularity](https://docs.sylabs.io/guides/latest/user-guide/) -- [minimap2](https://github.com/lh3/minimap2) -- [miniasm](https://github.com/lh3/miniasm) -- [SPOA](https://github.com/rvaser/spoa) -- [Medaka](https://github.com/nanoporetech/medaka) -- [Porechop](https://github.com/rrwick/Porechop) diff --git a/bin/workflow_glue/subset_reads.py b/bin/workflow_glue/subset_reads.py index 0ca4ae1..33cff5d 100755 --- a/bin/workflow_glue/subset_reads.py +++ b/bin/workflow_glue/subset_reads.py @@ -12,9 +12,12 @@ def main(args): logger = get_named_logger("subsetReads") logger.info("Read per-read stats and sort lengths.") - sorted_lengths = pd.read_csv(args.per_read_stats, sep="\t", index_col=0)[ - "read_length" - ].sort_values(ascending=False) + sorted_lengths = pd.read_csv( + args.per_read_stats, + sep="\t", + index_col="read_id", + usecols=["read_id", "read_length"], + ).squeeze().sort_values(ascending=False) drop_longest_n = 0 if args.drop_longest_frac: diff --git a/docs/01_brief_description.md b/docs/01_brief_description.md new file mode 100644 index 0000000..e6fc789 --- /dev/null +++ b/docs/01_brief_description.md @@ -0,0 +1 @@ +Nextflow workflow for analysing Oxford Nanopore reads created by amplicon sequencing. \ No newline at end of file diff --git a/docs/intro.md b/docs/02_introduction.md similarity index 69% rename from docs/intro.md rename to docs/02_introduction.md index 6a8cefe..408d509 100644 --- a/docs/intro.md +++ b/docs/02_introduction.md @@ -1,5 +1,3 @@ -## Introduction - This [Nextflow](https://www.nextflow.io/) workflow provides a simple way to analyse Oxford Nanopore reads generated from amplicons. @@ -7,8 +5,7 @@ The workflow requires raw reads in FASTQ format and can be run in two modes: * Variant calling mode: Trigger this mode by passing a reference FASTA file. After initial filtering (based on read length and quality) and adapter trimming, [minimap2](https://github.com/lh3/minimap2) is used to align the - reads to the reference (please note that the reference should only contain the - expected sequences of the individual amplicons). Variants are then called with + reads to the reference. Variants are then called with [Medaka](https://github.com/nanoporetech/medaka). This mode allows for multiple amplicons per barcode (for details on how to map specific target amplicons to individual samples / barcodes, see below). @@ -19,6 +16,6 @@ The workflow requires raw reads in FASTQ format and can be run in two modes: [Medaka](https://github.com/nanoporetech/medaka). Please note that only one amplicon per barcode is supported in de-novo consensus mode. -The results of the workflow include an interactive HTML report, FASTA files with -the consensus sequences of the amplicons, BAM files with the alignments, and VCF -files containing the variants (if run in variant calling mode). +> Note: This workflow is *not* intended for marker gene sequencing of mixtures / communities of different organisms (e.g. 16S sequencing). +> In de-novo consensus mode it expects a single amplicon per barcode. +> When running in variant calling mode, multiple amplicons per barcode can be processed, but their sequences need to be sufficiently different from each other so that most reads only align to one of the provided references. diff --git a/docs/03_compute_requirements.md b/docs/03_compute_requirements.md new file mode 100644 index 0000000..9c30fb8 --- /dev/null +++ b/docs/03_compute_requirements.md @@ -0,0 +1,13 @@ +Recommended requirements: + ++ CPUs = 12 ++ Memory = 32GB + +Minimum requirements: + ++ CPUs = 6 ++ Memory = 16GB + +Approximate run time: 0.5-5 minutes per sample (depending on number of reads, length of amplicons, and available compute). + +ARM processor support: True diff --git a/docs/04_install_and_run.md b/docs/04_install_and_run.md new file mode 100644 index 0000000..6d24a13 --- /dev/null +++ b/docs/04_install_and_run.md @@ -0,0 +1,35 @@ +These are instructions to install and run the workflow on command line. You can also access the workflow via the [EPI2ME application](https://labs.epi2me.io/downloads/). + +The workflow uses [Nextflow](https://www.nextflow.io/) to manage compute and software resources, therefore Nextflow will need to be installed before attempting to run the workflow. + +The workflow can currently be run using either [Docker](https://www.docker.com/products/docker-desktop) or +[Singularity](https://docs.sylabs.io/guides/3.0/user-guide/index.html) to provide isolation of +the required software. Both methods are automated out-of-the-box provided +either Socker or Singularity is installed. This is controlled by the [`-profile`](https://www.nextflow.io/docs/latest/config.html#config-profiles) parameter as exemplified below. + +It is not required to clone or download the git repository in order to run the workflow. +More information on running EPI2ME workflows can be found on our [website](https://labs.epi2me.io/wfindex). + +The following command can be used to obtain the workflow. This will pull the repository in to the assets folder of Nextflow and provide a list of all parameters available for the workflow as well as an example command: + +``` +nextflow run epi2me-labs/wf-amplicon –-help +``` + +A demo dataset is provided for testing of the workflow. It can be downloaded using: + +``` +wget https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-amplicon/wf-amplicon-demo.tar.gz +tar -xzvf wf-amplicon-demo.tar.gz +``` + +The workflow can be run with the demo data using: + +``` +nextflow run epi2me-labs/wf-amplicon \ + --fastq wf-amplicon-demo/fastq \ + --reference wf-amplicon-demo/reference.fa \ + -profile standard +``` + +For further information about running a workflow on the command line see https://labs.epi2me.io/wfquickstart/ diff --git a/docs/05_related_protocols.md b/docs/05_related_protocols.md new file mode 100644 index 0000000..796f0ed --- /dev/null +++ b/docs/05_related_protocols.md @@ -0,0 +1,3 @@ +This workflow is designed to take input sequences that have been produced from [Oxford Nanopore Technologies](https://nanoporetech.com/) devices. + +Find related protocols in the [Nanopore community](https://community.nanoporetech.com/docs/). diff --git a/docs/06_inputs.md b/docs/06_inputs.md new file mode 100644 index 0000000..b4e2844 --- /dev/null +++ b/docs/06_inputs.md @@ -0,0 +1,73 @@ +### Input Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| fastq | string | FASTQ files to use in the analysis. | This accepts one of three cases: (i) the path to a single FASTQ file; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. | | +| analyse_unclassified | boolean | Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory. | If selected and if the input is a multiplex directory the workflow will also process the unclassified directory. | False | +| reference | string | Path to a reference FASTA file. | The reference file should contain one sequence per amplicon. | | + + +### Sample Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| sample_sheet | string | A CSV file used to map barcodes to sample aliases. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files. | The sample sheet is a CSV file with, minimally, columns named `barcode` and `alias`. Extra columns are allowed. A `type` column is required for certain workflows and should have the following values; `test_sample`, `positive_control`, `negative_control`, `no_template_control`. | | +| sample | string | A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files. | | | + + +### Pre-processing Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| min_read_length | integer | Shorter reads will be removed. | | 300 | +| max_read_length | integer | Longer reads will be removed. | | | +| min_read_qual | number | Reads with a lower mean quality will be removed. | | 10 | +| drop_frac_longest_reads | number | Drop fraction of longest reads. | The very longest reads might be concatemers or contain other artifacts. In many cases removing them simplifies de novo consensus generation. | 0.05 | +| take_longest_remaining_reads | boolean | Whether to use the longest (remaining) reads. | With this option, reads are not randomly selected during downsampling (potentially after the longest reads have been removed), but instead the longest remaining reads are taken. This generally improves performance on long amplicons. | True | +| reads_downsampling_size | integer | Downsample to this number of reads per sample. | Downsampling is performed after filtering. Set to 0 to disable downsampling. | 0 | +| min_n_reads | number | Samples / barcodes with fewer reads will not be processed. | | 40 | + + +### Variant Calling Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| min_coverage | integer | Minimum coverage for variants to keep. | Only variants covered by more than this number of reads are reported in the resulting VCF file. | 20 | +| basecaller_cfg | string | Name of the basecaller model that processed the signal data; used to select an appropriate Medaka model. | The basecaller configuration is used to automatically select the appropriate Medaka model. The automatic selection can be overridden with the 'medaka_model' parameters. Available models are: 'dna_r10.4.1_e8.2_400bps_hac@v3.5.2', 'dna_r10.4.1_e8.2_400bps_sup@v3.5.2', 'dna_r9.4.1_e8_fast@v3.4', 'dna_r9.4.1_e8_hac@v3.3', 'dna_r9.4.1_e8_sup@v3.3', 'dna_r10.4.1_e8.2_400bps_hac_prom', 'dna_r9.4.1_450bps_hac_prom', 'dna_r10.3_450bps_hac', 'dna_r10.3_450bps_hac_prom', 'dna_r10.4.1_e8.2_260bps_hac', 'dna_r10.4.1_e8.2_260bps_hac_prom', 'dna_r10.4.1_e8.2_400bps_hac', 'dna_r9.4.1_450bps_hac', 'dna_r9.4.1_e8.1_hac', 'dna_r9.4.1_e8.1_hac_prom'. | dna_r10.4.1_e8.2_400bps_sup@v4.2.0 | +| medaka_model | string | The name of the Medaka model to use. This will override the model automatically chosen based on the provided basecaller configuration. | The workflow will attempt to map the basecaller model (provided with 'basecaller_cfg') used to a suitable Medaka model. You can override this by providing a model with this option instead. | | + + +### De-novo Consensus Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| force_spoa_length_threshold | integer | Consensus length below which to force SPOA consensus generation. | If the consensus generated by `miniasm` is shorter than this value, force consensus generation with SPOA (regardless of whether the sequence produced by `miniasm` passed QC or not). The rationale for this parameter is that `miniasm` sometimes gives slightly truncated assemblies for short amplicons from RBK data, whereas SPOA tends to be more robust in this regard. | 2000 | +| spoa_minimum_relative_coverage | number | Minimum coverage (relative to the number of reads per sample after filtering) when constructing the consensus with SPOA. | Needs to be a number between 0.0 and 1.0. The result of multiplying this number with the number of reads for the corresponding sample (after filtering) is passed on to SPOA's `--min-coverage` option. | 0.15 | +| minimum_mean_depth | integer | Mean depth threshold to pass consensus quality control. | Draft consensus sequences with a lower average depth of coverage after re-aligning the input reads will fail QC. | 30 | +| primary_alignments_threshold | number | Fraction of primary alignments to pass quality control. | Draft consensus sequences with a lower fraction of primary alignments after re-aligning the input reads will fail QC. | 0.7 | + + +### Output Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| out_dir | string | Directory for output of all workflow results. | | output | +| combine_results | boolean | Whether to merge per-sample results into a single BAM / VCF file. | Per default, results are grouped per sample. With this option, an additional BAM and VCF file are produced which contain the alignments / variants for all samples and amplicons. | False | + + +### Advanced Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| number_depth_windows | integer | Number of windows used during depth of coverage calculations. | Depth of coverage is calculated for each sample across each amplicon split into this number of windows. A higher number will produce more fine-grained plots at the expense of run time. | 100 | +| medaka_target_depth_per_strand | integer | Downsample each amplicon to this per-strand depth before running Medaka. | Medaka performs best with even strand coverage and depths between 80X and 400X. To avoid too high coverage, the workflow downsamples the reads for each amplicon to this per-strand depth before running Medaka. Changing this value is discouraged as it might cause decreased performance. | 75 | + + +### Miscellaneous Options + +| Nextflow parameter name | Type | Description | Help | Default | +|--------------------------|------|-------------|------|---------| +| threads | integer | Maximum number of CPU threads to use per workflow task. | Several tasks in this workflow benefit from using multiple CPU threads. This option sets the maximum number of CPU threads for such processes. The total CPU resources used by the workflow are constrained by the executor configuration in `nextflow.config`. | 4 | +| disable_ping | boolean | Enable to prevent sending a workflow ping. | | False | + + diff --git a/docs/07_outputs.md b/docs/07_outputs.md new file mode 100644 index 0000000..f851123 --- /dev/null +++ b/docs/07_outputs.md @@ -0,0 +1,12 @@ +Outputs files may be aggregated including information for all samples or provided per sample. Per-sample files will be prefixed with respective aliases and represented below as {{ alias }}. + +| Title | File path | Description | Per sample or aggregated | +|-------|-----------|-------------|--------------------------| +| workflow report | ./wf-amplicon-report.html | Report for all samples. | aggregated | +| Sanitized reference file | ./reference_sanitized_seqIDs.fasta | Some programs used by the workflow don't like special characters (like colons) in the sequence IDs in the reference FASTA file. The reference is thus "sanitized" by replacing these characters with underscores. This file is only generated when the workflow is run in variant calling mode. | aggregated | +| Alignments BAM file | ./{{ alias }}/alignments/aligned.sorted.bam | BAM file with alignments of input reads against the references (in variant calling mode) or the created consensus (in de-novo consensus mode). | per-sample | +| Alignments index file | ./{{ alias }}/alignments/aligned.sorted.bam.bai | Index for alignments BAM file. | per-sample | +| De-novo consensus FASTQ file | ./{{ alias }}/consensus/consensus.fastq | Consensus file generated by de-novo consensus pipeline. | per-sample | +| Consensus FASTA file | ./{{ alias }}/consensus/consensus.fasta | Consensus file generated variant calling pipeline. | per-sample | +| Variants VCF file | ./{{ alias }}/variants/medaka.annotated.vcf.gz | VCF file of variants detected against the provided reference. | per-sample | +| Variants index file | ./{{ alias }}/variants/medaka.annotated.vcf.gz.csi | Index for variants VCF file. | per-sample | diff --git a/docs/08_pipeline_overview.md b/docs/08_pipeline_overview.md new file mode 100644 index 0000000..6a771b4 --- /dev/null +++ b/docs/08_pipeline_overview.md @@ -0,0 +1,75 @@ +> Note: The initial preprocessing steps are the same for the variant calling and the de-novo consensus mode. + +#### 1. Read preprocessing + +The input data can be a single FASTQ file, a directory containing FASTQ files, or a directory containing barcoded sub-directories which in turn contain FASTQ files. +A sample sheet can be included with `--sample_sheet` and a sample name for an individual sample with `--sample`. +If a sample sheet is provided, it needs to contain at least these three columns: + +- `barcode`: names of the sub-directories with the FASTQ files of the different + samples +- `alias`: sample names +- `type`: sample types (needs to be one of `test_sample`, `positive_control`, + `negative_control`, `no_template_control`) + +After parsing the sample sheet, the raw reads are filtered. Relevant options for filtering are + +- `--min_read_length` +- `--max_read_length` +- `--min_read_qual` + +Reads can optionally also be downsampled (`--reads_downsampling_size` controls the number of reads to keep per sample). +The workflow supports random downsampling as well as selecting reads based on read length. +Subsetting based on read length tends to work better for de-novo consensus generation and usually does not decrease performance of the variant calling mode. +It is thus set as default (see the FAQ section below for details and for how to disable this). +The selected reads are then trimmed with [Porechop](https://github.com/rrwick/Porechop) prior to downstream analysis. + +> Note: Samples with fewer reads than `--min_n_reads` after preprocessing are ignored. + + +### Variant calling mode + +#### 2. Align reads + +Preprocessed reads are aligned against the provided reference with [Minimap2](https://github.com/lh3/minimap2) (please note that the reference should only contain the expected sequences of the individual amplicons). +[Bamstats](https://github.com/epi2me-labs/fastcat#bamstats) is used to collate alignment statistics and [mosdepth](https://github.com/brentp/mosdepth) to calculate the depth of coverage along the amplicon. + +#### 3. Call variants + +After alignment, haploid variants are called with [Medaka](https://github.com/nanoporetech/medaka). +You can set the minimum coverage a variant needs to exceed in order to be included in the results with `--min_coverage`. +Variants with lower coverage will still be listed in the resulting VCF files, but with `LOW_DEPTH` instead of `PASS` in the `FILTER` column. + +The workflow selects the appropriate [Medaka models](https://github.com/nanoporetech/medaka#models) based on the basecaller configuration that was used to process the signal data. +You can use the parameter `--basecaller_cfg` to provide this information (e.g. `dna_r10.4.1_e8.2_400bps_hac`). +Alternatively, you can choose the [Medaka](https://github.com/nanoporetech/medaka) model directly with `--medaka_model`. + +#### 4. Use the variants to generate a consensus + +The variants passing the depth filter are then incorporated in the reference to create the consensus sequence. + +### De-novo consensus mode + +#### 2. Create a draft consensus + +When running in de-novo mode, [miniasm](https://github.com/lh3/miniasm) or [SPOA](https://github.com/rvaser/spoa) are used to create a draft consensus after the reads were selected and trimmed. +`miniasm` tends to work better for longer amplicons and `spoa` for shorter amplicons. +The workflow attempts to create an assembly with `miniasm` first. +If this succeeds, the draft assembly is polished with [Racon](https://github.com/lbcb-sci/racon) and the input reads are re-aligned against it. +This is followed by some post-processing (trimming low-coverage regions from the ends) and quality control (QC). +During QC, the following sanity-checks are performed: + +- The mean coverage of the consensus needs to be above `--minimum_mean_depth` (default: 30X). +- The fraction of primary alignments for the re-aligned reads needs to be more than + `--primary_alignment_threshold` (default: 70%). Large numbers of secondary / + supplementary alignments indicate that something might have gone wrong during + assembly. If your amplicons contain long repetitive regions, you can lower + this number. + +If more than one contig from the draft assembly passes these filters, the one with the highest coverage is selected. + +If assembly generation fails or none of the contigs produced by `miniasm` pass QC, `spoa` is tried. +Regardless of how the draft consensus was generated (`miniasm` or `spoa`), a final polishing step with +[Medaka](https://github.com/nanoporetech/medaka) is performed. + +> Note: Since `spoa` tends to produce a better consensus than `miniasm` for short amplicons, users can force the workflow to run `spoa` if the assembly created by `miniasm` is shorter than `--force_spoa_length_threshold` (even if it passes QC). diff --git a/docs/09_troubleshooting.md b/docs/09_troubleshooting.md new file mode 100644 index 0000000..b4bc407 --- /dev/null +++ b/docs/09_troubleshooting.md @@ -0,0 +1,2 @@ ++ If the workflow fails please run it with the demo data set to ensure the workflow itself is working. This will help us determine if the issue is related to the environment, input parameters or a bug. ++ Please see [here](https://labs.epi2me.io/trouble-shooting/) for how to resolve some common Nextflow issues and [here](https://labs.epi2me.io/how-to-exits/) for how to interpret command exit codes. diff --git a/docs/10_FAQ.md b/docs/10_FAQ.md new file mode 100644 index 0000000..f6360a6 --- /dev/null +++ b/docs/10_FAQ.md @@ -0,0 +1,32 @@ +*Variant calling mode: How can I specify specific reference sequences for individual barcodes?* - Per default, the workflow aligns all barcodes against all sequences in the supplied reference. +If you want to map specific reference sequences to specific amplicons, you can add a column named `ref` containing the respective target reference IDs to the sample sheet. +If a barcode should be aligned against multiple reference sequences, their IDs can be included as a space-delimited list. +If a specific sample should be aligned against all references, you can leave the corresponding cell in the `ref` column blank. +Consider the following example with 4 barcodes and 2 sequences in the reference file. The sample sheet tells the workflow that the first two barcodes (`sample1` and `sample2`) are expected to contain +reads from both amplicons in the reference, whereas the other two samples only contain one amplicon each. + +``` +barcode,alias,type,ref +barcode01,sample1,test_sample,katG::NC_000962.3:2154725-2155670 rpoB::NC_000962.3:760285-761376 +barcode02,sample2,test_sample, +barcode03,sample3,positive_control,katG::NC_000962.3:2154725-2155670 +barcode04,sample4,test_sample,rpoB::NC_000962.3:760285-761376 +``` + +*How much coverage do I need?* - We recommend >150X average coverage across the individual amplicons. +1500 reads per amplicon should thus be enough in the vast majority of cases. +You can speed up execution time by setting `--reads_downsampling_size 1500` (or to a smaller number if your amplicons are not longer than a few kb). + +*Why does the workflow select reads based on length rather than random downsampling per default?* - Despite the workflow dropping reads containing adapter sequences in the middle (using `porechop --discard_middle`), some reads in the input data stemming from concatemers or other PCR artifacts might still make it through preprocessing and could thus interfere with consensus generation. +A simple way to avoid this is to drop a small fraction (e.g. 5%) of longest reads. +Furthermore, `spoa` depends on the order of reads in the input and benefits from "seeing" longer reads first. +Therefore, the following workflow options are used per default `--drop_frac_longest_reads 0.05 --take_longest_remaining_reads`. +This essentially drops the longest 5% of reads and then takes selects the next longest reads (either all of them or the number specified with `--reads_downsampling_size`). +To disable the default behaviour and enforce random downsampling of input reads, use `--drop_frac_longest_reads 0 --take_longest_remaining_reads false`. + +*Is there a difference between the consensus sequences created in variant calling and de-novo consensus mode?* - In variant calling mode, the consensus sequence is generated by applying the variants found by [Medaka](https://github.com/nanoporetech/medaka) to the provided reference. +This comes with the caveat that structural variants too large to be detected by Medaka are not going to be represented in the consensus. +For this reason we suggest to always have a look at the coverage plots in the generated HTML report. +In de-novo consensus mode, however, the consensus is generated either via assembly or [POA](https://doi.org/10.1093/bioinformatics/btg109) and no prior information besides the reads is taken into account. + +If your question is not answered here, please report any issues or suggestions on the [github issues](https://github.com/epi2me-labs/wf-alignment/issues) page or start a discussion on the [community](https://community.nanoporetech.com/). diff --git a/docs/11_other.md b/docs/11_other.md new file mode 100644 index 0000000..53b1130 --- /dev/null +++ b/docs/11_other.md @@ -0,0 +1,5 @@ +## Related blog posts + +- [How to align your data](https://labs.epi2me.io/how-to-align/) + +See the [EPI2ME website](https://labs.epi2me.io/) for lots of other resources and blog posts. diff --git a/docs/header.md b/docs/header.md deleted file mode 100644 index 90d29c3..0000000 --- a/docs/header.md +++ /dev/null @@ -1,3 +0,0 @@ -# wf-amplicon - - diff --git a/docs/links.md b/docs/links.md deleted file mode 100644 index 5814961..0000000 --- a/docs/links.md +++ /dev/null @@ -1,11 +0,0 @@ - -## Useful links - -- [Nextflow](https://www.nextflow.io/) -- [Docker](https://www.docker.com/products/docker-desktop) -- [Singularity](https://docs.sylabs.io/guides/latest/user-guide/) -- [minimap2](https://github.com/lh3/minimap2) -- [miniasm](https://github.com/lh3/miniasm) -- [SPOA](https://github.com/rvaser/spoa) -- [Medaka](https://github.com/nanoporetech/medaka) -- [Porechop](https://github.com/rrwick/Porechop) diff --git a/docs/quickstart.md b/docs/quickstart.md deleted file mode 100644 index 37be1e9..0000000 --- a/docs/quickstart.md +++ /dev/null @@ -1,176 +0,0 @@ -## Quickstart - -The workflow relies on the following dependencies: - -- [Nextflow](https://www.nextflow.io/) for managing compute and software - resources. -- Either [Docker](https://www.docker.com/products/docker-desktop) or - [Singularity](https://docs.sylabs.io/guides/latest/user-guide/) to provide - isolation of the required software. - -It is not necessary to clone or download the git repository in order to run the -workflow. For more information on running EPI2ME Labs workflows, [visit our -website](https://labs.epi2me.io/wfindex). - -**Workflow options** - -If you have [Nextflow](https://www.nextflow.io/) installed, you can run the -following to obtain the workflow: - -``` -nextflow run epi2me-labs/wf-amplicon --help -``` - -This will show the workflow's command line options. - -### Usage - -This section covers how to use the workflow when providing a reference file with -the expected sequences of the amplicons ("variant calling mode"). For details on -how to run the workflow without a reference ("de-novo consensus mode"), see the -relevant section below. - -Basic usage is as follows - -```bash -nextflow run epi2me-labs/wf-amplicon \ - --fastq $input \ - --reference references.fa \ - --threads 4 -``` - -`$input` can be a single FASTQ file, a directory containing FASTQ files, or a -directory containing barcoded sub-directories which in turn contain FASTQ files. -A sample sheet can be included with `--sample_sheet` and a sample name for an -individual sample with `--sample`. If a sample sheet is provided, it needs to -contain at least these three columns: - -- `barcode`: names of the sub-directories with the FASTQ files of the different - samples -- `alias`: sample names -- `type`: sample types (needs to be one of `test_sample`, `positive_control`, - `negative_control`, `no_template_control`) - -Additionally, it can also contain a column named `ref`, which can be used to -specify one or more sequences in the reference FASTA file for the individual -samples. If a sample should be aligned against multiple reference sequences, -their IDs can be included as a space-delimited list. If a specific sample should -be aligned against all references, you can leave the corresponding cell in the -`ref` column blank. As an example, the following sample sheet tells the workflow -that the first two barcodes (`sample1` and `sample2`) are expected to contain -reads from both amplicons in the reference, whereas the other two samples only -contain one amplicon each. - -``` -barcode,alias,type,ref -barcode01,sample1,test_sample,katG::NC_000962.3:2154725-2155670 rpoB::NC_000962.3:760285-761376 -barcode02,sample2,test_sample, -barcode03,sample3,positive_control,katG::NC_000962.3:2154725-2155670 -barcode04,sample4,test_sample,rpoB::NC_000962.3:760285-761376 -``` - -Relevant options for filtering of raw reads are - -- `--min_read_length` -- `--max_read_length` -- `--min_read_qual` - -After initial filtering, reads can optionally be downsampled -(`--reads_downsampling_size` controls the number of reads to keep per sample). -The workflow supports random downsampling as well as selecting reads based on -read length. Subsetting based on read length tends to work better for de-novo -consensus generation and is thus set as default (see the section for the de-novo -consensus mode below for details and for how to disable this). However, it -should not detriment the performance of the variant-calling mode. The selected -reads are then trimmed with [Porechop](https://github.com/rrwick/Porechop) -before being aligned against the reference. - -> Note: Samples with fewer reads than `--min_n_reads` after preprocessing are -> ignored. - -After alignment, haploid variants are called with -[Medaka](https://github.com/nanoporetech/medaka). You can set the minimum -coverage a variant needs to exceed in order to be included in the results with -`--min_coverage`. Variants with lower coverage will still be included in the -resulting VCF files, but with `LOW_DEPTH` instead of `PASS` in the `FILTER` -column. - -The workflow selects the appropriate -[Medaka models](https://github.com/nanoporetech/medaka#models) based on the basecaller -configuration that was used to process the signal data. You can use the -parameter `--basecaller_cfg` to provide this information (e.g. -`dna_r10.4.1_e8.2_400bps_hac`). Alternatively, you can choose the -[Medaka](https://github.com/nanoporetech/medaka) model directly with -`--medaka_model`. - -If you want to use -[Singularity](https://docs.sylabs.io/guides/latest/user-guide/) instead of -[Docker](https://www.docker.com/products/docker-desktop), add `-profile -singularity` to the nextflow invocation. - -### Key outputs - -- Interactive HTML report detailing the results. -- BAM files with reads aligned against the reference. -- VCF files with variants called by - [Medaka](https://github.com/nanoporetech/medaka). -- FASTA files with consensus sequences generated by applying the variants to the - provided reference. - -A sub-directory with these output files will be created for each sample. -Additionally, a single BAM and VCF file containing the results of all samples -will be produced if the option `--combine_results` is provided. - -### Known issues and limitations - -The consensus sequence is generated by applying the variants found by -[Medaka](https://github.com/nanoporetech/medaka) to the provided reference. This -comes with the caveat that deletions at either end of the amplicon are not going -to be reflected in the consensus (i.e. it will still contain the deleted -regions). For this reason we suggest to always have a look at the coverage plots -in the generated HTML report. - -### Running without a reference - -The workflow can also be run without a reference. It will then use -[miniasm](https://github.com/lh3/miniasm) or -[SPOA](https://github.com/rvaser/spoa) to create a draft consensus. `miniasm` -tends to work better for longer amplicons and `spoa` for shorter amplicons. The -workflow attempts to create an assembly with `miniasm` first. If this succeeds, -the draft assembly is polished with [Racon](https://github.com/lbcb-sci/racon) and -the input reads are re-aligned against it. This is followed by some post-processing -(trimming low-coverage regions from the ends) and quality control (QC). During QC, the following -sanity-checks are performed: - -- The mean coverage of the consensus is above `--minimum_mean_depth` (default: 30X). -- The fraction of primary alignments for the re-aligned reads is more than - `--primary_alignment_threshold` (default: 70%). Large numbers of secondary / - supplementary alignments indicate that something might have gone wrong during - assembly. If your amplicons contain long repetitive regions, you can lower - this number. - -If more than one contig from the draft assembly passes these filters, the one -with the highest coverage is selected. - -If assembly generation fails or none of the contigs produced by `miniasm` pass -QC, `spoa` is tried. Regardless of how the draft consensus was generated -(`miniasm` or `spoa`), a final polishing step with -[Medaka](https://github.com/nanoporetech/medaka) is performed. - -Note: Since `spoa` tends to produce a better consensus than `miniasm` for short -amplicons, users can force the workflow to run `spoa` if the assembly created by -`miniasm` is shorter than `--force_spoa_length_threshold` (even if it passes -QC). - -**Random downsampling vs selecting reads by length:** - -Despite the workflow dropping reads containing adapter sequences in the middle -(with `porechop --discard_middle`), some reads in the input data stemming from -concatemers or other PCR artifacts might still make it through preprocessing and -could thus interfere with consensus generation. A simple way to avoid this is to -drop a small fraction (e.g. 5%) of longest reads. Additionally, `spoa` depends -on the order of reads in the input and benefits from "seeing" longer reads -first. Therefore, the following options are used per default -`--drop_frac_longest_reads 0.05 --take_longest_remaining_reads`. To disable this -and enforce random downsampling of input reads, use `--drop_frac_longest_reads 0 ---take_longest_remaining_reads false`. \ No newline at end of file diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy index 3b29be1..81fdc2e 100644 --- a/lib/NfcoreSchema.groovy +++ b/lib/NfcoreSchema.groovy @@ -141,7 +141,7 @@ class NfcoreSchema { for (specifiedParam in params.keySet()) { // nextflow params if (nf_params.contains(specifiedParam)) { - log.error "ERROR: You used a core Nextflow option with two hyphens: '--${specifiedParam}'. Please resubmit with '-${specifiedParam}'" + log.error "You used a core Nextflow option with two hyphens: '--${specifiedParam}'. Please resubmit with '-${specifiedParam}'" has_error = true } // unexpected params @@ -180,7 +180,7 @@ class NfcoreSchema { schema.validate(params_json) } catch (ValidationException e) { println '' - log.error 'ERROR: Validation of pipeline parameters failed!' + log.error 'Validation of pipeline parameters failed!' JSONObject exceptionJSON = e.toJSON() HashSet observed_exceptions = [] printExceptions(exceptionJSON, params_json, log, enums, raw_schema, observed_exceptions) diff --git a/lib/ingress.nf b/lib/ingress.nf index 5839730..b5d8dfa 100644 --- a/lib/ingress.nf +++ b/lib/ingress.nf @@ -240,6 +240,7 @@ process checkBamHeaders { label "ingress" label "wf_common" cpus 1 + memory "2 GB" input: tuple val(meta), path("input_dir/reads*.bam") output: // set the two env variables by `eval`-ing the output of the python script @@ -257,6 +258,7 @@ process mergeBams { label "ingress" label "wf_common" cpus 3 + memory "4 GB" input: tuple val(meta), path("input_bams/reads*.bam") output: tuple val(meta), path("reads.bam") shell: @@ -271,6 +273,7 @@ process catSortBams { label "ingress" label "wf_common" cpus 4 + memory "4 GB" input: tuple val(meta), path("input_bams/reads*.bam") output: tuple val(meta), path("reads.bam") script: @@ -285,6 +288,7 @@ process sortBam { label "ingress" label "wf_common" cpus 3 + memory "4 GB" input: tuple val(meta), path("reads.bam") output: tuple val(meta), path("reads.sorted.bam") script: @@ -298,6 +302,7 @@ process bamstats { label "ingress" label "wf_common" cpus 3 + memory "4 GB" input: tuple val(meta), path("reads.bam") output: @@ -414,6 +419,7 @@ process move_or_compress_fq_file { label "ingress" label "wf_common" cpus 1 + memory "2 GB" input: // don't stage `input` with a literal because we check the file extension tuple val(meta), path(input) @@ -439,6 +445,7 @@ process fastcat { label "ingress" label "wf_common" cpus 3 + memory "2 GB" input: tuple val(meta), path("input") val extra_args @@ -737,6 +744,7 @@ process validate_sample_sheet { cpus 1 label "ingress" label "wf_common" + memory "2 GB" input: path "sample_sheet.csv" val required_sample_types diff --git a/main.nf b/main.nf index 8258865..90ab7b4 100644 --- a/main.nf +++ b/main.nf @@ -16,6 +16,7 @@ OPTIONAL_FILE = file("$projectDir/data/OPTIONAL_FILE") process getVersions { label "wfamplicon" cpus 1 + memory "2 GB" output: path "versions.txt" script: """ @@ -41,6 +42,7 @@ process getVersions { process addMedakaToVersionsFile { label "medaka" cpus 1 + memory "2 GB" input: path "old_versions.txt" output: path "versions.txt" script: @@ -53,6 +55,7 @@ process addMedakaToVersionsFile { process getParams { label "wfamplicon" cpus 1 + memory "2 GB" output: path "params.json" script: @@ -66,6 +69,7 @@ process getParams { process downsampleReads { label "wfamplicon" cpus Math.min(params.threads, 3) + memory "2 GB" input: tuple val(meta), path("reads.fastq.gz") val n_reads @@ -88,6 +92,7 @@ Selected reads will always be sorted by length. process subsetReads { label "wfamplicon" cpus 1 + memory "2 GB" input: tuple val(meta), path("reads.fastq.gz"), path("fastcat-stats") val drop_longest_frac @@ -122,7 +127,15 @@ process subsetReads { process porechop { label "wfamplicon" cpus Math.min(params.threads, 5) - input: tuple val(meta), path("reads.fastq.gz") + memory { + // `porechop` quite annoyingly loads the whole FASTQ into memory + // (https://github.com/rrwick/Porechop/issues/77) and uses up to 4x the size of + // the `.fastq.gz` file (depending on compression ratio); we give another factor + // of 2 for extra margin to be on the safe side + def fastq_size = fastq.size() + fastq_size > 2e9 ? "32 GB" : (fastq_size > 2.5e8 ? "16 GB" : "2 GB") + } + input: tuple val(meta), path(fastq, stageAs: "reads.fastq.gz") output: tuple val(meta), path("porechopped.fastq.gz"), emit: seqs tuple val(meta), path("porechopped-per-file-stats.tsv"), emit: stats @@ -131,9 +144,15 @@ process porechop { // run fastcat on porechopped reads so that we can include the post-trimming stats // in the report """ - fastcat <(porechop -i reads.fastq.gz -t $porechop_threads --discard_middle) \ - -s $meta.alias \ - -f porechopped-per-file-stats.tsv \ + ( + # only run porechop if the FASTQ isn't empty + if [[ -n \$(zcat reads.fastq.gz | head -c1) ]]; then + porechop -i reads.fastq.gz -t $porechop_threads --discard_middle + else + echo -n + fi + ) \ + | fastcat /dev/stdin -s $meta.alias -f porechopped-per-file-stats.tsv \ | bgzip > porechopped.fastq.gz """ } @@ -141,6 +160,7 @@ process porechop { process collectFilesInDir { label "wfamplicon" cpus 1 + memory "2 GB" input: tuple path("staging_dir/*"), val(dirname) output: path(dirname) script: @@ -152,6 +172,7 @@ process collectFilesInDir { process concatTSVs { label "wfamplicon" cpus 1 + memory "2 GB" input: tuple val(meta), path("input/f*.tsv") val fname @@ -166,6 +187,7 @@ process concatTSVs { process makeReport { label "wfamplicon" cpus 1 + memory "8 GB" input: path "data/*" path ref, stageAs: "ref/*" @@ -200,6 +222,7 @@ process output { // publish inputs to output directory label "wfamplicon" cpus 1 + memory "2 GB" publishDir ( params.out_dir, mode: "copy", diff --git a/modules/local/common.nf b/modules/local/common.nf index 2bb0e7a..f444aa6 100644 --- a/modules/local/common.nf +++ b/modules/local/common.nf @@ -1,6 +1,7 @@ process alignReads { label "wfamplicon" cpus params.threads + memory "4 GB" input: tuple val(meta), path("reads.fastq.gz"), path("reference.fasta") output: tuple val(meta), path("*.bam"), path("*.bai") script: @@ -16,6 +17,7 @@ process alignReads { process bamstats { label "wfamplicon" cpus Math.min(params.threads, 2) + memory "2 GB" input: tuple val(meta), path("input.bam"), path("input.bam.bai") output: tuple val(meta), path("bamstats.tsv"), path("bamstats-flagstat.tsv") script: @@ -28,6 +30,7 @@ process bamstats { process mosdepth { label "wfamplicon" cpus Math.min(params.threads, 3) + memory "4 GB" input: tuple val(meta), path("input.bam"), path("input.bam.bai"), val(ref_id) val n_windows @@ -69,6 +72,7 @@ process mosdepth { process concatMosdepthResultFiles { label "wfamplicon" cpus 1 + memory "2 GB" input: tuple val(meta), path("depth.*.bed.gz") output: tuple val(meta), path("per-window-depth.tsv.gz") script: @@ -82,6 +86,8 @@ process concatMosdepthResultFiles { process lookupMedakaModel { label "wfamplicon" + cpus 1 + memory "2 GB" input: path("lookup_table") val basecall_model diff --git a/modules/local/de-novo.nf b/modules/local/de-novo.nf index 54ef2bf..c108b5f 100644 --- a/modules/local/de-novo.nf +++ b/modules/local/de-novo.nf @@ -15,6 +15,7 @@ SPOA. process spoa { label = "medaka" cpus 1 + memory "8 GB" input: tuple val(meta), path("reads.fastq.gz") output: tuple val(meta), path("reads.fastq.gz"), path("asm.fasta") script: @@ -38,6 +39,7 @@ the draft asssembly. If none of the assembled contigs is longer than process miniasm { label = "wfamplicon" cpus params.threads + memory "8 GB" input: tuple val(meta), path("reads.fastq.gz") output: tuple val(meta), path("reads.fastq.gz"), path("asm.fasta"), env(STATUS) script: @@ -71,6 +73,7 @@ too aggressively. We trim the sequences downstream instead. process racon { label = "wfamplicon" cpus params.threads + memory "8 GB" input: tuple val(meta), path("reads.fastq.gz"), path("draft.fasta") output: tuple val(meta), path("reads.fastq.gz"), path("polished.fasta") script: @@ -96,6 +99,7 @@ Polish draft consensus with `medaka`. Also, emit the polished sequences as FASTQ process medakaConsensus { label "medaka" cpus Math.min(params.threads, 3) + memory "8 GB" input: tuple val(meta), path("input.bam"), path("input.bam.bai"), path("draft.fasta") val medaka_model @@ -117,6 +121,7 @@ not for the plots in the report. process mosdepthPerBase { label "wfamplicon" cpus Math.min(params.threads, 3) + memory "4 GB" input: tuple val(meta), path("input.bam"), path("input.bam.bai") output: tuple val(meta), path("depth.per-base.bed.gz") script: @@ -136,6 +141,7 @@ this is expected by the report code. process mosdepthWindows { label "wfamplicon" cpus Math.min(params.threads, 3) + memory "4 GB" input: tuple val(meta), path("input.bam"), path("input.bam.bai") val n_windows @@ -193,6 +199,7 @@ process trimAndQC { label "wfamplicon" cpus params.threads cpus 1 + memory "2 GB" input: tuple val(meta), path("consensus.fastq"), diff --git a/modules/local/variant-calling.nf b/modules/local/variant-calling.nf index 1e91060..2c1d7fd 100644 --- a/modules/local/variant-calling.nf +++ b/modules/local/variant-calling.nf @@ -13,6 +13,7 @@ process sanitizeRefFile { // them (and whitespace) with underscores label "wfamplicon" cpus 1 + memory "2 GB" input: path "reference.fasta" output: path "reference_sanitized_seqIDs.fasta" script: @@ -24,6 +25,7 @@ process sanitizeRefFile { process subsetRefFile { label "wfamplicon" cpus 1 + memory "2 GB" input: tuple val(metas), path("reference.fasta"), val(target_seqs) output: tuple val(metas), path("reference_subset.fasta"), val(target_seqs) script: @@ -38,6 +40,7 @@ process subsetRefFile { process downsampleBAMforMedaka { label "wfamplicon" cpus 1 + memory "8 GB" input: tuple val(meta), path("input.bam"), path("input.bam.bai"), path("bamstats.tsv") output: tuple val(meta), path("downsampled.bam"), path("downsampled.bam.bai") @@ -56,6 +59,7 @@ process downsampleBAMforMedaka { process medakaConsensus { label "medaka" cpus Math.min(params.threads, 2) + memory "8 GB" input: tuple val(meta), path("input.bam"), path("input.bam.bai"), val(reg) val medaka_model @@ -70,6 +74,7 @@ process medakaConsensus { process medakaVariant { label "medaka" cpus 1 + memory "8 GB" input: tuple val(meta), path("consensus_probs*.hdf"), @@ -109,6 +114,7 @@ process medakaVariant { process mergeVCFs { label "medaka" cpus 1 + memory "4 GB" input: path "VCFs/file*.vcf.gz" output: tuple path("combined.vcf.gz"), path("combined.vcf.gz.csi") script: @@ -125,6 +131,7 @@ process mergeVCFs { process mergeBAMs { label "wfamplicon" cpus 1 + memory "16 GB" input: path "BAMs/file*.bam" path "indices/file*.bam.bai" diff --git a/nextflow.config b/nextflow.config index 3371657..c2b228a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -76,7 +76,7 @@ manifest { description = 'Amplicon workflow' mainScript = 'main.nf' nextflowVersion = '>=23.04.2' - version = 'v0.6.2' + version = 'v1.0.0' } epi2melabs { @@ -132,7 +132,7 @@ profiles { process { executor = 'awsbatch' queue = "${params.aws_queue}" - memory = '8G' + memory = '32G' withLabel:wf_common { container = "${params.aws_image_prefix}-wf-common:${params.wf.common_sha}-root" } diff --git a/nextflow_schema.json b/nextflow_schema.json index b043ea0..cb97d3c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -2,6 +2,7 @@ "$schema": "http://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json", "title": "epi2me-labs/wf-amplicon", + "workflow_title": "Amplicon workflow", "description": "Nextflow workflow for analysing Oxford Nanopore reads created by amplicon sequencing.", "demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-amplicon/wf-amplicon-demo.tar.gz", "aws_demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-amplicon/wf-amplicon-demo/aws.nextflow.config", @@ -314,8 +315,16 @@ "type": "boolean" } }, - "docs": { - "intro": "## Introduction\n\nThis [Nextflow](https://www.nextflow.io/) workflow provides a simple way to\nanalyse Oxford Nanopore reads generated from amplicons.\n\nThe workflow requires raw reads in FASTQ format and can be run in two modes:\n* Variant calling mode: Trigger this mode by passing a reference FASTA file.\n After initial filtering (based on read length and quality) and adapter\n trimming, [minimap2](https://github.com/lh3/minimap2) is used to align the\n reads to the reference (please note that the reference should only contain the\n expected sequences of the individual amplicons). Variants are then called with\n [Medaka](https://github.com/nanoporetech/medaka). This mode allows for\n multiple amplicons per barcode (for details on how to map specific target\n amplicons to individual samples / barcodes, see below).\n* De-novo consensus mode: This mode is run when no reference file is passed.\n Like for the \"variant calling mode\", reads are first filtered and trimmed.\n Then, a consensus sequence is generated _de novo_ from the reads of each\n sample. Reads are re-aligned against the draft consensus for polishing with\n [Medaka](https://github.com/nanoporetech/medaka). Please note that only one\n amplicon per barcode is supported in de-novo consensus mode.\n\nThe results of the workflow include an interactive HTML report, FASTA files with\nthe consensus sequences of the amplicons, BAM files with the alignments, and VCF\nfiles containing the variants (if run in variant calling mode).\n", - "links": "\n## Useful links\n\n- [Nextflow](https://www.nextflow.io/)\n- [Docker](https://www.docker.com/products/docker-desktop)\n- [Singularity](https://docs.sylabs.io/guides/latest/user-guide/)\n- [minimap2](https://github.com/lh3/minimap2)\n- [miniasm](https://github.com/lh3/miniasm)\n- [SPOA](https://github.com/rvaser/spoa)\n- [Medaka](https://github.com/nanoporetech/medaka)\n- [Porechop](https://github.com/rrwick/Porechop)\n" + "resources": { + "recommended": { + "cpus": 12, + "memory": "32GB" + }, + "minimum": { + "cpus": 6, + "memory": "16GB" + }, + "run_time": "0.5-5 minutes per sample (depending on number of reads, length of amplicons, and available compute).", + "arm_support": true } } \ No newline at end of file diff --git a/output_definition.json b/output_definition.json new file mode 100644 index 0000000..c7570b2 --- /dev/null +++ b/output_definition.json @@ -0,0 +1,68 @@ +{ + "files": { + "workflow-report": { + "filepath": "./wf-amplicon-report.html", + "title": "workflow report", + "description": "Report for all samples.", + "mime-type": "text/html", + "optional": false, + "type": "aggregated" + }, + "sanitized-ref": { + "filepath": "./reference_sanitized_seqIDs.fasta", + "title": "Sanitized reference file", + "description": "Some programs used by the workflow don't like special characters (like colons) in the sequence IDs in the reference FASTA file. The reference is thus \"sanitized\" by replacing these characters with underscores. This file is only generated when the workflow is run in variant calling mode.", + "mime-type": "text/txt", + "optional": true, + "type": "aggregated" + }, + "alignments": { + "filepath": "./{{ alias }}/alignments/aligned.sorted.bam", + "title": "Alignments BAM file", + "description": "BAM file with alignments of input reads against the references (in variant calling mode) or the created consensus (in de-novo consensus mode).", + "mime-type": "application/gzip", + "optional": false, + "type": "per-sample" + }, + "alignments-index": { + "filepath": "./{{ alias }}/alignments/aligned.sorted.bam.bai", + "title": "Alignments index file", + "description": "Index for alignments BAM file.", + "mime-type": "application/octet-stream", + "optional": false, + "type": "per-sample" + }, + "consensus-fastq": { + "filepath": "./{{ alias }}/consensus/consensus.fastq", + "title": "De-novo consensus FASTQ file", + "description": "Consensus file generated by de-novo consensus pipeline.", + "mime-type": "text/txt", + "optional": true, + "type": "per-sample" + }, + "consensus-fasta": { + "filepath": "./{{ alias }}/consensus/consensus.fasta", + "title": "Consensus FASTA file", + "description": "Consensus file generated variant calling pipeline.", + "mime-type": "text/txt", + "optional": true, + "type": "per-sample" + }, + "variants": { + "filepath": "./{{ alias }}/variants/medaka.annotated.vcf.gz", + "title": "Variants VCF file", + "description": "VCF file of variants detected against the provided reference.", + "mime-type": "application/gzip", + "optional": true, + "type": "per-sample" + }, + "variants-index": { + "filepath": "./{{ alias }}/variants/medaka.annotated.vcf.gz.csi", + "title": "Variants index file", + "description": "Index for variants VCF file.", + "mime-type": "application/gzip", + "optional": true, + "type": "per-sample" + } + } +}