diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e66a6de..17bff99 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -24,7 +24,7 @@ docker-run: # the variables, but it is broken when using long strings! See CW-756 parallel: matrix: - - MATRIX_NAME: ["ref", "ref-sample_sheet", "filter-all", "no-ref"] + - MATRIX_NAME: ["ref", "ref-sample_sheet", "filter-all", "de-novo"] rules: # NOTE As we're overriding the rules block for the included docker-run # we must redefine this CI_COMMIT_BRANCH rule to prevent docker-run @@ -36,8 +36,8 @@ docker-run: # use default workflow opts defined above NF_PROCESS_FILES: > main.nf - modules/local/reference-based.nf - NF_IGNORE_PROCESSES: downsampleReads,subsetRefFile + modules/local/variant-calling.nf + NF_IGNORE_PROCESSES: downsampleReads,subsetReads,subsetRefFile,concatTSVs - if: $MATRIX_NAME == "ref-sample_sheet" variables: NF_WORKFLOW_OPTS: > @@ -49,7 +49,8 @@ docker-run: --combine_results NF_PROCESS_FILES: > main.nf - modules/local/reference-based.nf + modules/local/variant-calling.nf + NF_IGNORE_PROCESSES: downsampleReads,concatTSVs - if: $MATRIX_NAME == "filter-all" variables: NF_WORKFLOW_OPTS: > @@ -59,17 +60,18 @@ docker-run: --combine_results NF_PROCESS_FILES: > main.nf - modules/local/reference-based.nf - NF_IGNORE_PROCESSES: downsampleBAMforMedaka,downsampleReads,alignReads,bamstats,concatMosdepthResultFiles,medakaConsensus,medakaVariant,mergeBAMs,mergeVCFs,mosdepth,subsetRefFile + modules/local/variant-calling.nf + NF_IGNORE_PROCESSES: downsampleBAMforMedaka,downsampleReads,subsetReads,alignReads,bamstats,concatMosdepthResultFiles,medakaConsensus,medakaVariant,mergeBAMs,mergeVCFs,mosdepth,subsetRefFile,concatTSVs AFTER_NEXTFLOW_CMD: | grep 'No reads left after pre-processing' .nextflow.log && grep 'only a limited report is available' $$PWD/$$CI_PROJECT_NAME/*html - - if: $MATRIX_NAME == "no-ref" + - if: $MATRIX_NAME == "de-novo" variables: NF_WORKFLOW_OPTS: > - --fastq test_data/fastq/barcode04 + --fastq test_data/fastq-denovo + --drop_frac_longest_reads 0.05 NF_PROCESS_FILES: > main.nf - modules/local/smolecule.nf - NF_IGNORE_PROCESSES: downsampleBAMforMedaka,downsampleReads + modules/local/de-novo.nf + NF_IGNORE_PROCESSES: downsampleBAMforMedaka,downsampleReads,racon diff --git a/CHANGELOG.md b/CHANGELOG.md index ec8a90f..5389bd6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [unreleased] +## [v0.6.0] +### Added +- Assembly step to de-novo consensus mode to handle longer amplicons. README was updated accordingly. + ### Removed - Default local executor CPU and RAM limits diff --git a/README.md b/README.md index 1699a89..9427387 100644 --- a/README.md +++ b/README.md @@ -11,20 +11,24 @@ This [Nextflow](https://www.nextflow.io/) workflow provides a simple way to analyse Oxford Nanopore reads generated from amplicons. The workflow requires raw reads in FASTQ format and can be run in two modes: -* With a reference: After initial filtering (based on read length and quality) - and adapter trimming, [minimap2](https://github.com/lh3/minimap2) is used to - align the reads to a reference FASTA file (please note that the reference - should only contain the expected sequences of the individual amplicons). - Variants are then called with - [Medaka](https://github.com/nanoporetech/medaka). -* Without a reference: Like for the "reference mode", reads are first filtered - and trimmed. Then, `medaka smolecule` is used to generate the consensus of the - reads of each sample _de novo_. Reads are re-aligned against the consensus to - produce coverage plots for the report. +* Variant calling mode: Trigger this mode by passing a reference FASTA file. + After initial filtering (based on read length and quality) and adapter + trimming, [minimap2](https://github.com/lh3/minimap2) is used to align the + reads to the reference (please note that the reference should only contain the + expected sequences of the individual amplicons). Variants are then called with + [Medaka](https://github.com/nanoporetech/medaka). This mode allows for + multiple amplicons per barcode (for details on how to map specific target + amplicons to individual samples / barcodes, see below). +* De-novo consensus mode: This mode is run when no reference file is passed. + Like for the "variant calling mode", reads are first filtered and trimmed. + Then, a consensus sequence is generated _de novo_ from the reads of each + sample. Reads are re-aligned against the draft consensus for polishing with + [Medaka](https://github.com/nanoporetech/medaka). Please note that only one + amplicon per barcode is supported in de-novo consensus mode. The results of the workflow include an interactive HTML report, FASTA files with the consensus sequences of the amplicons, BAM files with the alignments, and VCF -files containing the variants (if run in "reference mode"). +files containing the variants (if run in variant calling mode). @@ -57,9 +61,9 @@ This will show the workflow's command line options. ### Usage This section covers how to use the workflow when providing a reference file with -the expected sequences of the amplicons. For details on how to run the workflow -without a reference (i.e. for _de novo_ construction of the amplicon consensus -sequence), see the relevant section below. +the expected sequences of the amplicons ("variant calling mode"). For details on +how to run the workflow without a reference ("de-novo consensus mode"), see the +relevant section below. Basic usage is as follows @@ -106,11 +110,18 @@ Relevant options for filtering of raw reads are - `--max_read_length` - `--min_read_qual` -After filtering and trimming with -[Porechop](https://github.com/rrwick/Porechop), reads can optionally be -downsampled. You can control the number of reads to keep per sample with -`--reads_downsampling_size`. Samples with fewer than `--min_n_reads` are -ignored. +After initial filtering, reads can optionally be downsampled +(`--reads_downsampling_size` controls the number of reads to keep per sample). +The workflow supports random downsampling as well as selecting reads based on +read length. Subsetting based on read length tends to work better for de-novo +consensus generation and is thus set as default (see the section for the de-novo +consensus mode below for details and for how to disable this). However, it +should not detriment the performance of the variant-calling mode. The selected +reads are then trimmed with [Porechop](https://github.com/rrwick/Porechop) +before being aligned against the reference. + +> Note: Samples with fewer reads than `--min_n_reads` after preprocessing are +> ignored. After alignment, haploid variants are called with [Medaka](https://github.com/nanoporetech/medaka). You can set the minimum @@ -157,21 +168,47 @@ in the generated HTML report. ### Running without a reference The workflow can also be run without a reference. It will then use -`medaka smolecule` to construct a consensus sequence for each barcode. -`smolecule` relies on [SPOA](https://github.com/rvaser/spoa) and the workflow -takes advantage of the recently added option in SPOA to prune low-coverage nodes -from the partial order graph. The threshold for this is relative to the number -of reads per sample (after filtering) and can be modified with -`--spoa_minimum_relative_coverage` (e.g. set to `0.2` to require a coverage of at least -20% of reads along the whole consensus). - -[SPOA](https://github.com/rvaser/spoa) works best for LSK data. The performance -of the consensus generation may be poor for RBK data in some cases, especially -if no long reads spanning most of the amplicon were found. We therefore -recommend to only use "no reference mode" for short amplicons (4 kb or shorter). -To improve robustness on RBK, an assembly-based approach might be implemented in -the future. - +[miniasm](https://github.com/lh3/miniasm) or +[SPOA](https://github.com/rvaser/spoa) to create a draft consensus. `miniasm` +tends to work better for longer amplicons and `spoa` for shorter amplicons. The +workflow attempts to create an assembly with `miniasm` first. If this succeeds, +the draft assembly is polished with [Racon](https://github.com/lbcb-sci/racon) and +the input reads are re-aligned against it. This is followed by some post-processing +(trimming low-coverage regions from the ends) and quality control (QC). During QC, the following +sanity-checks are performed: + +- The mean coverage of the consensus is above `--minimum_mean_depth` (default: 30X). +- The fraction of primary alignments for the re-aligned reads is more than + `--primary_alignment_threshold` (default: 70%). Large numbers of secondary / + supplementary alignments indicate that something might have gone wrong during + assembly. If your amplicons contain long repetitive regions, you can lower + this number. + +If more than one contig from the draft assembly passes these filters, the one +with the highest coverage is selected. + +If assembly generation fails or none of the contigs produced by `miniasm` pass +QC, `spoa` is tried. Regardless of how the draft consensus was generated +(`miniasm` or `spoa`), a final polishing step with +[Medaka](https://github.com/nanoporetech/medaka) is performed. + +Note: Since `spoa` tends to produce a better consensus than `miniasm` for short +amplicons, users can force the workflow to run `spoa` if the assembly created by +`miniasm` is shorter than `--force_spoa_length_threshold` (even if it passes +QC). + +**Random downsampling vs selecting reads by length:** + +Despite the workflow dropping reads containing adapter sequences in the middle +(with `porechop --discard_middle`), some reads in the input data stemming from +concatemers or other PCR artifacts might still make it through preprocessing and +could thus interfere with consensus generation. A simple way to avoid this is to +drop a small fraction (e.g. 5%) of longest reads. Additionally, `spoa` depends +on the order of reads in the input and benefits from "seeing" longer reads +first. Therefore, the following options are used per default +`--drop_frac_longest_reads 0.05 --take_longest_remaining_reads`. To disable this +and enforce random downsampling of input reads, use `--drop_frac_longest_reads 0 +--take_longest_remaining_reads false`. @@ -182,5 +219,7 @@ the future. - [Docker](https://www.docker.com/products/docker-desktop) - [Singularity](https://docs.sylabs.io/guides/latest/user-guide/) - [minimap2](https://github.com/lh3/minimap2) +- [miniasm](https://github.com/lh3/miniasm) +- [SPOA](https://github.com/rvaser/spoa) - [Medaka](https://github.com/nanoporetech/medaka) - [Porechop](https://github.com/rrwick/Porechop) diff --git a/bin/workflow_glue/report.py b/bin/workflow_glue/report.py index 75a2570..60de49f 100755 --- a/bin/workflow_glue/report.py +++ b/bin/workflow_glue/report.py @@ -17,9 +17,6 @@ from .util import get_named_logger, wf_parser # noqa: ABS101 -# number of points in the depth-across-amplicon line plots -N_DEPTH_WINDOWS = 100 - def argparser(): """Argument parser for entrypoint.""" @@ -43,7 +40,7 @@ def argparser(): parser.add_argument( "--reference", type=Path, - help="FASTA file with reference sequences for the individual amplicon.", + help="FASTA file with reference sequences for the individual amplicons.", ) parser.add_argument( "--sample-sheet", @@ -110,8 +107,8 @@ def main(args): def populate_report(report, metadata, datasets, ref_fasta): """Fill the report with content.""" # put whether we got a ref file into a variable - ref_mode = ref_fasta is not None - analysis_type_str = "Variant calling" if ref_mode else "Consensus generation" + de_novo = ref_fasta is None + analysis_type_str = "De-novo consensus generation" if de_novo else "Variant calling" samples = [d.sample_alias for d in datasets] # check if any samples are missing any of the required inputs and filter them out # (we will mention them in the report below); this could only happen if there were @@ -119,11 +116,19 @@ def populate_report(report, metadata, datasets, ref_fasta): bad_datasets = [d for d in datasets if not d.all_inputs_valid] datasets = [d for d in datasets if d.all_inputs_valid] ref_seqs = [] - if ref_mode: + if de_novo: + # make sure that there was only one amplicon per sample in de-novo mode + for d in datasets: + if len(d.detected_amplicons) != 1: + raise ValueError( + "Found unexpected number of amplicons " + f"in de-novo consensus mode for sample {d.sample_alias}." + ) + else: + # in variant calling mode we can have many amplicons with pysam.FastxFile(ref_fasta) as f: for entry in f: ref_seqs.append(entry.name) - # in case there are no "good" datasets, print a short notice to the report and exit # early if not datasets: @@ -143,13 +148,18 @@ def populate_report(report, metadata, datasets, ref_fasta): Therefore, only a limited report is available. Consider relaxing the filtering criteria (e.g. using a lower value for """, - html_tags.kbd("--min_read_qual"), + html_tags.code("--min_read_qual"), """) as this is sometimes caused by filtering out all reads from the input data. """, ) - # show the pre-processing stats and omit all other sections + # show the pre-processing stats and consensus QC summaries (if in de-novo mode); + # omit all other sections preprocessing_section(report, bad_datasets) + + if de_novo: + de_novo_qc_section(report, datasets + bad_datasets) + return # intro and "at a glance" stats @@ -158,11 +168,11 @@ def populate_report(report, metadata, datasets, ref_fasta): "This report contains tables and plots to help interpret the results " "of wf-amplicon. The workflow was run in" ) - if not ref_mode: - # we're in no-ref mode + if de_novo: + # we're in de-novo mode html_tags.p( intro_str, - html_tags.b("no-reference mode"), + html_tags.b("De-novo consensus mode"), """. The individual sections of the report summarize the outcomes of the different steps of the workflow (read filtering, consensus generation, re-mapping against the consensus for depth of coverage calculation). @@ -172,7 +182,7 @@ def populate_report(report, metadata, datasets, ref_fasta): # we're in ref mode html_tags.p( intro_str, - html_tags.b("reference mode"), + html_tags.b("Variant calling mode"), """. The individual sections of the report summarize the outcomes of the different steps of the workflow (read filtering, mapping against the reference file containing the amplicon sequences, variant @@ -182,7 +192,7 @@ def populate_report(report, metadata, datasets, ref_fasta): html_tags.p("The input data contained:") # brief summary of how many samples / refs there were for name, items in zip(["sample", "amplicon"], [samples, ref_seqs]): - if name == "amplicon" and not ref_mode: + if name == "amplicon" and de_novo: continue if len(items) > 1: name += "s" @@ -212,18 +222,18 @@ def populate_report(report, metadata, datasets, ref_fasta): tabs = Tabs() with tabs.add_dropdown_menu(): for d in datasets: - basic_summary = d.get_basic_summary_stats(ref_mode) + basic_summary = d.get_basic_summary_stats(de_novo) with tabs.add_dropdown_tab(d.sample_alias): - # add values + titles of cards for stats reported in either case - # (no-ref mode and with reference) first + # add values + titles of cards for stats reported in both modes + # (de-novo and variant calling) first stats_card_items = [ (f'{basic_summary["reads"]:,g}', "Reads"), (f'{basic_summary["bases"]:,g}', "Bases"), (basic_summary["mean_length"].round(1), "Mean length"), (basic_summary["mean_quality"].round(1), "Mean quality"), ] - if not ref_mode: - # add the no-ref-only cards + if de_novo: + # add the de-novo-only cards stats_card_items += [ (f'{basic_summary["unmapped_reads"]:,g}', "Unmapped reads"), ( @@ -268,28 +278,40 @@ def populate_report(report, metadata, datasets, ref_fasta): # show preprocessing stats preprocessing_section(report, datasets) + # brief section for QC summaries (de-novo mode only) + if de_novo: + de_novo_qc_section(report, datasets + bad_datasets) + # summarize bamstats results of all samples for the following report sections bamstats_summary = util.summarize_bamstats(datasets) - # section for alignment stats and (if in reference mode) detected variants summary - with report.add_section("Summary", "Summary"): - if not ref_mode: + # if in de-novo mode, only keep the sample amplicon combinations that had alignments + bamstats_summary.query("reads > 0", inplace=True) + + # section for alignment stats (per sample and per-amplicon if variant calling mode; + # otherwise only per-sample) + if de_novo: + with report.add_section("Re-alignment summary", "Re-align"): # only show re-alignment stats: make a single table with two rows per # sample; one with stats of successfully re-aligned reads and one with stats # of unaligned reads html_tags.p( """ The table below summarizes the main results of re-mapping the reads of - each barcode against the generated consensus. + each barcode against the generated consensus. Percentages of unmapped + reads are relative to the number of reads for that particular sample. + Other percentages are relative to the total number of reads / bases + including all samples. """ ) with dom_util.container() as c: - no_ref_summary_table = util.format_no_ref_summary_table( + de_novo_summary_table = util.format_de_novo_summary_table( bamstats_summary, datasets ) c.clear() - DataTable.from_pandas(no_ref_summary_table) - else: + DataTable.from_pandas(de_novo_summary_table) + else: + with report.add_section("Summary", "Summary"): with dom_util.container() as c: # create summary table with one row per sample per_sample_summary_table = util.format_per_sample_summary_table( @@ -322,7 +344,9 @@ def populate_report(report, metadata, datasets, ref_fasta): """ The two tables below (one per tab) briefly summarize the main results of mapping the reads to the provided amplicon references and subsequent - variant calling. + variant calling. Percentages of unmapped reads are relative to the + number of reads for that particular sample. Other percentages are + relative to the total number of reads / bases including all samples. """ ) tabs = Tabs() @@ -358,15 +382,16 @@ def populate_report(report, metadata, datasets, ref_fasta): with tabs.add_dropdown_menu(): # get a table in long format with the amplicon ID, centre position of the # depth windows, depth values, and sample alias - per_amplicon_depths = pd.concat( - [ - d.depth.assign( - pos=lambda df: df[["start", "end"]].mean(axis=1), - sample=d.sample_alias, - )[["ref", "pos", "depth", "sample"]] - for d in datasets - ] - ) + depth_dfs = [ + d.depth.assign( + pos=lambda df: df[["start", "end"]].mean(axis=1), + sample=d.sample_alias, + )[["ref", "pos", "depth", "sample"]] + for d in datasets + ] + if de_novo: + depth_dfs = [df.eval("ref = sample") for df in depth_dfs] + per_amplicon_depths = pd.concat(depth_dfs) for amplicon, depth_df in per_amplicon_depths.groupby("ref"): # drop samples with zero depth along the whole amplicon total_depths = depth_df.groupby("sample")["depth"].sum() @@ -391,13 +416,13 @@ def populate_report(report, metadata, datasets, ref_fasta): EZChart(plt, "epi2melabs") # add variant tables (skip if there were no VCFs) - if not ref_mode: + if de_novo: return with report.add_section("Variants", "Variants"): html_tags.p( "Haploid variant calling was performed with Medaka. Variants with low ", "depth (i.e. smaller than ", - html_tags.kbd("--min_coverage"), + html_tags.code("--min_coverage"), ') are shown under the "Low depth" tab. The numbers in the ', '"depth" column relate to the sequencing depth used to perform ' "variant calling.", @@ -538,3 +563,47 @@ def preprocessing_section(report, datasets): """ ) fastcat.SeqSummary(comb_per_read_stats) + + +def de_novo_qc_section(report, datasets): + """Add a section with tables of post-assembly consensus QC stats.""" + # make sure datasets are in the right order + datasets = sorted(datasets, key=lambda d: d.sample_alias) + with report.add_section("Quality Control", "QC"): + html_tags.p( + "After creating a draft assembly (either with ", + html_tags.a("Miniasm", href="https://github.com/lh3/miniasm"), + " or with ", + html_tags.a("SPOA", href="https://github.com/rvaser/spoa"), + """) basic quality control is performed on the consensus candidates (i.e. + the contigs in the assembly). The QC stats for the contigs produced by the + assembly pipeline are listed in the table below. If there are no contigs for + the "miniasm" method in the table, the Miniasm step either + failed or produced contigs that were shorter than the """, + html_tags.code("--force_spoa_length_threshold"), + """ parameter (and thus SPOA was run regardless of assembly quality). If + there are no contigs for "spoa", SPOA was not run as a contig produced by + Miniasm passed all filters and was selected. + """, + ) + tabs = Tabs() + with tabs.add_dropdown_menu(): + for d in datasets: + qc_summary = d.qc_summary + qc_summary["fail_reason"] = qc_summary["fail_reason"].fillna("-") + qc_summary["mean_depth"] = qc_summary["mean_depth"].round(1) + qc_summary["length"] = [f"{x:,d}" for x in qc_summary["length"]] + # give the contigs more meaningful names + method_counts = {"miniasm": 0, "spoa": 0} + for idx, row in qc_summary.iterrows(): + method = row["method"] + method_counts[method] += 1 + qc_summary.loc[idx, "contig"] = f"{method}_{method_counts[method]}" + + qc_summary.set_index("contig", inplace=True) + qc_summary.index.name = "Contig" + qc_summary.rename( + columns=lambda col: col.replace("_", " ").capitalize(), inplace=True + ) + with tabs.add_dropdown_tab(d.sample_alias): + DataTable.from_pandas(qc_summary) diff --git a/bin/workflow_glue/report_util.py b/bin/workflow_glue/report_util.py index f452d8c..5b6a040 100755 --- a/bin/workflow_glue/report_util.py +++ b/bin/workflow_glue/report_util.py @@ -77,6 +77,11 @@ def __init__(self, data_dir): entry.info["DP"], ] + # read the QC summaries if they exist + qc_summary = data_dir / "qc-summary.tsv" + if qc_summary.exists(): + self.qc_summary = pd.read_csv(qc_summary, sep="\t") + def __repr__(self): """Return human-readable string (simply the sample alias).""" return f"ReportDataSet('{self.sample_alias}')" @@ -96,13 +101,13 @@ def read_csv(self, file, **kwargs): self.all_inputs_valid = False return df - def get_basic_summary_stats(self, ref_mode): + def get_basic_summary_stats(self, de_novo): """Collect basic summary stats. This is used in the "at a glance" section at the beginning of the report. """ # create series to hold the summary stats and add stats that are needed in - # no-ref and with-ref mode + # de-novo and variant calling mode basic_summary = pd.Series( 0, index=["reads", "bases", "mean_length", "mean_quality"], @@ -116,8 +121,8 @@ def get_basic_summary_stats(self, ref_mode): basic_summary["mean_length"] = self.post_trim_per_file_stats.eval( "n_bases / n_seqs" ) - # if in no-ref mode, add the no-ref-only stats and return - if not ref_mode: + # if in de-novo mode, add the de-novo-only stats and return + if de_novo: basic_summary["unmapped_reads"] = self.bamstats.eval('ref == "*"').sum() basic_summary["consensus_length"] = self.depth["end"].iloc[-1] return basic_summary @@ -181,11 +186,11 @@ def summarize_bamstats(datasets): for d in datasets: # order refs so that the row for unmapped is at the end for each sample - refs = d.bamstats['ref'].unique() - if '*' in refs: + refs = d.bamstats["ref"].unique() + if "*" in refs: refs = [ref for ref in refs if ref != "*"] + ["*"] for amplicon in refs: - df = d.bamstats.query('ref == @amplicon') + df = d.bamstats.query("ref == @amplicon") n_reads = df.shape[0] n_bases = df["read_length"].sum() med_read_length = df["read_length"].median() @@ -412,18 +417,18 @@ def format_per_amplicon_summary_table(summary_stats, datasets): return format_stats_table(per_amplicon_summary_stats) -def format_no_ref_summary_table(summary_stats, datasets): - """Summarize and format per-sample bamstats results in no-ref mode. +def format_de_novo_summary_table(summary_stats, datasets): + """Summarize and format per-sample bamstats results in de-novo mode. `summary_stats` contains summary stats for each sample--amplicon combination. In the - no-ref case this means there are two rows for each sample. One for the reads that + de-novo case this means there are two rows for each sample. One for the reads that were successfully re-aligned against the consensus (`summary_stats.loc[sample_name, sample_name]`) and one for the reads that did not re-align against the consensus (`summary_stats[sample_name, "*"]`). This function combines both entries into one row containing all the relevant information for the summary table. """ # initialise results dataframe with zeros - no_ref_summary_stats = pd.DataFrame( + de_novo_summary_stats = pd.DataFrame( 0, index=summary_stats.index.unique("sample"), columns=[ @@ -437,26 +442,25 @@ def format_no_ref_summary_table(summary_stats, datasets): ) # group by sample and aggregate the stats for sample, df in summary_stats.groupby("sample"): - no_ref_summary_stats.loc[sample, ["reads", "bases"]] = df[ + de_novo_summary_stats.loc[sample, ["reads", "bases"]] = df[ ["reads", "bases"] ].sum() try: - no_ref_summary_stats.loc[sample, "unmapped"] = df.loc[ + de_novo_summary_stats.loc[sample, "unmapped"] = df.loc[ (sample, "*"), "reads" ] except KeyError: - no_ref_summary_stats.loc[sample, "unmapped"] = 0 + de_novo_summary_stats.loc[sample, "unmapped"] = 0 # for median read length we have a look at the whole bamstats DataFrame # belonging to that sample (dataset,) = [x for x in datasets if x.sample_alias == sample] - no_ref_summary_stats.loc[sample, "median_read_length"] = dataset.bamstats[ + de_novo_summary_stats.loc[sample, "median_read_length"] = dataset.bamstats[ "read_length" ].median() - # multiply the mean cov + acc. per sample--amplicon combination by the number of - # reads for that combo and divide by total number of reads for this amplicon to - # get the overall per-amplicon average - no_ref_summary_stats.loc[sample, ["mean_cov", "mean_acc"]] = df.loc[ - (sample, sample), ["mean_cov", "mean_acc"] + # there should only be one amplicon per sample + (amplicon,) = set(df.reset_index()["amplicon"]) - set("*") + de_novo_summary_stats.loc[sample, ["mean_cov", "mean_acc"]] = df.loc[ + (sample, amplicon), ["mean_cov", "mean_acc"] ] - no_ref_summary_stats.fillna(0, inplace=True) - return format_stats_table(no_ref_summary_stats) + de_novo_summary_stats.fillna(0, inplace=True) + return format_stats_table(de_novo_summary_stats) diff --git a/bin/workflow_glue/run_spoa.py b/bin/workflow_glue/run_spoa.py new file mode 100755 index 0000000..0079547 --- /dev/null +++ b/bin/workflow_glue/run_spoa.py @@ -0,0 +1,114 @@ +"""Interleave forward and reverse reads to improve SPOA performance.""" + +from pathlib import Path +import sys + +from medaka.common import reverse_complement +import parasail +import pysam +import spoa + +from .util import get_named_logger, wf_parser # noqa: ABS101 + + +def align(s1, s2): + """Use parasail to align two sequences. + + :param s1: string with first DNA sequence + :param s2: string with second DNA sequence + :return: parasail alignment Result object + """ + return parasail.sw_trace_striped_16( + s1, s2, open=8, extend=4, matrix=parasail.dnafull + ) + + +def interleave_lists(l1, l2): + """Uniformly interleave two lists. + + For example: + ``` + >>> interleave_lists([1, 2, 3, 4], list('ABCDEF')) + [1, 'A', 'B', 2, 'C', 3, 'D', 'E', 4, 'F'] + ``` + + :param l1: first list + :param l2: second list + :return: list containing items of `l1` and `l2` uniformly interleaved + """ + interleaved = [] + rate_1 = 1 / len(l1) + rate_2 = 1 / len(l2) + + c_1, c_2 = 0, 0 + + itr_1 = iter(l1) + itr_2 = iter(l2) + + while True: + try: + if c_1 > c_2: + interleaved.append(next(itr_2)) + c_2 += rate_2 + else: + interleaved.append(next(itr_1)) + c_1 += rate_1 + except StopIteration: + break + + return interleaved + + +def main(args): + """Run the entry point.""" + logger = get_named_logger("runSPOA") + + logger.info("Get read orientations...") + # read input file and determine the orientation of the reads + fwd = [] + rev = [] + with pysam.FastxFile(args.fastq, "r") as f: + first_seq = next(f).sequence + fwd.append(first_seq) + for entry in f: + seq = entry.sequence + rc = reverse_complement(seq) + fwd_score = align(seq, first_seq).score + rev_score = align(rc, first_seq).score + if fwd_score > rev_score: + fwd.append(seq) + else: + rev.append(rc) + + # uniformly interleave the reads + interleaved_reads = interleave_lists(fwd, rev) + logger.info("Finished interleaving reads.") + + # determine minimum coverage if param was provided + min_cov = None + if args.relative_min_coverage is not None: + min_cov = int(round(len(interleaved_reads) * args.relative_min_coverage)) + logger.info(f"SPOA min coverage: {min_cov}.") + + # run spoa + cons, _ = spoa.poa(interleaved_reads, genmsa=False, min_coverage=min_cov) + + # run spoa a second time with the previous result as first read + cons, _ = spoa.poa([cons, *interleaved_reads], genmsa=False, min_coverage=min_cov) + + # write out the result + sys.stdout.write(f">consensus\n{cons}\n") + + logger.info("Wrote consensus to STDOUT.") + + +def argparser(): + """Argument parser for entrypoint.""" + parser = wf_parser("run_spoa") + parser.add_argument("fastq", type=Path, help="Path to input FASTQ file") + parser.add_argument( + "--relative-min-coverage", + type=float, + help="Minimum relative coverage of POA graph", + ) + return parser diff --git a/bin/workflow_glue/subset_reads.py b/bin/workflow_glue/subset_reads.py new file mode 100755 index 0000000..0ca4ae1 --- /dev/null +++ b/bin/workflow_glue/subset_reads.py @@ -0,0 +1,70 @@ +"""Subset reads based on length (from fastcat per-read stats).""" +from pathlib import Path +import sys + +import pandas as pd + +from .util import get_named_logger, wf_parser # noqa: ABS101 + + +def main(args): + """Run the entry point.""" + logger = get_named_logger("subsetReads") + + logger.info("Read per-read stats and sort lengths.") + sorted_lengths = pd.read_csv(args.per_read_stats, sep="\t", index_col=0)[ + "read_length" + ].sort_values(ascending=False) + + drop_longest_n = 0 + if args.drop_longest_frac: + drop_longest_n = int(round(len(sorted_lengths) * args.drop_longest_frac)) + logger.info(f"Drop {drop_longest_n} longest reads.") + sorted_lengths = sorted_lengths.iloc[drop_longest_n:] + + if args.take_longest_remaining: + if args.n_reads == 0: + logger.info("Take all remaining reads (sorted by length).") + target_ids = sorted_lengths.index + else: + logger.info(f"Take {args.n_reads} longest remaining reads.") + target_ids = sorted_lengths.iloc[: args.n_reads].index + else: + logger.info("Randomly select reads and write out IDs.") + # select a subset and sort by length again (but only if we still got enough + # reads) + if args.n_reads > len(sorted_lengths) or args.n_reads == 0: + target_ids = sorted_lengths.index + else: + target_ids = ( + sorted_lengths.sample(args.n_reads).sort_values(ascending=False).index + ) + + sys.stdout.write("\n".join(target_ids) + "\n") + + logger.info( + f"Finished extracting {len(target_ids)} reads from '{args.per_read_stats}'." + ) + + +def argparser(): + """Argument parser for entrypoint.""" + parser = wf_parser("subset_reads") + parser.add_argument( + "per_read_stats", type=Path, help="Path to per-read stats TSV file" + ) + parser.add_argument( + "drop_longest_frac", type=float, help="Fraction of longest reads to drop" + ) + parser.add_argument( + "n_reads", type=int, help="Number of reads to select. If 0, take all." + ) + parser.add_argument( + "--take-longest-remaining", + action="store_true", + help=( + "Whether to select the longest remaining reads or just randomly " + "sample after dropping the longest reads" + ), + ) + return parser diff --git a/bin/workflow_glue/trim_and_qc.py b/bin/workflow_glue/trim_and_qc.py new file mode 100755 index 0000000..1abd580 --- /dev/null +++ b/bin/workflow_glue/trim_and_qc.py @@ -0,0 +1,196 @@ +"""Select a contig if multiple were created, do some QC, modify depth / stats files.""" +from pathlib import Path +import sys + +import pandas as pd +import pysam + +from .util import get_named_logger, wf_parser # noqa: ABS101 + + +def main(args): + """Run the entry point.""" + logger = get_named_logger("mostCovRef") + + logger.info("Read input files.") + + flagstat = pd.read_csv(args.flagstat, sep="\t", index_col=0) + + depths = pd.read_csv( + args.depth, sep="\t", header=None, names=["ref", "start", "end", "depth"] + ) + + # calculate the mean depths of the candidate consensus contigs + mean_depths = depths.groupby("ref").apply( + lambda df: df.eval("(end - start) * depth").sum() / df["end"].iloc[-1] + ) + + # perform QC checks on each contig + qc_stats = pd.DataFrame( + columns=[ + "method", + "length", + "mean_depth", + "primary", + "secondary", + "supplementary", + "primary_ratio", + "status", + "fail_reason", + ] + ) + qc_stats.index.name = "contig" + for contig, mean_depth in mean_depths.items(): + status = "passed" + contig_len = depths.query("ref == @contig")["end"].iloc[-1] + fail_reasons = [] + if mean_depth < args.minimum_depth: + status = "failed" + fail_reasons.append("low depth") + primary_ratio = flagstat.loc[contig, "primary"] / flagstat.loc[contig, "total"] + if primary_ratio < args.primary_threshold: + status = "failed" + fail_reasons.append("low primary ratio") + + qc_stats.loc[contig] = [ + args.asm_method, + contig_len, + round(mean_depth, 3), + *flagstat.loc[contig, ["primary", "secondary", "supplementary"]], + round(primary_ratio, 3), + status, + ", ".join(fail_reasons), + ] + + qc_passed = qc_stats.query("status == 'passed'") + + failed = qc_passed.empty + + # create the output directory + outdir = Path(args.outdir_fail if failed else args.outdir_pass) + outdir.mkdir() + + qc_stats.to_csv(outdir / args.qc_summary_tsv, sep="\t") + + if failed: + logger.warning( + "Consensus generation failed for this sample. No consensus candidate " + "passed the quality checks." + ) + sys.exit() + + # select the contig with the largest mean depth + selected = qc_passed["mean_depth"].sort_values(ascending=False).index[0] + logger.info( + f"Selected contig '{selected}'. " + f"Write modified depth and stats files to {outdir}." + ) + + # only keep the depths of the selected contig + depths.query("ref == @selected", inplace=True) + + # determine the limits for trimming off low-coverage regions at the ends + first_and_last_idx = depths.query( + "depth > depth.max() * @args.relative_depth_trim_threshold" + ).index[[0, -1]] + depths = depths.loc[slice(*first_and_last_idx)] + trim_start = depths.iloc[0]["start"] + trim_end = depths.iloc[-1]["end"] + + # extract the selected consensus sequence and trim low-coverage regions from the + # ends + logger.info("Extract and write selected consensus sequence.") + with pysam.FastxFile(args.consensus) as f: + for entry in f: + if entry.name == selected: + break + else: + raise ValueError( + f"Selected consensus contig {selected} not found in" + f"consensus FASTx file '{args.consensus}'" + ) + entry.name = args.alias + entry.sequence = entry.sequence[trim_start:trim_end] + entry.quality = entry.quality[trim_start:trim_end] + with open(outdir / args.consensus.name, "w") as f: + f.write(str(entry) + "\n") + + logger.info("Done") + + +def argparser(): + """Argument parser for entrypoint.""" + parser = wf_parser("get_ref_with_largest_mean_depth") + parser.add_argument( + "--alias", + type=str, + help="Sample alias.", + required=True, + ) + parser.add_argument( + "--asm-method", + type=str, + help="Assembly method (is added to QC summary TSV).", + required=True, + ) + parser.add_argument( + "--depth", + type=Path, + help="Path to mosdepth per-base BED file.", + required=True, + ) + parser.add_argument( + "--flagstat", + type=Path, + help="Path to bamstats flagstat TSV file.", + required=True, + ) + parser.add_argument( + "--consensus", + type=Path, + help="Path to consensus FASTQ file.", + required=True, + ) + parser.add_argument( + "--outdir-pass", + type=Path, + help="Name of output directory if passing QC.", + required=True, + ) + parser.add_argument( + "--outdir-fail", + type=Path, + help="Name of output directory if failing QC.", + required=True, + ) + parser.add_argument( + "--relative-depth-trim-threshold", + type=float, + help=( + "Trim ends of consensus sequence if the relative coverage drops below " + "this value." + ), + required=True, + ) + parser.add_argument( + "--minimum-depth", + type=int, + help="Minimum required depth to trust a consensus sequence.", + required=True, + ) + parser.add_argument( + "--primary-threshold", + type=float, + help=( + "Threshold for proportion of primary alignments (will write a " + "warning for samples with lower fraction)." + ), + required=True, + ) + parser.add_argument( + "--qc-summary-tsv", + type=str, + help="Name of TSV file into which to write QC summary.", + required=True, + ) + return parser diff --git a/docs/intro.md b/docs/intro.md index b1edecb..6a8cefe 100644 --- a/docs/intro.md +++ b/docs/intro.md @@ -4,17 +4,21 @@ This [Nextflow](https://www.nextflow.io/) workflow provides a simple way to analyse Oxford Nanopore reads generated from amplicons. The workflow requires raw reads in FASTQ format and can be run in two modes: -* With a reference: After initial filtering (based on read length and quality) - and adapter trimming, [minimap2](https://github.com/lh3/minimap2) is used to - align the reads to a reference FASTA file (please note that the reference - should only contain the expected sequences of the individual amplicons). - Variants are then called with - [Medaka](https://github.com/nanoporetech/medaka). -* Without a reference: Like for the "reference mode", reads are first filtered - and trimmed. Then, `medaka smolecule` is used to generate the consensus of the - reads of each sample _de novo_. Reads are re-aligned against the consensus to - produce coverage plots for the report. +* Variant calling mode: Trigger this mode by passing a reference FASTA file. + After initial filtering (based on read length and quality) and adapter + trimming, [minimap2](https://github.com/lh3/minimap2) is used to align the + reads to the reference (please note that the reference should only contain the + expected sequences of the individual amplicons). Variants are then called with + [Medaka](https://github.com/nanoporetech/medaka). This mode allows for + multiple amplicons per barcode (for details on how to map specific target + amplicons to individual samples / barcodes, see below). +* De-novo consensus mode: This mode is run when no reference file is passed. + Like for the "variant calling mode", reads are first filtered and trimmed. + Then, a consensus sequence is generated _de novo_ from the reads of each + sample. Reads are re-aligned against the draft consensus for polishing with + [Medaka](https://github.com/nanoporetech/medaka). Please note that only one + amplicon per barcode is supported in de-novo consensus mode. The results of the workflow include an interactive HTML report, FASTA files with the consensus sequences of the amplicons, BAM files with the alignments, and VCF -files containing the variants (if run in "reference mode"). +files containing the variants (if run in variant calling mode). diff --git a/docs/links.md b/docs/links.md index 5182185..5814961 100644 --- a/docs/links.md +++ b/docs/links.md @@ -5,5 +5,7 @@ - [Docker](https://www.docker.com/products/docker-desktop) - [Singularity](https://docs.sylabs.io/guides/latest/user-guide/) - [minimap2](https://github.com/lh3/minimap2) +- [miniasm](https://github.com/lh3/miniasm) +- [SPOA](https://github.com/rvaser/spoa) - [Medaka](https://github.com/nanoporetech/medaka) - [Porechop](https://github.com/rrwick/Porechop) diff --git a/docs/quickstart.md b/docs/quickstart.md index 9327921..37be1e9 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -26,9 +26,9 @@ This will show the workflow's command line options. ### Usage This section covers how to use the workflow when providing a reference file with -the expected sequences of the amplicons. For details on how to run the workflow -without a reference (i.e. for _de novo_ construction of the amplicon consensus -sequence), see the relevant section below. +the expected sequences of the amplicons ("variant calling mode"). For details on +how to run the workflow without a reference ("de-novo consensus mode"), see the +relevant section below. Basic usage is as follows @@ -75,11 +75,18 @@ Relevant options for filtering of raw reads are - `--max_read_length` - `--min_read_qual` -After filtering and trimming with -[Porechop](https://github.com/rrwick/Porechop), reads can optionally be -downsampled. You can control the number of reads to keep per sample with -`--reads_downsampling_size`. Samples with fewer than `--min_n_reads` are -ignored. +After initial filtering, reads can optionally be downsampled +(`--reads_downsampling_size` controls the number of reads to keep per sample). +The workflow supports random downsampling as well as selecting reads based on +read length. Subsetting based on read length tends to work better for de-novo +consensus generation and is thus set as default (see the section for the de-novo +consensus mode below for details and for how to disable this). However, it +should not detriment the performance of the variant-calling mode. The selected +reads are then trimmed with [Porechop](https://github.com/rrwick/Porechop) +before being aligned against the reference. + +> Note: Samples with fewer reads than `--min_n_reads` after preprocessing are +> ignored. After alignment, haploid variants are called with [Medaka](https://github.com/nanoporetech/medaka). You can set the minimum @@ -126,17 +133,44 @@ in the generated HTML report. ### Running without a reference The workflow can also be run without a reference. It will then use -`medaka smolecule` to construct a consensus sequence for each barcode. -`smolecule` relies on [SPOA](https://github.com/rvaser/spoa) and the workflow -takes advantage of the recently added option in SPOA to prune low-coverage nodes -from the partial order graph. The threshold for this is relative to the number -of reads per sample (after filtering) and can be modified with -`--spoa_minimum_relative_coverage` (e.g. set to `0.2` to require a coverage of at least -20% of reads along the whole consensus). - -[SPOA](https://github.com/rvaser/spoa) works best for LSK data. The performance -of the consensus generation may be poor for RBK data in some cases, especially -if no long reads spanning most of the amplicon were found. We therefore -recommend to only use "no reference mode" for short amplicons (4 kb or shorter). -To improve robustness on RBK, an assembly-based approach might be implemented in -the future. +[miniasm](https://github.com/lh3/miniasm) or +[SPOA](https://github.com/rvaser/spoa) to create a draft consensus. `miniasm` +tends to work better for longer amplicons and `spoa` for shorter amplicons. The +workflow attempts to create an assembly with `miniasm` first. If this succeeds, +the draft assembly is polished with [Racon](https://github.com/lbcb-sci/racon) and +the input reads are re-aligned against it. This is followed by some post-processing +(trimming low-coverage regions from the ends) and quality control (QC). During QC, the following +sanity-checks are performed: + +- The mean coverage of the consensus is above `--minimum_mean_depth` (default: 30X). +- The fraction of primary alignments for the re-aligned reads is more than + `--primary_alignment_threshold` (default: 70%). Large numbers of secondary / + supplementary alignments indicate that something might have gone wrong during + assembly. If your amplicons contain long repetitive regions, you can lower + this number. + +If more than one contig from the draft assembly passes these filters, the one +with the highest coverage is selected. + +If assembly generation fails or none of the contigs produced by `miniasm` pass +QC, `spoa` is tried. Regardless of how the draft consensus was generated +(`miniasm` or `spoa`), a final polishing step with +[Medaka](https://github.com/nanoporetech/medaka) is performed. + +Note: Since `spoa` tends to produce a better consensus than `miniasm` for short +amplicons, users can force the workflow to run `spoa` if the assembly created by +`miniasm` is shorter than `--force_spoa_length_threshold` (even if it passes +QC). + +**Random downsampling vs selecting reads by length:** + +Despite the workflow dropping reads containing adapter sequences in the middle +(with `porechop --discard_middle`), some reads in the input data stemming from +concatemers or other PCR artifacts might still make it through preprocessing and +could thus interfere with consensus generation. A simple way to avoid this is to +drop a small fraction (e.g. 5%) of longest reads. Additionally, `spoa` depends +on the order of reads in the input and benefits from "seeing" longer reads +first. Therefore, the following options are used per default +`--drop_frac_longest_reads 0.05 --take_longest_remaining_reads`. To disable this +and enforce random downsampling of input reads, use `--drop_frac_longest_reads 0 +--take_longest_remaining_reads false`. \ No newline at end of file diff --git a/main.nf b/main.nf index 56b46ea..8258865 100644 --- a/main.nf +++ b/main.nf @@ -4,8 +4,10 @@ import groovy.json.JsonBuilder nextflow.enable.dsl = 2 include { fastq_ingress } from "./lib/ingress" -include { pipeline as variantCallingPipeline } from "./modules/local/reference-based" -include { pipeline as smoleculePipeline } from "./modules/local/smolecule" +include { pipeline as variantCallingPipeline } from "./modules/local/variant-calling" +include { + pipeline as deNovoPipeline_asm; pipeline as deNovoPipeline_spoa; +} from "./modules/local/de-novo" OPTIONAL_FILE = file("$projectDir/data/OPTIONAL_FILE") @@ -29,6 +31,10 @@ process getVersions { printf "porechop,%s\\n" \$(porechop --version) >> versions.txt samtools --version | head -n1 | tr ' ' ',' >> versions.txt printf "minimap2,%s\\n" \$(minimap2 --version) >> versions.txt + mosdepth --version | tr ' ' ',' >> versions.txt + printf "miniasm,%s\\n" \$(miniasm -V) >> versions.txt + printf "racon,%s\\n" \$(racon --version) >> versions.txt + printf "csvtk,%s\\n" \$(csvtk version | grep -oP '\\d+\\.\\d+.\\d+') >> versions.txt """ } @@ -40,7 +46,7 @@ process addMedakaToVersionsFile { script: """ cp old_versions.txt versions.txt - medaka --version | tr ' ' ',' > versions.txt + medaka --version | tr ' ' ',' >> versions.txt """ } @@ -72,6 +78,47 @@ process downsampleReads { """ } +/* +Instead of downsampling randomly, this process subsets input reads based on length. +Depending on parameters, it drops a fraction (e.g. `0.05` for 5%) of longest reads and +then selects a number of the next-longest reads (if `take_longest_remaining_reads` is +`true`; if `false`, it will randomly downsample the reads after dropping the longest). +Selected reads will always be sorted by length. +*/ +process subsetReads { + label "wfamplicon" + cpus 1 + input: + tuple val(meta), path("reads.fastq.gz"), path("fastcat-stats") + val drop_longest_frac + val take_longest_remaining_reads + val n_reads + output: tuple val(meta), path("subset.fastq.gz") + script: + String take_longest_remaining_arg = "" + if (take_longest_remaining_reads) { + take_longest_remaining_arg = "--take-longest-remaining" + } + """ + echo $meta.alias # sometimes useful for debugging + + workflow-glue subset_reads \ + fastcat-stats/per-read-stats.tsv.gz \ + $drop_longest_frac \ + $n_reads \ + $take_longest_remaining_arg \ + > target_ids.txt + + # only attempt to subset reads if neither the FASTQ or the file with target IDs is + # empty (`samtools fqidx` would fail otherwise) + if [[ ( -n \$(zcat reads.fastq.gz | head -c1) ) && ( -s target_ids.txt ) ]]; then + samtools fqidx reads.fastq.gz -r target_ids.txt | bgzip > subset.fastq.gz + else + echo -n | bgzip > subset.fastq.gz + fi + """ +} + process porechop { label "wfamplicon" cpus Math.min(params.threads, 5) @@ -102,6 +149,20 @@ process collectFilesInDir { """ } +process concatTSVs { + label "wfamplicon" + cpus 1 + input: + tuple val(meta), path("input/f*.tsv") + val fname + output: tuple val(meta), path(fname) + script: + """ + csvtk concat -t input/* > $fname + """ + +} + process makeReport { label "wfamplicon" cpus 1 @@ -182,12 +243,25 @@ workflow pipeline { ch_results_for_report = ch_reads | map { meta, reads, stats_dir -> [meta, *file(stats_dir.resolve("*"))] } - // remove fastcat stats from reads channel - ch_reads = ch_reads.map { meta, reads, stats_dir -> [meta, reads] } - - // downsample - if (params.reads_downsampling_size) { - ch_reads = downsampleReads(ch_reads, params.reads_downsampling_size) + // either subset reads (i.e. drop the longest and then take the next longest) or + // downsample naively or take all reads + if (params.drop_frac_longest_reads || params.take_longest_remaining_reads) { + // subset reads + ch_reads = subsetReads( + ch_reads, + params.drop_frac_longest_reads, + params.take_longest_remaining_reads, + params.reads_downsampling_size, + ) + } else if (params.reads_downsampling_size) { + // downsample reads + ch_reads = downsampleReads( + ch_reads | map { meta, reads, stats -> [meta, reads] }, + params.reads_downsampling_size + ) + } else { + // take all reads + ch_reads = ch_reads.map { meta, reads, stats_dir -> [meta, reads] } } // trim adapters @@ -220,17 +294,15 @@ workflow pipeline { ch_reads = ch_reads | join(ch_post_filter_n_reads) | filter { meta, reads, n_seqs -> n_seqs >= params.min_n_reads } + | map { meta, reads, n_seqs -> [meta, reads] } Path ref = OPTIONAL_FILE if (params.reference) { - // ref-based mode; make sure the ref file exists and run the variant calling - // pipeline + // variant calling mode; make sure the ref file exists and run the variant + // calling pipeline ref = file(params.reference, checkIfExists: true) - variantCallingPipeline( - ch_reads | map { meta, reads, n_seqs -> [meta, reads] }, - ref - ) + variantCallingPipeline(ch_reads, ref) // add variant calling results to channels for the report + to publish ch_results_for_report = ch_results_for_report @@ -263,22 +335,52 @@ workflow pipeline { ) } } else { - // no reference; run medaka smolecule on each sample - smoleculePipeline(ch_reads) + // de-novo consensus mode: run pipeline for de-novo consensus; we try + // `miniasm` first and then run `spoa` on all samples that failed + deNovoPipeline_asm(ch_reads, "miniasm") + + deNovoPipeline_spoa( + deNovoPipeline_asm.out.metas_failed + // use `combine` instead of `join` here because `join` with `remainder: + // true` does not work as expected if one channel contains only the key + // (as is the case with the `metas_failed` channel here) + | combine(ch_reads, by: 0), + "spoa" + ) + + // combine the results for `miniasm` and `spoa` + ch_no_ref_results = deNovoPipeline_asm.out.passed + | mix(deNovoPipeline_spoa.out.passed) + | multiMap { meta, cons, bam, bai, bamstats, flagstat, depth -> + consensus: [meta, cons] + mapped: [meta, bam, bai] + for_report: [meta, bamstats, flagstat, depth] + } + + // get the QC summary TSVs and concat them for each sample (if the `miniasm` + // and `spoa` pipelines were run for a sample, it will have two QC summary + // TSVs) + ch_concat_qc_summaries = concatTSVs( + deNovoPipeline_asm.out.qc_summaries + | mix(deNovoPipeline_spoa.out.qc_summaries) + | groupTuple, + "qc-summary.tsv" + ) - // add `smolecule` results to main results channel + // add de-novo results to main results channel ch_results_for_report = ch_results_for_report - | join(smoleculePipeline.out.mapping_stats, remainder: true) - | join(smoleculePipeline.out.depth, remainder: true) + | join(ch_no_ref_results.for_report, remainder: true) + | join(ch_concat_qc_summaries, remainder: true) // drop any `null`s from the joined list | map { it -> it.findAll { it } } - // collect files for report in directories (one dict per sample) + // add the consensus and alignments (re-aligned against the consensus) to + // the output channel for publishing ch_to_publish = ch_to_publish | mix( - smoleculePipeline.out.consensus + ch_no_ref_results.consensus | map { meta, cons -> [cons, "$meta.alias/consensus"] }, - smoleculePipeline.out.mapped + ch_no_ref_results.mapped | map { meta, bam, bai -> [[bam, bai], "$meta.alias/alignments"] } | transpose, ) diff --git a/modules/local/de-novo.nf b/modules/local/de-novo.nf new file mode 100644 index 0000000..54ef2bf --- /dev/null +++ b/modules/local/de-novo.nf @@ -0,0 +1,340 @@ +include { + lookupMedakaModel; + alignReads as alignDraft; + alignReads as alignPolished; + bamstats as bamstatsDraft; + bamstats as bamstatsPolished; +} from "./common" + + +/* +Get draft consensus with SPOA. Like `medaka smolecule`, the python script interleaves +reads so that forward and reverse reads are uniformly distributed before being passed to +SPOA. +*/ +process spoa { + label = "medaka" + cpus 1 + input: tuple val(meta), path("reads.fastq.gz") + output: tuple val(meta), path("reads.fastq.gz"), path("asm.fasta") + script: + String min_cov_args = "" + if (params.spoa_minimum_relative_coverage) { + min_cov_args = "--relative-min-coverage $params.spoa_minimum_relative_coverage" + } + """ + echo $meta.alias # makes some debugging easier + + workflow-glue run_spoa reads.fastq.gz $min_cov_args > asm.fasta + """ +} + +/* +Assemble draft consensus with `miniasm`. `miniasm` params are taken from pomoxis +`mini_assemble`. Emits the env variable `STATUS` (can be 'failed' or 'passed') alongside +the draft asssembly. If none of the assembled contigs is longer than +`params.force_spoa_length_threshold`, `STATUS` is set to 'failed'. +*/ +process miniasm { + label = "wfamplicon" + cpus params.threads + input: tuple val(meta), path("reads.fastq.gz") + output: tuple val(meta), path("reads.fastq.gz"), path("asm.fasta"), env(STATUS) + script: + int mapping_threads = Math.max(1, task.cpus - 1) + """ + STATUS=failed + echo $meta.alias # makes some debugging easier + + # overlap reads and assemble draft consensus (note that `miniasm` outputs the + # assembly in GFA format) + minimap2 -L -x ava-ont -t $mapping_threads reads.fastq.gz reads.fastq.gz \ + | miniasm -s 100 -e 3 -f reads.fastq.gz - \ + | awk '/^S/{print ">"\$2"\\n"\$3}' > asm.fasta # extract header + seq from GFA + + if [[ -s asm.fasta ]]; then + # check if at least one of the contigs is longer than the force-SPOA-threshold + samtools faidx asm.fasta + longest=\$(sort -k2rn asm.fasta.fai | head -n1 | cut -f2) + if [[ \$longest -gt $params.force_spoa_length_threshold ]]; then + STATUS=passed + fi + fi + """ +} + +/* +Polish draft consensus with `racon` (one round only). `racon` params are taken from +pomoxis `mini_assemble`. `--no-trimming` is added as it sometimes trims the consensus +too aggressively. We trim the sequences downstream instead. +*/ +process racon { + label = "wfamplicon" + cpus params.threads + input: tuple val(meta), path("reads.fastq.gz"), path("draft.fasta") + output: tuple val(meta), path("reads.fastq.gz"), path("polished.fasta") + script: + int mapping_threads = Math.max(1, task.cpus - 1) + """ + echo $meta.alias # makes some debugging easier + + # align against draft + minimap2 -L -x ava-ont -t $mapping_threads draft.fasta reads.fastq.gz \ + | bgzip > pre-racon.paf.gz + + # run racon + racon -m 8 -x -6 -g -8 -w 500 -t $task.cpus -q -1 --no-trimming \ + reads.fastq.gz pre-racon.paf.gz draft.fasta \ + > polished.fasta + """ +} + +/* +Polish draft consensus with `medaka`. Also, emit the polished sequences as FASTQ +(with consensus quality scores calculated by medaka) and not FASTA. +*/ +process medakaConsensus { + label "medaka" + cpus Math.min(params.threads, 3) + input: + tuple val(meta), path("input.bam"), path("input.bam.bai"), path("draft.fasta") + val medaka_model + output: tuple val(meta), path("consensus.fastq") + script: + """ + medaka consensus input.bam consensus_probs.hdf \ + --threads $task.cpus --model $medaka_model + + medaka stitch consensus_probs.hdf draft.fasta consensus.fastq \ + --threads $task.cpus --qualities + """ +} + +/* +Use `mosdepth` to calculate per-base depths. This is used for trimming the consensus and +not for the plots in the report. +*/ +process mosdepthPerBase { + label "wfamplicon" + cpus Math.min(params.threads, 3) + input: tuple val(meta), path("input.bam"), path("input.bam.bai") + output: tuple val(meta), path("depth.per-base.bed.gz") + script: + int mosdepth_extra_threads = task.cpus - 1 + """ + echo $meta.alias # makes some debugging easier + + mosdepth -t $mosdepth_extra_threads depth input.bam + """ +} + +/* +Get `mosdepth` depths for a given number of windows. Expects only a single reference to +be present in the input BAM. Emits a TSV with header instead of the original BED file as +this is expected by the report code. +*/ +process mosdepthWindows { + label "wfamplicon" + cpus Math.min(params.threads, 3) + input: + tuple val(meta), path("input.bam"), path("input.bam.bai") + val n_windows + output: tuple val(meta), path("per-window-depth.tsv.gz") + script: + int mosdepth_extra_threads = task.cpus - 1 + """ + echo $meta.alias # makes some debugging easier + + # get ref IDs and lengths with `samtools idxstats` + samtools idxstats input.bam | grep -v '^*' > idxstats + + # we expect only a single ref seq + if [[ \$(wc -l < idxstats) -ne 1 ]]; then + echo "Found unexpected number of references in input BAM." + exit 1 + fi + + # get the length of the reference + REF_LENGTH=\$(cut -f2 idxstats) + + # calculate the corresponding window length (check `REF_LENGTH` first because + # `expr a / b` returns non-zero exit code when `a < b`) + window_length=1 + if [ "\$REF_LENGTH" -gt "$n_windows" ]; then + window_length=\$(expr \$REF_LENGTH / $n_windows) + fi + + # get the depths (we could add `-x`, but this loses a lot of detail from the depth + # curves) + mosdepth -t $mosdepth_extra_threads -b \$window_length -n depth input.bam + + # add a header to the BED file + cat <(printf "ref\\tstart\\tend\\tdepth\\n" | gzip) depth.regions.bed.gz \ + > per-window-depth.tsv.gz + + # clean up BED file to save some space + rm depth.regions.bed.gz + """ +} + +/* +Performs QC on the draft consensus. The script filters out contigs that have either too +little mean depth or a too large fraction of secondary + supplementary alignments. If +more than one contig passes both filters, the one with higher mean depth is chosen. It +is then trimmed based on `mosdepth` per-base depths. The process outputs two optional +channels (`passed` and `failed`), depending on whether QC was passed. It makes sure that +it emits in only one of the channels. + +`qc-summary.tsv` contains info like mean depth, the number of primary alignments, etc. +for each contig. It also contains the assembly method so that files from the SPOA and +`miniasm` pipelines can be concatenated downstream. +*/ +process trimAndQC { + label "wfamplicon" + cpus params.threads + cpus 1 + input: + tuple val(meta), + path("consensus.fastq"), + path("flagstat.tsv"), + path("depth.per-base.bed.gz") + val asm_method + output: + tuple val(meta), path("passed/consensus.fastq"), path("passed/qc-summary.tsv"), + optional: true, emit: passed + tuple val(meta), path("failed/qc-summary.tsv"), optional: true, emit: failed + script: + """ + echo $meta.alias # makes some debugging easier + + workflow-glue trim_and_qc \ + --alias $meta.alias \ + --asm-method $asm_method \ + --depth depth.per-base.bed.gz \ + --flagstat flagstat.tsv \ + --consensus consensus.fastq \ + --outdir-pass passed \ + --outdir-fail failed \ + --minimum-depth $params.minimum_mean_depth \ + --primary-threshold $params.primary_alignments_threshold \ + --relative-depth-trim-threshold $params.spoa_minimum_relative_coverage \ + --qc-summary-tsv qc-summary.tsv + + # sanity check: make sure that `qc-summary.tsv` was only written for either passed + # or failed (and not both) + if [[ ( -f passed/qc-summary.tsv ) && ( -f failed/qc-summary.tsv ) ]]; then + echo "Found 'qc-summary.tsv' in both 'passed' and 'failed' directories." + exit 1 + elif [[ ( ! -f passed/qc-summary.tsv ) && ( ! -f failed/qc-summary.tsv ) ]]; then + echo "Found 'qc-summary.tsv' in neither 'passed' nor 'failed' directories." + exit 1 + fi + """ +} + + +// workflow module +workflow pipeline { + take: + // expected shape: `[meta, reads]` (reads have already been downsampled and + // trimmed) + ch_reads + method + main: + // get medaka model (look up or override) + def medaka_model + if (params.medaka_model) { + log.warn "Overriding Medaka model with ${params.medaka_model}." + medaka_model = params.medaka_model + } else { + Path lookup_table = file( + "${projectDir}/data/medaka_models.tsv", checkIfExists: true) + medaka_model = lookupMedakaModel( + lookup_table, params.basecaller_cfg, "medaka_consensus" + ) + } + + if (method !in ["miniasm", "spoa"]) { + error "Invalid de novo method '$method' (needs to be either 'miniasm' " + + "or 'spoa')." + } + + // run miniasm -> racon or SPOA depending on `method` + if (method == "miniasm") { + ch_branched = miniasm(ch_reads) + // branch off samples for which the assembly has failed (or was shorter than + // `--force_spoa_length_threshold`) + | branch { meta, reads, asm, status -> + passed: status == "passed" + failed: status == "failed" + err: error "Post-assembly status is neither 'passed' nor 'failed' " + + "for sample '$meta.alias'." + } + + ch_draft = ch_branched.passed + | map { meta, reads, asm, status -> [meta, reads, asm] } + | racon + // failed samples will be re-tried with SPOA in `main.nf` + ch_failed = ch_branched.failed + } else { + ch_draft = spoa(ch_reads) + ch_failed = Channel.empty() + } + + // re-align reads against the draft consensus + alignDraft(ch_draft) + + // polish with medaka + medakaConsensus( + alignDraft.out + | join(ch_draft) + | map { meta, bam, bai, reads, cons -> [meta, bam, bai, cons] }, + medaka_model + ) + + // get mapping stats and mosdepth per-base results for trimming the consensus + bamstatsDraft(alignDraft.out) + mosdepthPerBase(alignDraft.out) + + // select and trim the consensus sequence with the largest mean coverage + trimAndQC( + medakaConsensus.out + | join( + bamstatsDraft.out | map { meta, bamstats, flagstat -> [meta, flagstat] } + ) + | join(mosdepthPerBase.out), + method + ) + + // if QC passed, re-align again against the selected polished contig and get + // stats + depths for report + ch_reads + | join(trimAndQC.out.passed, remainder: true) + | filter { null !in it } + | map { meta, reads, cons, qc_summary -> + [meta, reads, cons] + } + | alignPolished + + bamstatsPolished(alignPolished.out) + mosdepthWindows(alignPolished.out, params.number_depth_windows) + emit: + // emit three channels: + // * passed: consensus, stats, and depths for successful samples + // * metas_failed: meta maps of samples that failed (either the `miniasm` step + // or QC) + // * qc_summaries: all QC summaries (regardless of fail or pass) + passed = trimAndQC.out.passed + | map { meta, cons, qc_summary -> [meta, cons] } + | join(alignPolished.out) + | join(bamstatsPolished.out) + | join(mosdepthWindows.out) + + metas_failed = ch_failed + | map { meta, reads, asm, status -> meta } + | mix(trimAndQC.out.failed | map { meta, qc_summary -> meta } ) + + qc_summaries = trimAndQC.out.passed + | map { meta, cons, qc_summary -> [meta, qc_summary] } + | mix(trimAndQC.out.failed) +} diff --git a/modules/local/smolecule.nf b/modules/local/smolecule.nf deleted file mode 100644 index 6484d88..0000000 --- a/modules/local/smolecule.nf +++ /dev/null @@ -1,100 +0,0 @@ -include { - lookupMedakaModel; - alignReads; - bamstats; - mosdepth; - concatMosdepthResultFiles; -} from "./common" - - -process medakaSmolecule { - label = "medaka" - cpus Math.min(params.threads, 2) - errorStrategy { task.exitStatus == 65 ? "retry" : "finish" } - maxRetries 3 - input: - tuple val(meta), path("reads.fastq.gz"), val(n_reads) - val medaka_model - output: tuple val(meta), - path("reads.fastq.gz"), - path("consensus.fasta"), - optional: true - script: - // POA is dependent on the order of input reads. Therefore, in case of failure due - // to producing multiple output sequences, shuffling the input reads might lead to a - // single consensus. If no single consensus is generated on the third try, no output - // will be produced. - String cat_cmd = task.attempt == 1 ? "zcat" : "seqkit shuffle -s $task.attempt" - int spoa_min_cov = Math.round(n_reads * params.spoa_minimum_relative_coverage) - // When passing a single FASTx file to `medaka smolecule`, it expects multiple - // amplicons in that file (with the amplicon to which a read belongs being denoted - // by the part of the id line before the first underscore; e.g. amp1_r1, amp1_r2, - // ..., ampX_rY). We need to format the ID lines accordingly. - """ - medaka smolecule out \ - <($cat_cmd reads.fastq.gz | awk '{ - if (NR % 4 == 1) {printf ">read_%d\\n", ++c} else if (NR % 4==2) print; - }') \ - --length 0 \ - --model $medaka_model \ - --threads $task.cpus \ - --spoa_min_coverage $spoa_min_cov - - # If `smolecule` doesn't produce a single sequence, fail the process with exit code - # `65` (it will shuffle the reads upon retry). If already in the third attempt, exit - # with code `0` (no retry and no output). - if [[ \$(grep -c '^>' out/consensus.fasta) -ne 1 ]]; then - echo "QUITTING: Found zero or more than one sequences in 'out/consensus.fasta'." - [[ $task.attempt -ge 3 ]] && exit_code=0 || exit_code=65 - exit \$exit_code - fi - - # give consensus sequence a more meaningful ID before emitting - sed -E '1s/^>.*/>${meta["alias"]}/' out/consensus.fasta > consensus.fasta - """ -} - - -// workflow module -workflow pipeline { - take: - // reads have already been downsampled and trimmed - // expects shape: `[meta, reads, n_reads]` - ch_reads - main: - // get medaka model (look up or override) - def medaka_model - if (params.medaka_model) { - log.warn "Overriding Medaka model with ${params.medaka_model}." - medaka_model = params.medaka_model - } else { - Path lookup_table = file( - "${projectDir}/data/medaka_models.tsv", checkIfExists: true) - medaka_model = lookupMedakaModel( - lookup_table, params.basecaller_cfg, "medaka_consensus" - ) - } - - // run `medaka smolecule` to get the consensus - medakaSmolecule(ch_reads, medaka_model) - // realign the reads against the consensus (for coverage stats etc.) - | alignReads - - // get mapping stats for report - bamstats(alignReads.out) - - // run mosdepth on each sample - mosdepth( - alignReads.out | combine(Channel.of(null)), - params.number_depth_windows - ) - // concat the depth files for each sample - mosdepth.out - | groupTuple - | concatMosdepthResultFiles - emit: - consensus = medakaSmolecule.out.map { meta, reads, cons -> [meta, cons] } - mapped = alignReads.out - mapping_stats = bamstats.out - depth = concatMosdepthResultFiles.out -} diff --git a/modules/local/reference-based.nf b/modules/local/variant-calling.nf similarity index 100% rename from modules/local/reference-based.nf rename to modules/local/variant-calling.nf diff --git a/nextflow.config b/nextflow.config index 0a19024..a0cdc16 100644 --- a/nextflow.config +++ b/nextflow.config @@ -24,7 +24,9 @@ params { min_read_length = 300 max_read_length = null min_read_qual = 10 - reads_downsampling_size = null + drop_frac_longest_reads = 0.05 + take_longest_remaining_reads = true + reads_downsampling_size = 0 min_n_reads = 40 // mapping + variant calling args @@ -34,8 +36,11 @@ params { medaka_model = null medaka_target_depth_per_strand = 75 - // no-ref mode options + // de-novo mode options + force_spoa_length_threshold = 2000 spoa_minimum_relative_coverage = 0.15 + minimum_mean_depth = 30 + primary_alignments_threshold = 0.7 // misc help = false @@ -58,8 +63,8 @@ params { "--reference 'wf-amplicon-demo/reference.fa'" ] common_sha = "sha91452ece4f647f62b32dac3a614635a6f0d7f8b5" - container_sha = "sha2699195046454bce4411b7b781457eabe3d5ee52" - container_sha_medaka = "sha0f266f34e7b789358fdb290cfde090b844a318d2" + container_sha = "sha3b6a08c2ed47efd85b17b98762951fffdeee5ba9" + container_sha_medaka = "sha61a9438d745a78030738352e084445a2db3daa2a" agent = null } } @@ -71,7 +76,7 @@ manifest { description = 'Amplicon workflow' mainScript = 'main.nf' nextflowVersion = '>=23.04.2' - version = 'v0.5.0' + version = 'v0.6.0' } epi2melabs { @@ -132,7 +137,7 @@ profiles { container = "${params.aws_image_prefix}-wf-common:${params.wf.common_sha}-root" } withLabel:wfamplicon { - container = "${params.aws_image_prefix}-wf-amplicon:${params.wf.container_sha}-root" + container = "${params.aws_image_prefix}-wf-amplicon:${params.wf.container_sha}" } withLabel:medaka { container = "${params.aws_image_prefix}-medaka:${params.wf.container_sha_medaka}-root" diff --git a/nextflow_schema.json b/nextflow_schema.json index a7c3acc..b043ea0 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -80,8 +80,21 @@ "title": "Minimum mean read quality", "description": "Reads with a lower mean quality will be removed." }, + "drop_frac_longest_reads": { + "type": "number", + "default": 0.05, + "description": "Drop fraction of longest reads.", + "help_text": "The very longest reads might be concatemers or contain other artifacts. In many cases removing them simplifies de novo consensus generation." + }, + "take_longest_remaining_reads": { + "type": "boolean", + "default": true, + "description": "Whether to use the longest (remaining) reads.", + "help_text": "With this option, reads are not randomly selected during downsampling (potentially after the longest reads have been removed), but instead the longest remaining reads are taken. This generally improves performance on long amplicons." + }, "reads_downsampling_size": { "type": "integer", + "default": 0, "title": "Downsampling size", "description": "Downsample to this number of reads per sample.", "help_text": "Downsampling is performed after filtering. Set to 0 to disable downsampling." @@ -95,7 +108,7 @@ } }, "variant_calling_options": { - "title": "Variant calling options", + "title": "Variant Calling Options", "type": "object", "fa_icon": "fas fa-arrow-left", "description": "Parameters affecting the variant calling process.", @@ -145,6 +158,40 @@ } } }, + "de_novo_options": { + "title": "De-novo Consensus Options", + "type": "object", + "fa_icon": "far fa-question-circle", + "description": "Parameters for creating a de-novo consensus (without a reference).", + "properties": { + "force_spoa_length_threshold": { + "type": "integer", + "default": 2000, + "description": "Consensus length below which to force SPOA consensus generation.", + "help_text": "If the consensus generated by `miniasm` is shorter than this value, force consensus generation with SPOA (regardless of whether the sequence produced by `miniasm` passed QC or not). The rationale for this parameter is that `miniasm` sometimes gives slightly truncated assemblies for short amplicons from RBK data, whereas SPOA tends to be more robust in this regard." + }, + "spoa_minimum_relative_coverage": { + "type": "number", + "default": 0.15, + "minimum": 0, + "maximum": 1, + "description": "Minimum coverage (relative to the number of reads per sample after filtering) when constructing the consensus with SPOA.", + "help_text": "Needs to be a number between 0.0 and 1.0. The result of multiplying this number with the number of reads for the corresponding sample (after filtering) is passed on to SPOA's `--min-coverage` option." + }, + "minimum_mean_depth": { + "type": "integer", + "default": 30, + "description": "Mean depth threshold to pass consensus quality control.", + "help_text": "Draft consensus sequences with a lower average depth of coverage after re-aligning the input reads will fail QC." + }, + "primary_alignments_threshold": { + "type": "number", + "default": 0.7, + "description": "Fraction of primary alignments to pass quality control.", + "help_text": "Draft consensus sequences with a lower fraction of primary alignments after re-aligning the input reads will fail QC." + } + } + }, "output": { "title": "Output Options", "type": "object", @@ -175,7 +222,7 @@ "number_depth_windows": { "type": "integer", "description": "Number of windows used during depth of coverage calculations.", - "help_text": "Depth of coverage is calculated for each sample across each amplicon split into this number of windows. A higher number will produce more fine-grained results at the expense of run time. Values of 100-200 should be suitable in the vast majority of cases.", + "help_text": "Depth of coverage is calculated for each sample across each amplicon split into this number of windows. A higher number will produce more fine-grained plots at the expense of run time.", "minimum": 10, "default": 100 }, @@ -184,14 +231,6 @@ "default": 75, "description": "Downsample each amplicon to this per-strand depth before running Medaka.", "help_text": "Medaka performs best with even strand coverage and depths between 80X and 400X. To avoid too high coverage, the workflow downsamples the reads for each amplicon to this per-strand depth before running Medaka. Changing this value is discouraged as it might cause decreased performance." - }, - "spoa_minimum_relative_coverage": { - "type": "number", - "default": 0.15, - "minimum": 0, - "maximum": 1, - "description": "Minimum coverage (relative to the number of reads per sample after filtering) when constructing the consensus with SPOA.", - "help_text": "Needs to be a number between 0.0 and 1.0. The result of multiplying this number with the number of reads for the corresponding sample (after filtering) is passed on to SPOA's `--min-coverage` option." } } }, @@ -242,6 +281,9 @@ { "$ref": "#/definitions/variant_calling_options" }, + { + "$ref": "#/definitions/de_novo_options" + }, { "$ref": "#/definitions/output" }, @@ -273,7 +315,7 @@ } }, "docs": { - "intro": "## Introduction\n\nThis [Nextflow](https://www.nextflow.io/) workflow provides a simple way to\nanalyse Oxford Nanopore reads generated from amplicons.\n\nThe workflow requires raw reads in FASTQ format and can be run in two modes:\n* With a reference: After initial filtering (based on read length and quality)\n and adapter trimming, [minimap2](https://github.com/lh3/minimap2) is used to\n align the reads to a reference FASTA file (please note that the reference\n should only contain the expected sequences of the individual amplicons).\n Variants are then called with\n [Medaka](https://github.com/nanoporetech/medaka).\n* Without a reference: Like for the \"reference mode\", reads are first filtered\n and trimmed. Then, `medaka smolecule` is used to generate the consensus of the\n reads of each sample _de novo_. Reads are re-aligned against the consensus to\n produce coverage plots for the report.\n\nThe results of the workflow include an interactive HTML report, FASTA files with\nthe consensus sequences of the amplicons, BAM files with the alignments, and VCF\nfiles containing the variants (if run in \"reference mode\").\n", - "links": "\n## Useful links\n\n- [Nextflow](https://www.nextflow.io/)\n- [Docker](https://www.docker.com/products/docker-desktop)\n- [Singularity](https://docs.sylabs.io/guides/latest/user-guide/)\n- [minimap2](https://github.com/lh3/minimap2)\n- [Medaka](https://github.com/nanoporetech/medaka)\n- [Porechop](https://github.com/rrwick/Porechop)\n" + "intro": "## Introduction\n\nThis [Nextflow](https://www.nextflow.io/) workflow provides a simple way to\nanalyse Oxford Nanopore reads generated from amplicons.\n\nThe workflow requires raw reads in FASTQ format and can be run in two modes:\n* Variant calling mode: Trigger this mode by passing a reference FASTA file.\n After initial filtering (based on read length and quality) and adapter\n trimming, [minimap2](https://github.com/lh3/minimap2) is used to align the\n reads to the reference (please note that the reference should only contain the\n expected sequences of the individual amplicons). Variants are then called with\n [Medaka](https://github.com/nanoporetech/medaka). This mode allows for\n multiple amplicons per barcode (for details on how to map specific target\n amplicons to individual samples / barcodes, see below).\n* De-novo consensus mode: This mode is run when no reference file is passed.\n Like for the \"variant calling mode\", reads are first filtered and trimmed.\n Then, a consensus sequence is generated _de novo_ from the reads of each\n sample. Reads are re-aligned against the draft consensus for polishing with\n [Medaka](https://github.com/nanoporetech/medaka). Please note that only one\n amplicon per barcode is supported in de-novo consensus mode.\n\nThe results of the workflow include an interactive HTML report, FASTA files with\nthe consensus sequences of the amplicons, BAM files with the alignments, and VCF\nfiles containing the variants (if run in variant calling mode).\n", + "links": "\n## Useful links\n\n- [Nextflow](https://www.nextflow.io/)\n- [Docker](https://www.docker.com/products/docker-desktop)\n- [Singularity](https://docs.sylabs.io/guides/latest/user-guide/)\n- [minimap2](https://github.com/lh3/minimap2)\n- [miniasm](https://github.com/lh3/miniasm)\n- [SPOA](https://github.com/rvaser/spoa)\n- [Medaka](https://github.com/nanoporetech/medaka)\n- [Porechop](https://github.com/rrwick/Porechop)\n" } } \ No newline at end of file diff --git a/test_data/fastq/barcode04/reads.fastq.gz b/test_data/fastq-denovo/barcode01/reads.fastq.gz similarity index 100% rename from test_data/fastq/barcode04/reads.fastq.gz rename to test_data/fastq-denovo/barcode01/reads.fastq.gz diff --git a/test_data/fastq-denovo/barcode02/reads.fastq.gz b/test_data/fastq-denovo/barcode02/reads.fastq.gz new file mode 100644 index 0000000..f5873d6 Binary files /dev/null and b/test_data/fastq-denovo/barcode02/reads.fastq.gz differ