From e1fe03b97c1cec3e47e48213aa11d5e152f484ad Mon Sep 17 00:00:00 2001 From: Julian Libiseller-Egger Date: Thu, 7 Sep 2023 08:36:12 +0000 Subject: [PATCH] downsample reads prior to Medaka [CW-2496] --- .gitlab-ci.yml | 2 +- CHANGELOG.md | 4 ++ .../get_reads_with_longest_aligned_ref_len.py | 57 +++++++++++++++++++ modules/local/pipeline.nf | 34 +++++++++-- nextflow.config | 3 +- nextflow_schema.json | 6 ++ 6 files changed, 100 insertions(+), 6 deletions(-) create mode 100755 bin/workflow_glue/get_reads_with_longest_aligned_ref_len.py diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 3b31606..ab8d83b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -48,7 +48,7 @@ docker-run: NF_PROCESS_FILES: > main.nf modules/local/pipeline.nf - NF_IGNORE_PROCESSES: downsampleReads,alignReads,bamstats,concatMosdepthResultFiles,medakaConsensus,medakaVariant,mergeBAMs,mergeVCFs,mosdepth + NF_IGNORE_PROCESSES: downsampleBAMforMedaka,downsampleReads,alignReads,bamstats,concatMosdepthResultFiles,medakaConsensus,medakaVariant,mergeBAMs,mergeVCFs,mosdepth AFTER_NEXTFLOW_CMD: | grep 'No reads left after pre-processing' .nextflow.log && grep 'only a limited report is available' $$PWD/$$CI_PROJECT_NAME/*html diff --git a/CHANGELOG.md b/CHANGELOG.md index 4fa4b59..48d405b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [v0.3.4] +### Changed +- The workflow now downsamples reads for each amplicon to be in the suitable depth range for Medaka. + ## [v0.3.3] ### Added - MacOS ARM64 support. diff --git a/bin/workflow_glue/get_reads_with_longest_aligned_ref_len.py b/bin/workflow_glue/get_reads_with_longest_aligned_ref_len.py new file mode 100755 index 0000000..857a74e --- /dev/null +++ b/bin/workflow_glue/get_reads_with_longest_aligned_ref_len.py @@ -0,0 +1,57 @@ +"""Get IDs of reads with largest `aligned_ref_len` from `bamstats` results TSV file.""" +import sys + +import pandas as pd + +from .util import get_named_logger, wf_parser # noqa: ABS101 + + +def argparser(): + """Argument parser for entrypoint.""" + parser = wf_parser("get_reads_with_longest_aligned_ref") + parser.add_argument( + "bamstats_results", + help="`bamstats` results TSV file.", + ) + parser.add_argument( + "num_reads_per_strand", + help="Number of reads to select.", + type=int, + ) + return parser + + +def main(args): + """Run the entry point.""" + logger = get_named_logger("longAlnRds") + logger.info( + f"Extract IDs of {args.num_reads_per_strand} reads with " + "longest `aligned_ref_len`." + ) + + # we only load the relevant columns and use categorical dtype where possible to + # reduce the memory footprint + dtypes = { + "name": str, + "ref": "category", + "direction": "category", + "aligned_ref_len": int, + } + df = pd.read_csv( + args.bamstats_results, sep="\t", usecols=dtypes.keys(), dtype=dtypes + ).query('ref != "*"') + + # for each ref--direction combo, write out the read IDs of the longest + # `args.num_reads_per_strand` reads + df.groupby(["ref", "direction"]).apply( + lambda grp_df: sys.stdout.write( + "\n".join( + grp_df.sort_values("aligned_ref_len", ascending=False)["name"].head( + args.num_reads_per_strand + ) + ) + + "\n" + ) + ) + + logger.info("Finished writing read IDs with longest `aligned_ref_len` to STDOUT.") diff --git a/modules/local/pipeline.nf b/modules/local/pipeline.nf index 2c41d01..adb523e 100644 --- a/modules/local/pipeline.nf +++ b/modules/local/pipeline.nf @@ -28,6 +28,24 @@ process bamstats { """ } +process downsampleBAMforMedaka { + label "wfamplicon" + cpus 1 + input: + tuple val(meta), path("input.bam"), path("input.bam.bai"), path("bamstats.tsv") + output: tuple val(meta), path("downsampled.bam"), path("downsampled.bam.bai") + script: + """ + workflow-glue get_reads_with_longest_aligned_ref_len \ + bamstats.tsv \ + $params.medaka_target_depth_per_strand\ + > downsampled.read_IDs + + samtools view input.bam -N downsampled.read_IDs -o downsampled.bam + samtools index downsampled.bam + """ +} + process mosdepth { label "wfamplicon" cpus Math.min(params.threads, 3) @@ -198,13 +216,24 @@ workflow pipeline { // align to reference alignReads(ch_reads, ref) + // get mapping stats for report and pre-Medaka downsampling + bamstats(alignReads.out) + + // downsample to target depth before running Medaka + alignReads.out + | join(bamstats.out) + | map { meta, bam, bai, bamstats, flagstat -> + [meta, bam, bai, bamstats] + } + | downsampleBAMforMedaka + // get the seq IDs of the amplicons from the ref file def ch_amplicon_seqIDs = ref | splitFasta(record: [id: true]) | map { it.id } // run medaka consensus (the process will run once for each sample--amplicon // combination) def ch_medaka_consensus_probs = medakaConsensus( - alignReads.out | combine(ch_amplicon_seqIDs), + downsampleBAMforMedaka.out | combine(ch_amplicon_seqIDs), medaka_model ) | groupTuple @@ -215,9 +244,6 @@ workflow pipeline { params.min_coverage ) - // get mapping stats for report - bamstats(alignReads.out) - // run mosdepth on each sample--amplicon combination mosdepth( alignReads.out | combine(ch_amplicon_seqIDs), diff --git a/nextflow.config b/nextflow.config index c23b403..382fc27 100644 --- a/nextflow.config +++ b/nextflow.config @@ -31,6 +31,7 @@ params { min_coverage = 20 basecaller_cfg = "dna_r10.4.1_e8.2_400bps_sup@v4.2.0" medaka_model = null + medaka_target_depth_per_strand = 75 // misc help = false @@ -53,7 +54,7 @@ params { "--fastq 'wf-amplicon-demo/fastq'", "--reference 'wf-amplicon-demo/reference.fa'" ] - common_sha = "sha5fc720674a26bc63a6f31ed186344209175b54b1" + common_sha = "sha0a6dc21fac17291f4acb2e0f67bcdec7bf63e6b7" container_sha = "sha2699195046454bce4411b7b781457eabe3d5ee52" container_sha_medaka = "sha492176b6093aa922aae5610f6ad899aacd2c3c7f" agent = null diff --git a/nextflow_schema.json b/nextflow_schema.json index d5eee09..320f5de 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -173,6 +173,12 @@ "help_text": "Depth of coverage is calculated for each sample across each amplicon split into this number of windows. A higher number will produce more fine-grained results at the expense of run time. Values of 100-200 should be suitable in the vast majority of cases.", "min": 10, "default": 100 + }, + "medaka_target_depth_per_strand": { + "type": "integer", + "default": 75, + "description": "Downsample each amplicon to this per-strand depth before running Medaka.", + "help_text": "Medaka performs best with even strand coverage and depths between 80X and 400X. To avoid too high coverage, the workflow downsamples the reads for each amplicon to this per-strand depth before running Medaka. Changing this value is discouraged as it might cause decreased performance." } } },