Merge branch 'CW-2496' into 'dev'

downsample reads prior to Medaka [CW-2496] See merge request epi2melabs/workflows/wf-amplicon!17
epi2me-labs · Sep 7, 2023 · a5fdf87 · a5fdf87
2 parents 082c719 + e1fe03b
commit a5fdf87
Show file tree

Hide file tree

Showing 6 changed files with 100 additions and 6 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -48,7 +48,7 @@ docker-run:
         NF_PROCESS_FILES: >
           main.nf
           modules/local/pipeline.nf
-        NF_IGNORE_PROCESSES: downsampleReads,alignReads,bamstats,concatMosdepthResultFiles,medakaConsensus,medakaVariant,mergeBAMs,mergeVCFs,mosdepth
+        NF_IGNORE_PROCESSES: downsampleBAMforMedaka,downsampleReads,alignReads,bamstats,concatMosdepthResultFiles,medakaConsensus,medakaVariant,mergeBAMs,mergeVCFs,mosdepth
         AFTER_NEXTFLOW_CMD: |
           grep 'No reads left after pre-processing' .nextflow.log &&
           grep 'only a limited report is available' $$PWD/$$CI_PROJECT_NAME/*html
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [v0.3.4]
+### Changed
+- The workflow now downsamples reads for each amplicon to be in the suitable depth range for Medaka.
+
 ## [v0.3.3]
 ### Added
 - MacOS ARM64 support.

diff --git a/bin/workflow_glue/get_reads_with_longest_aligned_ref_len.py b/bin/workflow_glue/get_reads_with_longest_aligned_ref_len.py
@@ -0,0 +1,57 @@
+"""Get IDs of reads with largest `aligned_ref_len` from `bamstats` results TSV file."""
+import sys
+
+import pandas as pd
+
+from .util import get_named_logger, wf_parser  # noqa: ABS101
+
+
+def argparser():
+    """Argument parser for entrypoint."""
+    parser = wf_parser("get_reads_with_longest_aligned_ref")
+    parser.add_argument(
+        "bamstats_results",
+        help="`bamstats` results TSV file.",
+    )
+    parser.add_argument(
+        "num_reads_per_strand",
+        help="Number of reads to select.",
+        type=int,
+    )
+    return parser
+
+
+def main(args):
+    """Run the entry point."""
+    logger = get_named_logger("longAlnRds")
+    logger.info(
+        f"Extract IDs of {args.num_reads_per_strand} reads with "
+        "longest `aligned_ref_len`."
+    )
+
+    # we only load the relevant columns and use categorical dtype where possible to
+    # reduce the memory footprint
+    dtypes = {
+        "name": str,
+        "ref": "category",
+        "direction": "category",
+        "aligned_ref_len": int,
+    }
+    df = pd.read_csv(
+        args.bamstats_results, sep="\t", usecols=dtypes.keys(), dtype=dtypes
+    ).query('ref != "*"')
+
+    # for each ref--direction combo, write out the read IDs of the longest
+    # `args.num_reads_per_strand` reads
+    df.groupby(["ref", "direction"]).apply(
+        lambda grp_df: sys.stdout.write(
+            "\n".join(
+                grp_df.sort_values("aligned_ref_len", ascending=False)["name"].head(
+                    args.num_reads_per_strand
+                )
+            )
+            + "\n"
+        )
+    )
+
+    logger.info("Finished writing read IDs with longest `aligned_ref_len` to STDOUT.")
diff --git a/modules/local/pipeline.nf b/modules/local/pipeline.nf
@@ -28,6 +28,24 @@ process bamstats {
     """
 }
 
+process downsampleBAMforMedaka {
+    label "wfamplicon"
+    cpus 1
+    input:
+        tuple val(meta), path("input.bam"), path("input.bam.bai"), path("bamstats.tsv")
+    output: tuple val(meta), path("downsampled.bam"), path("downsampled.bam.bai")
+    script:
+    """
+    workflow-glue get_reads_with_longest_aligned_ref_len \
+        bamstats.tsv \
+        $params.medaka_target_depth_per_strand\
+    > downsampled.read_IDs
+
+    samtools view input.bam -N downsampled.read_IDs -o downsampled.bam
+    samtools index downsampled.bam
+    """
+}
+
 process mosdepth {
     label "wfamplicon"
     cpus Math.min(params.threads, 3)
@@ -198,13 +216,24 @@ workflow pipeline {
         // align to reference
         alignReads(ch_reads, ref)
 
+        // get mapping stats for report and pre-Medaka downsampling
+        bamstats(alignReads.out)
+
+        // downsample to target depth before running Medaka
+        alignReads.out
+        | join(bamstats.out)
+        | map { meta, bam, bai, bamstats, flagstat ->
+            [meta, bam, bai, bamstats]
+        }
+        | downsampleBAMforMedaka
+
         // get the seq IDs of the amplicons from the ref file
         def ch_amplicon_seqIDs = ref | splitFasta(record: [id: true]) | map { it.id }
 
         // run medaka consensus (the process will run once for each sample--amplicon
         // combination)
         def ch_medaka_consensus_probs = medakaConsensus(
-            alignReads.out | combine(ch_amplicon_seqIDs),
+            downsampleBAMforMedaka.out | combine(ch_amplicon_seqIDs),
             medaka_model
         ) | groupTuple
 
@@ -215,9 +244,6 @@ workflow pipeline {
             params.min_coverage
         )
 
-        // get mapping stats for report
-        bamstats(alignReads.out)
-
         // run mosdepth on each sample--amplicon combination
         mosdepth(
             alignReads.out | combine(ch_amplicon_seqIDs),

diff --git a/nextflow.config b/nextflow.config
@@ -31,6 +31,7 @@ params {
     min_coverage = 20
     basecaller_cfg = "[email protected]"
     medaka_model = null
+    medaka_target_depth_per_strand = 75
 
     // misc
     help = false
@@ -53,7 +54,7 @@ params {
             "--fastq 'wf-amplicon-demo/fastq'",
             "--reference 'wf-amplicon-demo/reference.fa'"
         ]
-        common_sha = "sha5fc720674a26bc63a6f31ed186344209175b54b1"
+        common_sha = "sha0a6dc21fac17291f4acb2e0f67bcdec7bf63e6b7"
         container_sha = "sha2699195046454bce4411b7b781457eabe3d5ee52"
         container_sha_medaka = "sha492176b6093aa922aae5610f6ad899aacd2c3c7f"
         agent = null

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -173,6 +173,12 @@
                     "help_text": "Depth of coverage is calculated for each sample across each amplicon split into this number of windows. A higher number will produce more fine-grained results at the expense of run time. Values of 100-200 should be suitable in the vast majority of cases.",
                     "min": 10,
                     "default": 100
+                },
+                "medaka_target_depth_per_strand": {
+                    "type": "integer",
+                    "default": 75,
+                    "description": "Downsample each amplicon to this per-strand depth before running Medaka.",
+                    "help_text": "Medaka performs best with even strand coverage and depths between 80X and 400X. To avoid too high coverage, the workflow downsamples the reads for each amplicon to this per-strand depth before running Medaka. Changing this value is discouraged as it might cause decreased performance."
                 }
             }
         },