From e1fe03b97c1cec3e47e48213aa11d5e152f484ad Mon Sep 17 00:00:00 2001
From: Julian Libiseller-Egger <julian.libiseller-egger@nanoporetech.com>
Date: Thu, 7 Sep 2023 08:36:12 +0000
Subject: [PATCH] downsample reads prior to Medaka [CW-2496]

---
 .gitlab-ci.yml                                |  2 +-
 CHANGELOG.md                                  |  4 ++
 .../get_reads_with_longest_aligned_ref_len.py | 57 +++++++++++++++++++
 modules/local/pipeline.nf                     | 34 +++++++++--
 nextflow.config                               |  3 +-
 nextflow_schema.json                          |  6 ++
 6 files changed, 100 insertions(+), 6 deletions(-)
 create mode 100755 bin/workflow_glue/get_reads_with_longest_aligned_ref_len.py

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 3b31606..ab8d83b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -48,7 +48,7 @@ docker-run:
         NF_PROCESS_FILES: >
           main.nf
           modules/local/pipeline.nf
-        NF_IGNORE_PROCESSES: downsampleReads,alignReads,bamstats,concatMosdepthResultFiles,medakaConsensus,medakaVariant,mergeBAMs,mergeVCFs,mosdepth
+        NF_IGNORE_PROCESSES: downsampleBAMforMedaka,downsampleReads,alignReads,bamstats,concatMosdepthResultFiles,medakaConsensus,medakaVariant,mergeBAMs,mergeVCFs,mosdepth
         AFTER_NEXTFLOW_CMD: |
           grep 'No reads left after pre-processing' .nextflow.log &&
           grep 'only a limited report is available' $$PWD/$$CI_PROJECT_NAME/*html
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4fa4b59..48d405b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [v0.3.4]
+### Changed
+- The workflow now downsamples reads for each amplicon to be in the suitable depth range for Medaka.
+
 ## [v0.3.3]
 ### Added
 - MacOS ARM64 support.
diff --git a/bin/workflow_glue/get_reads_with_longest_aligned_ref_len.py b/bin/workflow_glue/get_reads_with_longest_aligned_ref_len.py
new file mode 100755
index 0000000..857a74e
--- /dev/null
+++ b/bin/workflow_glue/get_reads_with_longest_aligned_ref_len.py
@@ -0,0 +1,57 @@
+"""Get IDs of reads with largest `aligned_ref_len` from `bamstats` results TSV file."""
+import sys
+
+import pandas as pd
+
+from .util import get_named_logger, wf_parser  # noqa: ABS101
+
+
+def argparser():
+    """Argument parser for entrypoint."""
+    parser = wf_parser("get_reads_with_longest_aligned_ref")
+    parser.add_argument(
+        "bamstats_results",
+        help="`bamstats` results TSV file.",
+    )
+    parser.add_argument(
+        "num_reads_per_strand",
+        help="Number of reads to select.",
+        type=int,
+    )
+    return parser
+
+
+def main(args):
+    """Run the entry point."""
+    logger = get_named_logger("longAlnRds")
+    logger.info(
+        f"Extract IDs of {args.num_reads_per_strand} reads with "
+        "longest `aligned_ref_len`."
+    )
+
+    # we only load the relevant columns and use categorical dtype where possible to
+    # reduce the memory footprint
+    dtypes = {
+        "name": str,
+        "ref": "category",
+        "direction": "category",
+        "aligned_ref_len": int,
+    }
+    df = pd.read_csv(
+        args.bamstats_results, sep="\t", usecols=dtypes.keys(), dtype=dtypes
+    ).query('ref != "*"')
+
+    # for each ref--direction combo, write out the read IDs of the longest
+    # `args.num_reads_per_strand` reads
+    df.groupby(["ref", "direction"]).apply(
+        lambda grp_df: sys.stdout.write(
+            "\n".join(
+                grp_df.sort_values("aligned_ref_len", ascending=False)["name"].head(
+                    args.num_reads_per_strand
+                )
+            )
+            + "\n"
+        )
+    )
+
+    logger.info("Finished writing read IDs with longest `aligned_ref_len` to STDOUT.")
diff --git a/modules/local/pipeline.nf b/modules/local/pipeline.nf
index 2c41d01..adb523e 100644
--- a/modules/local/pipeline.nf
+++ b/modules/local/pipeline.nf
@@ -28,6 +28,24 @@ process bamstats {
     """
 }
 
+process downsampleBAMforMedaka {
+    label "wfamplicon"
+    cpus 1
+    input:
+        tuple val(meta), path("input.bam"), path("input.bam.bai"), path("bamstats.tsv")
+    output: tuple val(meta), path("downsampled.bam"), path("downsampled.bam.bai")
+    script:
+    """
+    workflow-glue get_reads_with_longest_aligned_ref_len \
+        bamstats.tsv \
+        $params.medaka_target_depth_per_strand\
+    > downsampled.read_IDs
+
+    samtools view input.bam -N downsampled.read_IDs -o downsampled.bam
+    samtools index downsampled.bam
+    """
+}
+
 process mosdepth {
     label "wfamplicon"
     cpus Math.min(params.threads, 3)
@@ -198,13 +216,24 @@ workflow pipeline {
         // align to reference
         alignReads(ch_reads, ref)
 
+        // get mapping stats for report and pre-Medaka downsampling
+        bamstats(alignReads.out)
+
+        // downsample to target depth before running Medaka
+        alignReads.out
+        | join(bamstats.out)
+        | map { meta, bam, bai, bamstats, flagstat ->
+            [meta, bam, bai, bamstats]
+        }
+        | downsampleBAMforMedaka
+
         // get the seq IDs of the amplicons from the ref file
         def ch_amplicon_seqIDs = ref | splitFasta(record: [id: true]) | map { it.id }
 
         // run medaka consensus (the process will run once for each sample--amplicon
         // combination)
         def ch_medaka_consensus_probs = medakaConsensus(
-            alignReads.out | combine(ch_amplicon_seqIDs),
+            downsampleBAMforMedaka.out | combine(ch_amplicon_seqIDs),
             medaka_model
         ) | groupTuple
 
@@ -215,9 +244,6 @@ workflow pipeline {
             params.min_coverage
         )
 
-        // get mapping stats for report
-        bamstats(alignReads.out)
-
         // run mosdepth on each sample--amplicon combination
         mosdepth(
             alignReads.out | combine(ch_amplicon_seqIDs),
diff --git a/nextflow.config b/nextflow.config
index c23b403..382fc27 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -31,6 +31,7 @@ params {
     min_coverage = 20
     basecaller_cfg = "dna_r10.4.1_e8.2_400bps_sup@v4.2.0"
     medaka_model = null
+    medaka_target_depth_per_strand = 75
 
     // misc
     help = false
@@ -53,7 +54,7 @@ params {
             "--fastq 'wf-amplicon-demo/fastq'",
             "--reference 'wf-amplicon-demo/reference.fa'"
         ]
-        common_sha = "sha5fc720674a26bc63a6f31ed186344209175b54b1"
+        common_sha = "sha0a6dc21fac17291f4acb2e0f67bcdec7bf63e6b7"
         container_sha = "sha2699195046454bce4411b7b781457eabe3d5ee52"
         container_sha_medaka = "sha492176b6093aa922aae5610f6ad899aacd2c3c7f"
         agent = null
diff --git a/nextflow_schema.json b/nextflow_schema.json
index d5eee09..320f5de 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -173,6 +173,12 @@
                     "help_text": "Depth of coverage is calculated for each sample across each amplicon split into this number of windows. A higher number will produce more fine-grained results at the expense of run time. Values of 100-200 should be suitable in the vast majority of cases.",
                     "min": 10,
                     "default": 100
+                },
+                "medaka_target_depth_per_strand": {
+                    "type": "integer",
+                    "default": 75,
+                    "description": "Downsample each amplicon to this per-strand depth before running Medaka.",
+                    "help_text": "Medaka performs best with even strand coverage and depths between 80X and 400X. To avoid too high coverage, the workflow downsamples the reads for each amplicon to this per-strand depth before running Medaka. Changing this value is discouraged as it might cause decreased performance."
                 }
             }
         },