Skip to content

Commit

Permalink
Merge branch 'CW-2496' into 'dev'
Browse files Browse the repository at this point in the history
downsample reads prior to Medaka [CW-2496]

See merge request epi2melabs/workflows/wf-amplicon!17
  • Loading branch information
julibeg committed Sep 7, 2023
2 parents 082c719 + e1fe03b commit a5fdf87
Show file tree
Hide file tree
Showing 6 changed files with 100 additions and 6 deletions.
2 changes: 1 addition & 1 deletion .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ docker-run:
NF_PROCESS_FILES: >
main.nf
modules/local/pipeline.nf
NF_IGNORE_PROCESSES: downsampleReads,alignReads,bamstats,concatMosdepthResultFiles,medakaConsensus,medakaVariant,mergeBAMs,mergeVCFs,mosdepth
NF_IGNORE_PROCESSES: downsampleBAMforMedaka,downsampleReads,alignReads,bamstats,concatMosdepthResultFiles,medakaConsensus,medakaVariant,mergeBAMs,mergeVCFs,mosdepth
AFTER_NEXTFLOW_CMD: |
grep 'No reads left after pre-processing' .nextflow.log &&
grep 'only a limited report is available' $$PWD/$$CI_PROJECT_NAME/*html
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [v0.3.4]
### Changed
- The workflow now downsamples reads for each amplicon to be in the suitable depth range for Medaka.

## [v0.3.3]
### Added
- MacOS ARM64 support.
Expand Down
57 changes: 57 additions & 0 deletions bin/workflow_glue/get_reads_with_longest_aligned_ref_len.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""Get IDs of reads with largest `aligned_ref_len` from `bamstats` results TSV file."""
import sys

import pandas as pd

from .util import get_named_logger, wf_parser # noqa: ABS101


def argparser():
"""Argument parser for entrypoint."""
parser = wf_parser("get_reads_with_longest_aligned_ref")
parser.add_argument(
"bamstats_results",
help="`bamstats` results TSV file.",
)
parser.add_argument(
"num_reads_per_strand",
help="Number of reads to select.",
type=int,
)
return parser


def main(args):
"""Run the entry point."""
logger = get_named_logger("longAlnRds")
logger.info(
f"Extract IDs of {args.num_reads_per_strand} reads with "
"longest `aligned_ref_len`."
)

# we only load the relevant columns and use categorical dtype where possible to
# reduce the memory footprint
dtypes = {
"name": str,
"ref": "category",
"direction": "category",
"aligned_ref_len": int,
}
df = pd.read_csv(
args.bamstats_results, sep="\t", usecols=dtypes.keys(), dtype=dtypes
).query('ref != "*"')

# for each ref--direction combo, write out the read IDs of the longest
# `args.num_reads_per_strand` reads
df.groupby(["ref", "direction"]).apply(
lambda grp_df: sys.stdout.write(
"\n".join(
grp_df.sort_values("aligned_ref_len", ascending=False)["name"].head(
args.num_reads_per_strand
)
)
+ "\n"
)
)

logger.info("Finished writing read IDs with longest `aligned_ref_len` to STDOUT.")
34 changes: 30 additions & 4 deletions modules/local/pipeline.nf
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,24 @@ process bamstats {
"""
}

process downsampleBAMforMedaka {
label "wfamplicon"
cpus 1
input:
tuple val(meta), path("input.bam"), path("input.bam.bai"), path("bamstats.tsv")
output: tuple val(meta), path("downsampled.bam"), path("downsampled.bam.bai")
script:
"""
workflow-glue get_reads_with_longest_aligned_ref_len \
bamstats.tsv \
$params.medaka_target_depth_per_strand\
> downsampled.read_IDs
samtools view input.bam -N downsampled.read_IDs -o downsampled.bam
samtools index downsampled.bam
"""
}

process mosdepth {
label "wfamplicon"
cpus Math.min(params.threads, 3)
Expand Down Expand Up @@ -198,13 +216,24 @@ workflow pipeline {
// align to reference
alignReads(ch_reads, ref)

// get mapping stats for report and pre-Medaka downsampling
bamstats(alignReads.out)

// downsample to target depth before running Medaka
alignReads.out
| join(bamstats.out)
| map { meta, bam, bai, bamstats, flagstat ->
[meta, bam, bai, bamstats]
}
| downsampleBAMforMedaka

// get the seq IDs of the amplicons from the ref file
def ch_amplicon_seqIDs = ref | splitFasta(record: [id: true]) | map { it.id }

// run medaka consensus (the process will run once for each sample--amplicon
// combination)
def ch_medaka_consensus_probs = medakaConsensus(
alignReads.out | combine(ch_amplicon_seqIDs),
downsampleBAMforMedaka.out | combine(ch_amplicon_seqIDs),
medaka_model
) | groupTuple

Expand All @@ -215,9 +244,6 @@ workflow pipeline {
params.min_coverage
)

// get mapping stats for report
bamstats(alignReads.out)

// run mosdepth on each sample--amplicon combination
mosdepth(
alignReads.out | combine(ch_amplicon_seqIDs),
Expand Down
3 changes: 2 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ params {
min_coverage = 20
basecaller_cfg = "[email protected]"
medaka_model = null
medaka_target_depth_per_strand = 75

// misc
help = false
Expand All @@ -53,7 +54,7 @@ params {
"--fastq 'wf-amplicon-demo/fastq'",
"--reference 'wf-amplicon-demo/reference.fa'"
]
common_sha = "sha5fc720674a26bc63a6f31ed186344209175b54b1"
common_sha = "sha0a6dc21fac17291f4acb2e0f67bcdec7bf63e6b7"
container_sha = "sha2699195046454bce4411b7b781457eabe3d5ee52"
container_sha_medaka = "sha492176b6093aa922aae5610f6ad899aacd2c3c7f"
agent = null
Expand Down
6 changes: 6 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,12 @@
"help_text": "Depth of coverage is calculated for each sample across each amplicon split into this number of windows. A higher number will produce more fine-grained results at the expense of run time. Values of 100-200 should be suitable in the vast majority of cases.",
"min": 10,
"default": 100
},
"medaka_target_depth_per_strand": {
"type": "integer",
"default": 75,
"description": "Downsample each amplicon to this per-strand depth before running Medaka.",
"help_text": "Medaka performs best with even strand coverage and depths between 80X and 400X. To avoid too high coverage, the workflow downsamples the reads for each amplicon to this per-strand depth before running Medaka. Changing this value is discouraged as it might cause decreased performance."
}
}
},
Expand Down

0 comments on commit a5fdf87

Please sign in to comment.