From 869e4fe928817e37c099ca39e739da0874212909 Mon Sep 17 00:00:00 2001 From: Julian Libiseller-Egger Date: Fri, 16 Feb 2024 09:42:02 +0000 Subject: [PATCH] better sanitisation of ref seq IDs [CW-3411] --- CHANGELOG.md | 7 +++++++ modules/local/variant-calling.nf | 11 ++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 75fa884..10ae7ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [unreleased] +### Fixed +- The workflow failing when there were tab characters in the FASTA header lines of reference sequences. + +### Changed +- The way the reference sequence IDs are sanitised to prevent issues with special characters. + ## [v1.0.3] ### Fixed - The workflow failing when there was a whitespace in the name of the reference file. diff --git a/modules/local/variant-calling.nf b/modules/local/variant-calling.nf index bb2e603..c4648e7 100644 --- a/modules/local/variant-calling.nf +++ b/modules/local/variant-calling.nf @@ -18,7 +18,9 @@ process sanitizeRefFile { output: path "reference_sanitized_seqIDs.fasta" script: """ - sed '/^>/s/:\\|\\*\\| /_/g' reference.fasta > reference_sanitized_seqIDs.fasta + # use `sed` to replace all non-alphanumerical characters with underscores (`/2g` + # skips the first match which will be `>`) + sed -E '/^>/s/[^[:alnum:]]+/_/2g' reference.fasta > reference_sanitized_seqIDs.fasta """ } @@ -175,8 +177,11 @@ workflow pipeline { // subset the sanitized ref file ref_id_map = Channel.empty() | concat( - Channel.of(ref).splitFasta(record: [id: true]).map{ it.id }.collect(), - san_ref.splitFasta(record: [id: true]).map{ it.id }.collect() + // `splitFasta(reecord: [id: true])` does not split the header line at tab + // characters. We thus split again here to make sure that we only got the + // seq ID + Channel.of(ref).splitFasta(record: [id: true]).map{ it.id.split()[0] }.collect(), + san_ref.splitFasta(record: [id: true]).map{ it.id.split()[0] }.collect() ) | toList | map { it.transpose().collectEntries() as LinkedHashMap }