From 869e4fe928817e37c099ca39e739da0874212909 Mon Sep 17 00:00:00 2001
From: Julian Libiseller-Egger <julian.libiseller-egger@nanoporetech.com>
Date: Fri, 16 Feb 2024 09:42:02 +0000
Subject: [PATCH] better sanitisation of ref seq IDs [CW-3411]

---
 CHANGELOG.md                     |  7 +++++++
 modules/local/variant-calling.nf | 11 ++++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 75fa884..10ae7ca 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [unreleased]
+### Fixed
+- The workflow failing when there were tab characters in the FASTA header lines of reference sequences.
+
+### Changed
+- The way the reference sequence IDs are sanitised to prevent issues with special characters.
+
 ## [v1.0.3]
 ### Fixed
 - The workflow failing when there was a whitespace in the name of the reference file.
diff --git a/modules/local/variant-calling.nf b/modules/local/variant-calling.nf
index bb2e603..c4648e7 100644
--- a/modules/local/variant-calling.nf
+++ b/modules/local/variant-calling.nf
@@ -18,7 +18,9 @@ process sanitizeRefFile {
     output: path "reference_sanitized_seqIDs.fasta"
     script:
     """
-    sed '/^>/s/:\\|\\*\\| /_/g' reference.fasta > reference_sanitized_seqIDs.fasta
+    # use `sed` to replace all non-alphanumerical characters with underscores (`/2g`
+    # skips the first match which will be `>`)
+    sed -E '/^>/s/[^[:alnum:]]+/_/2g' reference.fasta > reference_sanitized_seqIDs.fasta
     """
 }
 
@@ -175,8 +177,11 @@ workflow pipeline {
         // subset the sanitized ref file
         ref_id_map = Channel.empty()
         | concat(
-            Channel.of(ref).splitFasta(record: [id: true]).map{ it.id }.collect(),
-            san_ref.splitFasta(record: [id: true]).map{ it.id }.collect()
+            // `splitFasta(reecord: [id: true])` does not split the header line at tab
+            // characters. We thus split again here to make sure that we only got the
+            // seq ID
+            Channel.of(ref).splitFasta(record: [id: true]).map{ it.id.split()[0] }.collect(),
+            san_ref.splitFasta(record: [id: true]).map{ it.id.split()[0] }.collect()
         )
         | toList
         | map { it.transpose().collectEntries() as LinkedHashMap }