From c19d224525af1447e233cf635c35315f73c9f0d5 Mon Sep 17 00:00:00 2001
From: mlorthiois <matthias.lorthiois@gmail.com>
Date: Fri, 12 Jan 2024 16:02:54 +0100
Subject: [PATCH 1/7] feat: tfkmers. If strand undefined, extract both
 extremities

---
 bin/extract_tss.py | 48 ++++++++++++++++++++++++++++++----------------
 1 file changed, 32 insertions(+), 16 deletions(-)

diff --git a/bin/extract_tss.py b/bin/extract_tss.py
index 529e7ba..8d2b974 100755
--- a/bin/extract_tss.py
+++ b/bin/extract_tss.py
@@ -15,26 +15,42 @@ def get_tss_interval(transcript, length):
     return start, end
 
 
+def get_interval_record(tx, length):
+    start, end = get_tss_interval(tx, length)
+    if start > 0:
+        return (
+            tx.seqname,
+            start - 1,
+            end,
+            f"{tx['gene_id']}::{tx['transcript_id']}::{tx.strand}",
+            tx.score,
+            tx.strand,
+        )
+    else:
+        print(
+            f"{tx['transcript_id']} skipped: Start Coordinate detected < 0.",
+            file=sys.stderr,
+        )
+    return None
+
+
 def get_intervals(transcripts, length):
     intervals = []
     for transcript in transcripts:
-        start, end = get_tss_interval(transcript, length)
-        if start > 0:
-            intervals.append(
-                (
-                    transcript.seqname,
-                    start - 1,
-                    end,
-                    f"{transcript['gene_id']}::{transcript['transcript_id']}",
-                    transcript.score,
-                    transcript.strand,
-                )
-            )
+        # If defined strand, test only TSS
+        if transcript.strand in ("+", "-"):
+            interval_record = get_interval_record(transcript, length)
+            if interval_record is not None:
+                intervals.append(interval_record)
+
+        # If undefined strand, simulate and test both extremities
         else:
-            print(
-                f"{transcript['transcript_id']} skipped: Start Coordinate detected < 0.",
-                file=sys.stderr,
-            )
+            for strand in ("+", "-"):
+                transcript.strand = strand
+                interval_record = get_interval_record(transcript, length)
+                if interval_record is not None:
+                    intervals.append(interval_record)
+
     return intervals
 
 

From 3a39e48fe04a3a87fb135b19e0f7b10cb979a8cc Mon Sep 17 00:00:00 2001
From: mlorthiois <matthias.lorthiois@gmail.com>
Date: Fri, 12 Jan 2024 16:03:33 +0100
Subject: [PATCH 2/7] feat: if strand undefined, restrand correct strand based
 on tfkmers prob

---
 bin/filter_gtf_ndr.py | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/bin/filter_gtf_ndr.py b/bin/filter_gtf_ndr.py
index e6bba39..7811daa 100755
--- a/bin/filter_gtf_ndr.py
+++ b/bin/filter_gtf_ndr.py
@@ -1,5 +1,6 @@
-#! /usr/bin/env python3 
-from typing import Set
+#! /usr/bin/env python3
+from typing import Dict, Set, Tuple
+
 from GTF import GTF
 
 
@@ -9,37 +10,47 @@ def parse_bambu(line):
 
 def parse_tfkmers(line):
     ids = line[0].split("::")
-    return ids[1], ids[0], line[1]
+    return ids[1], ids[0], line[1], ids[2]
 
 
-def parse_ndr(csv, origin, th) -> Set[str]:
+def parse_ndr(csv, origin, th) -> Tuple[Set[str], Dict[str, str]]:
     s = set()
+    strand_dict = dict()
 
     # Skip header
     next(csv)
 
+    strand = None
     for line in csv:
         line = line.split(",")
 
         if origin == "bambu":
-            line = parse_bambu(line)
+            tx_id, _, ndr = parse_bambu(line)
         elif origin == "tfkmers":
-            line = parse_tfkmers(line)
+            tx_id, _, ndr, strand = parse_tfkmers(line)
+        else:
+            exit("Unknown method")
 
-        tx_id, _, ndr = line
         ndr = float(ndr)
 
         if ndr < th:
             s.add(tx_id.lower())
 
-    return s
+            # Extract strand from sequence name to restrand GTF records
+            if origin == "tfkmers":
+                strand_dict[tx_id] = strand
+
+    return s, strand_dict
 
 
 def filter_count_matrix(file, transcripts, wr):
     print(next(file), file=wr)
     for line in file:
         line_splitted = line.split("\t")
-        if line_splitted[0].startswith("BambuTx") and line_splitted[0].lower() not in transcripts:
+        if (
+            line_splitted[0].startswith("BambuTx")
+            and line_splitted[0].lower() not in transcripts
+        ):
             continue
         print(line.rstrip(), file=wr)
 
@@ -99,8 +110,10 @@ def filter_count_matrix(file, transcripts, wr):
     args = parser.parse_args()
 
     ###################################################
-    filter_bambu = parse_ndr(args.bambu, "bambu", args.bambu_threshold)
-    filter_tfkmers = parse_ndr(args.tfkmers, "tfkmers", args.tfkmers_threshold)
+    filter_bambu, _ = parse_ndr(args.bambu, "bambu", args.bambu_threshold)
+    filter_tfkmers, strand_dict = parse_ndr(
+        args.tfkmers, "tfkmers", args.tfkmers_threshold
+    )
 
     if args.operation == "union":
         filter = filter_bambu | filter_tfkmers
@@ -110,6 +123,7 @@ def filter_count_matrix(file, transcripts, wr):
     with open("unformat.novel.filter.gtf", "w") as wr:
         for record in GTF.parse_by_line(args.gtf):
             if "transcript_id" in record and record["transcript_id"].lower() in filter:
+                record.strand = strand_dict[record["transcript_id"]]
                 print(record, file=wr)
 
     with open("counts_transcript.filter.txt", "w") as wr:

From 925f1b7de9d22ff74dfc31d9ceafd1ea3e8703ae Mon Sep 17 00:00:00 2001
From: mlorthiois <matthias.lorthiois@gmail.com>
Date: Sat, 13 Jan 2024 16:03:21 +0100
Subject: [PATCH 3/7] fix: forget to lowercase the record tx_id when
 restranding

---
 bin/filter_gtf_ndr.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/bin/filter_gtf_ndr.py b/bin/filter_gtf_ndr.py
index 7811daa..673e749 100755
--- a/bin/filter_gtf_ndr.py
+++ b/bin/filter_gtf_ndr.py
@@ -122,8 +122,9 @@ def filter_count_matrix(file, transcripts, wr):
 
     with open("unformat.novel.filter.gtf", "w") as wr:
         for record in GTF.parse_by_line(args.gtf):
-            if "transcript_id" in record and record["transcript_id"].lower() in filter:
-                record.strand = strand_dict[record["transcript_id"]]
+            tx_id = record["transcript_id"].lower()
+            if "transcript_id" in record and tx_id in filter:
+                record.strand = strand_dict[tx_id]
                 print(record, file=wr)
 
     with open("counts_transcript.filter.txt", "w") as wr:

From c595707b7e35b54ca8c2dad4e36431ab7b60743e Mon Sep 17 00:00:00 2001
From: mlorthiois <matthias.lorthiois@gmail.com>
Date: Sat, 13 Jan 2024 16:10:53 +0100
Subject: [PATCH 4/7] fix: handle case when gtf record does not have tx_id

---
 bin/filter_gtf_ndr.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/bin/filter_gtf_ndr.py b/bin/filter_gtf_ndr.py
index 673e749..98cd885 100755
--- a/bin/filter_gtf_ndr.py
+++ b/bin/filter_gtf_ndr.py
@@ -122,10 +122,11 @@ def filter_count_matrix(file, transcripts, wr):
 
     with open("unformat.novel.filter.gtf", "w") as wr:
         for record in GTF.parse_by_line(args.gtf):
-            tx_id = record["transcript_id"].lower()
-            if "transcript_id" in record and tx_id in filter:
-                record.strand = strand_dict[tx_id]
-                print(record, file=wr)
+            if "transcript_id" in record:
+                tx_id = record["transcript_id"].lower()
+                if tx_id in filter:
+                    record.strand = strand_dict[tx_id]
+                    print(record, file=wr)
 
     with open("counts_transcript.filter.txt", "w") as wr:
         filter_count_matrix(args.counts_tx, filter, wr)

From 022ba578b11a668fefc80d7c47c609d6093e6444 Mon Sep 17 00:00:00 2001
From: mlorthiois <matthias.lorthiois@gmail.com>
Date: Mon, 15 Jan 2024 10:51:12 +0100
Subject: [PATCH 5/7] fix: problem with tx_id in lowercase

---
 bin/filter_gtf_ndr.py | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/bin/filter_gtf_ndr.py b/bin/filter_gtf_ndr.py
index 98cd885..a901b8c 100755
--- a/bin/filter_gtf_ndr.py
+++ b/bin/filter_gtf_ndr.py
@@ -1,16 +1,19 @@
 #! /usr/bin/env python3
 from typing import Dict, Set, Tuple
+from collections import namedtuple
 
 from GTF import GTF
 
+TranscriptProb = namedtuple("TranscriptProb", ["gene_id", "tx_id", "ndr"])
 
-def parse_bambu(line):
-    return tuple(line)
 
+def parse_bambu(line) -> TranscriptProb:
+    return TranscriptProb(line[1], line[0].lower(), float(line[2]))
 
-def parse_tfkmers(line):
+
+def parse_tfkmers(line) -> Tuple[TranscriptProb, str]:
     ids = line[0].split("::")
-    return ids[1], ids[0], line[1], ids[2]
+    return TranscriptProb(ids[0], ids[1].lower(), float(line[1])), ids[2]
 
 
 def parse_ndr(csv, origin, th) -> Tuple[Set[str], Dict[str, str]]:
@@ -25,20 +28,18 @@ def parse_ndr(csv, origin, th) -> Tuple[Set[str], Dict[str, str]]:
         line = line.split(",")
 
         if origin == "bambu":
-            tx_id, _, ndr = parse_bambu(line)
+            tx_prob = parse_bambu(line)
         elif origin == "tfkmers":
-            tx_id, _, ndr, strand = parse_tfkmers(line)
+            tx_prob, strand = parse_tfkmers(line)
         else:
             exit("Unknown method")
 
-        ndr = float(ndr)
-
-        if ndr < th:
-            s.add(tx_id.lower())
+        if tx_prob.ndr < th:
+            s.add(tx_prob.tx_id)
 
             # Extract strand from sequence name to restrand GTF records
             if origin == "tfkmers":
-                strand_dict[tx_id] = strand
+                strand_dict[tx_prob.tx_id] = strand
 
     return s, strand_dict
 
@@ -124,8 +125,13 @@ def filter_count_matrix(file, transcripts, wr):
         for record in GTF.parse_by_line(args.gtf):
             if "transcript_id" in record:
                 tx_id = record["transcript_id"].lower()
+
                 if tx_id in filter:
-                    record.strand = strand_dict[tx_id]
+                    # If operation == "union", tx_id can be OK in bambu
+                    # but not in TFKmers. So strand not defined
+                    if tx_id in strand_dict:
+                        record.strand = strand_dict[tx_id]
+
                     print(record, file=wr)
 
     with open("counts_transcript.filter.txt", "w") as wr:

From c18ed06fadd11fbbcc4b436c5a9201d99d7d48d6 Mon Sep 17 00:00:00 2001
From: mlorthiois <matthias.lorthiois@gmail.com>
Date: Sat, 20 Jan 2024 20:11:19 +0100
Subject: [PATCH 6/7] feat: add lower prob strand if multiple extremities
 tested

---
 bin/filter_gtf_ndr.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/bin/filter_gtf_ndr.py b/bin/filter_gtf_ndr.py
index a901b8c..038ec7a 100755
--- a/bin/filter_gtf_ndr.py
+++ b/bin/filter_gtf_ndr.py
@@ -16,7 +16,10 @@ def parse_tfkmers(line) -> Tuple[TranscriptProb, str]:
     return TranscriptProb(ids[0], ids[1].lower(), float(line[1])), ids[2]
 
 
-def parse_ndr(csv, origin, th) -> Tuple[Set[str], Dict[str, str]]:
+StrandRecord = namedtuple("StrandRecord", ["ndr", "strand"])
+
+
+def parse_ndr(csv, origin, th) -> Tuple[Set[str], Dict[str, StrandRecord]]:
     s = set()
     strand_dict = dict()
 
@@ -39,7 +42,12 @@ def parse_ndr(csv, origin, th) -> Tuple[Set[str], Dict[str, str]]:
 
             # Extract strand from sequence name to restrand GTF records
             if origin == "tfkmers":
-                strand_dict[tx_prob.tx_id] = strand
+                # If both extremities are tested, keep only lower extremity prob
+                if (
+                    tx_prob.tx_id not in strand_dict
+                    or tx_prob.ndr < strand_dict[tx_prob.tx_id].ndr
+                ):
+                    strand_dict[tx_prob.tx_id] = StrandRecord(tx_prob.ndr, strand)
 
     return s, strand_dict
 
@@ -130,7 +138,7 @@ def filter_count_matrix(file, transcripts, wr):
                     # If operation == "union", tx_id can be OK in bambu
                     # but not in TFKmers. So strand not defined
                     if tx_id in strand_dict:
-                        record.strand = strand_dict[tx_id]
+                        record.strand = strand_dict[tx_id].strand
 
                     print(record, file=wr)
 

From 5fcab0d9520b47c65866d1b11422ae6b1f3d8699 Mon Sep 17 00:00:00 2001
From: vlebars <104759956+vlebars@users.noreply.github.com>
Date: Wed, 13 Mar 2024 15:21:45 +0100
Subject: [PATCH 7/7] Add 'stranded' information in header.nf to display it in
 output

---
 modules/header.nf | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/header.nf b/modules/header.nf
index 15b5272..1077b00 100755
--- a/modules/header.nf
+++ b/modules/header.nf
@@ -24,6 +24,7 @@ Tfkmers Tokenizer   : ${params.tfkmers_tokenizer}
 Tfkmers Threshold   : ${params.tfkmers_threshold}
 Bambu Threshold     : ${params.bambu_threshold}
 Filtering operation : ${params.operation}
+Stranded            : ${params.bambu_strand}
 -${c_dim}-------------------------------------${c_reset}-
 """.stripIndent()
 }