From c19d224525af1447e233cf635c35315f73c9f0d5 Mon Sep 17 00:00:00 2001 From: mlorthiois Date: Fri, 12 Jan 2024 16:02:54 +0100 Subject: [PATCH 1/7] feat: tfkmers. If strand undefined, extract both extremities --- bin/extract_tss.py | 48 ++++++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/bin/extract_tss.py b/bin/extract_tss.py index 529e7ba..8d2b974 100755 --- a/bin/extract_tss.py +++ b/bin/extract_tss.py @@ -15,26 +15,42 @@ def get_tss_interval(transcript, length): return start, end +def get_interval_record(tx, length): + start, end = get_tss_interval(tx, length) + if start > 0: + return ( + tx.seqname, + start - 1, + end, + f"{tx['gene_id']}::{tx['transcript_id']}::{tx.strand}", + tx.score, + tx.strand, + ) + else: + print( + f"{tx['transcript_id']} skipped: Start Coordinate detected < 0.", + file=sys.stderr, + ) + return None + + def get_intervals(transcripts, length): intervals = [] for transcript in transcripts: - start, end = get_tss_interval(transcript, length) - if start > 0: - intervals.append( - ( - transcript.seqname, - start - 1, - end, - f"{transcript['gene_id']}::{transcript['transcript_id']}", - transcript.score, - transcript.strand, - ) - ) + # If defined strand, test only TSS + if transcript.strand in ("+", "-"): + interval_record = get_interval_record(transcript, length) + if interval_record is not None: + intervals.append(interval_record) + + # If undefined strand, simulate and test both extremities else: - print( - f"{transcript['transcript_id']} skipped: Start Coordinate detected < 0.", - file=sys.stderr, - ) + for strand in ("+", "-"): + transcript.strand = strand + interval_record = get_interval_record(transcript, length) + if interval_record is not None: + intervals.append(interval_record) + return intervals From 3a39e48fe04a3a87fb135b19e0f7b10cb979a8cc Mon Sep 17 00:00:00 2001 From: mlorthiois Date: Fri, 12 Jan 2024 16:03:33 +0100 Subject: [PATCH 2/7] feat: if strand undefined, restrand correct strand based on tfkmers prob --- bin/filter_gtf_ndr.py | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/bin/filter_gtf_ndr.py b/bin/filter_gtf_ndr.py index e6bba39..7811daa 100755 --- a/bin/filter_gtf_ndr.py +++ b/bin/filter_gtf_ndr.py @@ -1,5 +1,6 @@ -#! /usr/bin/env python3 -from typing import Set +#! /usr/bin/env python3 +from typing import Dict, Set, Tuple + from GTF import GTF @@ -9,37 +10,47 @@ def parse_bambu(line): def parse_tfkmers(line): ids = line[0].split("::") - return ids[1], ids[0], line[1] + return ids[1], ids[0], line[1], ids[2] -def parse_ndr(csv, origin, th) -> Set[str]: +def parse_ndr(csv, origin, th) -> Tuple[Set[str], Dict[str, str]]: s = set() + strand_dict = dict() # Skip header next(csv) + strand = None for line in csv: line = line.split(",") if origin == "bambu": - line = parse_bambu(line) + tx_id, _, ndr = parse_bambu(line) elif origin == "tfkmers": - line = parse_tfkmers(line) + tx_id, _, ndr, strand = parse_tfkmers(line) + else: + exit("Unknown method") - tx_id, _, ndr = line ndr = float(ndr) if ndr < th: s.add(tx_id.lower()) - return s + # Extract strand from sequence name to restrand GTF records + if origin == "tfkmers": + strand_dict[tx_id] = strand + + return s, strand_dict def filter_count_matrix(file, transcripts, wr): print(next(file), file=wr) for line in file: line_splitted = line.split("\t") - if line_splitted[0].startswith("BambuTx") and line_splitted[0].lower() not in transcripts: + if ( + line_splitted[0].startswith("BambuTx") + and line_splitted[0].lower() not in transcripts + ): continue print(line.rstrip(), file=wr) @@ -99,8 +110,10 @@ def filter_count_matrix(file, transcripts, wr): args = parser.parse_args() ################################################### - filter_bambu = parse_ndr(args.bambu, "bambu", args.bambu_threshold) - filter_tfkmers = parse_ndr(args.tfkmers, "tfkmers", args.tfkmers_threshold) + filter_bambu, _ = parse_ndr(args.bambu, "bambu", args.bambu_threshold) + filter_tfkmers, strand_dict = parse_ndr( + args.tfkmers, "tfkmers", args.tfkmers_threshold + ) if args.operation == "union": filter = filter_bambu | filter_tfkmers @@ -110,6 +123,7 @@ def filter_count_matrix(file, transcripts, wr): with open("unformat.novel.filter.gtf", "w") as wr: for record in GTF.parse_by_line(args.gtf): if "transcript_id" in record and record["transcript_id"].lower() in filter: + record.strand = strand_dict[record["transcript_id"]] print(record, file=wr) with open("counts_transcript.filter.txt", "w") as wr: From 925f1b7de9d22ff74dfc31d9ceafd1ea3e8703ae Mon Sep 17 00:00:00 2001 From: mlorthiois Date: Sat, 13 Jan 2024 16:03:21 +0100 Subject: [PATCH 3/7] fix: forget to lowercase the record tx_id when restranding --- bin/filter_gtf_ndr.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/filter_gtf_ndr.py b/bin/filter_gtf_ndr.py index 7811daa..673e749 100755 --- a/bin/filter_gtf_ndr.py +++ b/bin/filter_gtf_ndr.py @@ -122,8 +122,9 @@ def filter_count_matrix(file, transcripts, wr): with open("unformat.novel.filter.gtf", "w") as wr: for record in GTF.parse_by_line(args.gtf): - if "transcript_id" in record and record["transcript_id"].lower() in filter: - record.strand = strand_dict[record["transcript_id"]] + tx_id = record["transcript_id"].lower() + if "transcript_id" in record and tx_id in filter: + record.strand = strand_dict[tx_id] print(record, file=wr) with open("counts_transcript.filter.txt", "w") as wr: From c595707b7e35b54ca8c2dad4e36431ab7b60743e Mon Sep 17 00:00:00 2001 From: mlorthiois Date: Sat, 13 Jan 2024 16:10:53 +0100 Subject: [PATCH 4/7] fix: handle case when gtf record does not have tx_id --- bin/filter_gtf_ndr.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/bin/filter_gtf_ndr.py b/bin/filter_gtf_ndr.py index 673e749..98cd885 100755 --- a/bin/filter_gtf_ndr.py +++ b/bin/filter_gtf_ndr.py @@ -122,10 +122,11 @@ def filter_count_matrix(file, transcripts, wr): with open("unformat.novel.filter.gtf", "w") as wr: for record in GTF.parse_by_line(args.gtf): - tx_id = record["transcript_id"].lower() - if "transcript_id" in record and tx_id in filter: - record.strand = strand_dict[tx_id] - print(record, file=wr) + if "transcript_id" in record: + tx_id = record["transcript_id"].lower() + if tx_id in filter: + record.strand = strand_dict[tx_id] + print(record, file=wr) with open("counts_transcript.filter.txt", "w") as wr: filter_count_matrix(args.counts_tx, filter, wr) From 022ba578b11a668fefc80d7c47c609d6093e6444 Mon Sep 17 00:00:00 2001 From: mlorthiois Date: Mon, 15 Jan 2024 10:51:12 +0100 Subject: [PATCH 5/7] fix: problem with tx_id in lowercase --- bin/filter_gtf_ndr.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/bin/filter_gtf_ndr.py b/bin/filter_gtf_ndr.py index 98cd885..a901b8c 100755 --- a/bin/filter_gtf_ndr.py +++ b/bin/filter_gtf_ndr.py @@ -1,16 +1,19 @@ #! /usr/bin/env python3 from typing import Dict, Set, Tuple +from collections import namedtuple from GTF import GTF +TranscriptProb = namedtuple("TranscriptProb", ["gene_id", "tx_id", "ndr"]) -def parse_bambu(line): - return tuple(line) +def parse_bambu(line) -> TranscriptProb: + return TranscriptProb(line[1], line[0].lower(), float(line[2])) -def parse_tfkmers(line): + +def parse_tfkmers(line) -> Tuple[TranscriptProb, str]: ids = line[0].split("::") - return ids[1], ids[0], line[1], ids[2] + return TranscriptProb(ids[0], ids[1].lower(), float(line[1])), ids[2] def parse_ndr(csv, origin, th) -> Tuple[Set[str], Dict[str, str]]: @@ -25,20 +28,18 @@ def parse_ndr(csv, origin, th) -> Tuple[Set[str], Dict[str, str]]: line = line.split(",") if origin == "bambu": - tx_id, _, ndr = parse_bambu(line) + tx_prob = parse_bambu(line) elif origin == "tfkmers": - tx_id, _, ndr, strand = parse_tfkmers(line) + tx_prob, strand = parse_tfkmers(line) else: exit("Unknown method") - ndr = float(ndr) - - if ndr < th: - s.add(tx_id.lower()) + if tx_prob.ndr < th: + s.add(tx_prob.tx_id) # Extract strand from sequence name to restrand GTF records if origin == "tfkmers": - strand_dict[tx_id] = strand + strand_dict[tx_prob.tx_id] = strand return s, strand_dict @@ -124,8 +125,13 @@ def filter_count_matrix(file, transcripts, wr): for record in GTF.parse_by_line(args.gtf): if "transcript_id" in record: tx_id = record["transcript_id"].lower() + if tx_id in filter: - record.strand = strand_dict[tx_id] + # If operation == "union", tx_id can be OK in bambu + # but not in TFKmers. So strand not defined + if tx_id in strand_dict: + record.strand = strand_dict[tx_id] + print(record, file=wr) with open("counts_transcript.filter.txt", "w") as wr: From c18ed06fadd11fbbcc4b436c5a9201d99d7d48d6 Mon Sep 17 00:00:00 2001 From: mlorthiois Date: Sat, 20 Jan 2024 20:11:19 +0100 Subject: [PATCH 6/7] feat: add lower prob strand if multiple extremities tested --- bin/filter_gtf_ndr.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/bin/filter_gtf_ndr.py b/bin/filter_gtf_ndr.py index a901b8c..038ec7a 100755 --- a/bin/filter_gtf_ndr.py +++ b/bin/filter_gtf_ndr.py @@ -16,7 +16,10 @@ def parse_tfkmers(line) -> Tuple[TranscriptProb, str]: return TranscriptProb(ids[0], ids[1].lower(), float(line[1])), ids[2] -def parse_ndr(csv, origin, th) -> Tuple[Set[str], Dict[str, str]]: +StrandRecord = namedtuple("StrandRecord", ["ndr", "strand"]) + + +def parse_ndr(csv, origin, th) -> Tuple[Set[str], Dict[str, StrandRecord]]: s = set() strand_dict = dict() @@ -39,7 +42,12 @@ def parse_ndr(csv, origin, th) -> Tuple[Set[str], Dict[str, str]]: # Extract strand from sequence name to restrand GTF records if origin == "tfkmers": - strand_dict[tx_prob.tx_id] = strand + # If both extremities are tested, keep only lower extremity prob + if ( + tx_prob.tx_id not in strand_dict + or tx_prob.ndr < strand_dict[tx_prob.tx_id].ndr + ): + strand_dict[tx_prob.tx_id] = StrandRecord(tx_prob.ndr, strand) return s, strand_dict @@ -130,7 +138,7 @@ def filter_count_matrix(file, transcripts, wr): # If operation == "union", tx_id can be OK in bambu # but not in TFKmers. So strand not defined if tx_id in strand_dict: - record.strand = strand_dict[tx_id] + record.strand = strand_dict[tx_id].strand print(record, file=wr) From 5fcab0d9520b47c65866d1b11422ae6b1f3d8699 Mon Sep 17 00:00:00 2001 From: vlebars <104759956+vlebars@users.noreply.github.com> Date: Wed, 13 Mar 2024 15:21:45 +0100 Subject: [PATCH 7/7] Add 'stranded' information in header.nf to display it in output --- modules/header.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/header.nf b/modules/header.nf index 15b5272..1077b00 100755 --- a/modules/header.nf +++ b/modules/header.nf @@ -24,6 +24,7 @@ Tfkmers Tokenizer : ${params.tfkmers_tokenizer} Tfkmers Threshold : ${params.tfkmers_threshold} Bambu Threshold : ${params.bambu_threshold} Filtering operation : ${params.operation} +Stranded : ${params.bambu_strand} -${c_dim}-------------------------------------${c_reset}- """.stripIndent() }