diff --git a/assets/metro_map.drawio b/assets/metro_map.drawio new file mode 100644 index 0000000..6df431f --- /dev/null +++ b/assets/metro_map.drawio @@ -0,0 +1,400 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/bin/class_code.R b/bin/class_code.R index d803774..4e63fd4 100755 --- a/bin/class_code.R +++ b/bin/class_code.R @@ -1,5 +1,6 @@ #! /usr/bin/env Rscript library(rtracklayer) +library(dplyr) ############################################################################# # CLI Args @@ -15,6 +16,10 @@ output = args[3] cc_gtf <- readGFF(class_code_gtf) ext_anno <- readGFF(extended_annotation) +# Add class_code ext_anno$class_code <- cc_gtf$class_code[match(paste(ext_anno$transcript_id, ext_anno$type), paste(cc_gtf$transcript_id, cc_gtf$type))] +# Change order of transcript_id column +ext_anno <- ext_anno %>% relocate(transcript_id, .after = gene_id) + export(ext_anno, output) \ No newline at end of file diff --git a/bin/qc.R b/bin/qc.R index 9a684a8..30c7824 100755 --- a/bin/qc.R +++ b/bin/qc.R @@ -8,6 +8,7 @@ library(grid) library(gridExtra) library(viridis) library(ggpattern) +library(stringr) brew = c("#008387", "#a0d3db", "#ad8500", "#d9cb98") palette = c("#00AFBB", "#FC4E07", "#E7B800") @@ -395,6 +396,14 @@ cover <- textGrob("ANNEXA report", gp = gpar(fontsize = 40, col = "black")) +sub_cover_text <- paste( + str_wrap(paste("Reference genome:", genome_ref), width = 40), + str_wrap(paste("Reference annotation:", gtf_ref), width = 40), + "\n", + print(Sys.Date()), + paste("ANNEXA version:", ANNEXA_version), + sep = "\n") + sub_cover_text <- paste( print(Sys.Date()), paste("ANNEXA version:", ANNEXA_version), sep = "\n") diff --git a/bin/restrand_isoforms.R b/bin/restrand_isoforms.R new file mode 100755 index 0000000..7ddbadc --- /dev/null +++ b/bin/restrand_isoforms.R @@ -0,0 +1,72 @@ +#! /usr/bin/env Rscript + +library(rtracklayer) +library(dplyr) +library(stringr) + +############################################# +# CLI Args +args = commandArgs(trailingOnly=TRUE) +gtf_file = args[1] +tx_tool = args[2] +output = args[3] + +# Define prefix to identify novel transcripts +if (tx_tool == "stringtie2") { + prefix = "MSTRG" +} else{ + prefix = "Bambu" +} + +gtf <- rtracklayer::readGFF(gtf_file) + +# Restrand isoforms of known genes ---------------------------------------- + +# Check if gtf already has ref_gene_id feature (Stringtie), if not (Bambu), create it based on gene_id column +if (!"ref_gene_id" %in% colnames(gtf)){ + gtf$ref_gene_id <- ifelse(!grepl("Bambu|MSTRG", gtf$gene_id), + gtf$gene_id, + NA) +} + +# Get strand of reference genes +ref_strand <- unique(gtf %>% filter(!is.na(ref_gene_id)) %>% select(ref_gene_id, strand)) +# Find novel isoforms +to_change <- gtf %>% filter(str_detect(transcript_id, prefix) & !str_detect(gene_id, prefix)) +# Match and apply new strand (if different) to novel isoforms +matches <- match(to_change$gene_id, ref_strand$ref_gene_id) +to_change$strand <- ref_strand$strand[matches] + +# Change strand of novel isoforms in gtf dataframe +gtf <- gtf %>% + mutate(strand = case_when( + transcript_id %in% to_change$transcript_id ~ to_change$strand[match(transcript_id, to_change$transcript_id)], + TRUE ~ strand + )) + +# Restrand novel tx of novel genes ---------------------------------------- + +# Look for novel tx of novel genes +novel <- gtf %>% filter(str_detect(transcript_id, prefix) & str_detect(gene_id, prefix)) + +# Restrand novel transcripts +# If all isoforms are unstraded, keep unstranded +# If some are stranded, use strand as new strand +# If all three strands are there, use majority rule +new_gene_strands <- novel %>% + group_by(gene_id) %>% + summarize(gene_strand = if(all(strand == "*")) "*" + else { + strands <- table(strand[strand != "*"]) + if(length(strands) > 1 && max(strands) == min(strands)) "*" + else names(which.max(strands)) + }) + +# Change strand in gtf +gtf <- gtf %>% + left_join(new_gene_strands, by="gene_id") %>% + mutate(strand = coalesce(gene_strand, strand)) %>% + select(-gene_strand) + +# Export gtf to new restranded file +export(gtf, output) \ No newline at end of file diff --git a/main.nf b/main.nf index 9db3dec..a526195 100755 --- a/main.nf +++ b/main.nf @@ -53,12 +53,14 @@ include { INDEX_BAM } from './modules/index_bam.nf' include { BAMBU } from './modules/bambu/bambu.nf' include { STRINGTIE } from './modules/stringtie/stringtie_workflow.nf' include { GFFCOMPARE } from './modules/gffcompare/gffcompare.nf' +include { RESTRAND_ISOFORMS } from './modules/restrand_isoforms.nf' include { SPLIT_EXTENDED_ANNOTATION } from './modules/split_extended_annotation.nf' include { FEELNC_CODPOT } from './modules/feelnc/codpot.nf' include { FEELNC_FORMAT } from './modules/feelnc/format.nf' include { RESTORE_BIOTYPE } from './modules/restore_biotypes.nf' include { MERGE_NOVEL } from './modules/merge_novel.nf' include { TRANSDECODER } from './modules/transdecoder/transdecoder_workflow.nf' +include { RESTRAND_NOVEL } from './modules/restrand_novel.nf' include { TFKMERS } from './modules/transforkmers/workflow.nf' include { QC as QC_FULL; QC as QC_FILTER } from './modules/qc/workflow.nf' include { ADD_CLASS_CODE } from './modules/add_class_code.nf' @@ -84,11 +86,13 @@ workflow { if(params.tx_discovery == "bambu") { BAMBU(samples.collect(), VALIDATE_INPUT_GTF.out, ref_fa) GFFCOMPARE(VALIDATE_INPUT_GTF.out, ref_fa, BAMBU.out.bambu_gtf) - SPLIT_EXTENDED_ANNOTATION(BAMBU.out.bambu_gtf) + RESTRAND_ISOFORMS(BAMBU.out.bambu_gtf) + SPLIT_EXTENDED_ANNOTATION(RESTRAND_ISOFORMS.out) } else if (params.tx_discovery == "stringtie2") { STRINGTIE(samples, VALIDATE_INPUT_GTF.out, ref_fa) - SPLIT_EXTENDED_ANNOTATION(STRINGTIE.out.stringtie_gtf) + RESTRAND_ISOFORMS(STRINGTIE.out.stringtie_gtf) + SPLIT_EXTENDED_ANNOTATION(RESTRAND_ISOFORMS.out) } /////////////////////////////////////////////////////////////////////////// @@ -116,13 +120,13 @@ workflow { // PREDICT CDS ON NOVEL TRANSCRIPTS /////////////////////////////////////////////////////////////////////////// TRANSDECODER(MERGE_NOVEL.out.novel_full_gtf, ref_fa) - + RESTRAND_NOVEL(TRANSDECODER.out) /////////////////////////////////////////////////////////////////////////// // PERFORM QC ON FULL ANNOTATION /////////////////////////////////////////////////////////////////////////// QC_FULL(samples, INDEX_BAM.out, - TRANSDECODER.out, + RESTRAND_NOVEL.out, VALIDATE_INPUT_GTF.out, ch_gene_counts, "full") @@ -131,7 +135,7 @@ workflow { // FILTER NEW TRANSCRIPTS, AND QC ON FILTERED ANNOTATION /////////////////////////////////////////////////////////////////////////// if(params.filter) { - TFKMERS(TRANSDECODER.out, + TFKMERS(RESTRAND_NOVEL.out, ref_fa, ch_ndr, tokenizer, @@ -148,9 +152,9 @@ workflow { /////////////////////////////////////////////////////////////////////////// // ADD GFFCOMPARE CLASS CODES TO FINAL GTFS /////////////////////////////////////////////////////////////////////////// - final_gtf = TRANSDECODER.out.gtf.mix(QC_FULL.out.gtf) + final_gtf = RESTRAND_NOVEL.out.mix(QC_FULL.out.gtf) if (params.filter){ - final_gtf = TRANSDECODER.out.gtf.mix(QC_FULL.out.gtf,TFKMERS.out.gtf,QC_FILTER.out.gtf) + final_gtf = RESTRAND_NOVEL.out.mix(QC_FULL.out.gtf,TFKMERS.out.gtf,QC_FILTER.out.gtf) } ADD_CLASS_CODE(class_code, final_gtf) } diff --git a/modules/add_class_code.nf b/modules/add_class_code.nf index 8336434..1f1f8ee 100644 --- a/modules/add_class_code.nf +++ b/modules/add_class_code.nf @@ -12,13 +12,13 @@ process ADD_CLASS_CODE { path "class_code.${gtf}" script: - """ + """ class_code.R ${class_code_gtf} ${gtf} "class_code.${gtf}" # Remove header created by gtfsort sed -i 1,3d "class_code.${gtf}" - # Add semicolon at end of tx lines - sed -i '/\\ttranscript\\t/s/\$/;/' "class_code.${gtf}" + # Add semicolon if missing during rtracklayer export + sed -i 's/[^;]\$/&;/' "class_code.${gtf}" """ } \ No newline at end of file diff --git a/modules/feelnc/codpot.nf b/modules/feelnc/codpot.nf index cd677c4..cfd9f96 100755 --- a/modules/feelnc/codpot.nf +++ b/modules/feelnc/codpot.nf @@ -14,6 +14,7 @@ process FEELNC_CODPOT { path "feelnc_codpot_out/new.lncRNA.gtf", emit: lncRNA path "feelnc_codpot_out/new.mRNA.gtf", emit: mRNA + script: """ grep "protein_coding" ${ref} > known_mRNA.gtf grep -v "protein_coding" ${ref} > known_lncRNA.gtf diff --git a/modules/feelnc/format.nf b/modules/feelnc/format.nf index 8c6af13..726d62d 100755 --- a/modules/feelnc/format.nf +++ b/modules/feelnc/format.nf @@ -11,6 +11,7 @@ process FEELNC_FORMAT { output: path "novel.genes.gtf" + script: """ merge_feelnc.py --lncRNA ${lncRNA} --mRNA ${mRNA} > novel.genes.gtf """ diff --git a/modules/index_bam.nf b/modules/index_bam.nf index 51a76f0..74ed6fc 100755 --- a/modules/index_bam.nf +++ b/modules/index_bam.nf @@ -11,6 +11,7 @@ process INDEX_BAM { output: file("*.bai") + script: """ samtools index $bam """ diff --git a/modules/input/validate.nf b/modules/input/validate.nf index ce88391..e5be32b 100755 --- a/modules/input/validate.nf +++ b/modules/input/validate.nf @@ -10,6 +10,7 @@ process VALIDATE_INPUT_GTF { output: file 'input.formatted.gtf' + script: """ validate_gtf.py ${gtf} > input.formatted.gtf """ diff --git a/modules/merge_novel.nf b/modules/merge_novel.nf index 3c2686d..e9df608 100755 --- a/modules/merge_novel.nf +++ b/modules/merge_novel.nf @@ -12,6 +12,7 @@ process MERGE_NOVEL { output: path "novel.full.gtf", emit: novel_full_gtf + script: """ cat ${novel_genes} ${novel_isoforms} | GTF.py format > novel.full.gtf """ diff --git a/modules/restore_biotypes.nf b/modules/restore_biotypes.nf index 39daf7c..dfae6a8 100755 --- a/modules/restore_biotypes.nf +++ b/modules/restore_biotypes.nf @@ -11,6 +11,7 @@ process RESTORE_BIOTYPE { output: path "novel.isoforms.gtf" + script: """ restore_ref_attributes.py -gtf ${novel_isoforms} -ref $ref > novel.isoforms.gtf """ diff --git a/modules/restrand_isoforms.nf b/modules/restrand_isoforms.nf new file mode 100644 index 0000000..0266ee9 --- /dev/null +++ b/modules/restrand_isoforms.nf @@ -0,0 +1,16 @@ +process RESTRAND_ISOFORMS { + conda (params.enable_conda ? "$baseDir/environment.yml" : null) + container "ghcr.io/igdrion/annexa:${workflow.revision? workflow.revision: "main"}" + + input: + file gtf + + output: + path "restranded.${gtf}" + + shell: + ''' + restrand_isoforms.R !{gtf} !{params.tx_discovery} "restranded.!{gtf}" + sed -i '/;\s*$/!s/\s*$/;/' "restranded.!{gtf}" + ''' +} \ No newline at end of file diff --git a/modules/restrand_novel.nf b/modules/restrand_novel.nf new file mode 100644 index 0000000..bdd5fcb --- /dev/null +++ b/modules/restrand_novel.nf @@ -0,0 +1,18 @@ +process RESTRAND_NOVEL { + conda (params.enable_conda ? "$baseDir/environment.yml" : null) + container "ghcr.io/igdrion/annexa:${workflow.revision? workflow.revision: "main"}" + publishDir "$params.outdir/final", mode: 'copy', saveAs: {filename -> 'novel.full.gtf'}, overwrite: true + + input: + file gtf + + output: + path "${gtf}" + + shell: + ''' + restrand_isoforms.R !{gtf} !{params.tx_discovery} "restranded.!{gtf}" + mv restranded.!{gtf} !{gtf} + sed -i '/;\s*$/!s/\s*$/;/' !{gtf} + ''' +} \ No newline at end of file diff --git a/modules/rseqc/gene_body_coverage.nf b/modules/rseqc/gene_body_coverage.nf index bc33909..37a2199 100755 --- a/modules/rseqc/gene_body_coverage.nf +++ b/modules/rseqc/gene_body_coverage.nf @@ -15,6 +15,7 @@ process GENEBODY_COVERAGE { file "${prefix}.${bed.simpleName}.geneBodyCoverage.curves.pdf" file "${prefix}.${bed.simpleName}.geneBodyCoverage.txt" + script: """ geneBody_coverage.py \ -i ./ \ diff --git a/modules/rseqc/genepredtobed.nf b/modules/rseqc/genepredtobed.nf index 776cd43..4b33008 100755 --- a/modules/rseqc/genepredtobed.nf +++ b/modules/rseqc/genepredtobed.nf @@ -10,6 +10,7 @@ process GENEPRED_TO_BED { output: path "${genePred.simpleName}.bed12" + script: """ genePredToBed ${genePred} ${genePred.simpleName}.bed12 """ diff --git a/modules/rseqc/gtftogenepred.nf b/modules/rseqc/gtftogenepred.nf index 88e3054..0721b49 100755 --- a/modules/rseqc/gtftogenepred.nf +++ b/modules/rseqc/gtftogenepred.nf @@ -10,6 +10,7 @@ process GTF_TO_GENEPRED { output: path "${gtf.simpleName}.genePred" + script: """ gtfToGenePred -genePredExt ${gtf} ${gtf.simpleName}.genePred """ diff --git a/modules/rseqc/prepare.nf b/modules/rseqc/prepare.nf index 05cf039..e217f69 100755 --- a/modules/rseqc/prepare.nf +++ b/modules/rseqc/prepare.nf @@ -6,6 +6,7 @@ process PREPARE_RSEQC { output: path "*.rseqc.gtf" + script: """ grep "protein_coding" ${novel} > novel_mRNA.rseqc.gtf grep "lncRNA" ${novel} > novel_lncRNA.rseqc.gtf diff --git a/modules/transforkmers/extract_regions.nf b/modules/transforkmers/extract_regions.nf index 81b9d96..a0f2d1f 100755 --- a/modules/transforkmers/extract_regions.nf +++ b/modules/transforkmers/extract_regions.nf @@ -10,6 +10,7 @@ process EXTRACT_TSS_REGIONS { output: path "tss_regions.bed6", emit: tss_regions + script: """ cat ${novel_gtf} | extract_tss.py -l 512 > tss_regions.bed6 """ diff --git a/modules/transforkmers/extract_sequences.nf b/modules/transforkmers/extract_sequences.nf index bef6b85..1a2bb5f 100755 --- a/modules/transforkmers/extract_sequences.nf +++ b/modules/transforkmers/extract_sequences.nf @@ -11,6 +11,7 @@ process EXTRACT_TSS_SEQUENCES { output: path "tss.fa", emit: tss_sequences + script: """ bedtools getfasta -nameOnly -fi ${fa} -bed ${tss_regions} | tr a-z A-Z > tss.fa """ diff --git a/modules/transforkmers/filter.nf b/modules/transforkmers/filter.nf index ed70281..bdbe78a 100755 --- a/modules/transforkmers/filter.nf +++ b/modules/transforkmers/filter.nf @@ -16,6 +16,7 @@ process FILTER { path "counts_transcript.filter.txt" path "counts_transcript.full.txt" + script: """ cp ${counts_tx} counts_transcript.full.txt diff --git a/modules/transforkmers/predict.nf b/modules/transforkmers/predict.nf index 417aa76..26668a2 100755 --- a/modules/transforkmers/predict.nf +++ b/modules/transforkmers/predict.nf @@ -14,6 +14,7 @@ process PREDICT { output: path "output.csv", emit: tss_prob + script: """ transforkmers predict \ --model_path_or_name ${model} \