diff --git a/.circleci/config.yml b/.circleci/config.yml index ce9d2a6..ac12eb1 100755 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -12,6 +12,7 @@ jobs: - run: cd ; nextflow run ~/RNAseq-nf -with-docker iarcbioinfo/rnaseq-nf --input_folder ~/data_test/BAM/ --output_folder BAM_realigned --ref_folder ~/data_test/REF --gtf ~/data_test/REF/TP53_small.gtf --bed ~/data_test/BED/TP53_small.bed --cpu 2 --mem 4 - run: cd ~/project/ ; docker build -t iarcbioinfo/rnaseq-transcript-nf . - run: cd ; nextflow run ~/project/rnaseq-transcript.nf --help + - run: cd ; echo -e 'ID\treadlength\tbam\nSample1\t75\tBAM_realigned/BAM/NA06984_N.bam\nSample2\t100\tBAM_realigned/BAM/NA06984_N.bam'> inputfile.tsv; nextflow run ~/project/rnaseq-transcript.nf -with-docker iarcbioinfo/rnaseq-transcript-nf --input_file inputfile.tsv --output_folder stringtie_output --gtf ~/data_test/REF/TP53_small.gtf --cpu 2 --mem 2 - run: cd ; nextflow run ~/project/rnaseq-transcript.nf -with-docker iarcbioinfo/rnaseq-transcript-nf --input_folder ~/BAM_realigned/BAM/ --output_folder stringtie_output --gtf ~/data_test/REF/TP53_small.gtf --cpu 2 --mem 2 -with-dag dag_stringtie_1pass.png - run: cd ; nextflow run ~/project/rnaseq-transcript.nf -with-docker iarcbioinfo/rnaseq-transcript-nf --input_folder ~/BAM_realigned/BAM/ --output_folder stringtie_output --gtf ~/data_test/REF/TP53_small.gtf --cpu 2 --mem 2 -with-dag dag_stringtie_1pass.html - run: cd ; nextflow run ~/project/rnaseq-transcript.nf -with-docker iarcbioinfo/rnaseq-transcript-nf --input_folder ~/BAM_realigned/BAM/ --output_folder stringtie_output --gtf ~/data_test/REF/TP53_small.gtf --twopass --cpu 2 --mem 2 -with-dag dag_stringtie_2pass.png diff --git a/Dockerfile b/Dockerfile index 31d8f90..bc6c50f 100755 --- a/Dockerfile +++ b/Dockerfile @@ -6,7 +6,7 @@ FROM continuumio/miniconda3:4.7.12 LABEL base_image="continuumio/miniconda3" LABEL version="4.7.12" LABEL software="rnaseq-transcript-nf" -LABEL software.version="2.1" +LABEL software.version="2.2" LABEL about.summary="Container image containing all requirements for rnaseq-transcript-nf" LABEL about.home="http://github.com/IARCbioinfo/RNAseq-transcript-nf" LABEL about.documentation="http://github.com/IARCbioinfo/RNAseq-transcript-nf/README.md" diff --git a/README.md b/README.md index b2f3688..180a598 100755 --- a/README.md +++ b/README.md @@ -1,24 +1,29 @@ # RNAseq-transcript-nf ## RNA-seq transcript-level analysis nextflow pipeline [![CircleCI](https://circleci.com/gh/IARCbioinfo/RNAseq-transcript-nf/tree/master.svg?style=svg)](https://circleci.com/gh/IARCbioinfo/RNAseq-transcript-nf/tree/master) -[![Docker Hub](https://img.shields.io/badge/docker-ready-blue.svg)](https://hub.docker.com/r/iarcbioinfo/rnaseq-transcript-nf/) - +[![Docker Hub](https://img.shields.io/badge/docker-ready-blue.svg)](https://hub.docker.com/r/iarcbioinfo/rnaseq-transcript-nf/)[![https://www.singularity-hub.org/static/img/hosted-singularity--hub-%23e32929.svg](https://www.singularity-hub.org/static/img/hosted-singularity--hub-%23e32929.svg)](https://singularity-hub.org/collections/3880) ![Workflow representation](rnaseq-transcript-nf.png) ## Description Performs transcript identification and quantification from a series of BAM files using StringTie, following the Nature Protocol paper (Pertea et al. 2016; doi:10.1038/nprot.2016.095) +The twopass mode involves 4 steps: +- a 1st pass identifies new transcripts for each BAM file +- a merging step merges the list of transcripts of each BAM file +- a 2nd pass quantifies transcripts from the merged file (without new transcript discovery) in each sample + + ## Dependencies +1. Nextflow: for common installation procedures see the [IARC-nf](https://github.com/IARCbioinfo/IARC-nf) repository. +2. [StringTie](http://ccb.jhu.edu/software/stringtie/) +3. [R](https://cran.r-project.org/) with packages [tidyverse](https://www.tidyverse.org/), [ballgown](https://www.bioconductor.org/packages/release/bioc/html/ballgown.html), and [tximeta](https://www.bioconductor.org/packages/release/bioc/html/tximeta.html) -1. This pipeline is based on [nextflow](https://www.nextflow.io). As we have several nextflow pipelines, we have centralized the common information in the [IARC-nf](https://github.com/IARCbioinfo/IARC-nf) repository. Please read it carefully as it contains essential information for the installation, basic usage and configuration of nextflow and our pipelines. -2. External software: -- StringTie +### twopass mode +In twopass mode, original and novel transcript annotations gtfs are compared with +4. [gffcompare](https://ccb.jhu.edu/software/stringtie/gffcompare.shtml) -You can avoid installing all the external software by only installing Docker, Singularity or conda. Simply run by specifying the corresponding `-profile` option to nextflow: -``` -nextflow run iarcbioinfo/rnaseq-transcript-nf -profile -``` +**A conda receipe, and docker and singularity containers are available with all the tools needed to run the pipeline (see "Usage")** ## Input | Type | Description | @@ -26,25 +31,28 @@ nextflow run iarcbioinfo/rnaseq-transcript-nf -profile + DESCRIPTION Container image containing all requirements for pipeline RNAseq-transcript-nf + VERSION 2.2 \ No newline at end of file diff --git a/bin/create_summarizedExperiment.R b/bin/create_summarizedExperiment.R new file mode 100755 index 0000000..aade957 --- /dev/null +++ b/bin/create_summarizedExperiment.R @@ -0,0 +1,167 @@ +args = commandArgs(trailingOnly=TRUE) +annot=args[1] +provider=args[2] +version=args[3] +genome=args[4] +organism=args[5] +gtf.path=args[6] +suffix=args[7] +fasta=args[8] + +# print arguments +print(args) +# +samples = list.dirs(".") +samples = gsub("./", "", samples[samples!="."]) + +# transcriptome R file creation +library(tximeta) +library(tximport) +library(readr) +library(dplyr) +library(SummarizedExperiment) +library(GenomicFeatures) + +# list files and transcript/gene link +STfiles = list.files(".",pattern = "t_data.ctab",recursive = T,full.names = T) +STnames = list.dirs(".") +STnames = gsub("./", "", STnames[STnames!="."]) +coldata <- data.frame(files=STfiles, names=STnames, stringsAsFactors=FALSE) +tx2gene = read.table(STfiles[1],h=T)[,c("t_name","gene_id")] +colnames(tx2gene) = c("TXNAME","GENEID") + +# import files and create summarizedExperiment object +txim = tximeta(coldata = coldata,type = "stringtie",txOut = T,tx2gene=tx2gene) +txim.genes = tximeta(coldata = coldata,type = "stringtie",txOut = F,tx2gene=tx2gene) + +# create Txome annotation for our annot +gr <- rtracklayer::import(annot) #, format = "GFF", colnames = colnames, +circ_seqs <- intersect(GenomeInfoDb::DEFAULT_CIRC_SEQS, seqlevels(gr) ) +gr <- GenomicFeatures:::.tidy_seqinfo(gr, circ_seqs, NULL) +genome(seqinfo(gr))=genome +names(gr) <- gr$transcript_id +tximetaInfo <- list(version = packageVersion("tximeta"), importTime = Sys.time()) +metadata <- list(tximetaInfo = tximetaInfo) +metadata$level <- "txp" +metadata$gtf <- GenomicFeatures:::.prepareGFFMetadata(annot, paste0(provider,"_v",version), organism, NA, NA, NULL) + +assay.nms <- rownames(assay(txim,"counts")) +txps.missing <- !assay.nms %in% names(gr) +txps2 <- gr[rownames(assay(txim,"counts"))] +ucsc.genome <- tximeta:::genome2UCSC(genome) +try(seqinfo(txps2) <- Seqinfo(genome = ucsc.genome)[seqlevels(txps2)]) + +txomeInfo = list( index=gsub(annot,"",gtf.path), + source=provider,organism=organism,release=version, + genome =genome, + fasta= fasta, + gtf= annot, + sha256= paste0(provider,"_v",version,"_hash") ) +rowRanges(txim) = txps2 +metadata(txim) = c(metadata(txim),metadata) +metadata(txim)$txomeInfo = txomeInfo + +## add raw read counts from files +rcmat_files_list = list.files(".",pattern = "transcript_count_matrix",full.names = T) +rcmat_list = lapply(rcmat_files_list, read_csv,col_names = T) +readlengths_rcmat_list = sapply( rcmat_files_list, function(x) as.numeric(strsplit(strsplit(rev( strsplit(x,"_")[[1]] )[1],"l")[[1]][2],".csv")[[1]][1] ) ) +samples_rcmat_list = lapply(rcmat_list, function(x) colnames(x)[-1]) +readlengths_txim = as.data.frame(matrix(rep(NA,ncol(txim)),nrow=1 )) +colnames(readlengths_txim) = colnames(txim) +for(i in 1:length(rcmat_list)) readlengths_txim[1,samples_rcmat_list[[i]]] = readlengths_rcmat_list[i] + +rcmat = rcmat_list[[1]] +if(length(rcmat_list)>1){ + for(i in 2:length(rcmat_list)) rcmat = left_join(rcmat,rcmat_list[[i]],by="transcript_id") +} +rcmat = rcmat[match(rownames(txim),rcmat$transcript_id),c("transcript_id",colnames(txim))] + +write_csv(rcmat,path=paste0("transcript_count_matrix",suffix,".csv")) +rcmatdf = as.data.frame(rcmat[,-1]) +rownames(rcmatdf) = rcmat$transcript_id + +colData(txim)$readlength = readlengths_txim[1,] +assays(txim,withDimnames=F)$raw_counts = rcmatdf + +names(assays(txim))[names(assays(txim))=="counts"] = "counts_length_normalized" +names(assays(txim))[names(assays(txim))=="abundance"] = "abundance_FPKM" + +assays(txim)$abundance_TPM = sweep(assays(txim)$abundance_FPKM, 2,colSums(assays(txim)$abundance_FPKM),"/" )*10**6 +assays(txim) <- SimpleList(counts=assay(txim,"raw_counts"),length=assay(txim,"length"),abundance_FPKM=assay(txim,"abundance_FPKM"),abundance_TPM=assay(txim,"abundance_TPM") ) + +# add exon information +#exons <- exonsBy(mytxdb, by="tx",use.names=T) #tximeta:::getRanges(txdb = mytxdb, txomeInfo = txomeInfo, type = "exon") +exons <- split(gr[gr$type=="exon",grep("exon|type" ,colnames(gr@elementMetadata))], + names(gr)[gr$type=="exon"]) +exons <- exons[names(txim)] +if (all(is.na(seqlengths(exons)))) { + seqinfo(exons) <- seqinfo(txim) +} +exons <- exons[rownames(txim)] +stopifnot(all(rownames(txim) == names(exons))) +mcols(exons) <- mcols(txim) +rowRanges(txim) <- exons + +eval(call("<-", as.name(paste0("transcript",suffix,".SE")),txim )) +save(list=paste0("transcript",suffix,".SE"),file = paste0("transcript",suffix,".SE.rda") ) +rm(txim) +gc() + +## +gr.g = gr[gr$type=="exon",] +names(gr.g) <- gr.g$gene_id +exons.g <- split(gr.g,names(gr.g) ) #groupGRangesBy(gr.g[gr.g$type=="exon",]) +exons.g <- exons.g[names(txim.genes)] + +genes2 = elementMetadata(unlist(exons.g)) +rownames(genes2) = genes2$gene_id +genes2 = genes2[cumsum(elementNROWS(exons.g)),] +genes2$type = "gene" + +assay.nms <- rownames(assay(txim.genes,"counts")) +genes.missing <- !assay.nms %in% names(gr.g) +genes2 <- genes2[rownames(assay(txim.genes,"counts")),] +try(seqinfo(genes2) <- Seqinfo(genome = ucsc.genome)[seqlevels(genes2)]) + +metadata.genes = metadata +metadata.genes$level <- "gene" +rowData(txim.genes) = genes2 +metadata(txim.genes) = c(metadata(txim.genes),metadata.genes) +metadata(txim.genes)$txomeInfo = txomeInfo + +## add raw read counts from files +names(assays(txim.genes))[names(assays(txim.genes))=="counts"] = "counts_length_normalized" + +rcmat_files_list.g = list.files(".",pattern = "gene_count_matrix",full.names = T) +rcmat_list.g = lapply(rcmat_files_list.g, read_csv,col_names = T) + +rcmat.g = rcmat_list.g[[1]] +if(length(rcmat_list.g)>1){ + for(i in 2:length(rcmat_list.g)) rcmat.g = left_join(rcmat.g,rcmat_list.g[[i]],by="gene_id") +} +rcmat.g = rcmat.g[match(rownames(txim.genes),rcmat.g$gene_id),c("gene_id",colnames(txim.genes))] +write_csv(rcmat.g,path=paste0("gene_count_matrix",suffix,".csv")) +rcmatdf.g = as.data.frame(rcmat.g[,-1]) +rownames(rcmatdf.g) = rcmat.g$gene_id + +colData(txim.genes)$readlength = readlengths_txim[1,] + +assays(txim.genes,withDimnames=F)$raw_counts = rcmatdf.g +names(assays(txim.genes))[names(assays(txim.genes))=="abundance"] = "abundance_FPKM" +assays(txim.genes)$abundance_TPM = sweep(assays(txim.genes)$abundance_FPKM, 2,colSums(assays(txim.genes)$abundance_FPKM),"/" )*10**6 + +### check if still SimpleList in recent SummarizedExperiment versions +assays(txim.genes) <- SimpleList(counts=assay(txim.genes,"raw_counts"),length=assay(txim.genes,"length"),abundance_FPKM=assay(txim.genes,"abundance_FPKM"),abundance_TPM=assay(txim.genes,"abundance_TPM") ) + +if (all(is.na(seqlengths(exons.g)))) { + seqinfo(exons.g) <- seqinfo(exons) +} +exons.g <- exons.g[rownames(txim.genes)] +stopifnot(all(rownames(txim.genes) == names(exons.g))) +mcols(exons.g) <- mcols(txim.genes) +rowRanges(txim.genes) <- exons.g + +eval(call("<-", as.name(paste0("gene",suffix,".SE")),txim.genes )) +save(list=paste0("gene",suffix,".SE"),file = paste0("gene",suffix,".SE.rda") ) +rm(txim.genes) +gc() \ No newline at end of file diff --git a/dag_stringtie_1pass.html b/dag_stringtie_1pass.html index e767ede..9c42159 100755 --- a/dag_stringtie_1pass.html +++ b/dag_stringtie_1pass.html @@ -107,39 +107,45 @@ nodes: [ { data: { id: 'p0', label: 'Channel.fromPath'}, classes: 'ORIGIN' }, { data: { id: 'p1', label: 'map'}, classes: 'OPERATOR' }, -{ data: { id: 'p2', label: 'view'}, classes: 'OPERATOR' }, -{ data: { id: 'p3', label: 'into'}, classes: 'OPERATOR' }, -{ data: { id: 'p4'}, classes: 'NODE' }, -{ data: { id: 'p5'}, classes: 'ORIGIN' }, -{ data: { id: 'p6', label: 'StringTie1stpass'}, classes: 'PROCESS' }, -{ data: { id: 'p7'}, classes: 'NODE' }, -{ data: { id: 'p8', label: 'into'}, classes: 'OPERATOR' }, -{ data: { id: 'p9', label: 'groupTuple'}, classes: 'OPERATOR' }, -{ data: { id: 'p10'}, classes: 'ORIGIN' }, -{ data: { id: 'p11', label: 'prepDE'}, classes: 'PROCESS' }, -{ data: { id: 'p12'}, classes: 'NODE' }, -{ data: { id: 'p13', label: 'collect'}, classes: 'OPERATOR' }, -{ data: { id: 'p14', label: 'ballgown_create'}, classes: 'PROCESS' }, -{ data: { id: 'p15'}, classes: 'NODE' }, -{ data: { id: 'p16'}, classes: 'NODE' }, +{ data: { id: 'p2', label: 'into'}, classes: 'OPERATOR' }, +{ data: { id: 'p3'}, classes: 'NODE' }, +{ data: { id: 'p4'}, classes: 'ORIGIN' }, +{ data: { id: 'p5', label: 'StringTie1stpass'}, classes: 'PROCESS' }, +{ data: { id: 'p6'}, classes: 'NODE' }, +{ data: { id: 'p7', label: 'into'}, classes: 'OPERATOR' }, +{ data: { id: 'p8', label: 'groupTuple'}, classes: 'OPERATOR' }, +{ data: { id: 'p9'}, classes: 'ORIGIN' }, +{ data: { id: 'p10', label: 'prepDE'}, classes: 'PROCESS' }, +{ data: { id: 'p11', label: 'collect'}, classes: 'OPERATOR' }, +{ data: { id: 'p12', label: 'ballgown_create'}, classes: 'PROCESS' }, +{ data: { id: 'p13'}, classes: 'NODE' }, +{ data: { id: 'p14', label: 'collect'}, classes: 'OPERATOR' }, +{ data: { id: 'p15', label: 'collect'}, classes: 'OPERATOR' }, +{ data: { id: 'p16'}, classes: 'ORIGIN' }, +{ data: { id: 'p17'}, classes: 'ORIGIN' }, +{ data: { id: 'p18', label: 'SummarizedExperiment_create'}, classes: 'PROCESS' }, ], edges: [ { data: { source: 'p0', target: 'p1'} }, -{ data: { source: 'p1', target: 'p2'} }, -{ data: { source: 'p2', target: 'p3', label: 'bam_files' } }, -{ data: { source: 'p3', target: 'p6', label: 'bam_files_41stpass' } }, -{ data: { source: 'p3', target: 'p4', label: 'bam_files_42ndpass' } }, -{ data: { source: 'p5', target: 'p6', label: 'gtf' } }, -{ data: { source: 'p6', target: 'p8', label: 'ST_out1pass' } }, -{ data: { source: 'p6', target: 'p7', label: 'stringtie_log' } }, -{ data: { source: 'p8', target: 'p9', label: 'ST_out4group' } }, -{ data: { source: 'p8', target: 'p13', label: 'ST_out4bg' } }, -{ data: { source: 'p9', target: 'p11', label: 'ST_out4prepDE' } }, -{ data: { source: 'p10', target: 'p11', label: 'samplenames' } }, -{ data: { source: 'p11', target: 'p12', label: 'count_matrices' } }, -{ data: { source: 'p13', target: 'p14'} }, -{ data: { source: 'p14', target: 'p16', label: 'FPKM_matrices' } }, -{ data: { source: 'p14', target: 'p15', label: 'rdata2pass' } }, +{ data: { source: 'p1', target: 'p2', label: 'bam_files' } }, +{ data: { source: 'p2', target: 'p5', label: 'bam_files_41stpass' } }, +{ data: { source: 'p2', target: 'p3', label: 'bam_files_42ndpass' } }, +{ data: { source: 'p4', target: 'p5', label: 'gtf' } }, +{ data: { source: 'p5', target: 'p7', label: 'ST_out1pass' } }, +{ data: { source: 'p5', target: 'p6', label: 'stringtie_log' } }, +{ data: { source: 'p7', target: 'p11', label: 'ST_out4bg' } }, +{ data: { source: 'p7', target: 'p14', label: 'ST_out4SE' } }, +{ data: { source: 'p7', target: 'p8', label: 'ST_out4group' } }, +{ data: { source: 'p8', target: 'p10', label: 'ST_out4prepDE' } }, +{ data: { source: 'p9', target: 'p10', label: 'samplenames' } }, +{ data: { source: 'p10', target: 'p15', label: 'quantif_count_matrices' } }, +{ data: { source: 'p11', target: 'p12'} }, +{ data: { source: 'p12', target: 'p18', label: 'quantif_norm_matrices' } }, +{ data: { source: 'p12', target: 'p13', label: 'rdata2pass' } }, +{ data: { source: 'p14', target: 'p18'} }, +{ data: { source: 'p15', target: 'p18'} }, +{ data: { source: 'p16', target: 'p18', label: 'gtf' } }, +{ data: { source: 'p17', target: 'p18', label: 'merged_gtf4SE' } }, ], }, diff --git a/dag_stringtie_1pass.png b/dag_stringtie_1pass.png index 941d34e..30e0a54 100755 Binary files a/dag_stringtie_1pass.png and b/dag_stringtie_1pass.png differ diff --git a/dag_stringtie_2pass.html b/dag_stringtie_2pass.html index fd9c0fd..d8ffd39 100755 --- a/dag_stringtie_2pass.html +++ b/dag_stringtie_2pass.html @@ -107,52 +107,57 @@ nodes: [ { data: { id: 'p0', label: 'Channel.fromPath'}, classes: 'ORIGIN' }, { data: { id: 'p1', label: 'map'}, classes: 'OPERATOR' }, -{ data: { id: 'p2', label: 'view'}, classes: 'OPERATOR' }, -{ data: { id: 'p3', label: 'into'}, classes: 'OPERATOR' }, -{ data: { id: 'p4'}, classes: 'ORIGIN' }, -{ data: { id: 'p5', label: 'StringTie1stpass'}, classes: 'PROCESS' }, -{ data: { id: 'p6'}, classes: 'NODE' }, -{ data: { id: 'p7', label: 'collect'}, classes: 'OPERATOR' }, -{ data: { id: 'p8'}, classes: 'ORIGIN' }, -{ data: { id: 'p9', label: 'mergeGTF'}, classes: 'PROCESS' }, -{ data: { id: 'p10'}, classes: 'NODE' }, -{ data: { id: 'p11'}, classes: 'ORIGIN' }, -{ data: { id: 'p12', label: 'StringTie2ndpass'}, classes: 'PROCESS' }, -{ data: { id: 'p13'}, classes: 'NODE' }, -{ data: { id: 'p14', label: 'into'}, classes: 'OPERATOR' }, -{ data: { id: 'p15', label: 'groupTuple'}, classes: 'OPERATOR' }, -{ data: { id: 'p16'}, classes: 'ORIGIN' }, -{ data: { id: 'p17', label: 'prepDE'}, classes: 'PROCESS' }, -{ data: { id: 'p18'}, classes: 'NODE' }, -{ data: { id: 'p19', label: 'collect'}, classes: 'OPERATOR' }, -{ data: { id: 'p20', label: 'ballgown_create'}, classes: 'PROCESS' }, -{ data: { id: 'p21'}, classes: 'NODE' }, -{ data: { id: 'p22'}, classes: 'NODE' }, +{ data: { id: 'p2', label: 'into'}, classes: 'OPERATOR' }, +{ data: { id: 'p3'}, classes: 'ORIGIN' }, +{ data: { id: 'p4', label: 'StringTie1stpass'}, classes: 'PROCESS' }, +{ data: { id: 'p5'}, classes: 'NODE' }, +{ data: { id: 'p6', label: 'collect'}, classes: 'OPERATOR' }, +{ data: { id: 'p7'}, classes: 'ORIGIN' }, +{ data: { id: 'p8', label: 'mergeGTF'}, classes: 'PROCESS' }, +{ data: { id: 'p9'}, classes: 'NODE' }, +{ data: { id: 'p10'}, classes: 'ORIGIN' }, +{ data: { id: 'p11', label: 'StringTie2ndpass'}, classes: 'PROCESS' }, +{ data: { id: 'p12'}, classes: 'NODE' }, +{ data: { id: 'p13', label: 'into'}, classes: 'OPERATOR' }, +{ data: { id: 'p14', label: 'groupTuple'}, classes: 'OPERATOR' }, +{ data: { id: 'p15'}, classes: 'ORIGIN' }, +{ data: { id: 'p16', label: 'prepDE'}, classes: 'PROCESS' }, +{ data: { id: 'p17', label: 'collect'}, classes: 'OPERATOR' }, +{ data: { id: 'p18', label: 'ballgown_create'}, classes: 'PROCESS' }, +{ data: { id: 'p19'}, classes: 'NODE' }, +{ data: { id: 'p20', label: 'collect'}, classes: 'OPERATOR' }, +{ data: { id: 'p21', label: 'collect'}, classes: 'OPERATOR' }, +{ data: { id: 'p22'}, classes: 'ORIGIN' }, +{ data: { id: 'p23', label: 'SummarizedExperiment_create'}, classes: 'PROCESS' }, ], edges: [ { data: { source: 'p0', target: 'p1'} }, -{ data: { source: 'p1', target: 'p2'} }, -{ data: { source: 'p2', target: 'p3', label: 'bam_files' } }, -{ data: { source: 'p3', target: 'p5', label: 'bam_files_41stpass' } }, -{ data: { source: 'p3', target: 'p12', label: 'bam_files_42ndpass' } }, -{ data: { source: 'p4', target: 'p5', label: 'gtf' } }, -{ data: { source: 'p5', target: 'p7', label: 'ST_out1pass' } }, -{ data: { source: 'p5', target: 'p6', label: 'stringtie_log' } }, -{ data: { source: 'p7', target: 'p9'} }, -{ data: { source: 'p8', target: 'p9', label: 'gtf' } }, -{ data: { source: 'p9', target: 'p12', label: 'merged_gtf' } }, -{ data: { source: 'p9', target: 'p10', label: 'gffcmp_output' } }, -{ data: { source: 'p11', target: 'p12', label: 'gtf' } }, -{ data: { source: 'p12', target: 'p14', label: 'ST_out2' } }, -{ data: { source: 'p12', target: 'p13', label: 'stringtie_log_2pass' } }, -{ data: { source: 'p14', target: 'p19', label: 'ST_out4bg' } }, -{ data: { source: 'p14', target: 'p15', label: 'ST_out4group' } }, -{ data: { source: 'p15', target: 'p17', label: 'ST_out4prepDE' } }, -{ data: { source: 'p16', target: 'p17', label: 'samplenames' } }, -{ data: { source: 'p17', target: 'p18', label: 'count_matrices' } }, -{ data: { source: 'p19', target: 'p20'} }, -{ data: { source: 'p20', target: 'p22', label: 'FPKM_matrices' } }, -{ data: { source: 'p20', target: 'p21', label: 'rdata2pass' } }, +{ data: { source: 'p1', target: 'p2', label: 'bam_files' } }, +{ data: { source: 'p2', target: 'p11', label: 'bam_files_42ndpass' } }, +{ data: { source: 'p2', target: 'p4', label: 'bam_files_41stpass' } }, +{ data: { source: 'p3', target: 'p4', label: 'gtf' } }, +{ data: { source: 'p4', target: 'p6', label: 'ST_out1pass' } }, +{ data: { source: 'p4', target: 'p5', label: 'stringtie_log' } }, +{ data: { source: 'p6', target: 'p8'} }, +{ data: { source: 'p7', target: 'p8', label: 'gtf' } }, +{ data: { source: 'p8', target: 'p11', label: 'merged_gtf' } }, +{ data: { source: 'p8', target: 'p23', label: 'merged_gtf4SE' } }, +{ data: { source: 'p8', target: 'p9', label: 'gffcmp_output' } }, +{ data: { source: 'p10', target: 'p11', label: 'gtf' } }, +{ data: { source: 'p11', target: 'p13', label: 'ST_out2' } }, +{ data: { source: 'p11', target: 'p12', label: 'stringtie_log_2pass' } }, +{ data: { source: 'p13', target: 'p20', label: 'ST_out4SE' } }, +{ data: { source: 'p13', target: 'p17', label: 'ST_out4bg' } }, +{ data: { source: 'p13', target: 'p14', label: 'ST_out4group' } }, +{ data: { source: 'p14', target: 'p16', label: 'ST_out4prepDE' } }, +{ data: { source: 'p15', target: 'p16', label: 'samplenames' } }, +{ data: { source: 'p16', target: 'p21', label: 'quantif_count_matrices' } }, +{ data: { source: 'p17', target: 'p18'} }, +{ data: { source: 'p18', target: 'p23', label: 'quantif_norm_matrices' } }, +{ data: { source: 'p18', target: 'p19', label: 'rdata2pass' } }, +{ data: { source: 'p20', target: 'p23'} }, +{ data: { source: 'p21', target: 'p23'} }, +{ data: { source: 'p22', target: 'p23', label: 'gtf' } }, ], }, diff --git a/dag_stringtie_2pass.png b/dag_stringtie_2pass.png index 300a573..4bcb8b6 100755 Binary files a/dag_stringtie_2pass.png and b/dag_stringtie_2pass.png differ diff --git a/environment.yml b/environment.yml index 72e4076..cab572b 100755 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,9 @@ channels: - bioconda - defaults dependencies: - - stringtie=2.1.1 + - stringtie=2.1.2 - openssl=1.1.1 - - bioconductor-ballgown=2.18.0 - gffcompare=0.11.2 + - r-tidyverse=1.3.0 + - bioconductor-ballgown=2.20.0 + - bioconductor-tximeta=1.6.2 \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index ec277d1..0f18d5a 100755 --- a/nextflow.config +++ b/nextflow.config @@ -11,11 +11,11 @@ profiles { } docker { docker.enabled = true - process.container = 'iarcbioinfo/rnaseq-transcript-nf:v2.1' + process.container = 'iarcbioinfo/rnaseq-transcript-nf:v2.2' } singularity { singularity.enabled = true - process.container = 'shub://IARCbioinfo/RNAseq-transcript-nf:v2.1' + process.container = 'shub://IARCbioinfo/RNAseq-transcript-nf:v2.2' } } diff --git a/rnaseq-transcript-nf.png b/rnaseq-transcript-nf.png new file mode 100644 index 0000000..1bccaa7 Binary files /dev/null and b/rnaseq-transcript-nf.png differ diff --git a/rnaseq-transcript-nf.svg b/rnaseq-transcript-nf.svg new file mode 100644 index 0000000..454414d --- /dev/null +++ b/rnaseq-transcript-nf.svg @@ -0,0 +1,10921 @@ + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Quantification + + + + + + + First Pass + quantify expressionStringTie  + + + + Robjectsrda format + + + + + + + + Compulsory step + + Optional step + Legend + Compulsory input + Optional input + + option --twopass + default + Aligned readsBAM format + input file with bam files location + + + + + + + folder with BAM files + default + Process outputs + + + + + + + + + + + + + + + + + + + + + + Second Pass + compare GTFsgffcompare  + merge GTFsStringTie --merge + quantify expressionStringTie + + + + + +   + + Run ballgown + compute FPKM & TPM matrices and ballgown objectcreate_matrices.R  + + + + + + + + + Prepare count matrices + compute matrices from GTFsprepDE.py  + + + + + + Run SummarizedExperiment + merge count matrices and compute SE objectscreate_summarizedExperiment.R  + + + MatricesCSV format + + diff --git a/rnaseq-transcript.nf b/rnaseq-transcript.nf index aae8eeb..989a0c3 100755 --- a/rnaseq-transcript.nf +++ b/rnaseq-transcript.nf @@ -24,14 +24,18 @@ params.cpu = 2 params.gtf = null params.prepDE_input = 'NO_FILE' params.readlength = 75 - params.twopass = null +params.annot_organism = "Homo sapiens" +params.annot_genome = "Unknown" +params.annot_provider = "Unspecified" +params.annot_version = "Unspecified" +params.ref = "Unspecified" params.help = null log.info "" log.info "-----------------------------------------------------------------" -log.info "RNAseq-transcript-nf 2.1: gene- and transcript-level " +log.info "RNAseq-transcript-nf 2.2: gene- and transcript-level " log.info "expression quantification from RNA sequencing data with StringTie" log.info "-----------------------------------------------------------------" log.info "Copyright (C) IARC/WHO" @@ -53,9 +57,17 @@ if (params.help) { log.info ' --gtf FILE Annotation file.' log.info "" log.info "Optional arguments:" - log.info ' --input_file FILE File in TSV format containing ID, path to BAM file, and readlength per sample.' - log.info ' --output_folder STRING Output folder (default: results_alignment).' + log.info ' --input_file FILE File in TSV format containing columns "ID" (sample ID), "bam" ' + log.info ' (path to RNA-seq BAM file), and "readlength" (sample read length)' + log.info ' --output_folder STRING Output folder (default: .).' log.info ' --readlength STRING Mean read length for count computation (default: 75).' + log.info ' --prepDE_input FILE File given to script prepDE from StringTie (default: none).' + log.info ' --annot_organism, ' + log.info ' --annot_genome, ' + log.info ' --annot_provider,' + log.info ' --annot_version,' + log.info ' --ref STRINGS metainformation stored in SummarizedExperiment R object' + log.info ' (default: "Homo sapiens","hg38", Unknown, Unknown, Unknown)' log.info ' --cpu INTEGER Number of cpu used by bwa mem and sambamba (default: 2).' log.info ' --mem INTEGER Size of memory used for mapping (in GB) (default: 2).' log.info "" @@ -65,14 +77,20 @@ if (params.help) { exit 0 } else { /* Software information */ - log.info "input_folder = ${params.input_folder}" - log.info "input_file = ${params.input_file}" - log.info "cpu = ${params.cpu}" - log.info "mem = ${params.mem}" - log.info "readlength = ${params.readlength}" - log.info "output_folder = ${params.output_folder}" - log.info "gtf = ${params.gtf}" - log.info "twopass = ${params.twopass}" + log.info "input_folder = ${params.input_folder}" + log.info "input_file = ${params.input_file}" + log.info "cpu = ${params.cpu}" + log.info "mem = ${params.mem}" + log.info "readlength = ${params.readlength}" + log.info "output_folder = ${params.output_folder}" + log.info "gtf = ${params.gtf}" + log.info "twopass = ${params.twopass}" + log.info "params.prepDE_input = ${params.prepDE_input}" + log.info "annot_organism = ${params.annot_organism}" + log.info "annot_genome = ${params.annot_genome}" + log.info "annot_provider = ${params.annot_provider}" + log.info "annot_version = ${params.annot_version}" + log.info "ref = ${params.ref}" log.info "help: ${params.help}" } @@ -85,7 +103,7 @@ if (file(params.input_folder).listFiles().findAll { it.name ==~ /.*bam/ }.size() println "BAM files found, proceed with transcript quantification"; mode ='bam' bam_files = Channel.fromPath( params.input_folder+'/*.bam') .map{ path -> [ path.name.replace(".bam",""), params.readlength, path ] } - .view() + //.view() }else{ println "ERROR: input folder contains no fastq nor BAM files"; System.exit(1) } @@ -114,8 +132,8 @@ process StringTie1stpass { if (filename.indexOf(".log") > 0){ "logs/$filename" }else{ - if(params.twopass) "sample_folders/ST1of2passes/$filename" - else "sample_folders/ST1pass/$filename" + if(params.twopass) "intermediate_files/sample_folders/ST1of2passes/$filename" + else "intermediate_files/sample_folders/ST1pass/$filename" } } @@ -149,10 +167,16 @@ process mergeGTF { file gtf output: - file("stringtie_merged.gtf") into merged_gtf + file "stringtie_merged.gtf" into merged_gtf, merged_gtf4SE file("gffcmp_merged*") into gffcmp_output - publishDir "${params.output_folder}/gtf", mode: 'copy', pattern: '{gffcmp_merged*}' - + publishDir "${params.output_folder}/gtf", mode: 'copy', saveAs: {filename -> + if (filename.indexOf("gffcmp") == 0){ + "gffcmp/$filename" + }else{ + "$filename" + } + } + shell: ''' ls */*_ST.gtf > mergelist.txt @@ -177,7 +201,7 @@ process StringTie2ndpass { file "*.log" into stringtie_log_2pass publishDir params.output_folder, mode: 'copy', saveAs: {filename -> if (filename.indexOf(".log") > 0) "logs/$filename" - else "sample_folders/ST2pass/$filename" + else "intermediate_files/sample_folders/ST2pass/$filename" } shell: @@ -191,12 +215,14 @@ process StringTie2ndpass { } }else{ ST_out2 = ST_out1pass + merged_gtf4SE = file('NO_FILE') }//end if twopass ST_out4prepDE = Channel.create() ST_out4bg = Channel.create() +ST_out4SE = Channel.create() ST_out4group = Channel.create() -ST_out2.into( ST_out4group, ST_out4bg) +ST_out2.into( ST_out4group, ST_out4bg, ST_out4SE) ST_out4prepDE = ST_out4group.groupTuple(by:1) process prepDE { @@ -209,8 +235,8 @@ process prepDE { file samplenames from ch_prepDE_input output: - file("*count_matrix*.csv") into count_matrices - publishDir "${params.output_folder}/expr_matrices", mode: 'copy' + file("*count_matrix*.csv") into quantif_count_matrices + publishDir "${params.output_folder}/intermediate_files/expr_matrices", mode: 'copy' shell: input = samplenames.name != 'NO_FILE' ? "$samplenames" : '.' @@ -228,13 +254,77 @@ process ballgown_create { file ST_outs from ST_out4bg.collect() output: - file("*_matrix*.csv") into FPKM_matrices + file("*_matrix*.csv") into quantif_norm_matrices file("*.rda") into rdata2pass - publishDir "${params.output_folder}/expr_matrices", mode: 'copy' - + publishDir "${params.output_folder}", mode: 'copy', saveAs: {filename -> + if (filename.indexOf(".csv") > 0) "expr_matrices/$filename" + else "Robjects/$filename" + } + shell: suffix = params.twopass == null ? " " : '_2pass' ''' Rscript !{baseDir}/bin/create_matrices.R !{suffix} ''' +} + +process SummarizedExperiment_create { + cpus params.cpu + memory params.mem +'G' + + input: + file ST_outs from ST_out4SE.collect() + file quantif_norm_matrices + file count_mats from quantif_count_matrices.collect() + file gtf + file merged_gtf4SE + + output: + file("*.rda") + file("*_matrix*.csv") + publishDir "${params.output_folder}", mode: 'copy', saveAs: {filename -> + if (filename.indexOf(".csv") > 0) "expr_matrices/$filename" + else "Robjects/$filename" + } + + shell: + suffix = params.twopass == null ? "_1pass" : '_2pass' + if(merged_gtf4SE.name =='NO_FILE'){ + gtf2use=gtf + gtf_path=params.gtf + }else{ + gtf2use=merged_gtf4SE + gtf_path="${params.output_folder}/gtf/"+merged_gtf4SE.name + } + ''' + if [ "!{params.annot_genome}" == "Unknown" ] + then + if grep -q -E "GRCh37|hg19" !{gtf} + then genome=hg19 + else + genome=hg38 + fi + else + genome=!{params.annot_genome} + fi + if [ "!{params.annot_provider}" == "Unspecified" ] + then + provider="Unspecified" + if [ `cat !{gtf} | grep provider | awk '{print $2}'` != "" ] + then provider=`cat !{gtf} | grep provider | awk '{print $2}'` + fi + else + provider=!{params.annot_provider} + fi + if [ "!{params.annot_version}" == "Unspecified" ] + then + version="Unspecified" + if [ `cat !{gtf} | grep version | awk '{for(i=1;i<=NF;i++)if($i=="version")print $(i+1)}'` != "" ] + then version=`cat !{gtf} | grep version | awk '{for(i=1;i<=NF;i++)if($i=="version")print $(i+1)}'` + fi + else + version=!{params.annot_version} + fi + Rscript !{baseDir}/bin/create_summarizedExperiment.R !{gtf2use} ${provider} ${version} ${genome} "!{params.annot_organism}" !{gtf_path} !{suffix} !{params.ref} + ''' } \ No newline at end of file