Merge pull request #6 from IARCbioinfo/dev

To new version 2.2
IARCbioinfo · Jun 5, 2020 · c664d0a · c664d0a
2 parents 49d39cd + 1c6d40a
commit c664d0a
Show file tree

Hide file tree

Showing 14 changed files with 11,347 additions and 128 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -12,6 +12,7 @@ jobs:
                         - run: cd ; nextflow run ~/RNAseq-nf -with-docker iarcbioinfo/rnaseq-nf --input_folder ~/data_test/BAM/ --output_folder BAM_realigned --ref_folder  ~/data_test/REF --gtf ~/data_test/REF/TP53_small.gtf --bed ~/data_test/BED/TP53_small.bed --cpu 2 --mem 4
                         - run: cd ~/project/ ; docker build -t iarcbioinfo/rnaseq-transcript-nf .
                         - run: cd ; nextflow run ~/project/rnaseq-transcript.nf --help
+                        - run: cd ; echo -e 'ID\treadlength\tbam\nSample1\t75\tBAM_realigned/BAM/NA06984_N.bam\nSample2\t100\tBAM_realigned/BAM/NA06984_N.bam'> inputfile.tsv; nextflow run ~/project/rnaseq-transcript.nf -with-docker iarcbioinfo/rnaseq-transcript-nf --input_file inputfile.tsv --output_folder stringtie_output --gtf ~/data_test/REF/TP53_small.gtf --cpu 2 --mem 2
                         - run: cd ; nextflow run ~/project/rnaseq-transcript.nf -with-docker iarcbioinfo/rnaseq-transcript-nf --input_folder ~/BAM_realigned/BAM/ --output_folder stringtie_output --gtf ~/data_test/REF/TP53_small.gtf --cpu 2 --mem 2 -with-dag dag_stringtie_1pass.png
                         - run: cd ; nextflow run ~/project/rnaseq-transcript.nf -with-docker iarcbioinfo/rnaseq-transcript-nf --input_folder ~/BAM_realigned/BAM/ --output_folder stringtie_output --gtf ~/data_test/REF/TP53_small.gtf --cpu 2 --mem 2 -with-dag dag_stringtie_1pass.html
                         - run: cd ; nextflow run ~/project/rnaseq-transcript.nf -with-docker iarcbioinfo/rnaseq-transcript-nf --input_folder ~/BAM_realigned/BAM/ --output_folder stringtie_output --gtf ~/data_test/REF/TP53_small.gtf --twopass --cpu 2 --mem 2 -with-dag dag_stringtie_2pass.png

diff --git a/Dockerfile b/Dockerfile
@@ -6,7 +6,7 @@ FROM continuumio/miniconda3:4.7.12
 LABEL base_image="continuumio/miniconda3"
 LABEL version="4.7.12"
 LABEL software="rnaseq-transcript-nf"
-LABEL software.version="2.1"
+LABEL software.version="2.2"
 LABEL about.summary="Container image containing all requirements for rnaseq-transcript-nf"
 LABEL about.home="http://github.com/IARCbioinfo/RNAseq-transcript-nf"
 LABEL about.documentation="http://github.com/IARCbioinfo/RNAseq-transcript-nf/README.md"

diff --git a/README.md b/README.md
@@ -1,50 +1,58 @@
 # RNAseq-transcript-nf
 ## RNA-seq transcript-level analysis nextflow pipeline
 [![CircleCI](https://circleci.com/gh/IARCbioinfo/RNAseq-transcript-nf/tree/master.svg?style=svg)](https://circleci.com/gh/IARCbioinfo/RNAseq-transcript-nf/tree/master)
-[![Docker Hub](https://img.shields.io/badge/docker-ready-blue.svg)](https://hub.docker.com/r/iarcbioinfo/rnaseq-transcript-nf/)
-
+[![Docker Hub](https://img.shields.io/badge/docker-ready-blue.svg)](https://hub.docker.com/r/iarcbioinfo/rnaseq-transcript-nf/)[![https://www.singularity-hub.org/static/img/hosted-singularity--hub-%23e32929.svg](https://www.singularity-hub.org/static/img/hosted-singularity--hub-%23e32929.svg)](https://singularity-hub.org/collections/3880)
 
 ![Workflow representation](rnaseq-transcript-nf.png)
 
 ## Description
 Performs transcript identification and quantification from a series of BAM files using StringTie, following the Nature Protocol paper (Pertea et al. 2016; doi:10.1038/nprot.2016.095)
 
+The twopass mode involves 4 steps:
+- a 1st pass identifies new transcripts for each BAM file
+- a merging step merges the list of transcripts of each BAM file
+- a 2nd pass quantifies transcripts from the merged file (without new transcript discovery) in each sample
+
+
 ## Dependencies 
+1. Nextflow: for common installation procedures see the [IARC-nf](https://github.com/IARCbioinfo/IARC-nf) repository.
+2. [StringTie](http://ccb.jhu.edu/software/stringtie/)
+3. [R](https://cran.r-project.org/) with packages [tidyverse](https://www.tidyverse.org/), [ballgown](https://www.bioconductor.org/packages/release/bioc/html/ballgown.html), and [tximeta](https://www.bioconductor.org/packages/release/bioc/html/tximeta.html)
 
-1. This pipeline is based on [nextflow](https://www.nextflow.io). As we have several nextflow pipelines, we have centralized the common information in the [IARC-nf](https://github.com/IARCbioinfo/IARC-nf) repository. Please read it carefully as it contains essential information for the installation, basic usage and configuration of nextflow and our pipelines.
-2. External software:
-- StringTie
+### twopass mode
+In twopass mode, original and novel transcript annotations gtfs are compared with
+4. [gffcompare](https://ccb.jhu.edu/software/stringtie/gffcompare.shtml)
 
-You can avoid installing all the external software by only installing Docker, Singularity or conda. Simply run by specifying the corresponding `-profile` option to nextflow:
-```
-nextflow run iarcbioinfo/rnaseq-transcript-nf -profile <docker/singularity/conda>
-```
+**A conda receipe, and docker and singularity containers are available with all the tools needed to run the pipeline (see "Usage")**
 
 ## Input 
   | Type      | Description     |
   |-----------|---------------|
   | input_folder    | input folder with BAM files |
 
 
-
 ## Parameters
 * #### Mandatory
 | Name      | Example value | Description     |
 |-----------|---------------|-----------------| 
 | --gtf    |  ref_annot.gtf  | annotation .gtf file |
 
-
+  
   * #### Optional
 | Name      | Default value | Description     |
 |-----------|---------------|-----------------| 
-|--input_file | null |  File in TSV format containing ID, path to BAM file, and readlength per sample |
+|--input_file | null |  File in TSV format containing columns "ID" (sample ID), "bam" (path to RNA-seq BAM file), and "readlength" (the sample's read length) |
 | --output_folder |      . | folder where output is written |
-|--readlength | 75 | Mean read length for count computation |
+|--readlength | 75 | Read length for count computation (only if input_folder is used) |
 | --mem  | 2 | memory |
 | --cpu  | 2 | number of CPUs | 
+| --prepDE_input | none | File given to script prepDE from StringTie |
+| --annot_organism, --annot_genome, --annot_provider, --annot_version, --ref | "Homo sapiens","hg38", Unknown, Unknown, Unknown | metainformation stored in SummarizedExperiment R object |
 
 Note that you have two ways of providing input: specifying a folder (then all bam files will be processed) or a file with columns "ID", "bam", and "readlength" (in any order). The file method is preferred when bam files do not all have the same read length.
 
+Also note that of the metainformation used for the SummarizedExperiment object creation, only --annot_organism and --annot_genome are actually used to retrieve information about the annotation in the R script; the other parameters (including the reference) are just written in the metainformation of the object. When no genome or organism is specified, the script attempts to retrieve automatically the metainformation from the gtf file, and otherwise falls back to defaults (hg38 and Homo sapiens).
+
 * #### Flags
 
 | Name  | Description |
@@ -57,23 +65,35 @@ Note that you have two ways of providing input: specifying a folder (then all ba
   nextflow run iarcbioinfo/RNAseq-transcript-nf --input_folder BAM/ --output_folder out --gtf ref_annot.gtf
   ```
 
+ To run the pipeline on a series of BAM files in folder *BAM* and an annotation file ref_annot.gtf, one can type:
+```bash
+nextflow run IARCbioinfo/RNAseq-transcript-nf -r v2.2 -profile singularity --input_folder BAM/ --output_folder out --gtf ref_annot.gtf 
+``` 
+To run the pipeline without singularity just remove "-profile singularity". Alternatively, one can run the pipeline using a docker container (-profile docker) the conda receipe containing all required dependencies (-profile conda).
+
 ## Output 
-- StringTie logs in folder logs/
-- matrices with gene and transcript expression in different formats (counts, FPKM, and an R ballgown object *bg.rda* with all information) in folder expr_matrices
-
-In addition, a folder is created (sample/ST1pass/ or sample/ST2pass depending on the twopass option) with a folder for each sample, with:
+ | Type      | Description     |
+  |-----------|---------------|
+  |expr_matrices/*_matrix.csv | matrices with gene and transcript expression in different formats (counts, FPKM, and TPM) |
+  | logs/  | StringTie logs |
+  | stats/    | gatk stats files from mutect |
+  | intermediate_files/expression_matrices/ | same as expr_matrices/*_matrix.csv but with one matrix per read length  |
+  | intermediate_files/sample_folders/ | (see below) |
+  | (optional) gtf/ | if the twopass mode is enabled, stringtie_annot.gtf contains identified genes and transcripts and folder gffcmp stats comparing original gtf with stringtie gtf (see below) |
+  | Robjects | R data objects in ballgown format (bg.rda, containing both gene- and transcript-level information) and SummarizedExperiment format (gene*.SE.rda and transcript*.SE.rda)|
+          
+The sample_folders/ folder contain subfolders (sample/ST1pass/ or sample/ST2pass depending on the twopass option), which themselves contain a folder for each sample, with:
 - an expression quantification file (\*_gene_abund.tab) with FPKM and TPM
 - an annotation file (\*_merged.gtf)
 - Ballgown input files for statistical analysis using R package ballgown (exon/transcript and intron/transcript ids correspondance e2t.ctab and i2t.ctab, exon, intron, and transcript-level quantification files e_data.ctab, i_data.ctab, and t_data.ctab) 
 
-- if the twopass mode is enabled, an annotation file with the discovered and known transcripts (gtf/gffcmp_merged.annotated.gtf), along with information about naming (gffcmp_merged.stringtie_merged.gtf.refmap, gffcmp_merged.tracking) and positions (gffcmp_merged.loci, gffcmp_merged.stringtie_merged.gtf.tmap), and some statistics (gtf/gffcmp_merged.stats)
+The gtf/gffcmp folder contains an annotation file with the discovered and known transcripts (gtf/gffcmp_merged.annotated.gtf), along with information about naming (gffcmp_merged.stringtie_merged.gtf.refmap, gffcmp_merged.tracking) and positions (gffcmp_merged.loci, gffcmp_merged.stringtie_merged.gtf.tmap), and some statistics (gtf/gffcmp_merged.stats)
 
-
-## Detailed description
-This workflow involves 3 steps:
-- a 1st pass identifies new transcripts for each BAM file
-- a merging step merges the list of transcripts of each BAM file
-- a 2nd pass quantifies transcripts from the merged file (without new transcript discovery) in each sample
+See the [SummarizedExperiment documentation](https://bioconductor.org/packages/release/bioc/vignettes/SummarizedExperiment/inst/doc/SummarizedExperiment.html) for details about the structure. In our case, the data contains:
+- 4 assays accessible using function *assay()* (raw_counts, length--with transcript lengths--, abundance_FPKM, and abundance_TPM)
+- metadata about package versions, gtf (reference genome, version, annotation ) accessible using function *metadata()* 
+- feature data (transcript and gene name, id) accessible using function *rowData()* and *rowRanges()* (detailed information)
+- sample data (name and read length) accessible using function *colData()*
 
 ## Directed Acyclic Graph
 [![DAG](dag_stringtie_1pass.png)](http://htmlpreview.github.io/?https://github.com/IARCbioinfo/template-nf/blob/master/dag_stringtie_1pass.html)
@@ -84,6 +104,6 @@ This workflow involves 3 steps:
   |-----------|---------------|-----------------| 
   | Nicolas Alcala*    |  [email protected] | Developer to contact for support |
 
-## References (optional)
+## References
 Pertea, M., Kim, D., Pertea, G. M., Leek, J. T., & Salzberg, S. L. (2016). Transcript-level expression analysis of RNA-seq experiments with HISAT, StringTie and Ballgown. Nature protocols, 11(9), 1650-1667.
 
diff --git a/Singularity/Singularity.v2.2 b/Singularity/Singularity.v2.2
@@ -0,0 +1,7 @@
+From:iarcbioinfo/rnaseq-transcript-nf:v2.2
+Bootstrap:docker
+
+%labels
+    MAINTAINER **alcalan** <**[email protected]**>
+    DESCRIPTION Container image containing all requirements for pipeline RNAseq-transcript-nf
+    VERSION 2.2
diff --git a/bin/create_summarizedExperiment.R b/bin/create_summarizedExperiment.R
@@ -0,0 +1,167 @@
+args = commandArgs(trailingOnly=TRUE)
+annot=args[1]
+provider=args[2]
+version=args[3]
+genome=args[4]
+organism=args[5]
+gtf.path=args[6]
+suffix=args[7]
+fasta=args[8]
+
+# print arguments
+print(args)
+#
+samples = list.dirs(".")
+samples = gsub("./", "", samples[samples!="."])
+
+# transcriptome R file creation
+library(tximeta)
+library(tximport)
+library(readr)
+library(dplyr)
+library(SummarizedExperiment)
+library(GenomicFeatures)
+
+# list files and transcript/gene link
+STfiles = list.files(".",pattern = "t_data.ctab",recursive = T,full.names = T)
+STnames = list.dirs(".")
+STnames = gsub("./", "", STnames[STnames!="."])
+coldata <- data.frame(files=STfiles, names=STnames, stringsAsFactors=FALSE)
+tx2gene = read.table(STfiles[1],h=T)[,c("t_name","gene_id")]
+colnames(tx2gene) = c("TXNAME","GENEID")
+
+# import files and create summarizedExperiment object
+txim = tximeta(coldata = coldata,type = "stringtie",txOut = T,tx2gene=tx2gene)
+txim.genes = tximeta(coldata = coldata,type = "stringtie",txOut = F,tx2gene=tx2gene)
+
+# create Txome annotation for our annot
+gr <- rtracklayer::import(annot) #, format = "GFF", colnames = colnames, 
+circ_seqs <- intersect(GenomeInfoDb::DEFAULT_CIRC_SEQS, seqlevels(gr) )
+gr <- GenomicFeatures:::.tidy_seqinfo(gr, circ_seqs, NULL)
+genome(seqinfo(gr))=genome
+names(gr) <- gr$transcript_id
+tximetaInfo <- list(version = packageVersion("tximeta"), importTime = Sys.time())
+metadata <- list(tximetaInfo = tximetaInfo)
+metadata$level <- "txp"
+metadata$gtf <- GenomicFeatures:::.prepareGFFMetadata(annot, paste0(provider,"_v",version), organism, NA, NA, NULL)
+
+assay.nms <- rownames(assay(txim,"counts"))
+txps.missing <- !assay.nms %in% names(gr)
+txps2 <- gr[rownames(assay(txim,"counts"))]
+ucsc.genome <- tximeta:::genome2UCSC(genome)
+try(seqinfo(txps2) <- Seqinfo(genome = ucsc.genome)[seqlevels(txps2)])
+
+txomeInfo = list( index=gsub(annot,"",gtf.path),
+                  source=provider,organism=organism,release=version,
+                  genome =genome,
+                  fasta= fasta,
+                  gtf= annot,
+                  sha256= paste0(provider,"_v",version,"_hash") )
+rowRanges(txim) = txps2
+metadata(txim)  = c(metadata(txim),metadata)
+metadata(txim)$txomeInfo = txomeInfo
+
+## add raw read counts from files
+rcmat_files_list = list.files(".",pattern = "transcript_count_matrix",full.names = T)
+rcmat_list = lapply(rcmat_files_list, read_csv,col_names = T)
+readlengths_rcmat_list = sapply( rcmat_files_list, function(x) as.numeric(strsplit(strsplit(rev( strsplit(x,"_")[[1]] )[1],"l")[[1]][2],".csv")[[1]][1] ) )
+samples_rcmat_list = lapply(rcmat_list, function(x) colnames(x)[-1])
+readlengths_txim = as.data.frame(matrix(rep(NA,ncol(txim)),nrow=1 ))
+colnames(readlengths_txim) = colnames(txim)
+for(i in 1:length(rcmat_list)) readlengths_txim[1,samples_rcmat_list[[i]]] = readlengths_rcmat_list[i]
+
+rcmat = rcmat_list[[1]]
+if(length(rcmat_list)>1){ 
+    for(i in 2:length(rcmat_list)) rcmat = left_join(rcmat,rcmat_list[[i]],by="transcript_id")
+}
+rcmat = rcmat[match(rownames(txim),rcmat$transcript_id),c("transcript_id",colnames(txim))]
+
+write_csv(rcmat,path=paste0("transcript_count_matrix",suffix,".csv"))
+rcmatdf = as.data.frame(rcmat[,-1])
+rownames(rcmatdf) = rcmat$transcript_id
+
+colData(txim)$readlength = readlengths_txim[1,]
+assays(txim,withDimnames=F)$raw_counts = rcmatdf
+
+names(assays(txim))[names(assays(txim))=="counts"] = "counts_length_normalized"
+names(assays(txim))[names(assays(txim))=="abundance"] = "abundance_FPKM"
+
+assays(txim)$abundance_TPM = sweep(assays(txim)$abundance_FPKM, 2,colSums(assays(txim)$abundance_FPKM),"/" )*10**6
+assays(txim) <- SimpleList(counts=assay(txim,"raw_counts"),length=assay(txim,"length"),abundance_FPKM=assay(txim,"abundance_FPKM"),abundance_TPM=assay(txim,"abundance_TPM") )
+
+# add exon information
+#exons <- exonsBy(mytxdb, by="tx",use.names=T) #tximeta:::getRanges(txdb = mytxdb, txomeInfo = txomeInfo, type = "exon")
+exons <- split(gr[gr$type=="exon",grep("exon|type" ,colnames(gr@elementMetadata))],
+                names(gr)[gr$type=="exon"])
+exons <- exons[names(txim)]
+if (all(is.na(seqlengths(exons)))) {
+    seqinfo(exons) <- seqinfo(txim)
+}
+exons <- exons[rownames(txim)]
+stopifnot(all(rownames(txim) == names(exons)))
+mcols(exons) <- mcols(txim)
+rowRanges(txim) <- exons
+
+eval(call("<-", as.name(paste0("transcript",suffix,".SE")),txim ))
+save(list=paste0("transcript",suffix,".SE"),file = paste0("transcript",suffix,".SE.rda") )
+rm(txim)
+gc()
+
+## 
+gr.g = gr[gr$type=="exon",]
+names(gr.g) <- gr.g$gene_id
+exons.g <- split(gr.g,names(gr.g) ) #groupGRangesBy(gr.g[gr.g$type=="exon",])
+exons.g <- exons.g[names(txim.genes)]
+
+genes2 = elementMetadata(unlist(exons.g))
+rownames(genes2) = genes2$gene_id
+genes2 = genes2[cumsum(elementNROWS(exons.g)),]
+genes2$type = "gene"
+
+assay.nms <- rownames(assay(txim.genes,"counts"))
+genes.missing <- !assay.nms %in% names(gr.g)
+genes2 <- genes2[rownames(assay(txim.genes,"counts")),]
+try(seqinfo(genes2) <- Seqinfo(genome = ucsc.genome)[seqlevels(genes2)])
+
+metadata.genes = metadata
+metadata.genes$level <- "gene"
+rowData(txim.genes) = genes2
+metadata(txim.genes)  = c(metadata(txim.genes),metadata.genes)
+metadata(txim.genes)$txomeInfo = txomeInfo
+
+## add raw read counts from files
+names(assays(txim.genes))[names(assays(txim.genes))=="counts"] = "counts_length_normalized"
+
+rcmat_files_list.g = list.files(".",pattern = "gene_count_matrix",full.names = T)
+rcmat_list.g = lapply(rcmat_files_list.g, read_csv,col_names = T)
+
+rcmat.g = rcmat_list.g[[1]]
+if(length(rcmat_list.g)>1){ 
+    for(i in 2:length(rcmat_list.g)) rcmat.g = left_join(rcmat.g,rcmat_list.g[[i]],by="gene_id")
+}
+rcmat.g = rcmat.g[match(rownames(txim.genes),rcmat.g$gene_id),c("gene_id",colnames(txim.genes))]
+write_csv(rcmat.g,path=paste0("gene_count_matrix",suffix,".csv"))
+rcmatdf.g = as.data.frame(rcmat.g[,-1])
+rownames(rcmatdf.g) = rcmat.g$gene_id
+
+colData(txim.genes)$readlength = readlengths_txim[1,]
+
+assays(txim.genes,withDimnames=F)$raw_counts = rcmatdf.g
+names(assays(txim.genes))[names(assays(txim.genes))=="abundance"] = "abundance_FPKM"
+assays(txim.genes)$abundance_TPM = sweep(assays(txim.genes)$abundance_FPKM, 2,colSums(assays(txim.genes)$abundance_FPKM),"/" )*10**6
+
+### check if still SimpleList in recent SummarizedExperiment versions
+assays(txim.genes) <- SimpleList(counts=assay(txim.genes,"raw_counts"),length=assay(txim.genes,"length"),abundance_FPKM=assay(txim.genes,"abundance_FPKM"),abundance_TPM=assay(txim.genes,"abundance_TPM") )
+
+if (all(is.na(seqlengths(exons.g)))) {
+    seqinfo(exons.g) <- seqinfo(exons)
+}
+exons.g <- exons.g[rownames(txim.genes)]
+stopifnot(all(rownames(txim.genes) == names(exons.g)))
+mcols(exons.g) <- mcols(txim.genes)
+rowRanges(txim.genes) <- exons.g
+
+eval(call("<-", as.name(paste0("gene",suffix,".SE")),txim.genes ))
+save(list=paste0("gene",suffix,".SE"),file = paste0("gene",suffix,".SE.rda") )
+rm(txim.genes)
+gc()