Merge pull request #120 from GoekeLab/master

update devel branch Former-commit-id: f431818
GoekeLab · Jun 22, 2020 · 0d8d943 · 0d8d943
2 parents 36e1456 + c72e637
commit 0d8d943
Show file tree

Hide file tree

Showing 28 changed files with 332 additions and 290 deletions.
diff --git a/R/annotationFunctions.R b/R/annotationFunctions.R
@@ -57,19 +57,37 @@ prepareAnnotationsFromGTF <- function(file){
     data$strand[data$strand=='.'] <- '*'
     data$GENEID = gsub('gene_id (.*?);.*','\\1',data$attribute)
     data$TXNAME=gsub('.*transcript_id (.*?);.*', '\\1',data$attribute)
-    data$exon_rank=as.integer(gsub('.*exon_number (.*?);.*', '\\1',data$attribute))
+    #data$exon_rank=as.integer(gsub('.*exon_number (.*?);.*', '\\1',data$attribute))
+
     geneData=unique(data[,c('TXNAME', 'GENEID')])
     grlist <- makeGRangesListFromDataFrame(
-      data[,c('seqname', 'start','end','strand','exon_rank','TXNAME')],split.field='TXNAME',keep.extra.columns = TRUE)
-
+      data[,c('seqname', 'start','end','strand','TXNAME')],split.field='TXNAME',keep.extra.columns = TRUE)
+    grlist <- grlist[IRanges::order(start(grlist))]
+
     unlistedExons <- unlist(grlist, use.names = FALSE)
     partitioning <- PartitioningByEnd(cumsum(elementNROWS(grlist)), names=NULL)
     txIdForReorder <- togroup(PartitioningByWidth(grlist))
-    unlistedExons <- unlistedExons[order(txIdForReorder, unlistedExons$exon_rank)] #'exonsByTx' is always sorted by exon rank, not by strand, make sure that this is the case here
+
+    exon_rank <- sapply(elementNROWS(grlist), seq, from = 1)
+    exon_rank[which(unlist(unique(strand(grlist)))=="-")] <- lapply(exon_rank[which(unlist(unique(strand(grlist)))=="-")], rev)  # * assumes positive for exon ranking
+    names(exon_rank) <- NULL
+    unlistedExons$exon_rank <- unlist(exon_rank)
+
+
+    unlistedExons <- unlistedExons[order(txIdForReorder, unlistedExons$exon_rank)]  #'exonsByTx' is always sorted by exon rank, not by strand, make sure that this is the case here
     unlistedExons$exon_endRank <- unlist(sapply(elementNROWS(grlist),seq,to=1), use.names=FALSE)
     unlistedExons <- unlistedExons[order(txIdForReorder, start(unlistedExons))]
-    #    mcols(unlistedExons) <- mcols(unlistedExons)[,c('exon_rank','exon_endRank')]
+
+
+    mcols(unlistedExons) <- mcols(unlistedExons)[,c('exon_rank','exon_endRank')]
+
     grlist <- relist(unlistedExons, partitioning)
+
+    # the grlist from gtf is ranked by exon_number instead of start position
+    # to be consistent with prepareAnnotations
+    # need to sort the grlist by start position
+
+
     minEqClasses <- getMinimumEqClassByTx(grlist)
     mcols(grlist) <- DataFrame(geneData[(match(names(grlist), geneData$TXNAME)),])
     mcols(grlist)$eqClass <- minEqClasses$eqClass[match(names(grlist),minEqClasses$queryTxId)]

diff --git a/R/bambu.R b/R/bambu.R
@@ -100,7 +100,7 @@ bambu <- function(reads = NULL, readClass.file = NULL, readClass.outputDir = NUL
     isoreParameters <- isoreParameters.default
 
     ## check quantification parameters
-    emParameters.default <- list(bias = FALSE,
+    emParameters.default <- list(bias = TRUE,
                                  maxiter = 10000,
                                  conv = 10^(-4))
 
@@ -210,14 +210,19 @@ bambu <- function(reads = NULL, readClass.file = NULL, readClass.outputDir = NUL
 #' @noRd
 bambu.extendAnnotations <- function(readClassList, annotations, isoreParameters, verbose = FALSE){
   combinedTxCandidates = NULL
+  start.ptm <- proc.time()
   for(readClassIndex in seq_along(readClassList)){
     readClass <- readClassList[[readClassIndex]]
     if(is.character(readClass)){
       readClass <- readRDS(file=readClass)
       seqlevelsStyle(readClass) <- seqlevelsStyle(annotations)[1]
     }
     combinedTxCandidates <- isore.combineTranscriptCandidates(readClass, readClassSeRef = combinedTxCandidates, verbose = verbose)
+    rm(readClass)
+    gc(verbose=FALSE)
   }
+  end.ptm <- proc.time()
+  if(verbose)  message('combining transcripts in ', round((end.ptm-start.ptm)[3]/60,1), ' mins.')
   annotations <- isore.extendAnnotations(se=combinedTxCandidates,
                                          annotationGrangesList=annotations,
                                          remove.subsetTx = isoreParameters[['remove.subsetTx']],
@@ -228,6 +233,8 @@ bambu.extendAnnotations <- function(readClassList, annotations, isoreParameters,
                                          min.exonOverlap = isoreParameters[['min.exonOverlap']],
                                          prefix = isoreParameters[['prefix']],
                                          verbose = verbose)
+  rm(combinedTxCandidates)
+  gc(verbose=FALSE)
   return(annotations)
 }
 
@@ -241,16 +248,21 @@ bambu.quantify <- function(readClass, annotations, emParameters, min.exonDistanc
   }
 
   readClass <- isore.estimateDistanceToAnnotations(readClass, annotations, min.exonDistance = min.exonDistance, verbose = verbose)
+  gc(verbose=FALSE)
   readClassDt <- getEmptyClassFromSE(readClass, annotations)
   colNameRC <- colnames(readClass)
   colDataRC <- colData(readClass)
   rm(readClass)
   gc(verbose=FALSE)
   counts <- bambu.quantDT(readClassDt,emParameters = emParameters, ncore = ncore, verbose = verbose)
+  rm(readClassDt)
+  gc(verbose=FALSE)
   if(length(setdiff(counts$tx_name,names(annotations)))>0){
     stop("The provided annotation is incomplete")
   }
   counts <- counts[data.table(tx_name = names(annotations)),  on = 'tx_name']
+  rm(annotations)
+  gc(verbose=FALSE)
   counts[is.na(estimates),`:=`(estimates = 0, CPM = 0) ]
 
   seOutput <- SummarizedExperiment::SummarizedExperiment(assays = SimpleList(counts = matrix(counts$estimates,ncol = 1, dimnames = list(NULL, colNameRC)),

diff --git a/R/junctionCorrection.R b/R/junctionCorrection.R
@@ -7,23 +7,38 @@ createJunctionTable <- function(unlisted_junction_granges, genomeSequence=NULL,
 
   if(is.null(genomeSequence)){
     stop("Reference genome sequence is missing, please provide fasta file or BSgenome name, see available.genomes()")
-  }else if(class(genomeSequence) != 'FaFile'){
+  }
+
+  original_seqlevelstyle <- seqlevelsStyle(unlisted_junction_granges)[1]
+
+  if(class(genomeSequence) != 'FaFile'){
     if(grepl('.fa',genomeSequence)){
       genomeSequence <- Rsamtools::FaFile(genomeSequence)
+
+      if(seqlevelsStyle(genomeSequence)[1]  != seqlevelsStyle(unlisted_junction_granges)[1]){
+        seqlevelsStyle(unlisted_junction_granges) <- seqlevelsStyle(genomeSequence)[1] 
+      }
+
     }else {
       if (!suppressWarnings(require(BSgenome, quietly=TRUE)))
         stop("Please install the BSgenome package")
-
+      
       genomeSequence <- BSgenome::getBSgenome(genomeSequence)
       seqlevelsStyle(genomeSequence) <- seqlevelsStyle(unlisted_junction_granges)[1]
-      if(!all(seqlevels(unlisted_junction_granges) %in% seqlevels(genomeSequence))) {
-        message("not all chromosomes present in reference genome sequence, ranges are dropped")
-        unlisted_junction_granges <- keepSeqlevels(unlisted_junction_granges,
-                                                   value = seqlevels(unlisted_junction_granges)[seqlevels(unlisted_junction_granges) %in% seqlevels(genomeSequence)],
-                                                   pruning.mode = 'coarse')
-      }
     }
   }
+
+  if(class(genomeSequence) == 'FaFile'){
+    if(seqlevelsStyle(genomeSequence)[1]  != seqlevelsStyle(unlisted_junction_granges)[1]){
+      seqlevelsStyle(unlisted_junction_granges) <- seqlevelsStyle(genomeSequence)[1] 
+    }
+  }
+  if(!all(seqlevels(unlisted_junction_granges) %in% seqlevels(genomeSequence))) {
+    message("not all chromosomes present in reference genome sequence, ranges are dropped")
+    unlisted_junction_granges <- keepSeqlevels(unlisted_junction_granges,
+                                               value = seqlevels(unlisted_junction_granges)[seqlevels(unlisted_junction_granges) %in% seqlevels(genomeSequence)],
+                                               pruning.mode = 'coarse')
+  }
 
   ##Todo: don't create junction names, instead work with indices/intergers (names are memory intensive)
 
@@ -39,20 +54,20 @@ createJunctionTable <- function(unlisted_junction_granges, genomeSequence=NULL,
 
   # the code now includes a parallel implementation, which is only helpful when the BSgenome package is used
 
-  #junctionSeqStart<-BSgenome::getSeq(genomeSequence,IRanges::shift(flank(uniqueJunctions,width=2),2)) # shift: from IRanges
-  #junctionSeqEnd<-BSgenome::getSeq(genomeSequence,IRanges::shift(flank(uniqueJunctions,width=2,start=FALSE),-2)) # shift: from IRanges
-
-  bpParameters <- BiocParallel::bpparam()
-  bpParameters$workers <- ncore
-  junctionSeqStart <- BiocParallel::bpvec(IRanges::shift(flank(uniqueJunctions,width=2),2),
-                            BSgenome::getSeq,
-                            x = genomeSequence,
-                            BPPARAM=bpParameters)
+  junctionSeqStart<-BSgenome::getSeq(genomeSequence,IRanges::shift(flank(uniqueJunctions,width=2),2)) # shift: from IRanges
+  junctionSeqEnd<-BSgenome::getSeq(genomeSequence,IRanges::shift(flank(uniqueJunctions,width=2,start=FALSE),-2)) # shift: from IRanges
 
-  junctionSeqEnd <- BiocParallel::bpvec(IRanges::shift(flank(uniqueJunctions,width=2,start=FALSE),-2),
-                                        BSgenome::getSeq,
-                          x = genomeSequence,
-                          BPPARAM=bpParameters)
+  # bpParameters <- BiocParallel::bpparam()
+  # bpParameters$workers <- ncore
+  # junctionSeqStart <- BiocParallel::bpvec(IRanges::shift(flank(uniqueJunctions,width=2),2),
+  #                           BSgenome::getSeq,
+  #                           x = genomeSequence,
+  #                           BPPARAM=bpParameters)
+  # 
+  # junctionSeqEnd <- BiocParallel::bpvec(IRanges::shift(flank(uniqueJunctions,width=2,start=FALSE),-2),
+  #                                       BSgenome::getSeq,
+  #                         x = genomeSequence,
+  #                         BPPARAM=bpParameters)
 
 
   junctionMotif <- paste(junctionSeqStart,junctionSeqEnd,sep='-')
@@ -75,7 +90,9 @@ createJunctionTable <- function(unlisted_junction_granges, genomeSequence=NULL,
                                       id=1:length(uniqueJunctions))
 
   strand(uniqueJunctions)<-uniqueJunctions$spliceStrand
-
+  if(original_seqlevelstyle != seqlevelsStyle(unlisted_junction_granges)[1]){
+    seqlevelsStyle(uniqueJunctions) <- original_seqlevelstyle
+  }
   return(uniqueJunctions)
 }
 

diff --git a/R/plotBambu.R b/R/plotBambu.R
@@ -1,6 +1,6 @@
 #' plotSEOuptut
 #' @title plot.bambu
-#' @param se An summarized experiment object obtained from \code\link{bambu} or \code{\link{transcriptToGene}}.
+#' @param se An summarized experiment object obtained from \code{\link{bambu}} or \code{\link{transcriptToGene}}.
 #' @param group.variable Variable for grouping in plot, has be to provided if choosing to plot PCA.
 #' @param type plot type variable, a values of annotation for a single gene with heatmap for isoform expressions,  pca,  or heatmap, see \code{\link{details}}.
 #' @param gene_id specifying the gene_id for plotting gene annotation, either gene_id or transcript_id has to be provided when type = "annotation".

diff --git a/R/prepareBam.R b/R/prepareBam.R
@@ -2,7 +2,7 @@
 #' @param bamFile bamFile
 #' @inheritParams bambu
 #' @noRd
-prepareDataFromBam <- function(bamFile, yieldSize=NULL, verbose = FALSE, ncore=1) {
+prepareDataFromBam <- function(bamFile, yieldSize=NULL, verbose = FALSE, ncore = 1) {
 
   if(class(bamFile)=='BamFile') {
     if(!is.null(yieldSize)) {
@@ -21,19 +21,6 @@ prepareDataFromBam <- function(bamFile, yieldSize=NULL, verbose = FALSE, ncore=1
     bamFile <- Rsamtools::BamFile(bamFile, yieldSize = yieldSize)
   }
 
-  # parallel processing of single files by reading chromosomes separately
-  if(ncore>1){ 
-
-    chr <- scanBamHeader(bamFile)[[1]]
-    chrRanges <- GRanges(seqnames=names(chr), ranges=IRanges(start=1, end=chr))
-    readGrgList <- mclapply(1:length(chr),
-                            helpFun,
-                            chrRanges=chrRanges,
-                            bamFile=bamFile,
-                            mc.cores = ncore)
-   } else {
-
-
   bf <- open(bamFile)
   #readGrgList <- GenomicRanges::GRangesList()
 
@@ -51,7 +38,7 @@ prepareDataFromBam <- function(bamFile, yieldSize=NULL, verbose = FALSE, ncore=1
   }
 
   close(bf)
-  }
+
   if(length(readGrgList)>1) {
     readGrgList <-  do.call(c, readGrgList)
   } else   {

diff --git a/R/readWrite.R b/R/readWrite.R
@@ -17,8 +17,11 @@ writeBambuOutput <- function(se,path){
     transcript_gtffn <- paste(path,"transcript_exon.gtf",sep="")
     gtf <- writeToGTF(annotation=transcript_grList,file=transcript_gtffn)
     transcript_counts <- as.data.frame(assays(se)$counts)
+    geneIDs <- data.frame(mcols(transcript_grList, use.names=FALSE)$TXNAME,mcols(transcript_grList, use.names=FALSE)$GENEID)
+    colnames(geneIDs) <- c("TXNAME","GENEID")
+    transcript_counts <- cbind(geneIDs,transcript_counts)  
     transcript_countsfn <- paste(path,"counts_transcript.txt",sep="")
-    write.table(transcript_counts, file= transcript_countsfn, sep="\t",quote=FALSE)
+    write.table(transcript_counts, file= transcript_countsfn, sep="\t",quote=FALSE,row.names= FALSE)
     gene_se <- transcriptToGeneExpression(se)
     gene_counts <- as.data.frame(assays(gene_se)$counts)
     gene_countsfn <- paste(path,"counts_gene.txt",sep="")

diff --git a/R/sysdata.rda b/R/sysdata.rda
diff --git a/R/transcriptToGeneExpression.R b/R/transcriptToGeneExpression.R
@@ -28,7 +28,7 @@ transcriptToGeneExpression<- function(se){
   counts <- rowDataSe[,.(TXNAME,GENEID)][counts, on = "TXNAME"]
 
   counts[, valueGene:=sum(value), by = list(variable, GENEID)]
-  counts[, valueGeneCPM:=valueGene/sum(value)*10^6, by = list(variable)]
+  counts[, valueGeneCPM:=valueGene/max(sum(value),1)*10^6, by = list(variable)]
 
   ## counts
   counts_gene <- dcast(unique(counts[,.(GENEID, variable, valueGene)]), GENEID ~ variable, value.var = "valueGene")

diff --git a/README.md b/README.md
@@ -190,6 +190,10 @@ writeBambuOutput(se, path = "./bambu/")
 
 ### Release History
 
+**bambu version 0.2.0**
+
+Release date: 18th June 2020
+
 **bambu version 0.1.0**
 
 Release date: 29th May 2020

diff --git a/data-raw/DATASET.R b/data-raw/DATASET.R
@@ -64,6 +64,11 @@ seCombinedGeneExpected <- transcriptToGeneExpression(seCombined)
 seCombinedExtendedGeneExpected <- transcriptToGeneExpression(seCombinedExtended)
 
 
+set.seed(1234)
+seqlevelsStyle(gr) <- "UCSC"
+seUCSCExpected = bambu(reads = test.bam,  annotations = gr, genomeSequence = fa.file, extendAnnotations = FALSE)
+
+
 
 
 usethis::use_data(data1,data2,data3,data4,data5,
@@ -73,6 +78,7 @@ usethis::use_data(data1,data2,data3,data4,data5,
                   seWithDistExpected,
                   seGeneExpected,seExtendedGeneExpected,
                   seCombinedGeneExpected,seCombinedExtendedGeneExpected,
+                  seUCSCExpected,
                   internal = TRUE,overwrite = TRUE)