Merge pull request #141 from GoekeLab/bambu-devel

update documentation to bioconductor guidelines and fix small bugs Former-commit-id: af72897
GoekeLab · Jul 28, 2020 · 1ca39b7 · 1ca39b7
2 parents bbdcf8b + ed21e44
commit 1ca39b7
Show file tree

Hide file tree

Showing 31 changed files with 525 additions and 318 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: bambu
 Type: Package
 Title: Reference-guided isoform reconstruction and quantification for long read RNA-Seq data
-Version: 0.1.0
+Version: 0.3.0
 Authors@R: c(person("Ying Chen", "Developer", role = "cre",email = "[email protected]"),
              person("Jonathan Goeke", "Developer", role = "aut",
                      email = "[email protected]"))
@@ -10,7 +10,7 @@ License: GPL-3
 Encoding: UTF-8
 LazyData: true
 Depends: 
-    R(>= 3.5.0), 
+    R(>= 4.0.0), 
     data.table(>= 1.1.8), 
     dplyr, 
     SummarizedExperiment(>= 1.1.6),
@@ -27,13 +27,19 @@ Suggests:
     circlize,
     ggbio,
     RColorBrewer,
-    gridExtra
+    gridExtra,
+    BSgenome.Hsapiens.NCBI.GRCh38,
+    TxDb.Hsapiens.UCSC.hg38.knownGene,
+    AnnotationDbi,
+    BSgenome,
+    BiocGenerics,
+    BiocParallel,
+    Biostrings
 Enhances: parallel
 SystemRequirements:
 biocViews: 
     FeatureExtraction, 
     GeneExpression, 
-    GeneExpressionWorkflow, 
     GenomeAnnotation,
     ImmunoOncology, 
     Normalization, 

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,4 +1,28 @@
 useDynLib(bambu, .registration=TRUE)
-exportPattern("^[[:alpha:]]+")
 importFrom(Rcpp, evalCpp)
-
+importFrom(Rcpp, show)
+importFrom(Rsamtools, index)
+import(RcppArmadillo)
+import(RcppProgress)
+import(progress)
+import(data.table, except=c(last, first, shift, second, between))
+import(dplyr, except=c(last, first, desc, union))
+import(GenomicAlignments)
+import(GenomicFeatures)
+importFrom(GenomicRanges, makeGRangesListFromFeatureFragments)
+importFrom(GenomicRanges, GRanges)
+importFrom(GenomicRanges, makeGRangesListFromDataFrame)
+importFrom(GenomicRanges, score)
+import(ggplot2)
+import(IRanges, except=c(slice, collapse, setdiff, intersect))
+import(S4Vectors, except=c(rename, setequal, setdiff, intersect))
+import(SummarizedExperiment)
+export(bambu)
+export(plot)
+export(plot.bambu)
+export(prepareAnnotations)
+export(prepareAnnotationsFromGTF)
+export(readFromGTF)
+export(transcriptToGeneExpression)
+export(writeBambuOutput)
+export(writeToGTF)
diff --git a/R/abundance_quantification.R b/R/abundance_quantification.R
@@ -54,7 +54,7 @@ run_parallel <- function(g,conv,bias,maxiter, readClassDt){
                                gene_sid = g,
                                ntotal = sum(tmp$nobs)))
   }else{
-    tmp_wide <- dcast(tmp[order(nobs)], tx_sid~read_class_sid, fun.agg = length,
+    tmp_wide <- dcast(tmp[order(nobs)], tx_sid~read_class_sid, fun.aggregate = length,
                       value.var = 'nobs')
     a_mat <- tmp_wide[,-1,with=FALSE]
     setDF(a_mat)

diff --git a/R/annotationFunctions.R b/R/annotationFunctions.R
@@ -1,52 +1,59 @@
 #' Function to prepare tables and genomic ranges for transript reconstruction using a txdb object
-#' @title prepare annotations from txdb object
-#' @param txdb a \code{\link{TxDb}} object
-#' @return A \code{\link{GrangesList}} object
+#' @title prepare annotations from txdb object or gtf file
+#' @param x A \code{\link{TxDb}} object or a gtf file
+#' @return A \code{\link{GRangesList}} object
 #' @export
 #' @examples
-#' \dontrun{
 #'  library(TxDb.Hsapiens.UCSC.hg38.knownGene)
 #'  txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
-#'  prepareAnnotations(txdb)
-#'  }
-prepareAnnotations <- function(txdb) {
-  exonsByTx = exonsBy(txdb,by='tx', use.names=TRUE)
-  if(any(duplicated(names(exonsByTx)))) {
-    warning('transcript names are not unique, only one transcript per ID will be kept')
-    exonsByTx <- exonsByTx[!duplicated(exonsByTx)]
+#'  prepareAnnotations(x = txdb)
+#'  gtf.file <- system.file("extdata", 
+#'  "Homo_sapiens.GRCh38.91_chr9_1_1000000.gtf", 
+#'  package = "bambu")
+#'  gr <- prepareAnnotationsFrom(x = gtf.file)
+prepareAnnotations <- function(x) {
+  if(grepl(".gtf",x)){
+    return(prepareAnnotationsFromGTF(x))
   }
-
-  unlistedExons <- unlist(exonsByTx, use.names = FALSE)
-  partitioning <- PartitioningByEnd(cumsum(elementNROWS(exonsByTx)), names=NULL)
-  txIdForReorder <- togroup(PartitioningByWidth(exonsByTx))
-  unlistedExons <- unlistedExons[order(txIdForReorder, unlistedExons$exon_rank)]  #'exonsByTx' is always sorted by exon rank, not by strand, make sure that this is the case here
-  unlistedExons$exon_endRank <- unlist(sapply(elementNROWS(exonsByTx),seq,to=1), use.names=FALSE)
-  unlistedExons <- unlistedExons[order(txIdForReorder, start(unlistedExons))]
-  mcols(unlistedExons) <- mcols(unlistedExons)[,c('exon_rank','exon_endRank')]
-  exonsByTx <- relist(unlistedExons, partitioning)
-
-  mcols(exonsByTx) <-  suppressMessages(AnnotationDbi::select(txdb, names(exonsByTx),
-                                             columns=c("TXNAME", "GENEID"),
-                                             keytype="TXNAME"))
-  minEqClasses <- getMinimumEqClassByTx(exonsByTx)
-  mcols(exonsByTx)$eqClass <- minEqClasses$eqClass[match(names(exonsByTx),minEqClasses$queryTxId)]
-  return(exonsByTx)
+  if(class(x) == "TxDb"){
+    exonsByTx = exonsBy(txdb,by='tx', use.names=TRUE)
+    if(any(duplicated(names(exonsByTx)))) {
+      warning('transcript names are not unique, only one transcript per ID will be kept')
+      exonsByTx <- exonsByTx[!duplicated(exonsByTx)]
+    }
+
+    unlistedExons <- unlist(exonsByTx, use.names = FALSE)
+    partitioning <- PartitioningByEnd(cumsum(elementNROWS(exonsByTx)), names=NULL)
+    txIdForReorder <- togroup(PartitioningByWidth(exonsByTx))
+    unlistedExons <- unlistedExons[order(txIdForReorder, unlistedExons$exon_rank)]  #'exonsByTx' is always sorted by exon rank, not by strand, make sure that this is the case here
+    unlistedExons$exon_endRank <- unlist(sapply(elementNROWS(exonsByTx),seq,to=1), use.names=FALSE)
+    unlistedExons <- unlistedExons[order(txIdForReorder, start(unlistedExons))]
+    mcols(unlistedExons) <- mcols(unlistedExons)[,c('exon_rank','exon_endRank')]
+    exonsByTx <- relist(unlistedExons, partitioning)
+
+    mcols(exonsByTx) <-  suppressMessages(AnnotationDbi::select(txdb, names(exonsByTx),
+                                                                columns=c("TXNAME", "GENEID"),
+                                                                keytype="TXNAME"))
+    minEqClasses <- getMinimumEqClassByTx(exonsByTx)
+    mcols(exonsByTx)$eqClass <- minEqClasses$eqClass[match(names(exonsByTx),minEqClasses$queryTxId)]
+    return(exonsByTx)
+  }
+
 }
 
 
 #' Prepare annotation granges object from GTF file 
 #' @title Prepare annotation granges object from GTF file  into a GRangesList object
 #' @param file a GTF file
-#' @return grlist a \code{\link{GRangesList}} object, unlike \code\link{readFromGTF}}, 
-#' this function finds out the equivalence classes between the transcripts, 
-#' with \code{\link{mcols}} data having three columns:
+#' @return A \code{\link{GRangesList}} object
+#' @details Unlike \code{\link{readFromGTF}}, this function finds out the equivalence classes between the transcripts,
+#' with \code{\link{mcols}} data having three columns: 
 #' \itemize{
 #'   \item TXNAME specifying prefix for new gene Ids (genePrefix.number), defaults to empty
 #'   \item GENEID indicating whether filter to remove read classes which are a subset of known transcripts(), defaults to TRUE
 #'   \item eqClass specifying minimun read count to consider a read class valid in a sample, defaults to 2
 #'   }
-#' 
-#' @export
+#' @noRd
 prepareAnnotationsFromGTF <- function(file){
   if (missing(file)){
     stop('A GTF file is required.')
@@ -122,7 +129,7 @@ getMinimumEqClassByTx <- function(exonsByTranscripts) {
 #' @param minoverlap defaults to 5
 #' @param ignore.strand defaults to FALSE
 #' @noRd
-assignNewGeneIds <- function(exByTx, prefix='', minoverlap=5, ignore.strand=F){
+assignNewGeneIds <- function(exByTx, prefix='', minoverlap=5, ignore.strand=FALSE){
   if(is.null(names(exByTx))){
     names(exByTx) <- 1:length(exByTx)
   }
@@ -218,8 +225,8 @@ calculateDistToAnnotation <- function(exByTx, exByTxRef, maxDist = 35, primarySe
   spliceOverlaps <- findSpliceOverlapsByDist(exByTx,
                                              exByTxRef,
                                              maxDist=maxDist,
-                                             firstLastSeparate=T,
-                                             dropRangesByMinLength=T,
+                                             firstLastSeparate=TRUE,
+                                             dropRangesByMinLength=TRUE,
                                              cutStartEnd=TRUE,
                                              ignore.strand=ignore.strand)
 
@@ -243,8 +250,8 @@ calculateDistToAnnotation <- function(exByTx, exByTxRef, maxDist = 35, primarySe
                                                   exByTxRef,
                                                   maxDist=0,
                                                   type='any',
-                                                  firstLastSeparate=T,
-                                                  dropRangesByMinLength=F,
+                                                  firstLastSeparate=TRUE,
+                                                  dropRangesByMinLength=FALSE,
                                                   cutStartEnd=TRUE,
                                                   ignore.strand=ignore.strand)
 
@@ -272,9 +279,9 @@ calculateDistToAnnotation <- function(exByTx, exByTxRef, maxDist = 35, primarySe
                                                             exByTxRef,
                                                             maxDist=0,
                                                             type='any',
-                                                            firstLastSeparate=T,
-                                                            dropRangesByMinLength=F,
-                                                            cutStartEnd=F,
+                                                            firstLastSeparate=TRUE,
+                                                            dropRangesByMinLength=FALSE,
+                                                            cutStartEnd=FALSE,
                                                             ignore.strand=ignore.strand)
     if(length(spliceOverlaps_restStartEnd)>0){
       txToAnTableRestStartEnd <- tbl_df(spliceOverlaps_restStartEnd) %>%

diff --git a/R/bambu.R b/R/bambu.R
@@ -8,6 +8,7 @@
 #' @param readClass.outputDir A string variable specifying the path to where read class files will be saved.
 #' @param annotations A \code{\link{TxDb}} object or A GRangesList object obtained by \code{\link{prepareAnnotations}} or \code{\link{prepareAnnotationsFromGTF}}.
 #' @param genomeSequence A fasta file or a BSGenome object.
+#' @param stranded A boolean for strandedness, defaults to FALSE.
 #' @param ncore specifying number of cores used when parallel processing is used, defaults to 1.
 #' @param yieldSize see \code{\link{Rsamtools}}.
 #' @param isoreParameters A list of controlling parameters for isoform reconstruction process:
@@ -31,14 +32,22 @@
 #' @details
 #' @return A list of two SummarizedExperiment object for transcript expression and gene expression.
 #' @examples
-#' \dontrun{
+#' 
 #' ## =====================
-#' ## More stringent new gene/isoform discovery: new isoforms are identified with at least 5 read count in 1 sample
+#' ## Minimum read support 5
 #' ## Increase EM convergence threshold to 10^(-6)
-#' seOutput <- bambu(reads, annotationGrangesList,
-#' genomeSequence, isoreParameters = list(min.readCount=5),
-#' emParameters = list(conv = 10^(-6))
-#' }
+#'  test.bam <- system.file("extdata", 
+#'  "SGNex_A549_directRNA_replicate5_run1_chr9_1_1000000.bam", 
+#'  package = "bambu")
+#'  gr <- readRDS(system.file("extdata", 
+#'  "annotationGranges_txdbGrch38_91_chr9_1_1000000.rds", 
+#'  package = "bambu"))
+#'  fa.file <- system.file("extdata", 
+#'  "Homo_sapiens.GRCh38.dna_sm.primary_assembly_chr9_1_1000000.fa", 
+#'  package = "bambu")
+#'  se = bambu(reads = test.bam,  annotations = gr, 
+#'  genomeSequence = fa.file, extendAnnotations = FALSE)
+#' 
 #' @export
 bambu <- function(reads = NULL, readClass.file = NULL, readClass.outputDir = NULL, 
                   annotations = NULL, genomeSequence = NULL,
@@ -50,9 +59,9 @@ bambu <- function(reads = NULL, readClass.file = NULL, readClass.outputDir = NUL
 
   #===# Check annotation inputs #===#
   if(!is.null(annotations)){
-      if(class(annotations) == 'TxDb'){
+      if(is(annotations,'TxDb')){
         annotations <- prepareAnnotations(annotations)
-      }else if(class(annotations) == "CompressedGRangesList"){
+      }else if(is(annotations,"CompressedGRangesList")){
         ## check if annotations is as expected
         if(!all(c("TXNAME","GENEID","eqClass") %in% colnames(mcols(annotations)))){
          stop("The annotations is not properly prepared.\nPlease prepareAnnnotations using prepareAnnotations or prepareAnnotationsFromGTF functions.")
@@ -124,15 +133,15 @@ bambu <- function(reads = NULL, readClass.file = NULL, readClass.outputDir = NUL
     if(!is.null(reads)){  # calculate readClass objects
 
       #===# create BamFileList object from character #===#
-      if(class(reads)=='BamFile') {
+      if(is(reads,'BamFile')) {
         if(!is.null(yieldSize)) {
           Rsamtools::yieldSize(reads) <- yieldSize
         } else {
           yieldSize <- Rsamtools::yieldSize(reads)
         }
         reads<- Rsamtools::BamFileList(reads)
         names(reads) <- tools::file_path_sans_ext(BiocGenerics::basename(reads))
-      }else if(class(reads)=='BamFileList') {
+      }else if(is(reads,'BamFileList')) {
         if(!is.null(yieldSize)) {
           Rsamtools::yieldSize(reads) <- yieldSize
         } else {

diff --git a/R/constructReadClassTables.R b/R/constructReadClassTables.R
@@ -112,6 +112,7 @@ constructSplicedReadClassTables <- function(uniqueJunctions, unlisted_junctions,
 
   mcols(exonsByReadClass) <- readTable
  # seqlevels(exonsByReadClass) <- seqLevelList
+  options(scipen = 0)
   return(exonsByReadClass)
 }
 

diff --git a/R/isore.R b/R/isore.R
@@ -140,10 +140,10 @@ isore.constructReadClasses <- function(readGrgList,
   # seqlevels are made equal (added for chromosomes missing in any of them)
   # seqlevels(readClassListSpliced) <- unique(c(seqlevels(readGrgList), seqlevels(annotationGrangesList)))
 
-  singleExonReads <- unlist(readGrgList[elementNROWS(readGrgList)==1], use.names=F)
+  singleExonReads <- unlist(readGrgList[elementNROWS(readGrgList)==1], use.names=FALSE)
   mcols(singleExonReads)$id <- mcols(readGrgList[elementNROWS(readGrgList)==1])$id
 
-  referenceExons <- unique(c(granges(unlist(readClassListSpliced[mcols(readClassListSpliced)$confidenceType=='highConfidenceJunctionReads' & mcols(readClassListSpliced)$strand.rc!='*'], use.names=F)), granges(unlist(annotationGrangesList, use.names=F))))
+  referenceExons <- unique(c(granges(unlist(readClassListSpliced[mcols(readClassListSpliced)$confidenceType=='highConfidenceJunctionReads' & mcols(readClassListSpliced)$strand.rc!='*'], use.names=FALSE)), granges(unlist(annotationGrangesList, use.names=FALSE))))
 
   readClassListUnsplicedWithAnnotation <- constructUnsplicedReadClasses(granges = singleExonReads,
                                                                         grangesReference = referenceExons,
@@ -270,8 +270,8 @@ isore.combineTranscriptCandidates <- function(readClassSe, readClassSeRef = NULL
     counts.spliced <- cbind(counts.splicedRef, counts.splicedNew)
     start.spliced <- cbind(start.splicedRef, start.splicedNew)
     end.spliced <- cbind(end.splicedRef, end.splicedNew)
-    rowData.spliced$start <- rowMins(start.spliced, na.rm=T)
-    rowData.spliced$end <- rowMaxs(end.spliced, na.rm=T)
+    rowData.spliced$start <- rowMins(start.spliced, na.rm=TRUE)
+    rowData.spliced$end <- rowMaxs(end.spliced, na.rm=TRUE)
     rowData.spliced <- dplyr::select(rowData.spliced, chr, start, end, strand, intronStarts, intronEnds) %>%
       mutate(confidenceType = 'highConfidenceJunctionReads')
 
@@ -321,36 +321,36 @@ isore.combineTranscriptCandidates <- function(readClassSe, readClassSeRef = NULL
     counts.unsplicedRefSum <- as_tibble(assays(readClassSeRef)[['counts']])[rowData(readClassSeRef)$confidenceType=='unsplicedNew',] %>%
       mutate(index=overlapRefToCombined) %>%
       group_by(index) %>%
-      summarise_all(sum, na.rm=T)
+      summarise_all(sum, na.rm=TRUE)
 
     counts.unsplicedNewSum <- as_tibble(assays(readClassSe)[['counts']])[rowData(readClassSe)$confidenceType=='unsplicedNew',] %>%
       mutate(index=overlapNewToCombined) %>%
       group_by(index) %>%
-      summarise_all(sum, na.rm=T)
+      summarise_all(sum, na.rm=TRUE)
 
     start.unsplicedRefSum <- as_tibble(assays(readClassSeRef)[['start']])[rowData(readClassSeRef)$confidenceType=='unsplicedNew',] %>%
       mutate(index=overlapRefToCombined) %>%
       group_by(index) %>%
-      summarise_all(min, na.rm=T)
+      summarise_all(min, na.rm=TRUE)
 
     start.unsplicedNewSum <- readClassSeTBL %>%
       filter(confidenceType=='unsplicedNew') %>%
       dplyr::select(start) %>%
       mutate(index=overlapNewToCombined) %>%
       group_by(index) %>%
-      summarise_all(min, na.rm=T)
+      summarise_all(min, na.rm=TRUE)
 
     end.unsplicedRefSum <- as_tibble(assays(readClassSeRef)[['end']])[rowData(readClassSeRef)$confidenceType=='unsplicedNew',] %>%
       mutate(index=overlapRefToCombined) %>%
       group_by(index) %>%
-      summarise_all(max, na.rm=T)
+      summarise_all(max, na.rm=TRUE)
 
     end.unsplicedNewSum <- readClassSeTBL %>%
       filter(confidenceType=='unsplicedNew') %>%
       dplyr::select(end) %>%
       mutate(index=overlapNewToCombined) %>%
       group_by(index) %>%
-      summarise_all(max, na.rm=T)
+      summarise_all(max, na.rm=TRUE)
 
     counts.unsplicedRef <- matrix(0,
                                   dimnames = list(1:nrow(rowData.unspliced),
@@ -641,7 +641,7 @@ seqlevels(exonsByReadClassUnspliced) <-  unique(c(seqlevels(exonsByReadClassUnsp
 
     if(any(is.na(mcols(seCombined)$GENEID))){
 
-      newGeneIds <- assignNewGeneIds(exonRangesCombined[is.na(mcols(seCombined)$GENEID)], prefix=prefix, minoverlap=5, ignore.strand=F)
+      newGeneIds <- assignNewGeneIds(exonRangesCombined[is.na(mcols(seCombined)$GENEID)], prefix=prefix, minoverlap=5, ignore.strand=FALSE)
 
       mcols(seCombined)$GENEID[as.integer(newGeneIds$readClassId)] <- newGeneIds$geneId
     }
@@ -659,7 +659,7 @@ seqlevels(exonsByReadClassUnspliced) <-  unique(c(seqlevels(exonsByReadClassUnsp
       ungroup() %>%
       dplyr::select(-geneId)
     relCounts <- assays(seCombined)$counts / countsTBL
-    filterTxUsage <- rowSums(relCounts >= min.readFractionByGene, na.rm=T) >= min.sampleNumber
+    filterTxUsage <- rowSums(relCounts >= min.readFractionByGene, na.rm=TRUE) >= min.sampleNumber
     seCombinedFiltered <- seCombined[filterTxUsage]
     exonRangesCombinedFiltered <- exonRangesCombined[filterTxUsage]
 
@@ -767,7 +767,7 @@ isore.estimateDistanceToAnnotations <- function(seReadClass, annotationGrangesLi
   readClassToGeneIdTableNew <- assignNewGeneIds(rowRanges(seReadClass)[newGeneCandidates],
                                                 prefix='.unassigned',
                                                 minoverlap=5,
-                                                ignore.strand=F)
+                                                ignore.strand=FALSE)
   readClassGeneTable <- rbind(readClassToGeneIdTable, readClassToGeneIdTableNew)
   readClassTable <- left_join(readClassTable, readClassGeneTable, by="readClassId") %>%
     dplyr::select(confidenceType, geneId, compatible, equal)