Skip to content

Commit

Permalink
Merge pull request #141 from GoekeLab/bambu-devel
Browse files Browse the repository at this point in the history
update documentation to bioconductor guidelines and fix small bugs

Former-commit-id: af72897
  • Loading branch information
cying111 authored Jul 28, 2020
2 parents bbdcf8b + ed21e44 commit 1ca39b7
Show file tree
Hide file tree
Showing 31 changed files with 525 additions and 318 deletions.
14 changes: 10 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: bambu
Type: Package
Title: Reference-guided isoform reconstruction and quantification for long read RNA-Seq data
Version: 0.1.0
Version: 0.3.0
Authors@R: c(person("Ying Chen", "Developer", role = "cre",email = "[email protected]"),
person("Jonathan Goeke", "Developer", role = "aut",
email = "[email protected]"))
Expand All @@ -10,7 +10,7 @@ License: GPL-3
Encoding: UTF-8
LazyData: true
Depends:
R(>= 3.5.0),
R(>= 4.0.0),
data.table(>= 1.1.8),
dplyr,
SummarizedExperiment(>= 1.1.6),
Expand All @@ -27,13 +27,19 @@ Suggests:
circlize,
ggbio,
RColorBrewer,
gridExtra
gridExtra,
BSgenome.Hsapiens.NCBI.GRCh38,
TxDb.Hsapiens.UCSC.hg38.knownGene,
AnnotationDbi,
BSgenome,
BiocGenerics,
BiocParallel,
Biostrings
Enhances: parallel
SystemRequirements:
biocViews:
FeatureExtraction,
GeneExpression,
GeneExpressionWorkflow,
GenomeAnnotation,
ImmunoOncology,
Normalization,
Expand Down
28 changes: 26 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,4 +1,28 @@
useDynLib(bambu, .registration=TRUE)
exportPattern("^[[:alpha:]]+")
importFrom(Rcpp, evalCpp)

importFrom(Rcpp, show)
importFrom(Rsamtools, index)
import(RcppArmadillo)
import(RcppProgress)
import(progress)
import(data.table, except=c(last, first, shift, second, between))
import(dplyr, except=c(last, first, desc, union))
import(GenomicAlignments)
import(GenomicFeatures)
importFrom(GenomicRanges, makeGRangesListFromFeatureFragments)
importFrom(GenomicRanges, GRanges)
importFrom(GenomicRanges, makeGRangesListFromDataFrame)
importFrom(GenomicRanges, score)
import(ggplot2)
import(IRanges, except=c(slice, collapse, setdiff, intersect))
import(S4Vectors, except=c(rename, setequal, setdiff, intersect))
import(SummarizedExperiment)
export(bambu)
export(plot)
export(plot.bambu)
export(prepareAnnotations)
export(prepareAnnotationsFromGTF)
export(readFromGTF)
export(transcriptToGeneExpression)
export(writeBambuOutput)
export(writeToGTF)
2 changes: 1 addition & 1 deletion R/abundance_quantification.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ run_parallel <- function(g,conv,bias,maxiter, readClassDt){
gene_sid = g,
ntotal = sum(tmp$nobs)))
}else{
tmp_wide <- dcast(tmp[order(nobs)], tx_sid~read_class_sid, fun.agg = length,
tmp_wide <- dcast(tmp[order(nobs)], tx_sid~read_class_sid, fun.aggregate = length,
value.var = 'nobs')
a_mat <- tmp_wide[,-1,with=FALSE]
setDF(a_mat)
Expand Down
87 changes: 47 additions & 40 deletions R/annotationFunctions.R
Original file line number Diff line number Diff line change
@@ -1,52 +1,59 @@
#' Function to prepare tables and genomic ranges for transript reconstruction using a txdb object
#' @title prepare annotations from txdb object
#' @param txdb a \code{\link{TxDb}} object
#' @return A \code{\link{GrangesList}} object
#' @title prepare annotations from txdb object or gtf file
#' @param x A \code{\link{TxDb}} object or a gtf file
#' @return A \code{\link{GRangesList}} object
#' @export
#' @examples
#' \dontrun{
#' library(TxDb.Hsapiens.UCSC.hg38.knownGene)
#' txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
#' prepareAnnotations(txdb)
#' }
prepareAnnotations <- function(txdb) {
exonsByTx = exonsBy(txdb,by='tx', use.names=TRUE)
if(any(duplicated(names(exonsByTx)))) {
warning('transcript names are not unique, only one transcript per ID will be kept')
exonsByTx <- exonsByTx[!duplicated(exonsByTx)]
#' prepareAnnotations(x = txdb)
#' gtf.file <- system.file("extdata",
#' "Homo_sapiens.GRCh38.91_chr9_1_1000000.gtf",
#' package = "bambu")
#' gr <- prepareAnnotationsFrom(x = gtf.file)
prepareAnnotations <- function(x) {
if(grepl(".gtf",x)){
return(prepareAnnotationsFromGTF(x))
}

unlistedExons <- unlist(exonsByTx, use.names = FALSE)
partitioning <- PartitioningByEnd(cumsum(elementNROWS(exonsByTx)), names=NULL)
txIdForReorder <- togroup(PartitioningByWidth(exonsByTx))
unlistedExons <- unlistedExons[order(txIdForReorder, unlistedExons$exon_rank)] #'exonsByTx' is always sorted by exon rank, not by strand, make sure that this is the case here
unlistedExons$exon_endRank <- unlist(sapply(elementNROWS(exonsByTx),seq,to=1), use.names=FALSE)
unlistedExons <- unlistedExons[order(txIdForReorder, start(unlistedExons))]
mcols(unlistedExons) <- mcols(unlistedExons)[,c('exon_rank','exon_endRank')]
exonsByTx <- relist(unlistedExons, partitioning)

mcols(exonsByTx) <- suppressMessages(AnnotationDbi::select(txdb, names(exonsByTx),
columns=c("TXNAME", "GENEID"),
keytype="TXNAME"))
minEqClasses <- getMinimumEqClassByTx(exonsByTx)
mcols(exonsByTx)$eqClass <- minEqClasses$eqClass[match(names(exonsByTx),minEqClasses$queryTxId)]
return(exonsByTx)
if(class(x) == "TxDb"){
exonsByTx = exonsBy(txdb,by='tx', use.names=TRUE)
if(any(duplicated(names(exonsByTx)))) {
warning('transcript names are not unique, only one transcript per ID will be kept')
exonsByTx <- exonsByTx[!duplicated(exonsByTx)]
}

unlistedExons <- unlist(exonsByTx, use.names = FALSE)
partitioning <- PartitioningByEnd(cumsum(elementNROWS(exonsByTx)), names=NULL)
txIdForReorder <- togroup(PartitioningByWidth(exonsByTx))
unlistedExons <- unlistedExons[order(txIdForReorder, unlistedExons$exon_rank)] #'exonsByTx' is always sorted by exon rank, not by strand, make sure that this is the case here
unlistedExons$exon_endRank <- unlist(sapply(elementNROWS(exonsByTx),seq,to=1), use.names=FALSE)
unlistedExons <- unlistedExons[order(txIdForReorder, start(unlistedExons))]
mcols(unlistedExons) <- mcols(unlistedExons)[,c('exon_rank','exon_endRank')]
exonsByTx <- relist(unlistedExons, partitioning)

mcols(exonsByTx) <- suppressMessages(AnnotationDbi::select(txdb, names(exonsByTx),
columns=c("TXNAME", "GENEID"),
keytype="TXNAME"))
minEqClasses <- getMinimumEqClassByTx(exonsByTx)
mcols(exonsByTx)$eqClass <- minEqClasses$eqClass[match(names(exonsByTx),minEqClasses$queryTxId)]
return(exonsByTx)
}

}


#' Prepare annotation granges object from GTF file
#' @title Prepare annotation granges object from GTF file into a GRangesList object
#' @param file a GTF file
#' @return grlist a \code{\link{GRangesList}} object, unlike \code\link{readFromGTF}},
#' this function finds out the equivalence classes between the transcripts,
#' with \code{\link{mcols}} data having three columns:
#' @return A \code{\link{GRangesList}} object
#' @details Unlike \code{\link{readFromGTF}}, this function finds out the equivalence classes between the transcripts,
#' with \code{\link{mcols}} data having three columns:
#' \itemize{
#' \item TXNAME specifying prefix for new gene Ids (genePrefix.number), defaults to empty
#' \item GENEID indicating whether filter to remove read classes which are a subset of known transcripts(), defaults to TRUE
#' \item eqClass specifying minimun read count to consider a read class valid in a sample, defaults to 2
#' }
#'
#' @export
#' @noRd
prepareAnnotationsFromGTF <- function(file){
if (missing(file)){
stop('A GTF file is required.')
Expand Down Expand Up @@ -122,7 +129,7 @@ getMinimumEqClassByTx <- function(exonsByTranscripts) {
#' @param minoverlap defaults to 5
#' @param ignore.strand defaults to FALSE
#' @noRd
assignNewGeneIds <- function(exByTx, prefix='', minoverlap=5, ignore.strand=F){
assignNewGeneIds <- function(exByTx, prefix='', minoverlap=5, ignore.strand=FALSE){
if(is.null(names(exByTx))){
names(exByTx) <- 1:length(exByTx)
}
Expand Down Expand Up @@ -218,8 +225,8 @@ calculateDistToAnnotation <- function(exByTx, exByTxRef, maxDist = 35, primarySe
spliceOverlaps <- findSpliceOverlapsByDist(exByTx,
exByTxRef,
maxDist=maxDist,
firstLastSeparate=T,
dropRangesByMinLength=T,
firstLastSeparate=TRUE,
dropRangesByMinLength=TRUE,
cutStartEnd=TRUE,
ignore.strand=ignore.strand)

Expand All @@ -243,8 +250,8 @@ calculateDistToAnnotation <- function(exByTx, exByTxRef, maxDist = 35, primarySe
exByTxRef,
maxDist=0,
type='any',
firstLastSeparate=T,
dropRangesByMinLength=F,
firstLastSeparate=TRUE,
dropRangesByMinLength=FALSE,
cutStartEnd=TRUE,
ignore.strand=ignore.strand)

Expand Down Expand Up @@ -272,9 +279,9 @@ calculateDistToAnnotation <- function(exByTx, exByTxRef, maxDist = 35, primarySe
exByTxRef,
maxDist=0,
type='any',
firstLastSeparate=T,
dropRangesByMinLength=F,
cutStartEnd=F,
firstLastSeparate=TRUE,
dropRangesByMinLength=FALSE,
cutStartEnd=FALSE,
ignore.strand=ignore.strand)
if(length(spliceOverlaps_restStartEnd)>0){
txToAnTableRestStartEnd <- tbl_df(spliceOverlaps_restStartEnd) %>%
Expand Down
29 changes: 19 additions & 10 deletions R/bambu.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#' @param readClass.outputDir A string variable specifying the path to where read class files will be saved.
#' @param annotations A \code{\link{TxDb}} object or A GRangesList object obtained by \code{\link{prepareAnnotations}} or \code{\link{prepareAnnotationsFromGTF}}.
#' @param genomeSequence A fasta file or a BSGenome object.
#' @param stranded A boolean for strandedness, defaults to FALSE.
#' @param ncore specifying number of cores used when parallel processing is used, defaults to 1.
#' @param yieldSize see \code{\link{Rsamtools}}.
#' @param isoreParameters A list of controlling parameters for isoform reconstruction process:
Expand All @@ -31,14 +32,22 @@
#' @details
#' @return A list of two SummarizedExperiment object for transcript expression and gene expression.
#' @examples
#' \dontrun{
#'
#' ## =====================
#' ## More stringent new gene/isoform discovery: new isoforms are identified with at least 5 read count in 1 sample
#' ## Minimum read support 5
#' ## Increase EM convergence threshold to 10^(-6)
#' seOutput <- bambu(reads, annotationGrangesList,
#' genomeSequence, isoreParameters = list(min.readCount=5),
#' emParameters = list(conv = 10^(-6))
#' }
#' test.bam <- system.file("extdata",
#' "SGNex_A549_directRNA_replicate5_run1_chr9_1_1000000.bam",
#' package = "bambu")
#' gr <- readRDS(system.file("extdata",
#' "annotationGranges_txdbGrch38_91_chr9_1_1000000.rds",
#' package = "bambu"))
#' fa.file <- system.file("extdata",
#' "Homo_sapiens.GRCh38.dna_sm.primary_assembly_chr9_1_1000000.fa",
#' package = "bambu")
#' se = bambu(reads = test.bam, annotations = gr,
#' genomeSequence = fa.file, extendAnnotations = FALSE)
#'
#' @export
bambu <- function(reads = NULL, readClass.file = NULL, readClass.outputDir = NULL,
annotations = NULL, genomeSequence = NULL,
Expand All @@ -50,9 +59,9 @@ bambu <- function(reads = NULL, readClass.file = NULL, readClass.outputDir = NUL

#===# Check annotation inputs #===#
if(!is.null(annotations)){
if(class(annotations) == 'TxDb'){
if(is(annotations,'TxDb')){
annotations <- prepareAnnotations(annotations)
}else if(class(annotations) == "CompressedGRangesList"){
}else if(is(annotations,"CompressedGRangesList")){
## check if annotations is as expected
if(!all(c("TXNAME","GENEID","eqClass") %in% colnames(mcols(annotations)))){
stop("The annotations is not properly prepared.\nPlease prepareAnnnotations using prepareAnnotations or prepareAnnotationsFromGTF functions.")
Expand Down Expand Up @@ -124,15 +133,15 @@ bambu <- function(reads = NULL, readClass.file = NULL, readClass.outputDir = NUL
if(!is.null(reads)){ # calculate readClass objects

#===# create BamFileList object from character #===#
if(class(reads)=='BamFile') {
if(is(reads,'BamFile')) {
if(!is.null(yieldSize)) {
Rsamtools::yieldSize(reads) <- yieldSize
} else {
yieldSize <- Rsamtools::yieldSize(reads)
}
reads<- Rsamtools::BamFileList(reads)
names(reads) <- tools::file_path_sans_ext(BiocGenerics::basename(reads))
}else if(class(reads)=='BamFileList') {
}else if(is(reads,'BamFileList')) {
if(!is.null(yieldSize)) {
Rsamtools::yieldSize(reads) <- yieldSize
} else {
Expand Down
1 change: 1 addition & 0 deletions R/constructReadClassTables.R
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ constructSplicedReadClassTables <- function(uniqueJunctions, unlisted_junctions,

mcols(exonsByReadClass) <- readTable
# seqlevels(exonsByReadClass) <- seqLevelList
options(scipen = 0)
return(exonsByReadClass)
}

Expand Down
26 changes: 13 additions & 13 deletions R/isore.R
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,10 @@ isore.constructReadClasses <- function(readGrgList,
# seqlevels are made equal (added for chromosomes missing in any of them)
# seqlevels(readClassListSpliced) <- unique(c(seqlevels(readGrgList), seqlevels(annotationGrangesList)))

singleExonReads <- unlist(readGrgList[elementNROWS(readGrgList)==1], use.names=F)
singleExonReads <- unlist(readGrgList[elementNROWS(readGrgList)==1], use.names=FALSE)
mcols(singleExonReads)$id <- mcols(readGrgList[elementNROWS(readGrgList)==1])$id

referenceExons <- unique(c(granges(unlist(readClassListSpliced[mcols(readClassListSpliced)$confidenceType=='highConfidenceJunctionReads' & mcols(readClassListSpliced)$strand.rc!='*'], use.names=F)), granges(unlist(annotationGrangesList, use.names=F))))
referenceExons <- unique(c(granges(unlist(readClassListSpliced[mcols(readClassListSpliced)$confidenceType=='highConfidenceJunctionReads' & mcols(readClassListSpliced)$strand.rc!='*'], use.names=FALSE)), granges(unlist(annotationGrangesList, use.names=FALSE))))

readClassListUnsplicedWithAnnotation <- constructUnsplicedReadClasses(granges = singleExonReads,
grangesReference = referenceExons,
Expand Down Expand Up @@ -270,8 +270,8 @@ isore.combineTranscriptCandidates <- function(readClassSe, readClassSeRef = NULL
counts.spliced <- cbind(counts.splicedRef, counts.splicedNew)
start.spliced <- cbind(start.splicedRef, start.splicedNew)
end.spliced <- cbind(end.splicedRef, end.splicedNew)
rowData.spliced$start <- rowMins(start.spliced, na.rm=T)
rowData.spliced$end <- rowMaxs(end.spliced, na.rm=T)
rowData.spliced$start <- rowMins(start.spliced, na.rm=TRUE)
rowData.spliced$end <- rowMaxs(end.spliced, na.rm=TRUE)
rowData.spliced <- dplyr::select(rowData.spliced, chr, start, end, strand, intronStarts, intronEnds) %>%
mutate(confidenceType = 'highConfidenceJunctionReads')

Expand Down Expand Up @@ -321,36 +321,36 @@ isore.combineTranscriptCandidates <- function(readClassSe, readClassSeRef = NULL
counts.unsplicedRefSum <- as_tibble(assays(readClassSeRef)[['counts']])[rowData(readClassSeRef)$confidenceType=='unsplicedNew',] %>%
mutate(index=overlapRefToCombined) %>%
group_by(index) %>%
summarise_all(sum, na.rm=T)
summarise_all(sum, na.rm=TRUE)

counts.unsplicedNewSum <- as_tibble(assays(readClassSe)[['counts']])[rowData(readClassSe)$confidenceType=='unsplicedNew',] %>%
mutate(index=overlapNewToCombined) %>%
group_by(index) %>%
summarise_all(sum, na.rm=T)
summarise_all(sum, na.rm=TRUE)

start.unsplicedRefSum <- as_tibble(assays(readClassSeRef)[['start']])[rowData(readClassSeRef)$confidenceType=='unsplicedNew',] %>%
mutate(index=overlapRefToCombined) %>%
group_by(index) %>%
summarise_all(min, na.rm=T)
summarise_all(min, na.rm=TRUE)

start.unsplicedNewSum <- readClassSeTBL %>%
filter(confidenceType=='unsplicedNew') %>%
dplyr::select(start) %>%
mutate(index=overlapNewToCombined) %>%
group_by(index) %>%
summarise_all(min, na.rm=T)
summarise_all(min, na.rm=TRUE)

end.unsplicedRefSum <- as_tibble(assays(readClassSeRef)[['end']])[rowData(readClassSeRef)$confidenceType=='unsplicedNew',] %>%
mutate(index=overlapRefToCombined) %>%
group_by(index) %>%
summarise_all(max, na.rm=T)
summarise_all(max, na.rm=TRUE)

end.unsplicedNewSum <- readClassSeTBL %>%
filter(confidenceType=='unsplicedNew') %>%
dplyr::select(end) %>%
mutate(index=overlapNewToCombined) %>%
group_by(index) %>%
summarise_all(max, na.rm=T)
summarise_all(max, na.rm=TRUE)

counts.unsplicedRef <- matrix(0,
dimnames = list(1:nrow(rowData.unspliced),
Expand Down Expand Up @@ -641,7 +641,7 @@ seqlevels(exonsByReadClassUnspliced) <- unique(c(seqlevels(exonsByReadClassUnsp

if(any(is.na(mcols(seCombined)$GENEID))){

newGeneIds <- assignNewGeneIds(exonRangesCombined[is.na(mcols(seCombined)$GENEID)], prefix=prefix, minoverlap=5, ignore.strand=F)
newGeneIds <- assignNewGeneIds(exonRangesCombined[is.na(mcols(seCombined)$GENEID)], prefix=prefix, minoverlap=5, ignore.strand=FALSE)

mcols(seCombined)$GENEID[as.integer(newGeneIds$readClassId)] <- newGeneIds$geneId
}
Expand All @@ -659,7 +659,7 @@ seqlevels(exonsByReadClassUnspliced) <- unique(c(seqlevels(exonsByReadClassUnsp
ungroup() %>%
dplyr::select(-geneId)
relCounts <- assays(seCombined)$counts / countsTBL
filterTxUsage <- rowSums(relCounts >= min.readFractionByGene, na.rm=T) >= min.sampleNumber
filterTxUsage <- rowSums(relCounts >= min.readFractionByGene, na.rm=TRUE) >= min.sampleNumber
seCombinedFiltered <- seCombined[filterTxUsage]
exonRangesCombinedFiltered <- exonRangesCombined[filterTxUsage]

Expand Down Expand Up @@ -767,7 +767,7 @@ isore.estimateDistanceToAnnotations <- function(seReadClass, annotationGrangesLi
readClassToGeneIdTableNew <- assignNewGeneIds(rowRanges(seReadClass)[newGeneCandidates],
prefix='.unassigned',
minoverlap=5,
ignore.strand=F)
ignore.strand=FALSE)
readClassGeneTable <- rbind(readClassToGeneIdTable, readClassToGeneIdTableNew)
readClassTable <- left_join(readClassTable, readClassGeneTable, by="readClassId") %>%
dplyr::select(confidenceType, geneId, compatible, equal)
Expand Down
Loading

0 comments on commit 1ca39b7

Please sign in to comment.