Skip to content

Commit

Permalink
Merge pull request #120 from GoekeLab/master
Browse files Browse the repository at this point in the history
update devel branch 

Former-commit-id: f431818
  • Loading branch information
cying111 authored Jun 22, 2020
2 parents 36e1456 + c72e637 commit 0d8d943
Show file tree
Hide file tree
Showing 28 changed files with 332 additions and 290 deletions.
28 changes: 23 additions & 5 deletions R/annotationFunctions.R
Original file line number Diff line number Diff line change
Expand Up @@ -57,19 +57,37 @@ prepareAnnotationsFromGTF <- function(file){
data$strand[data$strand=='.'] <- '*'
data$GENEID = gsub('gene_id (.*?);.*','\\1',data$attribute)
data$TXNAME=gsub('.*transcript_id (.*?);.*', '\\1',data$attribute)
data$exon_rank=as.integer(gsub('.*exon_number (.*?);.*', '\\1',data$attribute))
#data$exon_rank=as.integer(gsub('.*exon_number (.*?);.*', '\\1',data$attribute))

geneData=unique(data[,c('TXNAME', 'GENEID')])
grlist <- makeGRangesListFromDataFrame(
data[,c('seqname', 'start','end','strand','exon_rank','TXNAME')],split.field='TXNAME',keep.extra.columns = TRUE)

data[,c('seqname', 'start','end','strand','TXNAME')],split.field='TXNAME',keep.extra.columns = TRUE)
grlist <- grlist[IRanges::order(start(grlist))]

unlistedExons <- unlist(grlist, use.names = FALSE)
partitioning <- PartitioningByEnd(cumsum(elementNROWS(grlist)), names=NULL)
txIdForReorder <- togroup(PartitioningByWidth(grlist))
unlistedExons <- unlistedExons[order(txIdForReorder, unlistedExons$exon_rank)] #'exonsByTx' is always sorted by exon rank, not by strand, make sure that this is the case here

exon_rank <- sapply(elementNROWS(grlist), seq, from = 1)
exon_rank[which(unlist(unique(strand(grlist)))=="-")] <- lapply(exon_rank[which(unlist(unique(strand(grlist)))=="-")], rev) # * assumes positive for exon ranking
names(exon_rank) <- NULL
unlistedExons$exon_rank <- unlist(exon_rank)


unlistedExons <- unlistedExons[order(txIdForReorder, unlistedExons$exon_rank)] #'exonsByTx' is always sorted by exon rank, not by strand, make sure that this is the case here
unlistedExons$exon_endRank <- unlist(sapply(elementNROWS(grlist),seq,to=1), use.names=FALSE)
unlistedExons <- unlistedExons[order(txIdForReorder, start(unlistedExons))]
# mcols(unlistedExons) <- mcols(unlistedExons)[,c('exon_rank','exon_endRank')]


mcols(unlistedExons) <- mcols(unlistedExons)[,c('exon_rank','exon_endRank')]

grlist <- relist(unlistedExons, partitioning)

# the grlist from gtf is ranked by exon_number instead of start position
# to be consistent with prepareAnnotations
# need to sort the grlist by start position


minEqClasses <- getMinimumEqClassByTx(grlist)
mcols(grlist) <- DataFrame(geneData[(match(names(grlist), geneData$TXNAME)),])
mcols(grlist)$eqClass <- minEqClasses$eqClass[match(names(grlist),minEqClasses$queryTxId)]
Expand Down
14 changes: 13 additions & 1 deletion R/bambu.R
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ bambu <- function(reads = NULL, readClass.file = NULL, readClass.outputDir = NUL
isoreParameters <- isoreParameters.default

## check quantification parameters
emParameters.default <- list(bias = FALSE,
emParameters.default <- list(bias = TRUE,
maxiter = 10000,
conv = 10^(-4))

Expand Down Expand Up @@ -210,14 +210,19 @@ bambu <- function(reads = NULL, readClass.file = NULL, readClass.outputDir = NUL
#' @noRd
bambu.extendAnnotations <- function(readClassList, annotations, isoreParameters, verbose = FALSE){
combinedTxCandidates = NULL
start.ptm <- proc.time()
for(readClassIndex in seq_along(readClassList)){
readClass <- readClassList[[readClassIndex]]
if(is.character(readClass)){
readClass <- readRDS(file=readClass)
seqlevelsStyle(readClass) <- seqlevelsStyle(annotations)[1]
}
combinedTxCandidates <- isore.combineTranscriptCandidates(readClass, readClassSeRef = combinedTxCandidates, verbose = verbose)
rm(readClass)
gc(verbose=FALSE)
}
end.ptm <- proc.time()
if(verbose) message('combining transcripts in ', round((end.ptm-start.ptm)[3]/60,1), ' mins.')
annotations <- isore.extendAnnotations(se=combinedTxCandidates,
annotationGrangesList=annotations,
remove.subsetTx = isoreParameters[['remove.subsetTx']],
Expand All @@ -228,6 +233,8 @@ bambu.extendAnnotations <- function(readClassList, annotations, isoreParameters,
min.exonOverlap = isoreParameters[['min.exonOverlap']],
prefix = isoreParameters[['prefix']],
verbose = verbose)
rm(combinedTxCandidates)
gc(verbose=FALSE)
return(annotations)
}

Expand All @@ -241,16 +248,21 @@ bambu.quantify <- function(readClass, annotations, emParameters, min.exonDistanc
}

readClass <- isore.estimateDistanceToAnnotations(readClass, annotations, min.exonDistance = min.exonDistance, verbose = verbose)
gc(verbose=FALSE)
readClassDt <- getEmptyClassFromSE(readClass, annotations)
colNameRC <- colnames(readClass)
colDataRC <- colData(readClass)
rm(readClass)
gc(verbose=FALSE)
counts <- bambu.quantDT(readClassDt,emParameters = emParameters, ncore = ncore, verbose = verbose)
rm(readClassDt)
gc(verbose=FALSE)
if(length(setdiff(counts$tx_name,names(annotations)))>0){
stop("The provided annotation is incomplete")
}
counts <- counts[data.table(tx_name = names(annotations)), on = 'tx_name']
rm(annotations)
gc(verbose=FALSE)
counts[is.na(estimates),`:=`(estimates = 0, CPM = 0) ]

seOutput <- SummarizedExperiment::SummarizedExperiment(assays = SimpleList(counts = matrix(counts$estimates,ncol = 1, dimnames = list(NULL, colNameRC)),
Expand Down
61 changes: 39 additions & 22 deletions R/junctionCorrection.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,38 @@ createJunctionTable <- function(unlisted_junction_granges, genomeSequence=NULL,

if(is.null(genomeSequence)){
stop("Reference genome sequence is missing, please provide fasta file or BSgenome name, see available.genomes()")
}else if(class(genomeSequence) != 'FaFile'){
}

original_seqlevelstyle <- seqlevelsStyle(unlisted_junction_granges)[1]

if(class(genomeSequence) != 'FaFile'){
if(grepl('.fa',genomeSequence)){
genomeSequence <- Rsamtools::FaFile(genomeSequence)

if(seqlevelsStyle(genomeSequence)[1] != seqlevelsStyle(unlisted_junction_granges)[1]){
seqlevelsStyle(unlisted_junction_granges) <- seqlevelsStyle(genomeSequence)[1]
}

}else {
if (!suppressWarnings(require(BSgenome, quietly=TRUE)))
stop("Please install the BSgenome package")

genomeSequence <- BSgenome::getBSgenome(genomeSequence)
seqlevelsStyle(genomeSequence) <- seqlevelsStyle(unlisted_junction_granges)[1]
if(!all(seqlevels(unlisted_junction_granges) %in% seqlevels(genomeSequence))) {
message("not all chromosomes present in reference genome sequence, ranges are dropped")
unlisted_junction_granges <- keepSeqlevels(unlisted_junction_granges,
value = seqlevels(unlisted_junction_granges)[seqlevels(unlisted_junction_granges) %in% seqlevels(genomeSequence)],
pruning.mode = 'coarse')
}
}
}

if(class(genomeSequence) == 'FaFile'){
if(seqlevelsStyle(genomeSequence)[1] != seqlevelsStyle(unlisted_junction_granges)[1]){
seqlevelsStyle(unlisted_junction_granges) <- seqlevelsStyle(genomeSequence)[1]
}
}
if(!all(seqlevels(unlisted_junction_granges) %in% seqlevels(genomeSequence))) {
message("not all chromosomes present in reference genome sequence, ranges are dropped")
unlisted_junction_granges <- keepSeqlevels(unlisted_junction_granges,
value = seqlevels(unlisted_junction_granges)[seqlevels(unlisted_junction_granges) %in% seqlevels(genomeSequence)],
pruning.mode = 'coarse')
}

##Todo: don't create junction names, instead work with indices/intergers (names are memory intensive)

Expand All @@ -39,20 +54,20 @@ createJunctionTable <- function(unlisted_junction_granges, genomeSequence=NULL,

# the code now includes a parallel implementation, which is only helpful when the BSgenome package is used

#junctionSeqStart<-BSgenome::getSeq(genomeSequence,IRanges::shift(flank(uniqueJunctions,width=2),2)) # shift: from IRanges
#junctionSeqEnd<-BSgenome::getSeq(genomeSequence,IRanges::shift(flank(uniqueJunctions,width=2,start=FALSE),-2)) # shift: from IRanges

bpParameters <- BiocParallel::bpparam()
bpParameters$workers <- ncore
junctionSeqStart <- BiocParallel::bpvec(IRanges::shift(flank(uniqueJunctions,width=2),2),
BSgenome::getSeq,
x = genomeSequence,
BPPARAM=bpParameters)
junctionSeqStart<-BSgenome::getSeq(genomeSequence,IRanges::shift(flank(uniqueJunctions,width=2),2)) # shift: from IRanges
junctionSeqEnd<-BSgenome::getSeq(genomeSequence,IRanges::shift(flank(uniqueJunctions,width=2,start=FALSE),-2)) # shift: from IRanges

junctionSeqEnd <- BiocParallel::bpvec(IRanges::shift(flank(uniqueJunctions,width=2,start=FALSE),-2),
BSgenome::getSeq,
x = genomeSequence,
BPPARAM=bpParameters)
# bpParameters <- BiocParallel::bpparam()
# bpParameters$workers <- ncore
# junctionSeqStart <- BiocParallel::bpvec(IRanges::shift(flank(uniqueJunctions,width=2),2),
# BSgenome::getSeq,
# x = genomeSequence,
# BPPARAM=bpParameters)
#
# junctionSeqEnd <- BiocParallel::bpvec(IRanges::shift(flank(uniqueJunctions,width=2,start=FALSE),-2),
# BSgenome::getSeq,
# x = genomeSequence,
# BPPARAM=bpParameters)


junctionMotif <- paste(junctionSeqStart,junctionSeqEnd,sep='-')
Expand All @@ -75,7 +90,9 @@ createJunctionTable <- function(unlisted_junction_granges, genomeSequence=NULL,
id=1:length(uniqueJunctions))

strand(uniqueJunctions)<-uniqueJunctions$spliceStrand

if(original_seqlevelstyle != seqlevelsStyle(unlisted_junction_granges)[1]){
seqlevelsStyle(uniqueJunctions) <- original_seqlevelstyle
}
return(uniqueJunctions)
}

Expand Down
2 changes: 1 addition & 1 deletion R/plotBambu.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#' plotSEOuptut
#' @title plot.bambu
#' @param se An summarized experiment object obtained from \code\link{bambu} or \code{\link{transcriptToGene}}.
#' @param se An summarized experiment object obtained from \code{\link{bambu}} or \code{\link{transcriptToGene}}.
#' @param group.variable Variable for grouping in plot, has be to provided if choosing to plot PCA.
#' @param type plot type variable, a values of annotation for a single gene with heatmap for isoform expressions, pca, or heatmap, see \code{\link{details}}.
#' @param gene_id specifying the gene_id for plotting gene annotation, either gene_id or transcript_id has to be provided when type = "annotation".
Expand Down
17 changes: 2 additions & 15 deletions R/prepareBam.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#' @param bamFile bamFile
#' @inheritParams bambu
#' @noRd
prepareDataFromBam <- function(bamFile, yieldSize=NULL, verbose = FALSE, ncore=1) {
prepareDataFromBam <- function(bamFile, yieldSize=NULL, verbose = FALSE, ncore = 1) {

if(class(bamFile)=='BamFile') {
if(!is.null(yieldSize)) {
Expand All @@ -21,19 +21,6 @@ prepareDataFromBam <- function(bamFile, yieldSize=NULL, verbose = FALSE, ncore=1
bamFile <- Rsamtools::BamFile(bamFile, yieldSize = yieldSize)
}

# parallel processing of single files by reading chromosomes separately
if(ncore>1){

chr <- scanBamHeader(bamFile)[[1]]
chrRanges <- GRanges(seqnames=names(chr), ranges=IRanges(start=1, end=chr))
readGrgList <- mclapply(1:length(chr),
helpFun,
chrRanges=chrRanges,
bamFile=bamFile,
mc.cores = ncore)
} else {


bf <- open(bamFile)
#readGrgList <- GenomicRanges::GRangesList()

Expand All @@ -51,7 +38,7 @@ prepareDataFromBam <- function(bamFile, yieldSize=NULL, verbose = FALSE, ncore=1
}

close(bf)
}

if(length(readGrgList)>1) {
readGrgList <- do.call(c, readGrgList)
} else {
Expand Down
5 changes: 4 additions & 1 deletion R/readWrite.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,11 @@ writeBambuOutput <- function(se,path){
transcript_gtffn <- paste(path,"transcript_exon.gtf",sep="")
gtf <- writeToGTF(annotation=transcript_grList,file=transcript_gtffn)
transcript_counts <- as.data.frame(assays(se)$counts)
geneIDs <- data.frame(mcols(transcript_grList, use.names=FALSE)$TXNAME,mcols(transcript_grList, use.names=FALSE)$GENEID)
colnames(geneIDs) <- c("TXNAME","GENEID")
transcript_counts <- cbind(geneIDs,transcript_counts)
transcript_countsfn <- paste(path,"counts_transcript.txt",sep="")
write.table(transcript_counts, file= transcript_countsfn, sep="\t",quote=FALSE)
write.table(transcript_counts, file= transcript_countsfn, sep="\t",quote=FALSE,row.names= FALSE)
gene_se <- transcriptToGeneExpression(se)
gene_counts <- as.data.frame(assays(gene_se)$counts)
gene_countsfn <- paste(path,"counts_gene.txt",sep="")
Expand Down
Binary file modified R/sysdata.rda
Binary file not shown.
2 changes: 1 addition & 1 deletion R/transcriptToGeneExpression.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ transcriptToGeneExpression<- function(se){
counts <- rowDataSe[,.(TXNAME,GENEID)][counts, on = "TXNAME"]

counts[, valueGene:=sum(value), by = list(variable, GENEID)]
counts[, valueGeneCPM:=valueGene/sum(value)*10^6, by = list(variable)]
counts[, valueGeneCPM:=valueGene/max(sum(value),1)*10^6, by = list(variable)]

## counts
counts_gene <- dcast(unique(counts[,.(GENEID, variable, valueGene)]), GENEID ~ variable, value.var = "valueGene")
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,10 @@ writeBambuOutput(se, path = "./bambu/")

### Release History

**bambu version 0.2.0**

Release date: 18th June 2020

**bambu version 0.1.0**

Release date: 29th May 2020
Expand Down
6 changes: 6 additions & 0 deletions data-raw/DATASET.R
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ seCombinedGeneExpected <- transcriptToGeneExpression(seCombined)
seCombinedExtendedGeneExpected <- transcriptToGeneExpression(seCombinedExtended)


set.seed(1234)
seqlevelsStyle(gr) <- "UCSC"
seUCSCExpected = bambu(reads = test.bam, annotations = gr, genomeSequence = fa.file, extendAnnotations = FALSE)




usethis::use_data(data1,data2,data3,data4,data5,
Expand All @@ -73,6 +78,7 @@ usethis::use_data(data1,data2,data3,data4,data5,
seWithDistExpected,
seGeneExpected,seExtendedGeneExpected,
seCombinedGeneExpected,seCombinedExtendedGeneExpected,
seUCSCExpected,
internal = TRUE,overwrite = TRUE)


Expand Down
Loading

0 comments on commit 0d8d943

Please sign in to comment.