Skip to content

Commit

Permalink
Merge pull request #128 from GoekeLab/bambu-devel
Browse files Browse the repository at this point in the history
Update read and write function and fixed a few other bugs

Former-commit-id: 24f7bee
  • Loading branch information
cying111 authored Jul 1, 2020
2 parents c72e637 + c905238 commit 5ddf1d4
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 39 deletions.
18 changes: 18 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
################## BASE IMAGE ######################
FROM rocker/r-ubuntu:18.04

# Install required libraries -- using prebuild binaries where available
RUN apt-get update && apt-get install -y \
aptitude \
libcurl4-openssl-dev \
libxml2-dev \
git \
r-cran-devtools \
r-cran-git2r \
r-cran-xml \
r-cran-rcurl \
sudo

# Install plr -- for now (?) from GH; also install visualTest
RUN installGithub.r Goekelab/bambu \
&& rm -rf /tmp/downloaded_packages/
15 changes: 9 additions & 6 deletions R/isore.R
Original file line number Diff line number Diff line change
Expand Up @@ -437,24 +437,26 @@ isore.extendAnnotations <- function(se,
fragmentStarts=rowData(seFilteredSpliced)$intronStarts,
fragmentEnds=rowData(seFilteredSpliced)$intronEnds,
strand=rowData(seFilteredSpliced)$strand)

names(intronsByReadClass) <- 1:length(intronsByReadClass)
seqlevels(intronsByReadClass) <- unique(c(seqlevels(intronsByReadClass), seqlevels(annotationGrangesList)))

exonEndsShifted <-paste(rowData(seFilteredSpliced)$intronStarts,
rowData(seFilteredSpliced)$end + 1,
as.integer(rowData(seFilteredSpliced)$end + 1),
sep=',')
exonStartsShifted <- paste(rowData(seFilteredSpliced)$start - 1,
exonStartsShifted <- paste(as.integer(rowData(seFilteredSpliced)$start - 1),
rowData(seFilteredSpliced)$intronEnds,
sep=',')

exonsByReadClass <- makeGRangesListFromFeatureFragments(seqnames=rowData(seFilteredSpliced)$chr,
fragmentStarts=exonStartsShifted,
fragmentEnds=exonEndsShifted,
strand=rowData(seFilteredSpliced)$strand)

exonsByReadClass <- narrow(exonsByReadClass, start = 2, end = -2) # correct junction to exon differences in coordinates

names(exonsByReadClass) <- 1:length(exonsByReadClass)

# add exon start and exon end rank
unlistData <- unlist(exonsByReadClass, use.names = FALSE)
partitioning <- PartitioningByEnd(cumsum(elementNROWS(exonsByReadClass)), names=NULL)
Expand Down Expand Up @@ -532,6 +534,7 @@ isore.extendAnnotations <- function(se,
type='equal',
select='all',
ignore.strand=FALSE)
if(length(overlapsNewIntronsAnnotatedIntrons)>0){
maxGeneCountPerNewTx <- tbl_df(data.frame(txId=names(unlistedIntrons)[queryHits(overlapsNewIntronsAnnotatedIntrons)],
geneId=mcols(unlistedIntronsAnnotations)$GENEID[subjectHits(overlapsNewIntronsAnnotatedIntrons)],
stringsAsFactors=FALSE)) %>%
Expand Down Expand Up @@ -566,7 +569,7 @@ isore.extendAnnotations <- function(se,

classificationTable$newLastExon[distNewTxByQuery$queryHits[!distNewTxByQuery$endMatch]] <- 'newLastExon'
classificationTable$newLastExon[classificationTable$newLastJunction != 'newLastJunction'] <- ''

}
mcols(seFilteredSpliced)$readClassType <- apply(classificationTable, 1, paste, collapse='')


Expand Down
79 changes: 46 additions & 33 deletions R/readWrite.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,36 +37,46 @@ writeBambuOutput <- function(se,path){
#' @export
writeToGTF <- function (annotation,file,geneIDs=NULL) {
if (missing(annotation) | missing(file)){
stop('Both GRangesList and the name of file are required.')
stop('Both GRangesList and the name of the output file are required.')
}else if (class(annotation) != "CompressedGRangesList"){
stop('The inputted GRangesList is of the wrong class.')
}else {
df <- as.data.frame(annotation)
df$exon_rank <- paste('exon_number "',df$exon_rank,'";',sep= '')
if (missing(geneIDs)){
if (!is.null(mcols(annotation, use.names=FALSE)$GENEID)){
geneIDs <- data.frame(mcols(annotation, use.names=FALSE)$TXNAME,mcols(annotation, use.names=FALSE)$GENEID)
colnames(geneIDs) <- c("TXNAME","GENEID")
}
}
df <- merge(df,geneIDs,by.x="group_name",by.y="TXNAME",all=TRUE)
df$group_name <- paste('transcript_id "',df$group_name,'";',sep= '')
df$GENEID <- paste('gene_id "',df$GENEID,'";',sep= '')
gtf_exon <- data.frame(seqname=df$seqnames, source= "Bambu",feature= "exon",
start=df$start,end=df$end,score=".",strand=df$strand,frame=".",
attributes= paste(df$GENEID,df$group_name,df$exon_rank))
df_end <- df[order(df$end, decreasing = TRUE),]
df_end <- data.frame(group_name=df_end$group_name,uend=df_end$end)
df_end <- df_end[!duplicated(df_end$group_name),]
df <- df[!duplicated(df$group_name),]
df <- merge(df,df_end,by="group_name",all=TRUE)
gtf_trns <- data.frame(seqname=df$seqnames, source= "Bambu",feature= "transcript",
start=df$start,end=df$uend,score=".",strand=df$strand,frame=".",
attributes= paste(df$GENEID,df$group_name))
}
df <- as_tibble(annotation)
df$exon_rank <- paste('exon_number "',df$exon_rank,'";',sep= '')
if (missing(geneIDs)){
if (!is.null(mcols(annotation, use.names=FALSE)$GENEID)){
geneIDs <- as_tibble(mcols(annotation, use.names=FALSE)[, c('TXNAME','GENEID')])
geneIDs$seqnames = unlist(unique(seqnames(annotation)))
geneIDs$strand = unlist(unique(strand(annotation)))
}
gtf <- rbind(gtf_trns,gtf_exon)
gtf <- gtf[order(gtf$attributes),]
write.table(gtf, file= file, quote=FALSE, row.names=FALSE, col.names=FALSE, sep = "\t")
}
df <- left_join(df,geneIDs[,c('TXNAME','GENEID')],by=c("group_name"="TXNAME"),all=TRUE)
df$group_name <- paste('transcript_id "',df$group_name,'";',sep= '')
df$GENEID <- paste('gene_id "',df$GENEID,'";',sep= '')
dfExon <- mutate(df, source='Bambu',
feature='exon',
score='.',
frame='.',
attributes=paste(GENEID, group_name, exon_rank)) %>%
dplyr::select( seqnames, source, feature, start, end, score, strand, frame, attributes, group_name)
dfTx <- as.data.frame(range(ranges(annotation)))
dfTx <- left_join(dfTx,geneIDs,by=c("group_name"="TXNAME"),all=TRUE)
dfTx$group_name <- paste('transcript_id "',dfTx$group_name,'";',sep= '')
dfTx$GENEID <- paste('gene_id "',dfTx$GENEID,'";',sep= '')

dfTx <- mutate(dfTx, source='Bambu',
feature='transcript',
score='.',
frame='.',
attributes=paste(GENEID, group_name)) %>%
dplyr::select( seqnames, source, feature, start, end, score, strand,frame, attributes, group_name)
gtf <- rbind(dfTx, dfExon) %>%
group_by(group_name) %>%
arrange(as.character(seqnames), start) %>%
ungroup() %>%
dplyr::select(seqnames, source, feature, start, end, score, strand, frame, attributes)
gtf <- mutate(gtf, strand=recode_factor(strand, `*`="."))
write.table(gtf, file= file, quote=FALSE, row.names=FALSE, col.names=FALSE, sep = "\t")
}


Expand All @@ -84,16 +94,19 @@ readFromGTF <- function(file){
stop('A GTF file is required.')
}else{
data=read.delim(file,header=FALSE, comment.char='#')

colnames(data) <- c("seqname","source","type","start","end","score","strand","frame","attribute")
data$strand[data$strand=='.'] <- '*'
data$GENEID = gsub('gene_id (.*); tra.*','\\1',data$attribute)
data <- data[data$type=='exon',]
data$strand[data$strand=='.'] <- '*'
data$GENEID = gsub('gene_id (.*?);.*','\\1',data$attribute)
data$TXNAME=gsub('.*transcript_id (.*?);.*', '\\1',data$attribute)
#geneData=unique(data[,c('txid','gene_id','refMatch')]))
geneData=unique(data[,c('TXNAME', 'GENEID')])
grlist <- makeGRangesListFromDataFrame(
data[data$type=='exon',],split.field='TXNAME', names.field='TXNAME',keep.extra.columns = TRUE)
data[,c('seqname', 'start','end','strand','TXNAME')],split.field='TXNAME',keep.extra.columns = TRUE)
grlist <- grlist[IRanges::order(start(grlist))]

geneData=(unique(data[,c('TXNAME', 'GENEID')]))
mcols(grlist) <- DataFrame(geneData[(match(names(grlist), geneData$txid)),])
mcols(grlist) <- DataFrame(geneData[(match(names(grlist), geneData$TXNAME)),])

}
return (grlist)
}
Expand Down

0 comments on commit 5ddf1d4

Please sign in to comment.