diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..cfcb4d4b --- /dev/null +++ b/Dockerfile @@ -0,0 +1,18 @@ +################## BASE IMAGE ###################### +FROM rocker/r-ubuntu:18.04 + +# Install required libraries -- using prebuild binaries where available +RUN apt-get update && apt-get install -y \ +aptitude \ +libcurl4-openssl-dev \ +libxml2-dev \ +git \ +r-cran-devtools \ +r-cran-git2r \ +r-cran-xml \ +r-cran-rcurl \ +sudo + +# Install plr -- for now (?) from GH; also install visualTest +RUN installGithub.r Goekelab/bambu \ +&& rm -rf /tmp/downloaded_packages/ diff --git a/R/isore.R b/R/isore.R index 99a0c169..38d4aee9 100755 --- a/R/isore.R +++ b/R/isore.R @@ -437,24 +437,26 @@ isore.extendAnnotations <- function(se, fragmentStarts=rowData(seFilteredSpliced)$intronStarts, fragmentEnds=rowData(seFilteredSpliced)$intronEnds, strand=rowData(seFilteredSpliced)$strand) - + names(intronsByReadClass) <- 1:length(intronsByReadClass) seqlevels(intronsByReadClass) <- unique(c(seqlevels(intronsByReadClass), seqlevels(annotationGrangesList))) exonEndsShifted <-paste(rowData(seFilteredSpliced)$intronStarts, - rowData(seFilteredSpliced)$end + 1, + as.integer(rowData(seFilteredSpliced)$end + 1), sep=',') - exonStartsShifted <- paste(rowData(seFilteredSpliced)$start - 1, + exonStartsShifted <- paste(as.integer(rowData(seFilteredSpliced)$start - 1), rowData(seFilteredSpliced)$intronEnds, sep=',') - + exonsByReadClass <- makeGRangesListFromFeatureFragments(seqnames=rowData(seFilteredSpliced)$chr, fragmentStarts=exonStartsShifted, fragmentEnds=exonEndsShifted, strand=rowData(seFilteredSpliced)$strand) + exonsByReadClass <- narrow(exonsByReadClass, start = 2, end = -2) # correct junction to exon differences in coordinates + names(exonsByReadClass) <- 1:length(exonsByReadClass) - + # add exon start and exon end rank unlistData <- unlist(exonsByReadClass, use.names = FALSE) partitioning <- PartitioningByEnd(cumsum(elementNROWS(exonsByReadClass)), names=NULL) @@ -532,6 +534,7 @@ isore.extendAnnotations <- function(se, type='equal', select='all', ignore.strand=FALSE) + if(length(overlapsNewIntronsAnnotatedIntrons)>0){ maxGeneCountPerNewTx <- tbl_df(data.frame(txId=names(unlistedIntrons)[queryHits(overlapsNewIntronsAnnotatedIntrons)], geneId=mcols(unlistedIntronsAnnotations)$GENEID[subjectHits(overlapsNewIntronsAnnotatedIntrons)], stringsAsFactors=FALSE)) %>% @@ -566,7 +569,7 @@ isore.extendAnnotations <- function(se, classificationTable$newLastExon[distNewTxByQuery$queryHits[!distNewTxByQuery$endMatch]] <- 'newLastExon' classificationTable$newLastExon[classificationTable$newLastJunction != 'newLastJunction'] <- '' - +} mcols(seFilteredSpliced)$readClassType <- apply(classificationTable, 1, paste, collapse='') diff --git a/R/readWrite.R b/R/readWrite.R index 572d2863..ae5f66df 100644 --- a/R/readWrite.R +++ b/R/readWrite.R @@ -37,36 +37,46 @@ writeBambuOutput <- function(se,path){ #' @export writeToGTF <- function (annotation,file,geneIDs=NULL) { if (missing(annotation) | missing(file)){ - stop('Both GRangesList and the name of file are required.') + stop('Both GRangesList and the name of the output file are required.') }else if (class(annotation) != "CompressedGRangesList"){ stop('The inputted GRangesList is of the wrong class.') - }else { - df <- as.data.frame(annotation) - df$exon_rank <- paste('exon_number "',df$exon_rank,'";',sep= '') - if (missing(geneIDs)){ - if (!is.null(mcols(annotation, use.names=FALSE)$GENEID)){ - geneIDs <- data.frame(mcols(annotation, use.names=FALSE)$TXNAME,mcols(annotation, use.names=FALSE)$GENEID) - colnames(geneIDs) <- c("TXNAME","GENEID") - } - } - df <- merge(df,geneIDs,by.x="group_name",by.y="TXNAME",all=TRUE) - df$group_name <- paste('transcript_id "',df$group_name,'";',sep= '') - df$GENEID <- paste('gene_id "',df$GENEID,'";',sep= '') - gtf_exon <- data.frame(seqname=df$seqnames, source= "Bambu",feature= "exon", - start=df$start,end=df$end,score=".",strand=df$strand,frame=".", - attributes= paste(df$GENEID,df$group_name,df$exon_rank)) - df_end <- df[order(df$end, decreasing = TRUE),] - df_end <- data.frame(group_name=df_end$group_name,uend=df_end$end) - df_end <- df_end[!duplicated(df_end$group_name),] - df <- df[!duplicated(df$group_name),] - df <- merge(df,df_end,by="group_name",all=TRUE) - gtf_trns <- data.frame(seqname=df$seqnames, source= "Bambu",feature= "transcript", - start=df$start,end=df$uend,score=".",strand=df$strand,frame=".", - attributes= paste(df$GENEID,df$group_name)) + } + df <- as_tibble(annotation) + df$exon_rank <- paste('exon_number "',df$exon_rank,'";',sep= '') + if (missing(geneIDs)){ + if (!is.null(mcols(annotation, use.names=FALSE)$GENEID)){ + geneIDs <- as_tibble(mcols(annotation, use.names=FALSE)[, c('TXNAME','GENEID')]) + geneIDs$seqnames = unlist(unique(seqnames(annotation))) + geneIDs$strand = unlist(unique(strand(annotation))) } - gtf <- rbind(gtf_trns,gtf_exon) - gtf <- gtf[order(gtf$attributes),] - write.table(gtf, file= file, quote=FALSE, row.names=FALSE, col.names=FALSE, sep = "\t") + } + df <- left_join(df,geneIDs[,c('TXNAME','GENEID')],by=c("group_name"="TXNAME"),all=TRUE) + df$group_name <- paste('transcript_id "',df$group_name,'";',sep= '') + df$GENEID <- paste('gene_id "',df$GENEID,'";',sep= '') + dfExon <- mutate(df, source='Bambu', + feature='exon', + score='.', + frame='.', + attributes=paste(GENEID, group_name, exon_rank)) %>% + dplyr::select( seqnames, source, feature, start, end, score, strand, frame, attributes, group_name) + dfTx <- as.data.frame(range(ranges(annotation))) + dfTx <- left_join(dfTx,geneIDs,by=c("group_name"="TXNAME"),all=TRUE) + dfTx$group_name <- paste('transcript_id "',dfTx$group_name,'";',sep= '') + dfTx$GENEID <- paste('gene_id "',dfTx$GENEID,'";',sep= '') + + dfTx <- mutate(dfTx, source='Bambu', + feature='transcript', + score='.', + frame='.', + attributes=paste(GENEID, group_name)) %>% + dplyr::select( seqnames, source, feature, start, end, score, strand,frame, attributes, group_name) + gtf <- rbind(dfTx, dfExon) %>% + group_by(group_name) %>% + arrange(as.character(seqnames), start) %>% + ungroup() %>% + dplyr::select(seqnames, source, feature, start, end, score, strand, frame, attributes) + gtf <- mutate(gtf, strand=recode_factor(strand, `*`=".")) + write.table(gtf, file= file, quote=FALSE, row.names=FALSE, col.names=FALSE, sep = "\t") } @@ -84,16 +94,19 @@ readFromGTF <- function(file){ stop('A GTF file is required.') }else{ data=read.delim(file,header=FALSE, comment.char='#') + colnames(data) <- c("seqname","source","type","start","end","score","strand","frame","attribute") - data$strand[data$strand=='.'] <- '*' - data$GENEID = gsub('gene_id (.*); tra.*','\\1',data$attribute) + data <- data[data$type=='exon',] + data$strand[data$strand=='.'] <- '*' + data$GENEID = gsub('gene_id (.*?);.*','\\1',data$attribute) data$TXNAME=gsub('.*transcript_id (.*?);.*', '\\1',data$attribute) - #geneData=unique(data[,c('txid','gene_id','refMatch')])) - geneData=unique(data[,c('TXNAME', 'GENEID')]) grlist <- makeGRangesListFromDataFrame( - data[data$type=='exon',],split.field='TXNAME', names.field='TXNAME',keep.extra.columns = TRUE) + data[,c('seqname', 'start','end','strand','TXNAME')],split.field='TXNAME',keep.extra.columns = TRUE) + grlist <- grlist[IRanges::order(start(grlist))] + geneData=(unique(data[,c('TXNAME', 'GENEID')])) - mcols(grlist) <- DataFrame(geneData[(match(names(grlist), geneData$txid)),]) + mcols(grlist) <- DataFrame(geneData[(match(names(grlist), geneData$TXNAME)),]) + } return (grlist) }