Add GATK/Picard header to interval file.

ddrichel · Sep 19, 2021 · d0bf581 · d0bf581
1 parent 10786e6
commit d0bf581
Show file tree

Hide file tree

Showing 15 changed files with 205 additions and 211 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -2,8 +2,8 @@ Package: PureCN
 Type: Package
 Title: Copy number calling and SNV classification using
     targeted short read sequencing
-Version: 1.23.29
-Date: 2021-09-13
+Version: 1.23.30
+Date: 2021-09-18
 Authors@R: c(person("Markus", "Riester",
                     role = c("aut", "cre"),
                     email = "[email protected]",

diff --git a/NAMESPACE b/NAMESPACE
@@ -32,6 +32,7 @@ export(processMultipleSamples)
 export(readAllelicCountsFile)
 export(readCoverageFile)
 export(readCurationFile)
+export(readIntervalFile)
 export(readLogRatioFile)
 export(readSegmentationFile)
 export(runAbsoluteCN)

diff --git a/NEWS b/NEWS
@@ -34,6 +34,8 @@ SIGNIFICANT USER-VISIBLE CHANGES
     o Add segmentation parameters as attributes to segmentation data.frame
     o Added min.betafit.rho and max.betafit.rho to calculateMappingBias*
     o Made --normal_panel in PureCN.R defunct
+    o Added GATK/Picard header with sequence lengths to interval file,
+      added readIntervalFile function to parse it
 
 BUGFIXES
 

diff --git a/R/PureCN-internal.R b/R/PureCN-internal.R
@@ -998,7 +998,7 @@ na.rm = TRUE)
 }
 
 .gcGeneToCoverage <- function(interval.file, min.coverage, min.total.counts) {
-    gc.data <- readCoverageFile(interval.file)
+    gc.data <- readIntervalFile(interval.file)
     gc.data$average.coverage <- min.coverage
     gc.data$coverage <- min.coverage * width(gc.data)
     gc.data$counts <- Inf

diff --git a/R/preprocessIntervals.R b/R/preprocessIntervals.R
diff --git a/R/readAllelicCountsFile.R b/R/readAllelicCountsFile.R
@@ -42,20 +42,17 @@ readAllelicCountsFile <- function(file, format, zero=NULL) {
     invisible(outputCounts)
 }
 
-.readAllelicCountsFileGatk4 <- function(file, zero) {
+.parseGATKHeader <- function(con) {
     .extractField <- function(line, field) {
         fields <- strsplit(line, "\t")[[1]]
         key <- paste0("^", field, ":")
         fields <- fields[grep(key, fields)]
         gsub(key, "", fields[1]) 
     }
-
-    if (!is.null(zero)) flog.warn("zero ignored for GATK4 files.")
-    con <- file(file, open = "r")
     sid <- NULL
     sl <- list()
     while ( TRUE ) {
-        line = readLines(con, n = 1)
+        line <- readLines(con, n = 1)
         if ( length(line) == 0 || !grepl("^@", line)[1]) {
             break
         }
@@ -64,35 +61,43 @@ readAllelicCountsFile <- function(file, format, zero=NULL) {
             sl[[.extractField(line, "SN")]] <- .extractField(line, "LN")
         }
     }
+    return(list(sid = sid, sl = sl, last_line = line))
+}
 
+.readAllelicCountsFileGatk4 <- function(file, zero) {
+    if (!is.null(zero)) flog.warn("zero ignored for GATK4 files.")
+    con <- file(file, open = "r")
+    header <- .parseGATKHeader(con)
     inputCounts <- read.delim(con, header = FALSE)
-    colnames(inputCounts) <- strsplit(line, "\t")[[1]]
+    colnames(inputCounts) <- strsplit(header$last_line, "\t")[[1]]
     close(con)
     gr <- GRanges(seqnames = inputCounts$CONTIG, IRanges(start = inputCounts$POSITION, end = inputCounts$POSITION))
     vcf <- VCF(gr, 
-                colData = DataFrame(Samples = 1, row.names = sid),
-                exptData = list(header = VCFHeader(samples = sid)))
+                colData = DataFrame(Samples = 1, row.names = header$sid),
+                exptData = list(header = VCFHeader(samples = header$sid)))
     ref(vcf) <- DNAStringSet(inputCounts$REF_NUCLEOTIDE)
     alt(vcf) <- DNAStringSetList(lapply(inputCounts$ALT_NUCLEOTIDE, DNAStringSet))
     info(header(vcf)) <- DataFrame(
-        Number ="0",
+        Number = "0",
         Type = "Flag",
         Description = "dbSNP Membership",
-        row.names="DB")
+        row.names = "DB")
 
     geno(header(vcf)) <- DataFrame(
         Number =".",
         Type = "Integer",
         Description = "Allelic depths for the ref and alt alleles in the order listed",
-        row.names="AD")
+        row.names = "AD")
 
     info(vcf)$DB <- TRUE
-    geno(vcf)$AD <- matrix(lapply(seq(nrow(inputCounts)), function(i) c( inputCounts$REF_COUNT[i], inputCounts$ALT_COUNT[i])), ncol = 1, dimnames = list(NULL, sid))
+    geno(vcf)$AD <- matrix(lapply(seq(nrow(inputCounts)), function(i)
+            c(inputCounts$REF_COUNT[i], inputCounts$ALT_COUNT[i])),
+        ncol = 1, dimnames = list(NULL, header$sid))
+
     names(vcf) <- paste0(seqnames(vcf), ":", start(vcf))
-    if (length(sl)) {
-        sl <- sapply(sl, as.numeric)
-        seqlengths(vcf) <- sl[names(seqlengths(vcf))]  
+    if (length(header$sl)) {
+        header$sl <- sapply(header$sl, as.numeric)
+        seqlengths(vcf) <- header$sl[names(seqlengths(vcf))]  
     }
     .readAndCheckVcf(vcf)
 }
-
diff --git a/R/readCoverageFile.R b/R/readCoverageFile.R
@@ -54,11 +54,11 @@ readCoverageFile <- function(file, format, zero=NULL, read.length = 100) {
     inputCoverage$on_target <- as.logical(inputCoverage$on_target)
 
     targetCoverage <- GRanges(inputCoverage$Target, 
-        coverage=inputCoverage$total_coverage, 
-        average.coverage=NA,
-        counts=inputCoverage$counts,
-        on.target=inputCoverage$on_target,
-        duplication.rate=inputCoverage$duplication_rate)
+        coverage = inputCoverage$total_coverage, 
+        average.coverage = NA,
+        counts = inputCoverage$counts,
+        on.target = inputCoverage$on_target,
+        duplication.rate = inputCoverage$duplication_rate)
     targetCoverage <- .addAverageCoverage(targetCoverage)
     targetCoverage
 }
@@ -78,10 +78,10 @@ readCoverageFile <- function(file, format, zero=NULL, read.length = 100) {
 }
 
 .addAverageCoverage <- function(x) {
-    x$average.coverage <- x$coverage/width(x)
+    x$average.coverage <- x$coverage / width(x)
     x
 }    
-.readCoverageCnn <- function(file, zero, format="cnn") {
+.readCoverageCnn <- function(file, zero, format = "cnn") {
     if (is.null(zero)) zero <- TRUE
     inputCoverage <- fread(file, sep = "\t")
     if (zero) inputCoverage$start <- inputCoverage$start + 1
@@ -91,11 +91,11 @@ readCoverageFile <- function(file, format, zero=NULL, read.length = 100) {
     targetCoverage$on.target <- TRUE
     targetCoverage$depth <- NULL
     targetCoverage$Gene <- targetCoverage$gene
-    targetCoverage$on.target[which(targetCoverage$Gene=="Background")] <- FALSE
-    targetCoverage$on.target[which(targetCoverage$Gene=="Antitarget")] <- FALSE
+    targetCoverage$on.target[which(targetCoverage$Gene == "Background")] <- FALSE
+    targetCoverage$on.target[which(targetCoverage$Gene == "Antitarget")] <- FALSE
     targetCoverage$gene <- NULL
     targetCoverage$duplication.rate <- NA
-    if (format=="cnr") {
+    if (format == "cnr") {
         targetCoverage$log.ratio <- targetCoverage$log2
     }
     targetCoverage$log2 <- NULL
@@ -148,46 +148,23 @@ readCoverageFile <- function(file, format, zero=NULL, read.length = 100) {
     tumor$gc_bias <- as.numeric(tumor$gc_bias)
     if (is.null(tumor$Gene)) tumor$Gene <- "."
 
-    inputGC <- read.delim(interval.file, as.is = TRUE)
-    if (is.null(inputGC$gc_bias)) {
-        .stopUserError("No gc_bias column in interval.file.")
-    }    
-    if (is.null(inputGC$Gene)) {
-        if (verbose) flog.info("No Gene column in interval.file. You won't get gene-level calls.")
-        inputGC$Gene <- "."
-    }
-    if (is.null(inputGC$on_target)) {
-        if (verbose) flog.info("No on_target column in interval.file. Recreate this file with IntervalFile.R.")
-        inputGC$on_target <- TRUE
-    }
-    if (is.null(inputGC$mappability)) {
-        if (verbose) flog.info("No mappability column in interval.file.")
-        inputGC$mappability <- 1
-    }
-    if (is.null(inputGC$reptiming)) {
-        if (verbose) flog.info("No reptiming column in interval.file.")
-        inputGC$reptiming <- NA
-    }
-
-    targetGC <- GRanges(inputGC[,1], ranges=NULL, strand=NULL, inputGC[,-1])
-    targetGC <- sort(sortSeqlevels(targetGC))
-
-    ov <- findOverlaps(tumor, targetGC) 
+    targetGC <- readIntervalFile(interval.file)
+    ov <- findOverlaps(tumor, targetGC)
     if (!identical(as.character(tumor), as.character(targetGC))) {
         # if only a few intervals are missing, maybe because of some poor 
         # quality regions, we just ignore those, otherwise we stop because 
         # user probably used the wrong file for the assay
-        if (length(ov) < length(tumor)/2) {
+        if (length(ov) < length(tumor) / 2) {
             .stopUserError("tumor.coverage.file and interval.file do not align.")
         } else {
             flog.warn("tumor.coverage.file and interval.file do not align.")
         }
     }
 
     if (!is.null(tumor$on.target)) {
-        if (!identical(tumor[queryHits(ov)]$on.target, targetGC[subjectHits(ov)]$on_target)) {
+        if (!identical(tumor[queryHits(ov)]$on.target, targetGC[subjectHits(ov)]$on.target)) {
             flog.warn("Intervals in coverage and interval.file have conflicting on/off-target annotation.")
-            tumor[queryHits(ov)]$on.target <- targetGC[subjectHits(ov)]$on_target
+            tumor[queryHits(ov)]$on.target <- targetGC[subjectHits(ov)]$on.target
         }
     } 
     tumor[queryHits(ov)]$mappability <- targetGC[subjectHits(ov)]$mappability

diff --git a/R/readLogRatioFile.R b/R/readLogRatioFile.R
@@ -41,10 +41,18 @@ readLogRatioFile <- function(file, format, zero = NULL) {
 }
 
 .readLogRatioFileGATK4 <- function(file, zero = FALSE) {
-    x <- read.delim(file, comment.char = "@", as.is = TRUE)
+    con <- file(file, open = "r")
+    header <- .parseGATKHeader(con)
+    x <- read.delim(con, header = FALSE, as.is = TRUE)
+    colnames(x) <- strsplit(header$last_line, "\t")[[1]]
     gr <- GRanges(x[,1], IRanges(start = x[,2], end = x[,3]))
     gr$log.ratio <- x[,4]
-    gr
+    gr <- sort(sortSeqlevels(gr))
+    if (length(header$sl)) {
+        header$sl <- sapply(header$sl, as.numeric)
+        seqlengths(gr) <- header$sl[names(seqlengths(gr))]
+    }
+    return(gr)
 }
 
 .writeLogRatioFileGATK4 <- function(x, id = 1, file) {
@@ -65,14 +73,16 @@ readLogRatioFile <- function(file, format, zero = NULL) {
     invisible(output)
 }
 .writeGATKHeader <- function(vcf, id = 1, con, file_type) {
-    writeLines(paste("@HD", "VN:1.6", sep="\t"), con)
+    writeLines(paste("@HD", "VN:1.6", sep = "\t"), con)
     if (any(is.na(seqlengths(vcf)))) {
         flog.warn("Cannot find all contig lengths while exporting %s file.",
             file_type)
     } else {
         sl <- seqlengths(vcf)
-        writeLines(paste("@SQ", paste0("SN:",names(sl)), paste0("LN:", sl), sep="\t"), con)
+        writeLines(paste("@SQ", paste0("SN:",names(sl)), paste0("LN:", sl), sep = "\t"), con)
     }
-    sampleid <- .getSampleIdFromVcf(vcf, id)
-    writeLines(paste("@RG", "ID:PureCN", paste0("SM:", sampleid), sep="\t"), con)
+    if (!is.null(id)) {
+        sampleid <- .getSampleIdFromVcf(vcf, id)
+        writeLines(paste("@RG", "ID:PureCN", paste0("SM:", sampleid), sep = "\t"), con)
+   } 
 }    
diff --git a/R/runAbsoluteCN.R b/R/runAbsoluteCN.R
@@ -232,21 +232,22 @@
 #' @examples
 #'
 #' normal.coverage.file <- system.file('extdata', 'example_normal_tiny.txt',
-#'     package='PureCN')
+#'     package = 'PureCN')
 #' tumor.coverage.file <- system.file('extdata', 'example_tumor_tiny.txt',
-#'     package='PureCN')
+#'     package = 'PureCN')
 #' vcf.file <- system.file('extdata', 'example.vcf.gz',
-#'     package='PureCN')
+#'     package = 'PureCN')
 #' interval.file <- system.file('extdata', 'example_intervals_tiny.txt',
-#'     package='PureCN')
+#'     package = 'PureCN')
 #'
 #' # The max.candidate.solutions, max.ploidy and test.purity parameters are set to
 #' # non-default values to speed-up this example.  This is not a good idea for real
 #' # samples.
-#' ret <-runAbsoluteCN(normal.coverage.file=normal.coverage.file,
-#'     tumor.coverage.file=tumor.coverage.file, genome='hg19', vcf.file=vcf.file,
-#'     sampleid='Sample1', interval.file=interval.file,
-#'     max.ploidy=4, test.purity=seq(0.3,0.7,by=0.05), max.candidate.solutions=1)
+#' ret <-runAbsoluteCN(normal.coverage.file = normal.coverage.file,
+#'     tumor.coverage.file = tumor.coverage.file, genome = 'hg19',
+#'     vcf.file = vcf.file, sampleid = 'Sample1',
+#'     interval.file = interval.file, max.ploidy = 4,
+#'     test.purity = seq(0.3, 0.7, by = 0.05), max.candidate.solutions = 1)
 #'
 #'
 #' # If a high-quality segmentation was obtained with third-party tools:
@@ -258,9 +259,10 @@
 #' # a minimal segmentation function which just returns the provided one:
 #' funSeg <- function(seg, ...) return(seg)
 #'
-#' res <- runAbsoluteCN(seg.file=seg.file, fun.segmentation=funSeg, max.ploidy = 4,
-#'     test.purity = seq(0.3, 0.7, by = 0.05), max.candidate.solutions=1,
-#'     genome='hg19', interval.file=interval.file)
+#' res <- runAbsoluteCN(seg.file = seg.file, fun.segmentation = funSeg,
+#'     max.ploidy = 4, test.purity = seq(0.3, 0.7, by = 0.05),
+#'     max.candidate.solutions = 1,
+#'     genome='hg19', interval.file = interval.file)
 #'
 #' @export runAbsoluteCN
 #' @import DNAcopy

diff --git a/inst/extdata/IntervalFile.R b/inst/extdata/IntervalFile.R
@@ -194,19 +194,6 @@ knownOrg <- list(
     canFam3 = "org.Cf.eg.db"
 )
 
-.writeGc <- function(interval.gr, output.file) {
-    tmp <- data.frame(
-        Target = as.character(interval.gr),
-        gc_bias = interval.gr$gc_bias,
-        mappability = interval.gr$mappability,
-        reptiming = interval.gr$reptiming,
-        Gene = interval.gr$Gene,
-        on_target = interval.gr$on.target
-    )    
-    write.table(tmp, file = output.file, row.names = FALSE, quote = FALSE, 
-                sep = "\t")
-}
-
 if (!is.null(opt$genome) ) {
     if (is.null(knownGenome[[opt$genome]])) {
         flog.warn("%s genome not known. %s Known genomes: %s", opt$genome, 
@@ -221,7 +208,7 @@ if (!is.null(opt$genome) ) {
     } else {
         outGC <- suppressMessages(annotateTargets(outGC,
             get(knownGenome[[opt$genome]]), get(knownOrg[[opt$genome]])))
-        .writeGc(outGC, outfile)
+        PureCN:::.writeIntervals(outGC, outfile)
     }
 } else {
     flog.warn("Specify --genome to get gene symbol annotation.")

diff --git a/inst/extdata/ex2_mappability.bigWig b/inst/extdata/ex2_mappability.bigWig
diff --git a/man/preprocessIntervals.Rd b/man/preprocessIntervals.Rd