Skip to content

Commit

Permalink
Add GATK/Picard header to interval file.
Browse files Browse the repository at this point in the history
  • Loading branch information
lima1 committed Sep 19, 2021
1 parent 10786e6 commit d0bf581
Show file tree
Hide file tree
Showing 15 changed files with 205 additions and 211 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ Package: PureCN
Type: Package
Title: Copy number calling and SNV classification using
targeted short read sequencing
Version: 1.23.29
Date: 2021-09-13
Version: 1.23.30
Date: 2021-09-18
Authors@R: c(person("Markus", "Riester",
role = c("aut", "cre"),
email = "[email protected]",
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ export(processMultipleSamples)
export(readAllelicCountsFile)
export(readCoverageFile)
export(readCurationFile)
export(readIntervalFile)
export(readLogRatioFile)
export(readSegmentationFile)
export(runAbsoluteCN)
Expand Down
2 changes: 2 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ SIGNIFICANT USER-VISIBLE CHANGES
o Add segmentation parameters as attributes to segmentation data.frame
o Added min.betafit.rho and max.betafit.rho to calculateMappingBias*
o Made --normal_panel in PureCN.R defunct
o Added GATK/Picard header with sequence lengths to interval file,
added readIntervalFile function to parse it

BUGFIXES

Expand Down
2 changes: 1 addition & 1 deletion R/PureCN-internal.R
Original file line number Diff line number Diff line change
Expand Up @@ -998,7 +998,7 @@ na.rm = TRUE)
}

.gcGeneToCoverage <- function(interval.file, min.coverage, min.total.counts) {
gc.data <- readCoverageFile(interval.file)
gc.data <- readIntervalFile(interval.file)
gc.data$average.coverage <- min.coverage
gc.data$coverage <- min.coverage * width(gc.data)
gc.data$counts <- Inf
Expand Down
185 changes: 96 additions & 89 deletions R/preprocessIntervals.R

Large diffs are not rendered by default.

37 changes: 21 additions & 16 deletions R/readAllelicCountsFile.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,20 +42,17 @@ readAllelicCountsFile <- function(file, format, zero=NULL) {
invisible(outputCounts)
}

.readAllelicCountsFileGatk4 <- function(file, zero) {
.parseGATKHeader <- function(con) {
.extractField <- function(line, field) {
fields <- strsplit(line, "\t")[[1]]
key <- paste0("^", field, ":")
fields <- fields[grep(key, fields)]
gsub(key, "", fields[1])
}

if (!is.null(zero)) flog.warn("zero ignored for GATK4 files.")
con <- file(file, open = "r")
sid <- NULL
sl <- list()
while ( TRUE ) {
line = readLines(con, n = 1)
line <- readLines(con, n = 1)
if ( length(line) == 0 || !grepl("^@", line)[1]) {
break
}
Expand All @@ -64,35 +61,43 @@ readAllelicCountsFile <- function(file, format, zero=NULL) {
sl[[.extractField(line, "SN")]] <- .extractField(line, "LN")
}
}
return(list(sid = sid, sl = sl, last_line = line))
}

.readAllelicCountsFileGatk4 <- function(file, zero) {
if (!is.null(zero)) flog.warn("zero ignored for GATK4 files.")
con <- file(file, open = "r")
header <- .parseGATKHeader(con)
inputCounts <- read.delim(con, header = FALSE)
colnames(inputCounts) <- strsplit(line, "\t")[[1]]
colnames(inputCounts) <- strsplit(header$last_line, "\t")[[1]]
close(con)
gr <- GRanges(seqnames = inputCounts$CONTIG, IRanges(start = inputCounts$POSITION, end = inputCounts$POSITION))
vcf <- VCF(gr,
colData = DataFrame(Samples = 1, row.names = sid),
exptData = list(header = VCFHeader(samples = sid)))
colData = DataFrame(Samples = 1, row.names = header$sid),
exptData = list(header = VCFHeader(samples = header$sid)))
ref(vcf) <- DNAStringSet(inputCounts$REF_NUCLEOTIDE)
alt(vcf) <- DNAStringSetList(lapply(inputCounts$ALT_NUCLEOTIDE, DNAStringSet))
info(header(vcf)) <- DataFrame(
Number ="0",
Number = "0",
Type = "Flag",
Description = "dbSNP Membership",
row.names="DB")
row.names = "DB")

geno(header(vcf)) <- DataFrame(
Number =".",
Type = "Integer",
Description = "Allelic depths for the ref and alt alleles in the order listed",
row.names="AD")
row.names = "AD")

info(vcf)$DB <- TRUE
geno(vcf)$AD <- matrix(lapply(seq(nrow(inputCounts)), function(i) c( inputCounts$REF_COUNT[i], inputCounts$ALT_COUNT[i])), ncol = 1, dimnames = list(NULL, sid))
geno(vcf)$AD <- matrix(lapply(seq(nrow(inputCounts)), function(i)
c(inputCounts$REF_COUNT[i], inputCounts$ALT_COUNT[i])),
ncol = 1, dimnames = list(NULL, header$sid))

names(vcf) <- paste0(seqnames(vcf), ":", start(vcf))
if (length(sl)) {
sl <- sapply(sl, as.numeric)
seqlengths(vcf) <- sl[names(seqlengths(vcf))]
if (length(header$sl)) {
header$sl <- sapply(header$sl, as.numeric)
seqlengths(vcf) <- header$sl[names(seqlengths(vcf))]
}
.readAndCheckVcf(vcf)
}

53 changes: 15 additions & 38 deletions R/readCoverageFile.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,11 @@ readCoverageFile <- function(file, format, zero=NULL, read.length = 100) {
inputCoverage$on_target <- as.logical(inputCoverage$on_target)

targetCoverage <- GRanges(inputCoverage$Target,
coverage=inputCoverage$total_coverage,
average.coverage=NA,
counts=inputCoverage$counts,
on.target=inputCoverage$on_target,
duplication.rate=inputCoverage$duplication_rate)
coverage = inputCoverage$total_coverage,
average.coverage = NA,
counts = inputCoverage$counts,
on.target = inputCoverage$on_target,
duplication.rate = inputCoverage$duplication_rate)
targetCoverage <- .addAverageCoverage(targetCoverage)
targetCoverage
}
Expand All @@ -78,10 +78,10 @@ readCoverageFile <- function(file, format, zero=NULL, read.length = 100) {
}

.addAverageCoverage <- function(x) {
x$average.coverage <- x$coverage/width(x)
x$average.coverage <- x$coverage / width(x)
x
}
.readCoverageCnn <- function(file, zero, format="cnn") {
.readCoverageCnn <- function(file, zero, format = "cnn") {
if (is.null(zero)) zero <- TRUE
inputCoverage <- fread(file, sep = "\t")
if (zero) inputCoverage$start <- inputCoverage$start + 1
Expand All @@ -91,11 +91,11 @@ readCoverageFile <- function(file, format, zero=NULL, read.length = 100) {
targetCoverage$on.target <- TRUE
targetCoverage$depth <- NULL
targetCoverage$Gene <- targetCoverage$gene
targetCoverage$on.target[which(targetCoverage$Gene=="Background")] <- FALSE
targetCoverage$on.target[which(targetCoverage$Gene=="Antitarget")] <- FALSE
targetCoverage$on.target[which(targetCoverage$Gene == "Background")] <- FALSE
targetCoverage$on.target[which(targetCoverage$Gene == "Antitarget")] <- FALSE
targetCoverage$gene <- NULL
targetCoverage$duplication.rate <- NA
if (format=="cnr") {
if (format == "cnr") {
targetCoverage$log.ratio <- targetCoverage$log2
}
targetCoverage$log2 <- NULL
Expand Down Expand Up @@ -148,46 +148,23 @@ readCoverageFile <- function(file, format, zero=NULL, read.length = 100) {
tumor$gc_bias <- as.numeric(tumor$gc_bias)
if (is.null(tumor$Gene)) tumor$Gene <- "."

inputGC <- read.delim(interval.file, as.is = TRUE)
if (is.null(inputGC$gc_bias)) {
.stopUserError("No gc_bias column in interval.file.")
}
if (is.null(inputGC$Gene)) {
if (verbose) flog.info("No Gene column in interval.file. You won't get gene-level calls.")
inputGC$Gene <- "."
}
if (is.null(inputGC$on_target)) {
if (verbose) flog.info("No on_target column in interval.file. Recreate this file with IntervalFile.R.")
inputGC$on_target <- TRUE
}
if (is.null(inputGC$mappability)) {
if (verbose) flog.info("No mappability column in interval.file.")
inputGC$mappability <- 1
}
if (is.null(inputGC$reptiming)) {
if (verbose) flog.info("No reptiming column in interval.file.")
inputGC$reptiming <- NA
}

targetGC <- GRanges(inputGC[,1], ranges=NULL, strand=NULL, inputGC[,-1])
targetGC <- sort(sortSeqlevels(targetGC))

ov <- findOverlaps(tumor, targetGC)
targetGC <- readIntervalFile(interval.file)
ov <- findOverlaps(tumor, targetGC)
if (!identical(as.character(tumor), as.character(targetGC))) {
# if only a few intervals are missing, maybe because of some poor
# quality regions, we just ignore those, otherwise we stop because
# user probably used the wrong file for the assay
if (length(ov) < length(tumor)/2) {
if (length(ov) < length(tumor) / 2) {
.stopUserError("tumor.coverage.file and interval.file do not align.")
} else {
flog.warn("tumor.coverage.file and interval.file do not align.")
}
}

if (!is.null(tumor$on.target)) {
if (!identical(tumor[queryHits(ov)]$on.target, targetGC[subjectHits(ov)]$on_target)) {
if (!identical(tumor[queryHits(ov)]$on.target, targetGC[subjectHits(ov)]$on.target)) {
flog.warn("Intervals in coverage and interval.file have conflicting on/off-target annotation.")
tumor[queryHits(ov)]$on.target <- targetGC[subjectHits(ov)]$on_target
tumor[queryHits(ov)]$on.target <- targetGC[subjectHits(ov)]$on.target
}
}
tumor[queryHits(ov)]$mappability <- targetGC[subjectHits(ov)]$mappability
Expand Down
22 changes: 16 additions & 6 deletions R/readLogRatioFile.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,18 @@ readLogRatioFile <- function(file, format, zero = NULL) {
}

.readLogRatioFileGATK4 <- function(file, zero = FALSE) {
x <- read.delim(file, comment.char = "@", as.is = TRUE)
con <- file(file, open = "r")
header <- .parseGATKHeader(con)
x <- read.delim(con, header = FALSE, as.is = TRUE)
colnames(x) <- strsplit(header$last_line, "\t")[[1]]
gr <- GRanges(x[,1], IRanges(start = x[,2], end = x[,3]))
gr$log.ratio <- x[,4]
gr
gr <- sort(sortSeqlevels(gr))
if (length(header$sl)) {
header$sl <- sapply(header$sl, as.numeric)
seqlengths(gr) <- header$sl[names(seqlengths(gr))]
}
return(gr)
}

.writeLogRatioFileGATK4 <- function(x, id = 1, file) {
Expand All @@ -65,14 +73,16 @@ readLogRatioFile <- function(file, format, zero = NULL) {
invisible(output)
}
.writeGATKHeader <- function(vcf, id = 1, con, file_type) {
writeLines(paste("@HD", "VN:1.6", sep="\t"), con)
writeLines(paste("@HD", "VN:1.6", sep = "\t"), con)
if (any(is.na(seqlengths(vcf)))) {
flog.warn("Cannot find all contig lengths while exporting %s file.",
file_type)
} else {
sl <- seqlengths(vcf)
writeLines(paste("@SQ", paste0("SN:",names(sl)), paste0("LN:", sl), sep="\t"), con)
writeLines(paste("@SQ", paste0("SN:",names(sl)), paste0("LN:", sl), sep = "\t"), con)
}
sampleid <- .getSampleIdFromVcf(vcf, id)
writeLines(paste("@RG", "ID:PureCN", paste0("SM:", sampleid), sep="\t"), con)
if (!is.null(id)) {
sampleid <- .getSampleIdFromVcf(vcf, id)
writeLines(paste("@RG", "ID:PureCN", paste0("SM:", sampleid), sep = "\t"), con)
}
}
24 changes: 13 additions & 11 deletions R/runAbsoluteCN.R
Original file line number Diff line number Diff line change
Expand Up @@ -232,21 +232,22 @@
#' @examples
#'
#' normal.coverage.file <- system.file('extdata', 'example_normal_tiny.txt',
#' package='PureCN')
#' package = 'PureCN')
#' tumor.coverage.file <- system.file('extdata', 'example_tumor_tiny.txt',
#' package='PureCN')
#' package = 'PureCN')
#' vcf.file <- system.file('extdata', 'example.vcf.gz',
#' package='PureCN')
#' package = 'PureCN')
#' interval.file <- system.file('extdata', 'example_intervals_tiny.txt',
#' package='PureCN')
#' package = 'PureCN')
#'
#' # The max.candidate.solutions, max.ploidy and test.purity parameters are set to
#' # non-default values to speed-up this example. This is not a good idea for real
#' # samples.
#' ret <-runAbsoluteCN(normal.coverage.file=normal.coverage.file,
#' tumor.coverage.file=tumor.coverage.file, genome='hg19', vcf.file=vcf.file,
#' sampleid='Sample1', interval.file=interval.file,
#' max.ploidy=4, test.purity=seq(0.3,0.7,by=0.05), max.candidate.solutions=1)
#' ret <-runAbsoluteCN(normal.coverage.file = normal.coverage.file,
#' tumor.coverage.file = tumor.coverage.file, genome = 'hg19',
#' vcf.file = vcf.file, sampleid = 'Sample1',
#' interval.file = interval.file, max.ploidy = 4,
#' test.purity = seq(0.3, 0.7, by = 0.05), max.candidate.solutions = 1)
#'
#'
#' # If a high-quality segmentation was obtained with third-party tools:
Expand All @@ -258,9 +259,10 @@
#' # a minimal segmentation function which just returns the provided one:
#' funSeg <- function(seg, ...) return(seg)
#'
#' res <- runAbsoluteCN(seg.file=seg.file, fun.segmentation=funSeg, max.ploidy = 4,
#' test.purity = seq(0.3, 0.7, by = 0.05), max.candidate.solutions=1,
#' genome='hg19', interval.file=interval.file)
#' res <- runAbsoluteCN(seg.file = seg.file, fun.segmentation = funSeg,
#' max.ploidy = 4, test.purity = seq(0.3, 0.7, by = 0.05),
#' max.candidate.solutions = 1,
#' genome='hg19', interval.file = interval.file)
#'
#' @export runAbsoluteCN
#' @import DNAcopy
Expand Down
15 changes: 1 addition & 14 deletions inst/extdata/IntervalFile.R
Original file line number Diff line number Diff line change
Expand Up @@ -194,19 +194,6 @@ knownOrg <- list(
canFam3 = "org.Cf.eg.db"
)

.writeGc <- function(interval.gr, output.file) {
tmp <- data.frame(
Target = as.character(interval.gr),
gc_bias = interval.gr$gc_bias,
mappability = interval.gr$mappability,
reptiming = interval.gr$reptiming,
Gene = interval.gr$Gene,
on_target = interval.gr$on.target
)
write.table(tmp, file = output.file, row.names = FALSE, quote = FALSE,
sep = "\t")
}

if (!is.null(opt$genome) ) {
if (is.null(knownGenome[[opt$genome]])) {
flog.warn("%s genome not known. %s Known genomes: %s", opt$genome,
Expand All @@ -221,7 +208,7 @@ if (!is.null(opt$genome) ) {
} else {
outGC <- suppressMessages(annotateTargets(outGC,
get(knownGenome[[opt$genome]]), get(knownOrg[[opt$genome]])))
.writeGc(outGC, outfile)
PureCN:::.writeIntervals(outGC, outfile)
}
} else {
flog.warn("Specify --genome to get gene symbol annotation.")
Expand Down
Binary file modified inst/extdata/ex2_mappability.bigWig
Binary file not shown.
30 changes: 15 additions & 15 deletions man/preprocessIntervals.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit d0bf581

Please sign in to comment.