Skip to content

Commit

Permalink
Make Cosmic.CNT configurable, make it possible to add Cosmic VCF in P…
Browse files Browse the repository at this point in the history
  • Loading branch information
lima1 committed Sep 12, 2021
1 parent a3956f3 commit 586a420
Show file tree
Hide file tree
Showing 9 changed files with 53 additions and 30 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ Package: PureCN
Type: Package
Title: Copy number calling and SNV classification using
targeted short read sequencing
Version: 1.23.26
Version: 1.23.27
Date: 2021-09-11
Authors@R: c(person("Markus", "Riester",
role = c("aut", "cre"),
Expand Down
1 change: 1 addition & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ NEW FEATURES
o Improved mapping bias estimates: variants with insufficient information
for position-specific fits (default 3-6 heterozygous variants)
are clustered and assigned to the most similar fit
o Make Cosmic.CNT INFO field name customizable

SIGNIFICANT USER-VISIBLE CHANGES

Expand Down
20 changes: 10 additions & 10 deletions R/filterVcf.R
Original file line number Diff line number Diff line change
Expand Up @@ -545,8 +545,8 @@ function(vcf, tumor.id.in.vcf, allowed=0.05) {
vcf
}

.addCosmicCNT <- function(vcf, cosmic.vcf.file) {
if (!is.null(info(vcf)$Cosmic.CNT)) {
.addCosmicCNT <- function(vcf, cosmic.vcf.file, Cosmic.CNT.info.field) {
if (!is.null(info(vcf)[[Cosmic.CNT.info.field]])) {
flog.info("VCF already COSMIC annotated. Skipping.")
return(vcf)
}
Expand All @@ -561,9 +561,9 @@ function(vcf, tumor.id.in.vcf, allowed=0.05) {
# look-up the variants in COSMIC
cosmic.vcf <- readVcf(cosmic.vcf.file, genome = genome(vcfRenamedSL)[1],
ScanVcfParam(which = rowRanges(vcfRenamedSL),
info="CNT",
fixed="ALT",
geno=NA
info = "CNT",
fixed = "ALT",
geno = NA
)
)
ov <- findOverlaps(vcfRenamedSL, cosmic.vcf, type = "equal")
Expand All @@ -580,12 +580,12 @@ function(vcf, tumor.id.in.vcf, allowed=0.05) {
}

newInfo <- DataFrame(
Number=1, Type="Integer",
Description="How many samples in Cosmic have this mutation",
row.names="Cosmic.CNT")
Number = 1, Type = "Integer",
Description = "How many samples in Cosmic have this mutation",
row.names = Cosmic.CNT.info.field)
info(header(vcf)) <- rbind(info(header(vcf)), newInfo)
info(vcf)$Cosmic.CNT <- NA
info(vcf)$Cosmic.CNT[queryHits(ov)] <- info(cosmic.vcf[subjectHits(ov)])$CNT
info(vcf)[[Cosmic.CNT.info.field]] <- NA
info(vcf)[[Cosmic.CNT.info.field]][queryHits(ov)] <- info(cosmic.vcf[subjectHits(ov)])$CNT
vcf
}

Expand Down
13 changes: 9 additions & 4 deletions R/runAbsoluteCN.R
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@
#' @param POPAF.info.field As alternative to a flag, use an info field that
#' contains population allele frequencies. The \code{DB} info flag has priority
#' over this field when both exist.
#' @param Cosmic.CNT.info.field Info field containing hits in the Cosmic database
#' @param min.pop.af Minimum population allele frequency in
#' \code{POPAF.info.field} to set a high germline prior probability.
#' @param model Use either a beta or a beta-binomial distribution for fitting
Expand Down Expand Up @@ -298,8 +299,11 @@ runAbsoluteCN <- function(normal.coverage.file = NULL,
max.segments = 300, min.gof = 0.8, min.variants = 20,
plot.cnv = TRUE,
vcf.field.prefix = "",
cosmic.vcf.file = NULL, DB.info.flag = "DB",
POPAF.info.field = "POP_AF", min.pop.af = 0.001,
cosmic.vcf.file = NULL,
DB.info.flag = "DB",
POPAF.info.field = "POP_AF",
Cosmic.CNT.info.field = "Cosmic.CNT",
min.pop.af = 0.001,
model = c("beta", "betabin"),
post.optimize = FALSE, speedup.heuristics = 2, BPPARAM = NULL,
log.file = NULL, verbose = TRUE) {
Expand Down Expand Up @@ -534,11 +538,12 @@ runAbsoluteCN <- function(normal.coverage.file = NULL,
vcf <- vcf.filtering$vcf

if (!is.null(cosmic.vcf.file)) {
vcf <- .addCosmicCNT(vcf, cosmic.vcf.file)
vcf <- .addCosmicCNT(vcf, cosmic.vcf.file, Cosmic.CNT.info.field)
}

args.setPriorVcf <- c(list(vcf = vcf, tumor.id.in.vcf = tumor.id.in.vcf,
DB.info.flag = DB.info.flag), args.setPriorVcf)
DB.info.flag = DB.info.flag,
Cosmic.CNT.info.field = Cosmic.CNT.info.field), args.setPriorVcf)
vcf <- do.call(fun.setPriorVcf,
.checkArgs(args.setPriorVcf, "setPriorVcf"))

Expand Down
9 changes: 5 additions & 4 deletions R/setPriorVcf.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#' @param DB.info.flag Flag in INFO of VCF that marks presence in common
#' germline databases. Defaults to \code{DB} that may contain somatic variants
#' if it is from an unfiltered dbSNP VCF.
#' @param Cosmic.CNT.info.field Info field containing hits in the Cosmic database
#' @return The \code{vcf} with \code{numeric(nrow(vcf))} vector with the
#' prior probability of somatic status for each variant in the
#' \code{CollapsedVCF} added to the \code{INFO} field \code{PR}.
Expand All @@ -39,7 +40,7 @@
setPriorVcf <- function(vcf, prior.somatic = c(0.5, 0.0005, 0.999, 0.0001,
0.995, 0.5),
tumor.id.in.vcf = NULL, min.cosmic.cnt = 6,
DB.info.flag = "DB") {
DB.info.flag = "DB", Cosmic.CNT.info.field = "Cosmic.CNT") {
if (is.null(tumor.id.in.vcf)) {
tumor.id.in.vcf <- .getTumorIdInVcf(vcf)
}
Expand All @@ -55,14 +56,14 @@ setPriorVcf <- function(vcf, prior.somatic = c(0.5, 0.0005, 0.999, 0.0001,
tmp <- prior.somatic
prior.somatic <- ifelse(info(vcf)[[DB.info.flag]],
prior.somatic[2], prior.somatic[1])
if (!is.null(info(vcf)$Cosmic.CNT)) {
if (!is.null(info(vcf)[[Cosmic.CNT.info.field]])) {
flog.info("Found COSMIC annotation in VCF. Requiring %i hits.",
min.cosmic.cnt)
flog.info("Setting somatic prior probabilities for hits to %f or to %f if in both COSMIC and dbSNP.",
tmp[5], tmp[6])

prior.somatic[which(info(vcf)$Cosmic.CNT >= min.cosmic.cnt)] <- tmp[5]
prior.somatic[which(info(vcf)$Cosmic.CNT >= min.cosmic.cnt &
prior.somatic[which(info(vcf)[[Cosmic.CNT.info.field]] >= min.cosmic.cnt)] <- tmp[5]
prior.somatic[which(info(vcf)[[Cosmic.CNT.info.field]] >= min.cosmic.cnt &
info(vcf)[[DB.info.flag]])] <- tmp[6]
} else {
flog.info("Setting somatic prior probabilities for dbSNP hits to %f or to %f otherwise.",
Expand Down
12 changes: 10 additions & 2 deletions inst/extdata/PureCN.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ option_list <- list(
make_option(c("--popaf-info-field"), action = "store", type = "character",
default = formals(PureCN::runAbsoluteCN)$POPAF.info.field,
help = "VCF Filter: VCF INFO field providing population allele frequency [default %default]"),
make_option(c("--cosmic-cnt-info-field"), action = "store", type = "character",
default = formals(PureCN::runAbsoluteCN)$Cosmic.CNT.info.field,
help = "VCF Filter: VCF INFO field providing counts in the Cosmic database [default %default]"),
make_option(c("--cosmic-vcf-file"), action = "store", type = "character", default = NULL,
help = "VCF Filter: Adds a Cosmic.CNT INFO annotation using a Cosmic VCF. Added for convenience, we recommend adding annotations upstream [default %default]"),
make_option(c("--min-cosmic-cnt"), action = "store", type = "integer",
default = formals(PureCN::setPriorVcf)$min.cosmic.cnt,
help = "VCF Filter: Min number of COSMIC hits [default %default]"),
Expand Down Expand Up @@ -96,7 +101,7 @@ option_list <- list(
help = "Post-optimization [default %default]"),
make_option(c("--bootstrap-n"), action = "store", type = "integer", default = 0,
help = "Number of bootstrap replicates [default %default]"),
make_option(c("--speedupheuristics"), action = "store", type = "integer",
make_option(c("--speedup-heuristics"), action = "store", type = "integer",
default = max(eval(formals(PureCN::runAbsoluteCN)$speedup.heuristics)),
help = "Tries to avoid spending computation time on unlikely local optima [default %default]"),
make_option(c("--model-homozygous"), action = "store_true", default = FALSE,
Expand Down Expand Up @@ -161,6 +166,7 @@ alias_list <- list(
"logratiocalibration" = "log-ratio-calibration",
"maxnonclonal" = "max-non-clonal",
"maxhomozygousloss" = "max-homozygous-loss",
"speedupheuristics" = "speedup-heuristics",
"outvcf" = "out-vcf"
)
replace_alias <- function(x, deprecated = TRUE) {
Expand Down Expand Up @@ -398,12 +404,14 @@ if (file.exists(file.rds) && !opt$force) {
min.logr.sdev = opt$min_logr_sdev,
error = opt$error, DB.info.flag = opt$db_info_flag,
POPAF.info.field = opt$popaf_info_field,
Cosmic.CNT.info.field = opt$cosmic_cnt_info_field,
log.ratio.calibration = opt$log_ratio_calibration,
max.non.clonal = opt$max_non_clonal,
max.homozygous.loss = as.numeric(strsplit(opt$max_homozygous_loss, ",")[[1]]),
post.optimize = opt$post_optimize,
speedup.heuristics = opt$speedupheuristics,
speedup.heuristics = opt$speedup_heuristics,
vcf.field.prefix = "PureCN.",
cosmic.vcf.file = opt$cosmic_vcf_file,
BPPARAM = BPPARAM)
# free memory
vcf <- NULL
Expand Down
3 changes: 3 additions & 0 deletions man/runAbsoluteCN.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion man/setPriorVcf.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 10 additions & 8 deletions vignettes/Quick.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ and newer are supported.
scripts:

```{r path}
system.file("extdata", package="PureCN")
system.file("extdata", package = "PureCN")
```

- Exit R and store this path in an environment variable, for example in
Expand All @@ -126,7 +126,7 @@ Usage: /path/to/PureCN/inst/extdata/PureCN.R [options] ...
$ export OUT_REF="reference_files"
$ Rscript $PURECN/IntervalFile.R --infile baits_hg19.bed \
--fasta hg19.fa --out-file $OUT_REF/baits_hg19_intervals.txt \
--offtarget --genome hg19 \
--off-target --genome hg19 \
--export $OUT_REF/baits_optimized_hg19.bed \
--mappability wgEncodeCrgMapabilityAlign100mer.bigWig \
--reptiming wgEncodeUwRepliSeqK562WaveSignalRep1.bigWig
Expand All @@ -143,7 +143,7 @@ sampling variance slightly. Note that we do however strongly recommend running t
variant caller with a padding of at least 50bp to increase the number of
informative SNPs, see below in the VCF section.

The `--offtarget` flag will include off-target reads. Including them is
The `--off-target` flag will include off-target reads. Including them is
recommended except for Amplicon data. For whole-exome data, the benefit is
usually also limited unless the assay is inefficient with a high fraction of
off-target reads (>10-15%).
Expand Down Expand Up @@ -176,7 +176,7 @@ Similarly, the `--reptiming` argument takes a replication timing score in the
same format. If provided, GC-normalized and log-transformed coverage is tested
for a linear relationship with this score and normalized accordingly. This is
optional and provides only a minor benefit for coverage normalization, but can
identify samples with high proliferation. Requires `--offtarget` to be useful.
identify samples with high proliferation. Requires `--off-target` to be useful.


# Create VCF files
Expand Down Expand Up @@ -254,11 +254,11 @@ $ Rscript $PURECN/Coverage.R --out-dir $OUT/normals \

Important recommendations:

- Only provide `--keepduplicates` or `--removemapq0` if you know what you are
- Only provide `--keep-duplicates` or `--remove-mapq0` if you know what you are
doing and always use the same command line arguments for tumor and the
normals

- It can be safe to skip the GC-normalization with `--skipgcnorm` when tumor
- It can be safe to skip the GC-normalization with `--skip-gc-norm` when tumor
and normal samples are expected to exhibit similar biases and a sufficient
number of normal samples is available. A good example would be plasma
sequencing. In contrast, old FFPE samples normalized against blood controls
Expand Down Expand Up @@ -323,9 +323,9 @@ Important recommendations:

- For ideal results, examine the `interval_weights.png` file to find good
off-target bin widths. You will need to re-run `IntervalFile.R` with the
`--offtargetwidth` parameter and re-calculate the coverages. `NormalDB.R`
`--average-off-target-width` parameter and re-calculate the coverages. `NormalDB.R`
will also give a suggestion for a good minimum width. We do not recommend
going lower than this estimate; setting `--offtargetwidth` to value larger
going lower than this estimate; setting `--average-off-target-width` to value larger
than this value can decrease noise at the cost of loss in resolution.
Setting it to 1.2-1.5x the minimum recommendation (that should be ideally <
250kb) is a good starting point.
Expand Down Expand Up @@ -771,6 +771,7 @@ Argument name | Corresponding PureCN argument | PureCN function
`--error` | `error` | `runAbsoluteCN`
`--db-info-flag` | `DB.info.flag` | `runAbsoluteCN`
`--popaf-info-field` | `POPAF.info.field` | `runAbsoluteCN`
`--cosmic-cnt-info-field` | `Cosmic.CNT.info.field` | `runAbsoluteCN`
`--min-cosmic-cnt` | `min.cosmic.cnt` | `setPriorVcf`
`--interval-padding` | `interval.padding` | `filterVcfBasic`
`--min-total-counts` | `min.total.counts` | `filterIntervals`
Expand All @@ -789,6 +790,7 @@ Argument name | Corresponding PureCN argument | PureCN function
`--max-copynumber` | `test.num.copy` | `runAbsoluteCN`
`--post-optimize` | `post.optimize` | `runAbsoluteCN`
`--bootstrap-n` | `n` | `bootstrapResults`
`--speedup-heuristics` | `speedup.heuristics` | `runAbsoluteCN`
`--model-homozygous` | `model.homozygous` | `runAbsoluteCN`
`--model` | `model` | `runAbsoluteCN`
`--log-ratio-calibration` | `log.ratio.calibration` | `runAbsoluteCN`
Expand Down

0 comments on commit 586a420

Please sign in to comment.