Make Cosmic.CNT configurable, make it possible to add Cosmic VCF in P…

…ureCN.R (https://support.bioconductor.org/p/9139446/)
ddrichel · Sep 12, 2021 · 586a420 · 586a420
1 parent a3956f3
commit 586a420
Show file tree

Hide file tree

Showing 9 changed files with 53 additions and 30 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -2,7 +2,7 @@ Package: PureCN
 Type: Package
 Title: Copy number calling and SNV classification using
     targeted short read sequencing
-Version: 1.23.26
+Version: 1.23.27
 Date: 2021-09-11
 Authors@R: c(person("Markus", "Riester",
                     role = c("aut", "cre"),

diff --git a/NEWS b/NEWS
@@ -8,6 +8,7 @@ NEW FEATURES
     o Improved mapping bias estimates: variants with insufficient information
       for position-specific fits (default 3-6 heterozygous variants) 
       are clustered and assigned to the most similar fit  
+    o Make Cosmic.CNT INFO field name customizable  
 
 SIGNIFICANT USER-VISIBLE CHANGES
 

diff --git a/R/filterVcf.R b/R/filterVcf.R
@@ -545,8 +545,8 @@ function(vcf, tumor.id.in.vcf, allowed=0.05) {
     vcf
 }
 
-.addCosmicCNT <- function(vcf, cosmic.vcf.file) {
-    if (!is.null(info(vcf)$Cosmic.CNT)) {
+.addCosmicCNT <- function(vcf, cosmic.vcf.file, Cosmic.CNT.info.field) {
+    if (!is.null(info(vcf)[[Cosmic.CNT.info.field]])) {
         flog.info("VCF already COSMIC annotated. Skipping.")
         return(vcf)        
     }
@@ -561,9 +561,9 @@ function(vcf, tumor.id.in.vcf, allowed=0.05) {
     # look-up the variants in COSMIC
     cosmic.vcf <- readVcf(cosmic.vcf.file, genome = genome(vcfRenamedSL)[1],  
         ScanVcfParam(which = rowRanges(vcfRenamedSL),
-            info="CNT",
-            fixed="ALT",
-            geno=NA
+            info = "CNT",
+            fixed = "ALT",
+            geno = NA
         )
     )
     ov <- findOverlaps(vcfRenamedSL, cosmic.vcf, type = "equal")
@@ -580,12 +580,12 @@ function(vcf, tumor.id.in.vcf, allowed=0.05) {
     }
 
     newInfo <- DataFrame(
-        Number=1, Type="Integer",
-        Description="How many samples in Cosmic have this mutation",
-        row.names="Cosmic.CNT")
+        Number = 1, Type = "Integer",
+        Description = "How many samples in Cosmic have this mutation",
+        row.names = Cosmic.CNT.info.field)
     info(header(vcf)) <- rbind(info(header(vcf)), newInfo)
-    info(vcf)$Cosmic.CNT <- NA
-    info(vcf)$Cosmic.CNT[queryHits(ov)] <- info(cosmic.vcf[subjectHits(ov)])$CNT
+    info(vcf)[[Cosmic.CNT.info.field]] <- NA
+    info(vcf)[[Cosmic.CNT.info.field]][queryHits(ov)] <- info(cosmic.vcf[subjectHits(ov)])$CNT
     vcf
 }
 

diff --git a/R/runAbsoluteCN.R b/R/runAbsoluteCN.R
@@ -193,6 +193,7 @@
 #' @param POPAF.info.field As alternative to a flag, use an info field that
 #' contains population allele frequencies. The \code{DB} info flag has priority
 #' over this field when both exist.
+#' @param Cosmic.CNT.info.field Info field containing hits in the Cosmic database
 #' @param min.pop.af Minimum population allele frequency in
 #' \code{POPAF.info.field} to set a high germline prior probability.
 #' @param model Use either a beta or a beta-binomial distribution for fitting
@@ -298,8 +299,11 @@ runAbsoluteCN <- function(normal.coverage.file = NULL,
     max.segments = 300, min.gof = 0.8, min.variants = 20,
     plot.cnv = TRUE,
     vcf.field.prefix = "",
-    cosmic.vcf.file = NULL, DB.info.flag = "DB",
-    POPAF.info.field = "POP_AF", min.pop.af = 0.001,
+    cosmic.vcf.file = NULL, 
+    DB.info.flag = "DB",
+    POPAF.info.field = "POP_AF",
+    Cosmic.CNT.info.field = "Cosmic.CNT",
+    min.pop.af = 0.001,
     model = c("beta", "betabin"),
     post.optimize = FALSE, speedup.heuristics = 2, BPPARAM = NULL,
     log.file = NULL, verbose = TRUE) {
@@ -534,11 +538,12 @@ runAbsoluteCN <- function(normal.coverage.file = NULL,
         vcf <- vcf.filtering$vcf
 
         if (!is.null(cosmic.vcf.file)) {
-            vcf <- .addCosmicCNT(vcf, cosmic.vcf.file)
+            vcf <- .addCosmicCNT(vcf, cosmic.vcf.file, Cosmic.CNT.info.field)
         }
 
         args.setPriorVcf <- c(list(vcf = vcf, tumor.id.in.vcf = tumor.id.in.vcf,
-            DB.info.flag = DB.info.flag), args.setPriorVcf)
+            DB.info.flag = DB.info.flag,
+            Cosmic.CNT.info.field = Cosmic.CNT.info.field), args.setPriorVcf)
         vcf <- do.call(fun.setPriorVcf,
             .checkArgs(args.setPriorVcf, "setPriorVcf"))
 

diff --git a/R/setPriorVcf.R b/R/setPriorVcf.R
@@ -23,6 +23,7 @@
 #' @param DB.info.flag Flag in INFO of VCF that marks presence in common
 #' germline databases. Defaults to \code{DB} that may contain somatic variants
 #' if it is from an unfiltered dbSNP VCF.
+#' @param Cosmic.CNT.info.field Info field containing hits in the Cosmic database
 #' @return The \code{vcf} with \code{numeric(nrow(vcf))} vector with the
 #' prior probability of somatic status for each variant in the 
 #' \code{CollapsedVCF} added to the \code{INFO} field \code{PR}.
@@ -39,7 +40,7 @@
 setPriorVcf <- function(vcf, prior.somatic = c(0.5, 0.0005, 0.999, 0.0001,
                                                0.995, 0.5),
                         tumor.id.in.vcf = NULL, min.cosmic.cnt = 6, 
-                        DB.info.flag = "DB") {
+                        DB.info.flag = "DB", Cosmic.CNT.info.field = "Cosmic.CNT") {
     if (is.null(tumor.id.in.vcf)) {
         tumor.id.in.vcf <- .getTumorIdInVcf(vcf)
     }
@@ -55,14 +56,14 @@ setPriorVcf <- function(vcf, prior.somatic = c(0.5, 0.0005, 0.999, 0.0001,
          tmp <- prior.somatic
          prior.somatic <- ifelse(info(vcf)[[DB.info.flag]],
             prior.somatic[2], prior.somatic[1])
-         if (!is.null(info(vcf)$Cosmic.CNT)) {
+         if (!is.null(info(vcf)[[Cosmic.CNT.info.field]])) {
              flog.info("Found COSMIC annotation in VCF. Requiring %i hits.", 
                 min.cosmic.cnt)
              flog.info("Setting somatic prior probabilities for hits to %f or to %f if in both COSMIC and dbSNP.", 
                 tmp[5], tmp[6])
 
-             prior.somatic[which(info(vcf)$Cosmic.CNT >= min.cosmic.cnt)] <- tmp[5]
-             prior.somatic[which(info(vcf)$Cosmic.CNT >= min.cosmic.cnt & 
+             prior.somatic[which(info(vcf)[[Cosmic.CNT.info.field]] >= min.cosmic.cnt)] <- tmp[5]
+             prior.somatic[which(info(vcf)[[Cosmic.CNT.info.field]] >= min.cosmic.cnt & 
                 info(vcf)[[DB.info.flag]])] <- tmp[6]
          } else {
              flog.info("Setting somatic prior probabilities for dbSNP hits to %f or to %f otherwise.", 

diff --git a/inst/extdata/PureCN.R b/inst/extdata/PureCN.R
@@ -45,6 +45,11 @@ option_list <- list(
     make_option(c("--popaf-info-field"), action = "store", type = "character",
         default = formals(PureCN::runAbsoluteCN)$POPAF.info.field,
         help = "VCF Filter: VCF INFO field providing population allele frequency [default %default]"),
+    make_option(c("--cosmic-cnt-info-field"), action = "store", type = "character",
+        default = formals(PureCN::runAbsoluteCN)$Cosmic.CNT.info.field,
+        help = "VCF Filter: VCF INFO field providing counts in the Cosmic database [default %default]"),
+    make_option(c("--cosmic-vcf-file"), action = "store", type = "character", default = NULL,
+        help = "VCF Filter: Adds a Cosmic.CNT INFO annotation using a Cosmic VCF. Added for convenience, we recommend adding annotations upstream [default %default]"),
     make_option(c("--min-cosmic-cnt"), action = "store", type = "integer",
         default = formals(PureCN::setPriorVcf)$min.cosmic.cnt,
         help = "VCF Filter: Min number of COSMIC hits [default %default]"),
@@ -96,7 +101,7 @@ option_list <- list(
         help = "Post-optimization [default %default]"),
     make_option(c("--bootstrap-n"), action = "store", type = "integer", default = 0,
         help = "Number of bootstrap replicates [default %default]"),
-    make_option(c("--speedupheuristics"), action = "store", type = "integer",
+    make_option(c("--speedup-heuristics"), action = "store", type = "integer",
         default =  max(eval(formals(PureCN::runAbsoluteCN)$speedup.heuristics)),
         help = "Tries to avoid spending computation time on unlikely local optima [default %default]"),
     make_option(c("--model-homozygous"), action = "store_true", default = FALSE,
@@ -161,6 +166,7 @@ alias_list <- list(
     "logratiocalibration" = "log-ratio-calibration",
     "maxnonclonal" = "max-non-clonal",
     "maxhomozygousloss" = "max-homozygous-loss",
+    "speedupheuristics" = "speedup-heuristics",
     "outvcf" = "out-vcf"
 )    
 replace_alias <- function(x, deprecated = TRUE) {
@@ -398,12 +404,14 @@ if (file.exists(file.rds) && !opt$force) {
             min.logr.sdev = opt$min_logr_sdev,
             error = opt$error, DB.info.flag = opt$db_info_flag,
             POPAF.info.field = opt$popaf_info_field,
+            Cosmic.CNT.info.field = opt$cosmic_cnt_info_field,
             log.ratio.calibration = opt$log_ratio_calibration,
             max.non.clonal = opt$max_non_clonal,
             max.homozygous.loss = as.numeric(strsplit(opt$max_homozygous_loss, ",")[[1]]),
             post.optimize = opt$post_optimize,
-            speedup.heuristics = opt$speedupheuristics,
+            speedup.heuristics = opt$speedup_heuristics,
             vcf.field.prefix = "PureCN.",
+            cosmic.vcf.file = opt$cosmic_vcf_file,
             BPPARAM = BPPARAM)
     # free memory
     vcf <- NULL

diff --git a/man/runAbsoluteCN.Rd b/man/runAbsoluteCN.Rd
diff --git a/man/setPriorVcf.Rd b/man/setPriorVcf.Rd
diff --git a/vignettes/Quick.Rmd b/vignettes/Quick.Rmd
@@ -105,7 +105,7 @@ and newer are supported.
   scripts:
 
 ```{r path}
-system.file("extdata", package="PureCN")
+system.file("extdata", package = "PureCN")
 ```
 
 - Exit R and store this path in an environment variable, for example in 
@@ -126,7 +126,7 @@ Usage: /path/to/PureCN/inst/extdata/PureCN.R [options] ...
 $ export OUT_REF="reference_files"
 $ Rscript $PURECN/IntervalFile.R --infile baits_hg19.bed \ 
     --fasta hg19.fa --out-file $OUT_REF/baits_hg19_intervals.txt \
-    --offtarget --genome hg19 \
+    --off-target --genome hg19 \
     --export $OUT_REF/baits_optimized_hg19.bed \
     --mappability wgEncodeCrgMapabilityAlign100mer.bigWig \
     --reptiming wgEncodeUwRepliSeqK562WaveSignalRep1.bigWig
@@ -143,7 +143,7 @@ sampling variance slightly. Note that we do however strongly recommend running t
 variant caller with a padding of at least 50bp to increase the number of
 informative SNPs, see below in the VCF section.
 
-The `--offtarget` flag will include off-target reads. Including them is
+The `--off-target` flag will include off-target reads. Including them is
 recommended except for Amplicon data. For whole-exome data, the benefit is
 usually also limited unless the assay is inefficient with a high fraction of
 off-target reads (>10-15%).
@@ -176,7 +176,7 @@ Similarly, the `--reptiming` argument takes a replication timing score in the
 same format.  If provided, GC-normalized and log-transformed coverage is tested
 for a linear relationship with this score and normalized accordingly.  This is
 optional and provides only a minor benefit for coverage normalization, but can
-identify samples with high proliferation. Requires `--offtarget` to be useful. 
+identify samples with high proliferation. Requires `--off-target` to be useful. 
 
 
 # Create VCF files
@@ -254,11 +254,11 @@ $ Rscript $PURECN/Coverage.R --out-dir $OUT/normals \
 
 Important recommendations:
 
-- Only provide `--keepduplicates` or `--removemapq0` if you know what you are
+- Only provide `--keep-duplicates` or `--remove-mapq0` if you know what you are
   doing and always use the same command line arguments for tumor and the
   normals
 
-- It can be safe to skip the GC-normalization with `--skipgcnorm` when tumor
+- It can be safe to skip the GC-normalization with `--skip-gc-norm` when tumor
   and normal samples are expected to exhibit similar biases and a sufficient
   number of normal samples is available. A good example would be plasma
   sequencing. In contrast, old FFPE samples normalized against blood controls
@@ -323,9 +323,9 @@ Important recommendations:
 
 - For ideal results, examine the `interval_weights.png` file to find good
   off-target bin widths. You will need to re-run `IntervalFile.R` with the
-  `--offtargetwidth` parameter and re-calculate the coverages. `NormalDB.R`
+  `--average-off-target-width` parameter and re-calculate the coverages. `NormalDB.R`
   will also give a suggestion for a good minimum width. We do not recommend
-  going lower than this estimate; setting `--offtargetwidth` to value larger
+  going lower than this estimate; setting `--average-off-target-width` to value larger
   than this value can decrease noise at the cost of loss in resolution.
   Setting it to 1.2-1.5x the minimum recommendation (that should be ideally <
   250kb) is a good starting point.
@@ -771,6 +771,7 @@ Argument name          | Corresponding PureCN argument | PureCN function
 `--error`            | `error` | `runAbsoluteCN`     
 `--db-info-flag`       | `DB.info.flag` | `runAbsoluteCN`     
 `--popaf-info-field`   | `POPAF.info.field` | `runAbsoluteCN`     
+`--cosmic-cnt-info-field`   | `Cosmic.CNT.info.field` | `runAbsoluteCN`     
 `--min-cosmic-cnt`     | `min.cosmic.cnt` | `setPriorVcf`     
 `--interval-padding`          | `interval.padding` | `filterVcfBasic`  
 `--min-total-counts`   | `min.total.counts` | `filterIntervals`  
@@ -789,6 +790,7 @@ Argument name          | Corresponding PureCN argument | PureCN function
 `--max-copynumber`   | `test.num.copy` | `runAbsoluteCN` 
 `--post-optimize`     | `post.optimize` | `runAbsoluteCN`     
 `--bootstrap-n`       | `n` | `bootstrapResults` 
+`--speedup-heuristics` | `speedup.heuristics` | `runAbsoluteCN`     
 `--model-homozygous`  | `model.homozygous` | `runAbsoluteCN`  
 `--model`            | `model` | `runAbsoluteCN`  
 `--log-ratio-calibration` | `log.ratio.calibration` | `runAbsoluteCN`