Merge pull request #157 from CCBR/chipseeker-plots

refactor chipseeker plots
CCBR · Nov 21, 2023 · d349971 · d349971
2 parents 1bc3c7f + bd5bd5b
commit d349971
Show file tree

Hide file tree

Showing 12 changed files with 185 additions and 68 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,7 +4,7 @@
 - Support multiple replicates per sample and call consensus peaks. (#129)
 - Find motifs in the genome with Homer. (#142)
 - Run motif enrichment analysis with MEME. (#142)
-- Annotate peaks with chipseeker. (#142,#147)
+- Annotate peaks with chipseeker. (#142,#147,#157)
 - Add preseq complexity curve and fastq screen to multiqc report. (#147)
 - Fix deepTools plots (#144):
   - Per sample fingerprint plots instead of per replicate

diff --git a/bin/chipseeker_annotate.R b/bin/chipseeker_annotate.R
@@ -12,35 +12,21 @@ parser$add_argument("-u", "--uptss", required = FALSE, type = "integer", default
 parser$add_argument("-d", "--downtss", required = FALSE, type = "integer", default = 2000, help = "upstream bases from TSS")
 parser$add_argument("-t", "--toppromoterpeaks", required = FALSE, type = "integer", default = 1000, help = "filter top N peaks in promoters for genelist output")
 parser$add_argument("-o", "--outfile-prefix", required = TRUE, type = "character", dest = "outfile_prefix", help = "prefix for output filenames")
-parser$add_argument("-g", "--genome", required = TRUE, help = "hg38/hg19/mm10/mm9/mmul10/bosTau9/sacCer3")
+parser$add_argument("--genome-txdb", dest = "txdb", required = TRUE, help = "BioConductor TxDb package, e.g. TxDb.Hsapiens.UCSC.hg38.knownGene")
+parser$add_argument("--genome-annot", dest = "adb", required = TRUE, help = "BioConductor annotation package, e.g. org.Hs.eg.db")
 
 # get command line options, if help option encountered print help and exit,
 # otherwise if options not found on command line then set defaults,
 args <- parser$parse_args()
 outfile_prefix <- args$outfile_prefix
-
-genomes <- tibble::tribble(
-  ~ref_genome, ~adb, ~txdb,
-  "mm9", "org.Mm.eg.db", "TxDb.Mmusculus.UCSC.mm9.knownGene",
-  "mm10", "org.Mm.eg.db", "TxDb.Mmusculus.UCSC.mm10.knownGene",
-  "hg19", "org.Hs.eg.db", "TxDb.Hsapiens.UCSC.hg19.knownGene",
-  "hg38", "org.Hs.eg.db", "TxDb.Hsapiens.UCSC.hg38.knownGene",
-  "mmul10", "org.Mmu.eg.db", "TxDb.Mmulatta.UCSC.rheMac10.refGene",
-  "bosTau9", "org.Bt.eg.db", "TxDb.Btaurus.UCSC.bosTau9.refGene",
-  "sacCer3", "org.Sc.sgd.db", "TxDb.Scerevisiae.UCSC.sacCer3.sgdGene"
-)
-adb <- genomes %>%
-  filter(ref_genome == args$genome) %>%
-  pull(adb)
+adb <- args$adb
 load_package(adb)
-txdb_str <- genomes %>%
-  filter(ref_genome == args$genome) %>%
-  pull(txdb)
-load_package(txdb_str)
-txdb <- txdb_str %>%
+load_package(args$txdb)
+txdb <- args$txdb %>%
   rlang::sym() %>%
   eval()
 
+# parse peak file
 np <- read.table(args$peak, sep = "\t")
 peak_colnames <- c(
   "chrom",
@@ -89,7 +75,7 @@ saveRDS(annot, file = glue("{outfile_prefix}.annotation.Rds"))
 padf <- as.data.frame(annot)
 padf$peakID <- paste(padf$seqnames, ":", padf$start, "-", padf$end, sep = "")
 merged <- dplyr::full_join(padf, np, by = "peakID") %>%
-  dplyr::select(all_of(c(
+  dplyr::select(any_of(c(
     "peakID",
     "chrom",
     "chromStart",
@@ -127,17 +113,20 @@ write(l, annotated_outfile, append = TRUE)
 
 
 # get promoter genes
+outfile_genelist <- glue("{outfile_prefix}.genelist.txt")
 # ... all lines with annotation starting with "Promoter"
 promoters1 <- dplyr::filter(merged, grepl("Promoter", annotation))
 # ... all lines with annotation is "5' UTR"
 promoters2 <- merged[merged$annotation == "5' UTR", ]
 promoters <- rbind(promoters1, promoters2)
 promoters <- promoters[order(-promoters$qValue), ]
 promoters <- head(promoters, n = args$toppromoterpeaks)
-promoter_genes <- unique(promoters[, c("ENSEMBL", "SYMBOL")])
-colnames(promoter_genes) <- c("#ENSEMBL", "SYMBOL")
-outfile_genelist <- glue("{outfile_prefix}.genelist.txt")
-write.table(promoter_genes, outfile_genelist, sep = "\t", quote = FALSE, row.names = FALSE)
+# ENSEMBL and SYMBOL may not be in the merged df columns, depending on adb/txdb
+if (length(intersect(c("ENSEMBL", "SYMBOL"), colnames(promoters))) == 2) {
+  promoter_genes <- unique(promoters[, c("ENSEMBL", "SYMBOL")])
+  colnames(promoter_genes) <- c("#ENSEMBL", "SYMBOL")
+  write.table(promoter_genes, outfile_genelist, sep = "\t", quote = FALSE, row.names = FALSE)
+}
 l <- paste("# Median peak width : ", median(promoters$width), sep = "")
 write(l, outfile_genelist, append = TRUE)
 l <- paste("# Median pValue : ", median(promoters$pValue), sep = "")
@@ -177,27 +166,9 @@ for (ann in c("Exon", "Intron")) {
   write(l, outfile_summary, append = TRUE)
 }
 
-# plots for individual peak file
-peaks <- GenomicRanges::GRanges(
-  seqnames = np$chrom,
-  ranges = IRanges(np$chromStart, np$chromEnd),
-  qValue = np$qValue
+# upset plot
+ggsave(
+  filename = glue("{outfile_prefix}_upsetplot.png"),
+  plot = upsetplot(annot, vennpie = TRUE),
+  device = "png"
 )
-plots <- list(
-  covplot = covplot(peaks, weightCol = "qValue"),
-  plotPeakProf2 = plotPeakProf2(
-    peak = peaks, upstream = rel(0.2), downstream = rel(0.2),
-    conf = 0.95, by = "gene", type = "body", nbin = 800,
-    TxDb = txdb, weightCol = "qValue", ignore_strand = F
-  ),
-  upsetplot = upsetplot(annot, vennpie = TRUE)
-)
-
-names(plots) %>%
-  lapply(function(plot_name) {
-    ggsave(
-      filename = glue("{outfile_prefix}_{plot_name}.png"),
-      plot = plots[[plot_name]],
-      device = "png"
-    )
-  })
diff --git a/bin/chipseeker_peakplot.R b/bin/chipseeker_peakplot.R
@@ -0,0 +1,68 @@
+#!/usr/bin/env Rscript
+load_package <- function(x) {
+  suppressPackageStartupMessages(library(x, character.only = TRUE))
+  invisible(x)
+}
+messages <- lapply(c("ChIPseeker", "dplyr", "glue", "ggplot2"), load_package)
+
+parser <- argparse::ArgumentParser()
+parser$add_argument("-p", "--peak", required = TRUE, help = "peak file")
+parser$add_argument("-o", "--outfile-prefix", required = TRUE, type = "character", dest = "outfile_prefix", help = "prefix for output filenames")
+parser$add_argument("--genome-txdb", dest = "txdb", required = TRUE, help = "BioConductor TxDb package, e.g. TxDb.Hsapiens.UCSC.hg38.knownGene")
+parser$add_argument("--genome-annot", dest = "adb", required = TRUE, help = "BioConductor annotation package, e.g. org.Hs.eg.db")
+
+# get command line options, if help option encountered print help and exit,
+# otherwise if options not found on command line then set defaults,
+args <- parser$parse_args()
+outfile_prefix <- args$outfile_prefix
+adb <- args$adb
+load_package(adb)
+load_package(args$txdb)
+txdb <- args$txdb %>%
+  rlang::sym() %>%
+  eval()
+
+np <- read.table(args$peak, sep = "\t")
+peak_colnames <- c(
+  "chrom",
+  "chromStart",
+  "chromEnd",
+  "name",
+  "score",
+  "strand",
+  "signalValue",
+  "pValue",
+  "qValue"
+)
+num_columns <- ncol(np)
+if (num_columns == 9) {
+  colnames(np) <- peak_colnames
+  np <- np %>% mutate(peak = NA)
+} else if (num_columns == 10) {
+  colnames(np) <- c(peak_colnames, "peak")
+} else {
+  stop(paste("Expected 9 or 10 columns in peak file, but", num_columns, "given"))
+}
+# plots for individual peak file
+peaks <- GenomicRanges::GRanges(
+  seqnames = np$chrom,
+  ranges = IRanges(np$chromStart, np$chromEnd),
+  qValue = np$qValue
+)
+plots <- list(
+  covplot = covplot(peaks, weightCol = "qValue"),
+  plotPeakProf2 = plotPeakProf2(
+    peak = peaks, upstream = rel(0.2), downstream = rel(0.2),
+    conf = 0.95, by = "gene", type = "body", nbin = 800,
+    TxDb = txdb, weightCol = "qValue", ignore_strand = F
+  )
+)
+
+names(plots) %>%
+  lapply(function(plot_name) {
+    ggsave(
+      filename = glue("{outfile_prefix}_{plot_name}.png"),
+      plot = plots[[plot_name]],
+      device = "png"
+    )
+  })
diff --git a/conf/genomes.config b/conf/genomes.config
@@ -10,6 +10,8 @@ params {
             gene_info = "${params.index_dir}/hg38_basic/geneinfo.bed"
             effective_genome_size = 2700000000
             meme_motifs = 'assets/HOCOMOCOv11_core_HUMAN_mono_meme_format.tar.gz' // source https://github.com/CCBR/ASPEN/raw/55f909d76500c3502c1c397ef3000908649b0284/resources/motif/HOCOMOCOv11_core_HUMAN_mono_meme_format.tar.gz
+            bioc_txdb = 'TxDb.Hsapiens.UCSC.hg38.knownGene'
+            bioc_annot = 'org.Hs.eg.db'
         }
         'mm10' {
             fasta = "${params.index_dir}/mm10_basic/mm10_basic.fa"
@@ -21,6 +23,8 @@ params {
             gene_info = "${params.index_dir}/mm10_basic/geneinfo.bed"
             effective_genome_size = 2400000000
             meme_motifs = 'assets/HOCOMOCOv11_core_MOUSE_mono_meme_format.tar.gz' // source https://github.com/CCBR/ASPEN/raw/55f909d76500c3502c1c397ef3000908649b0284/resources/motif/HOCOMOCOv11_core_MOUSE_mono_meme_format.tar.gz
+            bioc_txdb = 'TxDb.Mmusculus.UCSC.mm10.knownGene'
+            bioc_annot = 'org.Mmu.eg.db'
         }
     }
 }
diff --git a/conf/test.config b/conf/test.config
@@ -13,6 +13,9 @@ params {
     genes_gtf   = 'https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/reference/genes.gtf'
     blacklist = '/data/CCBR_Pipeliner/db/PipeDB/Indices/hg38_basic/indexes/hg38.blacklist_v3.chrM.chr_rDNA.fa'
     rename_contigs = 'assets/R64-1-1_ensembl2UCSC.txt'
+    meme_motifs = null
+    bioc_txdb = 'TxDb.Scerevisiae.UCSC.sacCer3.sgdGene'
+    bioc_annot = 'org.Sc.sgd.db'
 
     deeptools.bin_size = 10000 // this value is only to make bamCoverage run faster. use smaller value for real data.
     deeptools.excluded_chroms = 'chrM'
@@ -26,7 +29,7 @@ params {
         macs_broad = true
         macs_narrow = true
         chipseeker = true
-        homer = false
+        homer = true
     }
     sicer {
         species = "sacCer1"

diff --git a/main.nf b/main.nf
@@ -93,7 +93,10 @@ workflow CHIPSEQ {
                    deduped_bam,
                    frag_lengths,
                    effective_genome_size,
-                   PREPARE_GENOME.out.fasta
+                   PREPARE_GENOME.out.fasta,
+                   PREPARE_GENOME.out.meme_motifs,
+                   PREPARE_GENOME.out.bioc_txdb,
+                   PREPARE_GENOME.out.bioc_annot
                    )
         ch_multiqc = ch_multiqc.mix(CALL_PEAKS.out.plots)
     }

diff --git a/modules/local/chipseeker/annotate/main.nf b/modules/local/chipseeker/annotate/main.nf
@@ -1,12 +1,14 @@
 process CHIPSEEKER_ANNOTATE {
-    tag "${meta.id}.${group}"
+    tag "${meta.id}.${meta.group}"
     label 'peaks'
     label 'process_medium'
 
     container 'nciccbr/ccbr_chipseeker:1.1.2'
 
     input:
-        tuple val(meta), path(bed), val(group)
+        tuple val(meta), path(bed)
+        val(txdb)
+        val(annot_db)
 
     output:
         tuple val(meta), path("*.annotated.txt"), path("*.summary.txt"), path("*.genelist.txt"), emit: txt
@@ -17,8 +19,9 @@ process CHIPSEEKER_ANNOTATE {
     """
     chipseeker_annotate.R \\
         --peak ${bed} \\
-        --outfile-prefix ${meta.id}.${group} \\
-        --genome ${params.genome} \\
+        --outfile-prefix ${meta.id}.${meta.group} \\
+        --genome-txdb ${txdb} \\
+        --genome-annot ${annot_db} \\
         --uptss 2000 \\
         --downtss 2000 \\
         --toppromoterpeaks 1000
@@ -28,7 +31,7 @@ process CHIPSEEKER_ANNOTATE {
     """
     for ftype in annotated.txt summary.txt genelist.txt annotation.Rds .png
     do
-        touch ${meta.id}.${group}.\${ftype}
+        touch ${meta.id}.${meta.group}.\${ftype}
     done
     """
 }
diff --git a/modules/local/chipseeker/peakplot/main.nf b/modules/local/chipseeker/peakplot/main.nf
@@ -0,0 +1,32 @@
+process CHIPSEEKER_PEAKPLOT {
+    tag "${meta.id}.${group}"
+    label 'peaks'
+    label 'process_medium'
+
+    container 'nciccbr/ccbr_chipseeker:1.1.2'
+
+    input:
+        tuple val(meta), path(bed), val(group)
+        val(txdb)
+        val(annot_db)
+
+    output:
+        tuple val(meta), path("*.png"), emit: plots
+
+    script:
+    """
+    chipseeker_peakplot.R \\
+        --peak ${bed} \\
+        --outfile-prefix ${meta.id}.${group} \\
+        --genome-txdb ${txdb} \\
+        --genome-annot ${annot_db}
+    """
+
+    stub:
+    """
+    for ftype in annotated.txt summary.txt genelist.txt annotation.Rds .png
+    do
+        touch ${meta.id}.${group}.\${ftype}
+    done
+    """
+}
diff --git a/modules/local/prepare_genome.nf b/modules/local/prepare_genome.nf
@@ -139,6 +139,9 @@ process WRITE_GENOME_CONFIG {
         path(chrom_dir)
         path(gene_info)
         val(effective_genome_size)
+        val(meme_motifs)
+        val(bioc_txdb)
+        val(bioc_annot)
 
     output:
         path("*.config"), emit: conf
@@ -151,6 +154,7 @@ process WRITE_GENOME_CONFIG {
     import os
     import pprint
     import shutil
+    print("${meme_motifs}")
     os.makedirs("${genome_name}/")
     for subdir, filelist in (('reference/', "${reference_index}"), ('blacklist', "${blacklist_index}")):
         dirpath = f"${{genome_name}}/{subdir}"
@@ -169,7 +173,9 @@ process WRITE_GENOME_CONFIG {
                   chrom_sizes = '"\${params.index_dir}/${genome_name}/${chrom_sizes}"',
                   gene_info = '"\${params.index_dir}/${genome_name}/${gene_info}"',
                   effective_genome_size = "${effective_genome_size}",
-                  meme_motifs = "null"
+                  meme_motifs = "${meme_motifs}",
+                  bioc_txdb = "${bioc_txdb}",
+                  bioc_annot = "${bioc_annot}"
     )
     pprint.pprint(genome)
     with open('${genome_name}.config', 'w') as conf_file:

diff --git a/nextflow.config b/nextflow.config
@@ -79,6 +79,7 @@ params {
         macs_narrow = true
         chipseeker = true
         homer = true
+        meme = true
     }
 }
 

diff --git a/subworkflows/local/peaks.nf b/subworkflows/local/peaks.nf
@@ -21,6 +21,7 @@ include { HOMER_MOTIFS        } from "../../modules/local/homer"
 include { MEME_AME            } from "../../modules/local/meme"
 include { CHIPSEEKER_ANNOTATE } from "../../modules/local/chipseeker/annotate"
 include { CHIPSEEKER_PLOTLIST } from "../../modules/local/chipseeker/plotlist"
+include { CHIPSEEKER_PEAKPLOT } from "../../modules/local/chipseeker/peakplot"
 
 workflow CALL_PEAKS {
     take:
@@ -31,6 +32,9 @@ workflow CALL_PEAKS {
         frag_lengths
         effective_genome_size
         genome_fasta
+        meme_motifs
+        bioc_txdb
+        bioc_annot
 
     main:
         genome_frac = CALC_GENOME_FRAC(chrom_sizes, effective_genome_size)
@@ -150,23 +154,26 @@ workflow CALL_PEAKS {
                 peaks_grouped
             }
         peaks_grouped | CONSENSUS_PEAKS
+        ch_consensus_peaks = CONSENSUS_PEAKS.out.peaks
 
-        if (params.run.chipseeker) {
-            // TODO: change consensus peak method to keep p-value, q-value, etc for use in chipseeker
-            ch_peaks | CHIPSEEKER_ANNOTATE
+        if (params.run.chipseeker && bioc_txdb && bioc_annot) {
+            // TODO: change consensus peak method to keep p-value, q-value, etc for use in chipseeker peakplots
+            CHIPSEEKER_PEAKPLOT( ch_peaks, bioc_txdb, bioc_annot  )
+            CHIPSEEKER_ANNOTATE( ch_consensus_peaks, bioc_txdb, bioc_annot )
             CHIPSEEKER_ANNOTATE.out.annot.collect() | CHIPSEEKER_PLOTLIST
             ch_plots = ch_plots.mix(
                 CHIPSEEKER_PLOTLIST.out.plots
-                )
+            )
         }
         if (params.run.homer) {
-            HOMER_MOTIFS( CONSENSUS_PEAKS.out.peaks.combine(genome_fasta),
-                        params.homer.de_novo,
-                        file(params.homer.jaspar_db, checkIfExists: true)
+            HOMER_MOTIFS(ch_consensus_peaks.combine(genome_fasta),
+                         params.homer.de_novo,
+                         file(params.homer.jaspar_db, checkIfExists: true)
                         )
-            if (params.genomes[ params.genome ].meme_motifs) {
-                MEME_AME( HOMER_MOTIFS.out.ame,
-                        file(params.genomes[ params.genome ].meme_motifs, checkIfExists: true)
+
+            if (params.run.meme && meme_motifs) {
+                MEME_AME(HOMER_MOTIFS.out.ame,
+                         meme_motifs
                         )
             }
         }