Skip to content

Commit

Permalink
Merge pull request #157 from CCBR/chipseeker-plots
Browse files Browse the repository at this point in the history
refactor chipseeker plots
  • Loading branch information
kelly-sovacool authored Nov 21, 2023
2 parents 1bc3c7f + bd5bd5b commit d349971
Show file tree
Hide file tree
Showing 12 changed files with 185 additions and 68 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
- Support multiple replicates per sample and call consensus peaks. (#129)
- Find motifs in the genome with Homer. (#142)
- Run motif enrichment analysis with MEME. (#142)
- Annotate peaks with chipseeker. (#142,#147)
- Annotate peaks with chipseeker. (#142,#147,#157)
- Add preseq complexity curve and fastq screen to multiqc report. (#147)
- Fix deepTools plots (#144):
- Per sample fingerprint plots instead of per replicate
Expand Down
67 changes: 19 additions & 48 deletions bin/chipseeker_annotate.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,35 +12,21 @@ parser$add_argument("-u", "--uptss", required = FALSE, type = "integer", default
parser$add_argument("-d", "--downtss", required = FALSE, type = "integer", default = 2000, help = "upstream bases from TSS")
parser$add_argument("-t", "--toppromoterpeaks", required = FALSE, type = "integer", default = 1000, help = "filter top N peaks in promoters for genelist output")
parser$add_argument("-o", "--outfile-prefix", required = TRUE, type = "character", dest = "outfile_prefix", help = "prefix for output filenames")
parser$add_argument("-g", "--genome", required = TRUE, help = "hg38/hg19/mm10/mm9/mmul10/bosTau9/sacCer3")
parser$add_argument("--genome-txdb", dest = "txdb", required = TRUE, help = "BioConductor TxDb package, e.g. TxDb.Hsapiens.UCSC.hg38.knownGene")
parser$add_argument("--genome-annot", dest = "adb", required = TRUE, help = "BioConductor annotation package, e.g. org.Hs.eg.db")

# get command line options, if help option encountered print help and exit,
# otherwise if options not found on command line then set defaults,
args <- parser$parse_args()
outfile_prefix <- args$outfile_prefix

genomes <- tibble::tribble(
~ref_genome, ~adb, ~txdb,
"mm9", "org.Mm.eg.db", "TxDb.Mmusculus.UCSC.mm9.knownGene",
"mm10", "org.Mm.eg.db", "TxDb.Mmusculus.UCSC.mm10.knownGene",
"hg19", "org.Hs.eg.db", "TxDb.Hsapiens.UCSC.hg19.knownGene",
"hg38", "org.Hs.eg.db", "TxDb.Hsapiens.UCSC.hg38.knownGene",
"mmul10", "org.Mmu.eg.db", "TxDb.Mmulatta.UCSC.rheMac10.refGene",
"bosTau9", "org.Bt.eg.db", "TxDb.Btaurus.UCSC.bosTau9.refGene",
"sacCer3", "org.Sc.sgd.db", "TxDb.Scerevisiae.UCSC.sacCer3.sgdGene"
)
adb <- genomes %>%
filter(ref_genome == args$genome) %>%
pull(adb)
adb <- args$adb
load_package(adb)
txdb_str <- genomes %>%
filter(ref_genome == args$genome) %>%
pull(txdb)
load_package(txdb_str)
txdb <- txdb_str %>%
load_package(args$txdb)
txdb <- args$txdb %>%
rlang::sym() %>%
eval()

# parse peak file
np <- read.table(args$peak, sep = "\t")
peak_colnames <- c(
"chrom",
Expand Down Expand Up @@ -89,7 +75,7 @@ saveRDS(annot, file = glue("{outfile_prefix}.annotation.Rds"))
padf <- as.data.frame(annot)
padf$peakID <- paste(padf$seqnames, ":", padf$start, "-", padf$end, sep = "")
merged <- dplyr::full_join(padf, np, by = "peakID") %>%
dplyr::select(all_of(c(
dplyr::select(any_of(c(
"peakID",
"chrom",
"chromStart",
Expand Down Expand Up @@ -127,17 +113,20 @@ write(l, annotated_outfile, append = TRUE)


# get promoter genes
outfile_genelist <- glue("{outfile_prefix}.genelist.txt")
# ... all lines with annotation starting with "Promoter"
promoters1 <- dplyr::filter(merged, grepl("Promoter", annotation))
# ... all lines with annotation is "5' UTR"
promoters2 <- merged[merged$annotation == "5' UTR", ]
promoters <- rbind(promoters1, promoters2)
promoters <- promoters[order(-promoters$qValue), ]
promoters <- head(promoters, n = args$toppromoterpeaks)
promoter_genes <- unique(promoters[, c("ENSEMBL", "SYMBOL")])
colnames(promoter_genes) <- c("#ENSEMBL", "SYMBOL")
outfile_genelist <- glue("{outfile_prefix}.genelist.txt")
write.table(promoter_genes, outfile_genelist, sep = "\t", quote = FALSE, row.names = FALSE)
# ENSEMBL and SYMBOL may not be in the merged df columns, depending on adb/txdb
if (length(intersect(c("ENSEMBL", "SYMBOL"), colnames(promoters))) == 2) {
promoter_genes <- unique(promoters[, c("ENSEMBL", "SYMBOL")])
colnames(promoter_genes) <- c("#ENSEMBL", "SYMBOL")
write.table(promoter_genes, outfile_genelist, sep = "\t", quote = FALSE, row.names = FALSE)
}
l <- paste("# Median peak width : ", median(promoters$width), sep = "")
write(l, outfile_genelist, append = TRUE)
l <- paste("# Median pValue : ", median(promoters$pValue), sep = "")
Expand Down Expand Up @@ -177,27 +166,9 @@ for (ann in c("Exon", "Intron")) {
write(l, outfile_summary, append = TRUE)
}

# plots for individual peak file
peaks <- GenomicRanges::GRanges(
seqnames = np$chrom,
ranges = IRanges(np$chromStart, np$chromEnd),
qValue = np$qValue
# upset plot
ggsave(
filename = glue("{outfile_prefix}_upsetplot.png"),
plot = upsetplot(annot, vennpie = TRUE),
device = "png"
)
plots <- list(
covplot = covplot(peaks, weightCol = "qValue"),
plotPeakProf2 = plotPeakProf2(
peak = peaks, upstream = rel(0.2), downstream = rel(0.2),
conf = 0.95, by = "gene", type = "body", nbin = 800,
TxDb = txdb, weightCol = "qValue", ignore_strand = F
),
upsetplot = upsetplot(annot, vennpie = TRUE)
)

names(plots) %>%
lapply(function(plot_name) {
ggsave(
filename = glue("{outfile_prefix}_{plot_name}.png"),
plot = plots[[plot_name]],
device = "png"
)
})
68 changes: 68 additions & 0 deletions bin/chipseeker_peakplot.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/usr/bin/env Rscript
load_package <- function(x) {
suppressPackageStartupMessages(library(x, character.only = TRUE))
invisible(x)
}
messages <- lapply(c("ChIPseeker", "dplyr", "glue", "ggplot2"), load_package)

parser <- argparse::ArgumentParser()
parser$add_argument("-p", "--peak", required = TRUE, help = "peak file")
parser$add_argument("-o", "--outfile-prefix", required = TRUE, type = "character", dest = "outfile_prefix", help = "prefix for output filenames")
parser$add_argument("--genome-txdb", dest = "txdb", required = TRUE, help = "BioConductor TxDb package, e.g. TxDb.Hsapiens.UCSC.hg38.knownGene")
parser$add_argument("--genome-annot", dest = "adb", required = TRUE, help = "BioConductor annotation package, e.g. org.Hs.eg.db")

# get command line options, if help option encountered print help and exit,
# otherwise if options not found on command line then set defaults,
args <- parser$parse_args()
outfile_prefix <- args$outfile_prefix
adb <- args$adb
load_package(adb)
load_package(args$txdb)
txdb <- args$txdb %>%
rlang::sym() %>%
eval()

np <- read.table(args$peak, sep = "\t")
peak_colnames <- c(
"chrom",
"chromStart",
"chromEnd",
"name",
"score",
"strand",
"signalValue",
"pValue",
"qValue"
)
num_columns <- ncol(np)
if (num_columns == 9) {
colnames(np) <- peak_colnames
np <- np %>% mutate(peak = NA)
} else if (num_columns == 10) {
colnames(np) <- c(peak_colnames, "peak")
} else {
stop(paste("Expected 9 or 10 columns in peak file, but", num_columns, "given"))
}
# plots for individual peak file
peaks <- GenomicRanges::GRanges(
seqnames = np$chrom,
ranges = IRanges(np$chromStart, np$chromEnd),
qValue = np$qValue
)
plots <- list(
covplot = covplot(peaks, weightCol = "qValue"),
plotPeakProf2 = plotPeakProf2(
peak = peaks, upstream = rel(0.2), downstream = rel(0.2),
conf = 0.95, by = "gene", type = "body", nbin = 800,
TxDb = txdb, weightCol = "qValue", ignore_strand = F
)
)

names(plots) %>%
lapply(function(plot_name) {
ggsave(
filename = glue("{outfile_prefix}_{plot_name}.png"),
plot = plots[[plot_name]],
device = "png"
)
})
4 changes: 4 additions & 0 deletions conf/genomes.config
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ params {
gene_info = "${params.index_dir}/hg38_basic/geneinfo.bed"
effective_genome_size = 2700000000
meme_motifs = 'assets/HOCOMOCOv11_core_HUMAN_mono_meme_format.tar.gz' // source https://github.com/CCBR/ASPEN/raw/55f909d76500c3502c1c397ef3000908649b0284/resources/motif/HOCOMOCOv11_core_HUMAN_mono_meme_format.tar.gz
bioc_txdb = 'TxDb.Hsapiens.UCSC.hg38.knownGene'
bioc_annot = 'org.Hs.eg.db'
}
'mm10' {
fasta = "${params.index_dir}/mm10_basic/mm10_basic.fa"
Expand All @@ -21,6 +23,8 @@ params {
gene_info = "${params.index_dir}/mm10_basic/geneinfo.bed"
effective_genome_size = 2400000000
meme_motifs = 'assets/HOCOMOCOv11_core_MOUSE_mono_meme_format.tar.gz' // source https://github.com/CCBR/ASPEN/raw/55f909d76500c3502c1c397ef3000908649b0284/resources/motif/HOCOMOCOv11_core_MOUSE_mono_meme_format.tar.gz
bioc_txdb = 'TxDb.Mmusculus.UCSC.mm10.knownGene'
bioc_annot = 'org.Mmu.eg.db'
}
}
}
5 changes: 4 additions & 1 deletion conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ params {
genes_gtf = 'https://raw.githubusercontent.com/nf-core/test-datasets/atacseq/reference/genes.gtf'
blacklist = '/data/CCBR_Pipeliner/db/PipeDB/Indices/hg38_basic/indexes/hg38.blacklist_v3.chrM.chr_rDNA.fa'
rename_contigs = 'assets/R64-1-1_ensembl2UCSC.txt'
meme_motifs = null
bioc_txdb = 'TxDb.Scerevisiae.UCSC.sacCer3.sgdGene'
bioc_annot = 'org.Sc.sgd.db'

deeptools.bin_size = 10000 // this value is only to make bamCoverage run faster. use smaller value for real data.
deeptools.excluded_chroms = 'chrM'
Expand All @@ -26,7 +29,7 @@ params {
macs_broad = true
macs_narrow = true
chipseeker = true
homer = false
homer = true
}
sicer {
species = "sacCer1"
Expand Down
5 changes: 4 additions & 1 deletion main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,10 @@ workflow CHIPSEQ {
deduped_bam,
frag_lengths,
effective_genome_size,
PREPARE_GENOME.out.fasta
PREPARE_GENOME.out.fasta,
PREPARE_GENOME.out.meme_motifs,
PREPARE_GENOME.out.bioc_txdb,
PREPARE_GENOME.out.bioc_annot
)
ch_multiqc = ch_multiqc.mix(CALL_PEAKS.out.plots)
}
Expand Down
13 changes: 8 additions & 5 deletions modules/local/chipseeker/annotate/main.nf
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
process CHIPSEEKER_ANNOTATE {
tag "${meta.id}.${group}"
tag "${meta.id}.${meta.group}"
label 'peaks'
label 'process_medium'

container 'nciccbr/ccbr_chipseeker:1.1.2'

input:
tuple val(meta), path(bed), val(group)
tuple val(meta), path(bed)
val(txdb)
val(annot_db)

output:
tuple val(meta), path("*.annotated.txt"), path("*.summary.txt"), path("*.genelist.txt"), emit: txt
Expand All @@ -17,8 +19,9 @@ process CHIPSEEKER_ANNOTATE {
"""
chipseeker_annotate.R \\
--peak ${bed} \\
--outfile-prefix ${meta.id}.${group} \\
--genome ${params.genome} \\
--outfile-prefix ${meta.id}.${meta.group} \\
--genome-txdb ${txdb} \\
--genome-annot ${annot_db} \\
--uptss 2000 \\
--downtss 2000 \\
--toppromoterpeaks 1000
Expand All @@ -28,7 +31,7 @@ process CHIPSEEKER_ANNOTATE {
"""
for ftype in annotated.txt summary.txt genelist.txt annotation.Rds .png
do
touch ${meta.id}.${group}.\${ftype}
touch ${meta.id}.${meta.group}.\${ftype}
done
"""
}
32 changes: 32 additions & 0 deletions modules/local/chipseeker/peakplot/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
process CHIPSEEKER_PEAKPLOT {
tag "${meta.id}.${group}"
label 'peaks'
label 'process_medium'

container 'nciccbr/ccbr_chipseeker:1.1.2'

input:
tuple val(meta), path(bed), val(group)
val(txdb)
val(annot_db)

output:
tuple val(meta), path("*.png"), emit: plots

script:
"""
chipseeker_peakplot.R \\
--peak ${bed} \\
--outfile-prefix ${meta.id}.${group} \\
--genome-txdb ${txdb} \\
--genome-annot ${annot_db}
"""

stub:
"""
for ftype in annotated.txt summary.txt genelist.txt annotation.Rds .png
do
touch ${meta.id}.${group}.\${ftype}
done
"""
}
8 changes: 7 additions & 1 deletion modules/local/prepare_genome.nf
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@ process WRITE_GENOME_CONFIG {
path(chrom_dir)
path(gene_info)
val(effective_genome_size)
val(meme_motifs)
val(bioc_txdb)
val(bioc_annot)

output:
path("*.config"), emit: conf
Expand All @@ -151,6 +154,7 @@ process WRITE_GENOME_CONFIG {
import os
import pprint
import shutil
print("${meme_motifs}")
os.makedirs("${genome_name}/")
for subdir, filelist in (('reference/', "${reference_index}"), ('blacklist', "${blacklist_index}")):
dirpath = f"${{genome_name}}/{subdir}"
Expand All @@ -169,7 +173,9 @@ process WRITE_GENOME_CONFIG {
chrom_sizes = '"\${params.index_dir}/${genome_name}/${chrom_sizes}"',
gene_info = '"\${params.index_dir}/${genome_name}/${gene_info}"',
effective_genome_size = "${effective_genome_size}",
meme_motifs = "null"
meme_motifs = "${meme_motifs}",
bioc_txdb = "${bioc_txdb}",
bioc_annot = "${bioc_annot}"
)
pprint.pprint(genome)
with open('${genome_name}.config', 'w') as conf_file:
Expand Down
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ params {
macs_narrow = true
chipseeker = true
homer = true
meme = true
}
}

Expand Down
27 changes: 17 additions & 10 deletions subworkflows/local/peaks.nf
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ include { HOMER_MOTIFS } from "../../modules/local/homer"
include { MEME_AME } from "../../modules/local/meme"
include { CHIPSEEKER_ANNOTATE } from "../../modules/local/chipseeker/annotate"
include { CHIPSEEKER_PLOTLIST } from "../../modules/local/chipseeker/plotlist"
include { CHIPSEEKER_PEAKPLOT } from "../../modules/local/chipseeker/peakplot"

workflow CALL_PEAKS {
take:
Expand All @@ -31,6 +32,9 @@ workflow CALL_PEAKS {
frag_lengths
effective_genome_size
genome_fasta
meme_motifs
bioc_txdb
bioc_annot

main:
genome_frac = CALC_GENOME_FRAC(chrom_sizes, effective_genome_size)
Expand Down Expand Up @@ -150,23 +154,26 @@ workflow CALL_PEAKS {
peaks_grouped
}
peaks_grouped | CONSENSUS_PEAKS
ch_consensus_peaks = CONSENSUS_PEAKS.out.peaks

if (params.run.chipseeker) {
// TODO: change consensus peak method to keep p-value, q-value, etc for use in chipseeker
ch_peaks | CHIPSEEKER_ANNOTATE
if (params.run.chipseeker && bioc_txdb && bioc_annot) {
// TODO: change consensus peak method to keep p-value, q-value, etc for use in chipseeker peakplots
CHIPSEEKER_PEAKPLOT( ch_peaks, bioc_txdb, bioc_annot )
CHIPSEEKER_ANNOTATE( ch_consensus_peaks, bioc_txdb, bioc_annot )
CHIPSEEKER_ANNOTATE.out.annot.collect() | CHIPSEEKER_PLOTLIST
ch_plots = ch_plots.mix(
CHIPSEEKER_PLOTLIST.out.plots
)
)
}
if (params.run.homer) {
HOMER_MOTIFS( CONSENSUS_PEAKS.out.peaks.combine(genome_fasta),
params.homer.de_novo,
file(params.homer.jaspar_db, checkIfExists: true)
HOMER_MOTIFS(ch_consensus_peaks.combine(genome_fasta),
params.homer.de_novo,
file(params.homer.jaspar_db, checkIfExists: true)
)
if (params.genomes[ params.genome ].meme_motifs) {
MEME_AME( HOMER_MOTIFS.out.ame,
file(params.genomes[ params.genome ].meme_motifs, checkIfExists: true)

if (params.run.meme && meme_motifs) {
MEME_AME(HOMER_MOTIFS.out.ame,
meme_motifs
)
}
}
Expand Down
Loading

0 comments on commit d349971

Please sign in to comment.