Skip to content

Commit

Permalink
Merge pull request #192 from CCBR/iss-190
Browse files Browse the repository at this point in the history
Improvements to chipseeker, contrast & input checks, docs
  • Loading branch information
kopardev authored Jul 8, 2024
2 parents 61fe191 + 3846a5e commit a2bfd64
Show file tree
Hide file tree
Showing 14 changed files with 230 additions and 61 deletions.
11 changes: 6 additions & 5 deletions bin/chipseeker_annotate.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ load_package(args$txdb)
txdb <- args$txdb %>%
rlang::sym() %>%
eval()
dir.create(file.path(outfile_prefix), showWarnings = FALSE)

# parse peak file
np <- read.table(args$peak, sep = "\t")
Expand Down Expand Up @@ -71,7 +72,7 @@ annot <- annotatePeak(
ignoreDownstream = FALSE,
overlap = "TSS"
)
saveRDS(annot, file = glue("{outfile_prefix}.annotation.Rds"))
saveRDS(annot, file = glue("{outfile_prefix}/{outfile_prefix}.annotation.Rds"))

padf <- as.data.frame(annot)
padf$peakID <- paste(padf$seqnames, ":", padf$start, "-", padf$end, sep = "")
Expand Down Expand Up @@ -103,7 +104,7 @@ merged <- dplyr::full_join(padf, np, by = "peakID") %>%
dplyr::rename("#peakID" = "peakID") %>%
dplyr::arrange(dplyr::desc(qValue))

annotated_outfile <- glue("{outfile_prefix}.annotated.txt")
annotated_outfile <- glue("{outfile_prefix}/{outfile_prefix}.annotated.txt")
write.table(merged, annotated_outfile, sep = "\t", quote = FALSE, row.names = FALSE)
l <- paste("# Median peak width : ", median(merged$width), sep = "")
write(l, annotated_outfile, append = TRUE)
Expand All @@ -114,7 +115,7 @@ write(l, annotated_outfile, append = TRUE)


# get promoter genes
outfile_genelist <- glue("{outfile_prefix}.genelist.txt")
outfile_genelist <- glue("{outfile_prefix}/{outfile_prefix}.genelist.txt")
# ... all lines with annotation starting with "Promoter"
promoters1 <- dplyr::filter(merged, grepl("Promoter", annotation))
# ... all lines with annotation is "5' UTR"
Expand All @@ -138,7 +139,7 @@ write(l, outfile_genelist, append = TRUE)
# annotation type frequency table

l <- paste("#annotationType", "frequency", "medianWidth", "medianpValue", "medianqValue", sep = "\t")
outfile_summary <- glue("{outfile_prefix}.summary.txt")
outfile_summary <- glue("{outfile_prefix}/{outfile_prefix}.summary.txt")
write(l, outfile_summary)
atypes <- c(
"3' UTR",
Expand Down Expand Up @@ -169,7 +170,7 @@ for (ann in c("Exon", "Intron")) {

# upset plot
ggsave(
filename = glue("{outfile_prefix}_upsetplot.png"),
filename = glue("{outfile_prefix}/{outfile_prefix}_upsetplot.png"),
plot = upsetplot(annot, vennpie = TRUE),
device = "png"
)
3 changes: 2 additions & 1 deletion conf/full_mm10.config
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ params {
genome = 'mm10'
outdir = "results/full_mm10"
input = "${projectDir}/assets/samplesheet_full_mm10.csv"
contrasts = "${projectDir}/assets/contrasts_full_mm10.yml"
contrasts = 'true'
contrastsheet = "${projectDir}/assets/contrasts_full_mm10.yml"
sicer {
species = "mm10" // supported species https://github.com/zanglab/SICER2/blob/master/sicer/lib/GenomeData.py
}
Expand Down
2 changes: 1 addition & 1 deletion conf/test_human.config
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ params {
genome = 'hg38'
outdir = "results/human"
input = "assets/samplesheet_human.csv"
contrasts = null //'assets/contrasts_human.yml'
contrasts = false //'assets/contrasts_human.yml'
//read_length = 50
sicer.species = "${params.genome}" // supported species https://github.com/zanglab/SICER2/blob/master/sicer/lib/GenomeData.py

Expand Down
3 changes: 2 additions & 1 deletion conf/test_mm10.config
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ params {
genome = 'mm10'
outdir = "results/test_mm10"
input = "${projectDir}/assets/samplesheet_test_mm10.csv"
contrasts = "${projectDir}/assets/contrasts_test_mm10.yml"
contrasts = 'true'
contrastsheet = "${projectDir}/assets/contrasts_test_mm10.yml"
read_length = 50
sicer.species = "mm10" // supported species https://github.com/zanglab/SICER2/blob/master/sicer/lib/GenomeData.py

Expand Down
30 changes: 30 additions & 0 deletions docs/manifests.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# TODO Jotting notes here
## Samplemanifest
The following columns are required:

- sample: sampleID; does not need to be a unique column
- rep: replicateID of sampleID; does not need to be a unique column
- fastq_1: absolute path to R1 of sampleID
- fastq_2: absolute path to R1 of sampleID
- antibody: -c sampleID for mac2; this must match a unique {sample}_{rep} format
- control:

Example antibody / control format for a single-end project:

```
sample,rep,fastq_1,fastq_2,antibody,control
sample,1,/path/to/sample_1.R1.fastq.gz,,input_1,input_1
sample,2,/path/to/sample_2.R1.fastq.gz,,input_1,input_1
input,1,/path/to/sample1.R1.fastq.gz,,,
input,2,/path/to/sample1.R1.fastq.gz,,,
```

Example antibody / control format for a paired-end project:

```
sample,rep,fastq_1,fastq_2,antibody,control
sample,1,/path/to/sample_1.R1.fastq.gz,/path/to/sample_1.R2.fastq.gz,input_1,input_1
sample,2,/path/to/sample_2.R1.fastq.gz,/path/to/sample_1.R2.fastq.gz,input_1,input_1
input,1,/path/to/input_1.R1.fastq.gz,/path/to/input_1.R2.fastq.gz,,
input,2,/path/to/input_2.R1.fastq.gz,/path/to/input_2.R2.fastq.gz,,
```
131 changes: 131 additions & 0 deletions docs/workflow.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
## Process workflow

TODO add images to show workflow

### Pipeline Checks
- Input files are checked that they meet standard formatting; some file access is reviewed

- Processes include:

- INPUT_CHECK:SAMPLESHEET_CHECK
- INPUT_CHECK:CHECK_CONTRASTS

- Output directories include:

- check_contrasts

## Pre-alignment
- Adaptors are trimmed, if blacklists are included, filtering occurs

- Processes include:

- CUTADAPT
- FILTER_BLACKLIST:BWA_MEM
- FILTER_BLACKLIST:SAMTOOLS_FILTERALIGNED
- FILTER_BLACKLIST:PICARD_SAMTOFASTQ
- FILTER_BLACKLIST:CUSTOM_COUNTFASTQ

- Output directories include:

- cutadapt

## Alignment
- Samples are aligned using BWA; alignment stats are generated; samples are sorted and filtered

- Processes include:

- ALIGN_GENOME:BWA_MEM
- ALIGN_GENOME:SAMTOOLS_FLAGSTAT_ALIGN
- ALIGN_GENOME:FILTER_QUALITY
- ALIGN_GENOME:SAMTOOLS_SORT
- ALIGN_GENOME:SAMTOOLS_FLAGSTAT_FILTER

- Output directories include:

- bwa_mem
- samtools_flagstat_align
- samtools_filteraligned
- samtools_sort
- samtools_flagstat_filter

## Deduplicate
- Processes include:

- DEDUPLICATE:MACS2_DEDUP
- DEDUPLICATE:INDEX_SINGLE
- DEDUPLICATE:PICARD_DEDUP
- DEDUPLICATE:INDEX_PAIRED

- Output directories include:

## Quality Control
- Processes include:

- PPQT_PROCESS
- QC:FASTQC_RAW
- QC:FASTQC_TRIMMED
- QC:FASTQ_SCREE
- QC:PRESEQ
- QC:HANDLE_PRESEQ_ERROR
- QC:PARSE_PRESEQ_LOG
- QC:QC_STATS
- QC:QC_TABLE

## Deeptools analysis
- Processes include:

- QC:DEEPTOOLS:BAM_COVERAGE
- QC:DEEPTOOLS:BIGWIG_SUM
- QC:DEEPTOOLS:PLOT_CORRELATION
- QC:DEEPTOOLS:PLOT_PCA
- QC:DEEPTOOLS:NORMALIZE_INPUT
- QC:DEEPTOOLS:BED_PROTEIN_CODING
- QC:DEEPTOOLS:COMPUTE_MATRIX
- QC:DEEPTOOLS:PLOT_HEATMAP
- QC:DEEPTOOLS:PLOT_PROFILE
- QC:DEEPTOOLS:PLOT_FINGERPRINT

## Peak calling
- Processes include:

- PHANTOM_PEAKS
- CALL_PEAKS:CALC_GENOME_FRAC
- CALL_PEAKS:BAM_TO_BED
- CALL_PEAKS:MACS_BROAD
- CALL_PEAKS:MACS_NARROW
- CALL_PEAKS:SICER
- CALL_PEAKS:CONVERT_SICER
- CALL_PEAKS:GEM
- CALL_PEAKS:FILTER_GEM
- CALL_PEAKS:FRACTION_IN_PEAKS
- CALL_PEAKS:CONCAT_FRIPS
- CALL_PEAKS:PLOT_FRIP
- CALL_PEAKS:GET_PEAK_META
- CALL_PEAKS:CONCAT_PEAK_META
- CALL_PEAKS:PLOT_PEAK_WIDTHS

## Consensus Peaks
- Processes include:

- CONSENSUS_PEAKS:CAT_CAT
- CONSENSUS_PEAKS:SORT_BED
- CONSENSUS_PEAKS:BEDTOOLS_MERGE
- CONSENSUS_PEAKS:- CONSENSUS_PEAKS:_OUT

## Annotate
- Processes include:

- ANNOTATE:CHIPSEEKER_PEAKPLOT
- ANNOTATE:CHIPSEEKER_ANNOTATE
- ANNOTATE:CHIPSEEKER_PLOTLIST
- ANNOTATE:HOMER_MOTIFS
- ANNOTATE:MEME_AME

## Differential Analysis
- If there are more than 2 replicates per group then `diffbind` is performed; otherwise `manorm` pairewise analysis is performed

- Processes include:

- DIFF:DIFFBIND:PREP_DIFFBIND
- DIFF:DIFFBIND:DIFFBIND_RMD
- DIFF:MANORM:MANORM_PAIRWISE
9 changes: 5 additions & 4 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ include { PHANTOM_PEAKS
PPQT_PROCESS
MULTIQC } from "./modules/local/qc.nf"


contrastsheet = params.contrastsheet ?: "/assets/contrast_test.ymls"

workflow.onComplete {
if (!workflow.stubRun && !workflow.commandLine.contains('-preview')) {
def message = Utils.spooker(workflow)
Expand Down Expand Up @@ -64,7 +67,7 @@ workflow {
}

workflow CHIPSEQ {
INPUT_CHECK(file(params.input, checkIfExists: true), params.seq_center)
INPUT_CHECK(file(params.input, checkIfExists: true), params.seq_center, file(contrastsheet))

INPUT_CHECK.out.reads.set { raw_fastqs }
raw_fastqs | CUTADAPT
Expand Down Expand Up @@ -129,7 +132,6 @@ workflow CHIPSEQ {
}
.set{ ch_consensus_peaks }
if (params.contrasts) {
contrasts = file(params.contrasts, checkIfExists: true)
// TODO use consensus peaks for regions of interest in diffbind
CALL_PEAKS.out.bam_peaks
.combine(deduped_bam)
Expand All @@ -145,8 +147,7 @@ workflow CHIPSEQ {
.set{ tagalign_peaks }
DIFF( bam_peaks,
tagalign_peaks,
INPUT_CHECK.out.csv,
contrasts
INPUT_CHECK.out.contrasts
)

}
Expand Down
2 changes: 2 additions & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ nav:
- Home: index.md
- Usage:
- Getting Started: getting-started.md
- Creating Manifests: manifests.md
- Workflow Overview: workflow.md
- Developer Guide:
- How to contribute: contributing.md
- Release guide: release-guide.md
Expand Down
12 changes: 6 additions & 6 deletions modules/local/chipseeker/annotate/main.nf
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
process CHIPSEEKER_ANNOTATE {
tag "${meta.id}.${meta.group}"
tag "${meta.id}"
label 'peaks'
label 'process_high'

Expand All @@ -11,15 +11,15 @@ process CHIPSEEKER_ANNOTATE {
val(annot_db)

output:
tuple val(meta), path("*.annotated.txt"), path("*.summary.txt"), path("*.genelist.txt"), emit: txt
path("*.annotation.Rds"), emit: annot
path("*.png"), emit: plots
tuple val(meta), path("${meta.id}/*.annotated.txt"), path("${meta.id}/*.summary.txt"), path("${meta.id}/*.genelist.txt"), emit: txt
path("${meta.id}/*.annotation.Rds"), emit: annot
path("${meta.id}/*.png"), emit: plots

script:
"""
chipseeker_annotate.R \\
--peak ${bed} \\
--outfile-prefix ${meta.id}.${meta.group} \\
--outfile-prefix ${meta.id} \\
--genome-txdb ${txdb} \\
--genome-annot ${annot_db} \\
--uptss 2000 \\
Expand All @@ -32,7 +32,7 @@ process CHIPSEEKER_ANNOTATE {
"""
for ftype in annotated.txt summary.txt genelist.txt annotation.Rds .png
do
touch ${meta.id}.${meta.group}.\${ftype}
touch ${meta.id}/${meta.id}.\${ftype}
done
"""
}
2 changes: 1 addition & 1 deletion modules/local/chipseeker/peakplot/main.nf
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
process CHIPSEEKER_PEAKPLOT {
tag "${meta.id}"
label 'peaks'
label 'process_medium'
label 'process_high'

container 'nciccbr/ccbr_chipseeker:1.1.2'

Expand Down
4 changes: 2 additions & 2 deletions modules/local/deeptools.nf
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ process NORMALIZE_INPUT {
process BIGWIG_SUM {
label 'qc'
label 'deeptools'
label 'process_high'
label 'process_high_memory'

container "${params.containers.deeptools}"

Expand Down Expand Up @@ -307,7 +307,7 @@ process PLOT_HEATMAP {
process PLOT_PROFILE {
label 'qc'
label 'deeptools'
label 'process_single'
label 'process_low'

container "${params.containers.deeptools}"

Expand Down
5 changes: 4 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@ nextflow.enable.dsl = 2

params {
input = null
contrasts = null
contrasts = false
seq_center = null
read_length = null
genome = null

// manifest for contrasts
contrastsheet = null

// custom genome options
genome_fasta = null
genes_gtf = null
Expand Down
Loading

0 comments on commit a2bfd64

Please sign in to comment.