Merge pull request #192 from CCBR/iss-190

Improvements to chipseeker, contrast & input checks, docs
CCBR · Jul 8, 2024 · a2bfd64 · a2bfd64
2 parents 61fe191 + 3846a5e
commit a2bfd64
Show file tree

Hide file tree

Showing 14 changed files with 230 additions and 61 deletions.
diff --git a/bin/chipseeker_annotate.R b/bin/chipseeker_annotate.R
@@ -26,6 +26,7 @@ load_package(args$txdb)
 txdb <- args$txdb %>%
   rlang::sym() %>%
   eval()
+dir.create(file.path(outfile_prefix), showWarnings = FALSE)
 
 # parse peak file
 np <- read.table(args$peak, sep = "\t")
@@ -71,7 +72,7 @@ annot <- annotatePeak(
   ignoreDownstream = FALSE,
   overlap = "TSS"
 )
-saveRDS(annot, file = glue("{outfile_prefix}.annotation.Rds"))
+saveRDS(annot, file = glue("{outfile_prefix}/{outfile_prefix}.annotation.Rds"))
 
 padf <- as.data.frame(annot)
 padf$peakID <- paste(padf$seqnames, ":", padf$start, "-", padf$end, sep = "")
@@ -103,7 +104,7 @@ merged <- dplyr::full_join(padf, np, by = "peakID") %>%
   dplyr::rename("#peakID" = "peakID") %>%
   dplyr::arrange(dplyr::desc(qValue))
 
-annotated_outfile <- glue("{outfile_prefix}.annotated.txt")
+annotated_outfile <- glue("{outfile_prefix}/{outfile_prefix}.annotated.txt")
 write.table(merged, annotated_outfile, sep = "\t", quote = FALSE, row.names = FALSE)
 l <- paste("# Median peak width : ", median(merged$width), sep = "")
 write(l, annotated_outfile, append = TRUE)
@@ -114,7 +115,7 @@ write(l, annotated_outfile, append = TRUE)
 
 
 # get promoter genes
-outfile_genelist <- glue("{outfile_prefix}.genelist.txt")
+outfile_genelist <- glue("{outfile_prefix}/{outfile_prefix}.genelist.txt")
 # ... all lines with annotation starting with "Promoter"
 promoters1 <- dplyr::filter(merged, grepl("Promoter", annotation))
 # ... all lines with annotation is "5' UTR"
@@ -138,7 +139,7 @@ write(l, outfile_genelist, append = TRUE)
 # annotation type frequency table
 
 l <- paste("#annotationType", "frequency", "medianWidth", "medianpValue", "medianqValue", sep = "\t")
-outfile_summary <- glue("{outfile_prefix}.summary.txt")
+outfile_summary <- glue("{outfile_prefix}/{outfile_prefix}.summary.txt")
 write(l, outfile_summary)
 atypes <- c(
   "3' UTR",
@@ -169,7 +170,7 @@ for (ann in c("Exon", "Intron")) {
 
 # upset plot
 ggsave(
-  filename = glue("{outfile_prefix}_upsetplot.png"),
+  filename = glue("{outfile_prefix}/{outfile_prefix}_upsetplot.png"),
   plot = upsetplot(annot, vennpie = TRUE),
   device = "png"
 )
diff --git a/conf/full_mm10.config b/conf/full_mm10.config
@@ -5,7 +5,8 @@ params {
     genome = 'mm10'
     outdir = "results/full_mm10"
     input = "${projectDir}/assets/samplesheet_full_mm10.csv"
-    contrasts = "${projectDir}/assets/contrasts_full_mm10.yml"
+    contrasts = 'true'
+    contrastsheet = "${projectDir}/assets/contrasts_full_mm10.yml"
     sicer {
         species = "mm10" // supported species https://github.com/zanglab/SICER2/blob/master/sicer/lib/GenomeData.py
     }

diff --git a/conf/test_human.config b/conf/test_human.config
@@ -5,7 +5,7 @@ params {
     genome = 'hg38'
     outdir = "results/human"
     input = "assets/samplesheet_human.csv"
-    contrasts = null //'assets/contrasts_human.yml'
+    contrasts = false //'assets/contrasts_human.yml'
     //read_length = 50
     sicer.species = "${params.genome}" // supported species https://github.com/zanglab/SICER2/blob/master/sicer/lib/GenomeData.py
 

diff --git a/conf/test_mm10.config b/conf/test_mm10.config
@@ -5,7 +5,8 @@ params {
     genome = 'mm10'
     outdir = "results/test_mm10"
     input = "${projectDir}/assets/samplesheet_test_mm10.csv"
-    contrasts = "${projectDir}/assets/contrasts_test_mm10.yml"
+    contrasts = 'true'
+    contrastsheet = "${projectDir}/assets/contrasts_test_mm10.yml"
     read_length = 50
     sicer.species = "mm10" // supported species https://github.com/zanglab/SICER2/blob/master/sicer/lib/GenomeData.py
 

diff --git a/docs/manifests.md b/docs/manifests.md
@@ -0,0 +1,30 @@
+# TODO Jotting notes here
+## Samplemanifest
+The following columns are required:
+
+- sample: sampleID; does not need to be a unique column
+- rep: replicateID of sampleID; does not need to be a unique column
+- fastq_1: absolute path to R1 of sampleID
+- fastq_2: absolute path to R1 of sampleID
+- antibody: -c sampleID for mac2; this must match a unique {sample}_{rep} format
+- control: 
+
+Example antibody / control format for a single-end project:
+
+```
+sample,rep,fastq_1,fastq_2,antibody,control
+sample,1,/path/to/sample_1.R1.fastq.gz,,input_1,input_1
+sample,2,/path/to/sample_2.R1.fastq.gz,,input_1,input_1
+input,1,/path/to/sample1.R1.fastq.gz,,,
+input,2,/path/to/sample1.R1.fastq.gz,,,
+```
+
+Example antibody / control format for a paired-end project:
+
+```
+sample,rep,fastq_1,fastq_2,antibody,control
+sample,1,/path/to/sample_1.R1.fastq.gz,/path/to/sample_1.R2.fastq.gz,input_1,input_1
+sample,2,/path/to/sample_2.R1.fastq.gz,/path/to/sample_1.R2.fastq.gz,input_1,input_1
+input,1,/path/to/input_1.R1.fastq.gz,/path/to/input_1.R2.fastq.gz,,
+input,2,/path/to/input_2.R1.fastq.gz,/path/to/input_2.R2.fastq.gz,,
+```
diff --git a/docs/workflow.md b/docs/workflow.md
@@ -0,0 +1,131 @@
+## Process workflow
+
+TODO add images to show workflow 
+
+### Pipeline Checks
+- Input files are checked that they meet standard formatting; some file access is reviewed
+
+- Processes include:
+
+    - INPUT_CHECK:SAMPLESHEET_CHECK 
+    - INPUT_CHECK:CHECK_CONTRASTS
+
+- Output directories include:
+
+    - check_contrasts
+
+## Pre-alignment
+- Adaptors are trimmed, if blacklists are included, filtering occurs
+
+- Processes include:
+
+    - CUTADAPT
+    - FILTER_BLACKLIST:BWA_MEM
+    - FILTER_BLACKLIST:SAMTOOLS_FILTERALIGNED
+    - FILTER_BLACKLIST:PICARD_SAMTOFASTQ
+    - FILTER_BLACKLIST:CUSTOM_COUNTFASTQ
+
+- Output directories include:
+
+    - cutadapt
+
+## Alignment
+- Samples are aligned using BWA; alignment stats are generated; samples are sorted and filtered
+
+- Processes include:
+
+    - ALIGN_GENOME:BWA_MEM
+    - ALIGN_GENOME:SAMTOOLS_FLAGSTAT_ALIGN
+    - ALIGN_GENOME:FILTER_QUALITY
+    - ALIGN_GENOME:SAMTOOLS_SORT
+    - ALIGN_GENOME:SAMTOOLS_FLAGSTAT_FILTER
+
+- Output directories include:
+
+    - bwa_mem
+    - samtools_flagstat_align
+    - samtools_filteraligned
+    - samtools_sort
+    - samtools_flagstat_filter
+
+## Deduplicate
+- Processes include:
+
+    - DEDUPLICATE:MACS2_DEDUP
+    - DEDUPLICATE:INDEX_SINGLE
+    - DEDUPLICATE:PICARD_DEDUP
+    - DEDUPLICATE:INDEX_PAIRED
+
+- Output directories include:
+
+## Quality Control
+- Processes include:
+
+    - PPQT_PROCESS
+    - QC:FASTQC_RAW
+    - QC:FASTQC_TRIMMED
+    - QC:FASTQ_SCREE
+    - QC:PRESEQ
+    - QC:HANDLE_PRESEQ_ERROR
+    - QC:PARSE_PRESEQ_LOG 
+    - QC:QC_STATS
+    - QC:QC_TABLE
+
+## Deeptools analysis
+- Processes include:
+
+    - QC:DEEPTOOLS:BAM_COVERAGE
+    - QC:DEEPTOOLS:BIGWIG_SUM
+    - QC:DEEPTOOLS:PLOT_CORRELATION
+    - QC:DEEPTOOLS:PLOT_PCA
+    - QC:DEEPTOOLS:NORMALIZE_INPUT
+    - QC:DEEPTOOLS:BED_PROTEIN_CODING
+    - QC:DEEPTOOLS:COMPUTE_MATRIX
+    - QC:DEEPTOOLS:PLOT_HEATMAP
+    - QC:DEEPTOOLS:PLOT_PROFILE
+    - QC:DEEPTOOLS:PLOT_FINGERPRINT
+
+## Peak calling
+- Processes include:
+
+    - PHANTOM_PEAKS
+    - CALL_PEAKS:CALC_GENOME_FRAC
+    - CALL_PEAKS:BAM_TO_BED
+    - CALL_PEAKS:MACS_BROAD
+    - CALL_PEAKS:MACS_NARROW
+    - CALL_PEAKS:SICER
+    - CALL_PEAKS:CONVERT_SICER
+    - CALL_PEAKS:GEM
+    - CALL_PEAKS:FILTER_GEM
+    - CALL_PEAKS:FRACTION_IN_PEAKS
+    - CALL_PEAKS:CONCAT_FRIPS
+    - CALL_PEAKS:PLOT_FRIP
+    - CALL_PEAKS:GET_PEAK_META
+    - CALL_PEAKS:CONCAT_PEAK_META
+    - CALL_PEAKS:PLOT_PEAK_WIDTHS
+
+## Consensus Peaks
+- Processes include:
+
+    - CONSENSUS_PEAKS:CAT_CAT
+    - CONSENSUS_PEAKS:SORT_BED
+    - CONSENSUS_PEAKS:BEDTOOLS_MERGE
+    - CONSENSUS_PEAKS:- CONSENSUS_PEAKS:_OUT
+
+## Annotate
+- Processes include:
+
+    - ANNOTATE:CHIPSEEKER_PEAKPLOT
+    - ANNOTATE:CHIPSEEKER_ANNOTATE
+    - ANNOTATE:CHIPSEEKER_PLOTLIST
+    - ANNOTATE:HOMER_MOTIFS
+    - ANNOTATE:MEME_AME
+
+## Differential Analysis
+- If there are more than 2 replicates per group then `diffbind` is performed; otherwise `manorm` pairewise analysis is performed
+
+- Processes include:
+
+    - DIFF:DIFFBIND:PREP_DIFFBIND
+    - DIFF:DIFFBIND:DIFFBIND_RMD
+    - DIFF:MANORM:MANORM_PAIRWISE
diff --git a/main.nf b/main.nf
@@ -36,6 +36,9 @@ include { PHANTOM_PEAKS
           PPQT_PROCESS
           MULTIQC                  } from "./modules/local/qc.nf"
 
+
+contrastsheet = params.contrastsheet ?: "/assets/contrast_test.ymls"
+
 workflow.onComplete {
     if (!workflow.stubRun && !workflow.commandLine.contains('-preview')) {
         def message = Utils.spooker(workflow)
@@ -64,7 +67,7 @@ workflow {
 }
 
 workflow CHIPSEQ {
-    INPUT_CHECK(file(params.input, checkIfExists: true), params.seq_center)
+    INPUT_CHECK(file(params.input, checkIfExists: true), params.seq_center, file(contrastsheet))
 
     INPUT_CHECK.out.reads.set { raw_fastqs }
     raw_fastqs | CUTADAPT
@@ -129,7 +132,6 @@ workflow CHIPSEQ {
             }
             .set{ ch_consensus_peaks }
         if (params.contrasts) {
-            contrasts = file(params.contrasts, checkIfExists: true)
             // TODO use consensus peaks for regions of interest in diffbind
             CALL_PEAKS.out.bam_peaks
                 .combine(deduped_bam)
@@ -145,8 +147,7 @@ workflow CHIPSEQ {
                 .set{ tagalign_peaks }
             DIFF( bam_peaks,
                   tagalign_peaks,
-                  INPUT_CHECK.out.csv,
-                  contrasts
+                  INPUT_CHECK.out.contrasts
                 )
 
         }

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -96,6 +96,8 @@ nav:
   - Home: index.md
   - Usage:
       - Getting Started: getting-started.md
+      - Creating Manifests: manifests.md
+      - Workflow Overview: workflow.md
   - Developer Guide:
       - How to contribute: contributing.md
       - Release guide: release-guide.md

diff --git a/modules/local/chipseeker/annotate/main.nf b/modules/local/chipseeker/annotate/main.nf
@@ -1,5 +1,5 @@
 process CHIPSEEKER_ANNOTATE {
-    tag "${meta.id}.${meta.group}"
+    tag "${meta.id}"
     label 'peaks'
     label 'process_high'
 
@@ -11,15 +11,15 @@ process CHIPSEEKER_ANNOTATE {
         val(annot_db)
 
     output:
-        tuple val(meta), path("*.annotated.txt"), path("*.summary.txt"), path("*.genelist.txt"), emit: txt
-        path("*.annotation.Rds"),                                                                emit: annot
-        path("*.png"),                                                                           emit: plots
+        tuple val(meta), path("${meta.id}/*.annotated.txt"), path("${meta.id}/*.summary.txt"), path("${meta.id}/*.genelist.txt"), emit: txt
+        path("${meta.id}/*.annotation.Rds"),                                                                emit: annot
+        path("${meta.id}/*.png"),                                                                           emit: plots
 
     script:
     """
     chipseeker_annotate.R \\
         --peak ${bed} \\
-        --outfile-prefix ${meta.id}.${meta.group} \\
+        --outfile-prefix ${meta.id} \\
         --genome-txdb ${txdb} \\
         --genome-annot ${annot_db} \\
         --uptss 2000 \\
@@ -32,7 +32,7 @@ process CHIPSEEKER_ANNOTATE {
     """
     for ftype in annotated.txt summary.txt genelist.txt annotation.Rds .png
     do
-        touch ${meta.id}.${meta.group}.\${ftype}
+        touch ${meta.id}/${meta.id}.\${ftype}
     done
     """
 }
diff --git a/modules/local/chipseeker/peakplot/main.nf b/modules/local/chipseeker/peakplot/main.nf
@@ -1,7 +1,7 @@
 process CHIPSEEKER_PEAKPLOT {
     tag "${meta.id}"
     label 'peaks'
-    label 'process_medium'
+    label 'process_high'
 
     container 'nciccbr/ccbr_chipseeker:1.1.2'
 

diff --git a/modules/local/deeptools.nf b/modules/local/deeptools.nf
@@ -70,7 +70,7 @@ process NORMALIZE_INPUT {
 process BIGWIG_SUM {
     label 'qc'
     label 'deeptools'
-    label 'process_high'
+    label 'process_high_memory'
 
     container "${params.containers.deeptools}"
 
@@ -307,7 +307,7 @@ process PLOT_HEATMAP {
 process PLOT_PROFILE {
   label 'qc'
   label 'deeptools'
-  label 'process_single'
+  label 'process_low'
 
   container "${params.containers.deeptools}"
 

diff --git a/nextflow.config b/nextflow.config
@@ -2,11 +2,14 @@ nextflow.enable.dsl = 2
 
 params {
     input = null
-    contrasts = null
+    contrasts = false
     seq_center = null
     read_length = null
     genome = null
 
+    // manifest for contrasts
+    contrastsheet = null
+
     // custom genome options
     genome_fasta = null
     genes_gtf   = null