Merge pull request #51 from CCBR/qc

Implement main QC steps
CCBR · Sep 6, 2023 · 8a2b703 · 8a2b703
2 parents 1df9d6a + 6d0475f
commit 8a2b703
Show file tree

Hide file tree

Showing 24 changed files with 8,274 additions and 131 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -4,3 +4,4 @@ modules/nf-core/** linguist-generated
 subworkflows/nf-core/** linguist-generated
 *.config linguist-language=nextflow
 assets/** linguist-generated
+docs/**.html linguist-generated
diff --git a/.github/pull_request_template.md → .github/PULL_REQUEST_TEMPLATE.md b/.github/pull_request_template.md → .github/PULL_REQUEST_TEMPLATE.md
@@ -1,11 +1,3 @@
----
-name: Pull Request
-about: Create a Pull Request
-title: ""
-labels: ""
-assignees: ""
----
-
 ## Changes
 
 <!--
@@ -29,6 +21,6 @@ when referring to the issue.
 (~Strikethrough~ any points that are not applicable.)
 
 - [ ] This comment contains a description of changes with justifications, with any relevant issues linked.
-- [ ] Write unit tests for any new features, bug fixes, or other code changes.
+- ~[ ] Write unit tests for any new features, bug fixes, or other code changes.~ _testing framework not yet implemented_
 - [ ] Update docs if there are any API changes.
 - [ ] Update `CHANGELOG.md` with a one-line description of these changes and reference the PR number. Guidelines: https://keepachangelog.com/en/1.1.0/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,10 @@
 default_install_hook_types: [pre-commit, commit-msg]
 default_stages: [pre-commit]
-exclude: ^assets/
+exclude: |
+  (?x)(
+      ^assets/|
+      ^docs/.*.html
+   )
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v1.2.3

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,4 +2,15 @@
 
 _This project is under active development_
 
-<!--This is the first release of CHAMPAGNE 🎉-->
+## CHAMPAGNE 0.1.0
+
+### Quality control steps implemented for single-end reads
+
+- Trim raw reads, FastQC on raw and trimmed reads, and FastQ Screen on trimmed reads.
+- Exclude reads that align to blacklist regions, align remaining reads to the reference genome, and deduplicate.
+- Preseq on aligned reads.
+- Phantompeakqualtools on aligned and deduplicated reads.
+- Process reads with deepTools: bam coverage to generate bigwigs for each sample, summarize all bigwigs, and compute matrices relative to TSSs and scaled to metagene regions.
+- Generate plots with deepTools: PCA, profile, heatmap, spearman correlation, and fingerprint plots.
+- Summarize all quality control steps in a MultiQC report.
+- Input-normalize ChIP fragments for the next stage of the pipeline.
diff --git a/assets/dag.png b/assets/dag.png
diff --git a/assets/samplesheet_mm10.csv b/assets/samplesheet_mm10.csv
diff --git a/conf/base.config b/conf/base.config
@@ -0,0 +1,66 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ccbr/sandbox Nextflow base config file
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    A 'blank slate' config file, appropriate for general use on most high performance
+    compute environments. Assumes that all software is installed and available on
+    the PATH. Runs in `local` mode - all jobs will be run on the logged in environment.
+----------------------------------------------------------------------------------------
+*/
+
+process {
+
+    // TODO nf-core: Check the defaults for all processes
+    cpus   = { check_max( 1    * task.attempt, 'cpus'   ) }
+    memory = { check_max( 6.GB * task.attempt, 'memory' ) }
+    time   = { check_max( 4.h  * task.attempt, 'time'   ) }
+
+    errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' }
+    maxRetries    = 1
+    maxErrors     = '-1'
+
+    // Process-specific resource requirements
+    // NOTE - Please try and re-use the labels below as much as possible.
+    //        These labels are used and recognised by default in DSL2 files hosted on nf-core/modules.
+    //        If possible, it would be nice to keep the same label naming convention when
+    //        adding in your local modules too.
+    // TODO nf-core: Customise requirements for specific processes.
+    // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
+    withLabel:process_single {
+        cpus   = { check_max( 1                  , 'cpus'    ) }
+        memory = { check_max( 6.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 4.h  * task.attempt, 'time'    ) }
+    }
+    withLabel:process_low {
+        cpus   = { check_max( 2     * task.attempt, 'cpus'    ) }
+        memory = { check_max( 12.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 4.h   * task.attempt, 'time'    ) }
+    }
+    withLabel:process_medium {
+        cpus   = { check_max( 6     * task.attempt, 'cpus'    ) }
+        memory = { check_max( 36.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 8.h   * task.attempt, 'time'    ) }
+    }
+    withLabel:process_high {
+        cpus   = { check_max( 12    * task.attempt, 'cpus'    ) }
+        memory = { check_max( 72.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 16.h  * task.attempt, 'time'    ) }
+    }
+    withLabel:process_long {
+        time   = { check_max( 20.h  * task.attempt, 'time'    ) }
+    }
+    withLabel:process_high_memory {
+        memory = { check_max( 200.GB * task.attempt, 'memory' ) }
+    }
+    withLabel:error_ignore {
+        errorStrategy = 'ignore'
+    }
+    withLabel:error_retry {
+        errorStrategy = 'retry'
+        maxRetries    = 2
+    }
+    /*
+    withName:CUSTOM_DUMPSOFTWAREVERSIONS {
+        cache = false
+    }*/
+}
diff --git a/conf/biowulf.config b/conf/biowulf.config
@@ -1,4 +1,51 @@
+
 params {
-    // config that only works on biowulf, but regardless of on slurm or local
-    // e.g. genome file paths
+  config_profile_description = 'Biowulf nf-core config'
+  config_profile_contact = '[email protected]'
+  config_profile_url = 'https://hpc.nih.gov/apps/nextflow.html'
+  max_memory = '224 GB'
+  max_cpus = 32
+  max_time = '72 h'
+
+  igenomes_base = '/fdb/igenomes/'
+}
+
+executor {
+
+    $slurm {
+        queue = 'norm'
+        queueSize = 200
+        pollInterval = '2 min'
+        queueStatInterval = '5 min'
+        submitRateLimit = '6/1min'
+        retry.maxAttempts = 1
+    }
+}
+
+singularity {
+    enabled = true
+    autoMounts = true
+    cacheDir = "/data/$USER/.singularity"
+    envWhitelist='https_proxy,http_proxy,ftp_proxy,DISPLAY,SLURM_JOBID,SINGULARITY_BINDPATH'
+}
+
+env {
+    SINGULARITY_CACHEDIR="/data/$USER/.singularity"
+    PYTHONNOUSERSITE = 1
+}
+
+
+process {
+            executor = 'slurm'
+            maxRetries = 1
+
+            clusterOptions = ' --gres=lscratch:200 '
+
+            scratch = '/lscratch/$SLURM_JOBID'
+
+            stageInMode = 'symlink'
+            stageOutMode = 'rsync'
+
+            // for running pipeline on group sharing data directory, this can avoid inconsistent files timestamps
+            cache = 'lenient'
 }
diff --git a/conf/biowulf_slurm.config b/conf/biowulf_slurm.config
diff --git a/conf/modules.conf → conf/modules.config b/conf/modules.conf → conf/modules.config
@@ -21,47 +21,59 @@ process {
     }
     withName: 'TRIM.*' {
         container = 'nciccbr/ncigb_cutadapt_v1.18:latest'
-        cpus = 4
+        cpus = 16
+        memory = '32 G'
     }
     withName: 'FASTQC' {
         container = 'nciccbr/ccrgb_qctools:latest'
-        cpus = 4
+        cpus = 32
+        memory = '24 G'
     }
     withName: 'FASTQ_SCREEN' {
         container = 'nciccbr/ccbr_fastq_screen_0.14.1'
         containerOptions = "--bind ${params.fastq_screen.db_dir}"
-        cpus = 8
+        cpus = 24
+        memory = '24 G'
     }
     withName: 'ALIGN.*|INDEX_BAM' {
         container = 'nciccbr/ccbr_ubuntu_base_20.04:latest'
-        cpus = 8
+        cpus = 32
+        memory = '48 G'
     }
     withName: 'PRESEQ' {
         container = 'nciccbr/ccbr_preseq_v2.0:v1'
         cpus = 16
+        memory = '24 G'
     }
     withName: 'PHANTOM_PEAKS' {
         container = 'quay.io/biocontainers/phantompeakqualtools:1.2.2--hdfd78af_1'
         cpus = 1
+        memory = '24 G'
     }
     withName: 'DEDUPLICATE' {
         container = 'nciccbr/ccbr_macs2_2.2.9.1:v1'
         cpus = 4
+        memory = '24 G'
     }
     withName: 'QC_.*|PPQT_.*' {
         container = 'nciccbr/ccbr_ubuntu_base_20.04:latest'
     }
     withName: 'NGSQC_GEN' {
         container = 'nciccbr/ccbr_ngsqc_0.31:v1'
         cpus = 1
+        memory = '120 G'
     }
     withLabel: 'deeptools' {
         container = 'nciccbr/ccbr_deeptools_3.5.3:v1'
+        cpus = 16
+        memory = '24 G'
     }
     withName: 'BAM_COVERAGE|PLOT_FINGERPRINT|COMPUTE_.*|NORMALIZE_INPUT' {
         cpus = 32
+        memory = '24 G'
     }
     withName: 'MULTIQC' {
         container = 'nciccbr/ccbr_multiqc_1.15:v1'
+        memory = '24 G'
     }
 }
diff --git a/conf/multiqc_config.yaml b/conf/multiqc_config.yaml
@@ -43,7 +43,31 @@ custom_data:
     id: "QC_Table"
     section_name: "ChIP-specific QC metrics"
     section_href: https://www.encodeproject.org/chip-seq/
-    description: "Encode3 standards: <br>  Usable fragments (A fragment corresponds to a read mapping to one location in the genome.) If single-end, one read is considered a fragment. If paired-end, one pair is considered a fragment. <br>  Usable fragments for TF/narrow histones: concerning <10M, acceptable 10-20M, ideal >20M. <br>  Usable fragments for broad histones: concerning <20M, acceptable 20-40M, ideal >40M. <br>  NRF: Number of distinct mapping reads after removing duplicates/total number of reads. <br> PBC1: Number of genomic locations where exactly one read maps uniquely/number of distinct genomic locations to which one read maps uniquely. <br> PBC2: Number of genomic locations where only one read maps uniquely/number of genomic locations where two reads map uniquely <br>  Concerning(severe bottlenecking): NRF < 0.5, Acceptable: 0.5 < NRF < 0.8, Ideal (no bottlenecking): NRF > 0.9 <br><br> PPQT standards: https://github.com/crazyhottommy/phantompeakqualtools <br> NSC: cross-correlation value/minimum cross-correlation <br> RSC: (cross-correlation value - minimum cross-correlation) / (correlation at phantom peak - minimum cross-correlation) <br> Optimal NSC: > 1.1, Okay NSC: 1.05 - 1.1. Low NSC means low signal to noise or very few peaks (biological or technical). <br> Optimal RSC: > 1, Okay RSC: 0.8-1. Low RSC means low signal to noise caused by: poor quality ChIP, low read sequence quality, shallow sequencing depth, or few peaks. <br> QualityTag: -2: very low, 2: very high"
+    description: |
+      <h3>Encode3 standards</h3>
+      <ul>
+        <li><a href='https://www.encodeproject.org/chip-seq/histone-encode4/#standards' target='_blank'>Histone standards</a></li>
+        <li><a href='https://www.encodeproject.org/chip-seq/transcription-factor-encode4/#standards' target='_blank'>Transcription Factor standards</a></li>
+      </ul>
+      Usable fragments (A fragment corresponds to a read mapping to one location in the genome.)
+      If single-end, one read is considered a fragment. If paired-end, one pair is considered a fragment. <br>
+      Usable fragments for TF/narrow histones: concerning <10M, acceptable 10-20M, ideal >20M. <br>
+      Usable fragments for broad histones: concerning <20M, acceptable 20-40M, ideal >40M. <br>
+      <ul>
+        <li> NRF: Number of distinct mapping reads after removing duplicates/total number of reads.</li>
+        <li> PBC1: Number of genomic locations where exactly one read maps uniquely/number of distinct genomic locations to which one read maps uniquely.</li>
+        <li> PBC2: Number of genomic locations where only one read maps uniquely/number of genomic locations where two reads map uniquely</li>
+        <li> Concerning(severe bottlenecking): NRF < 0.5, Acceptable: 0.5 < NRF < 0.8, Ideal (no bottlenecking): NRF > 0.9</li>
+      </ul>
+      <br>
+      <h3><a href='https://github.com/kundajelab/phantompeakqualtools' target='_blank'>phantompeakqualtools</a> standards</h3>
+      <ul>
+      <li> NSC: cross-correlation value/minimum cross-correlation <br>
+           Optimal NSC: > 1.1, Okay NSC: 1.05 - 1.1. Low NSC means low signal to noise or very few peaks (biological or technical).</li>
+      <li> RSC: (cross-correlation value - minimum cross-correlation) / (correlation at phantom peak - minimum cross-correlation) <br>
+           Optimal RSC: > 1, Okay RSC: 0.8-1. Low RSC means low signal to noise caused by: poor quality ChIP,
+           low read sequence quality, shallow sequencing depth, or few peaks. <br> QualityTag: -2: very low, 2: very high </li>
+      </ul>
     plot_type: "table"
     pconfig:
       id: "QC_Table"

diff --git a/conf/slurmint.config b/conf/slurmint.config
@@ -0,0 +1,15 @@
+params {
+    config_profile_name = 'Slurm interactive node'
+    max_memory = '224 GB'
+    max_cpus = 32
+    max_time = '72 h'
+}
+process {
+    scratch = '/lscratch/$SLURM_JOBID'
+}
+singularity {
+    enabled = true
+    autoMounts = true
+    cacheDir = "/data/$USER/.singularity"
+    envWhitelist='https_proxy,http_proxy,ftp_proxy,DISPLAY,SLURM_JOBID,SINGULARITY_BINDPATH'
+}
diff --git a/conf/test.config b/conf/test.config
@@ -4,11 +4,27 @@ params {
 
     outdir = 'results/test'
     input = 'assets/samplesheet_test.csv' // adapted from https://github.com/nf-core/test-datasets/blob/chipseq/samplesheet/v2.0/samplesheet_test.csv
+    genome = 'hg38'
 
     max_cpus = 32           // for interactive node on biowulf
     max_memory = '120 GB'   // for interactive node on biowulf
 
     publish_dir_mode = "symlink"
+
+    fastq_screen {
+        conf = "${baseDir}/conf/fastq_screen.conf"
+        db_dir = '/data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/'
+    }
+    align {
+        index_dir = "/data/CCBR_Pipeliner/db/PipeDB/Indices/${params.genome}_basic/indexes/"
+        blacklist = "${params.genome}.blacklist"
+        blacklist_files = "${params.align.index_dir}${params.align.blacklist}*"
+        reference_files = "${params.align.index_dir}${params.genome}*"
+        min_quality = 6                     // to get a min quality of 5, set this to 6
+        effective_genome_size = 2700000000  // source: https://github.com/CCBR/Pipeliner/blob/86c6ccaa3d58381a0ffd696bbf9c047e4f991f9e/hg38.json#L349
+        chrom_sizes = "${params.align.index_dir}${params.genome}.fa.sizes"       // source: https://github.com/CCBR/Pipeliner/blob/86c6ccaa3d58381a0ffd696bbf9c047e4f991f9e/hg38.json#L359
+    }
+    gene_info = "/data/CCBR_Pipeliner/db/PipeDB/Indices/${params.genome}_basic/geneinfo.bed"
     deeptools {
         bin_size = 10000 // this value is only to make bamCoverage run faster. use smaller value for real data.
         smooth_length = 75
@@ -17,8 +33,13 @@ params {
     }
 }
 
+dag {
+    enabled = true
+    overwrite = true
+    file = "assets/dag.png"
+}
 report {
     enabled = true
     overwrite = true
-    file = "assets/test_report.html"
+    file = "${params.outdir}/pipeline_info/execution_report.html"
 }