Skip to content

Commit

Permalink
Merge pull request #51 from CCBR/qc
Browse files Browse the repository at this point in the history
Implement main QC steps
  • Loading branch information
kelly-sovacool authored Sep 6, 2023
2 parents 1df9d6a + 6d0475f commit 8a2b703
Show file tree
Hide file tree
Showing 24 changed files with 8,274 additions and 131 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ modules/nf-core/** linguist-generated
subworkflows/nf-core/** linguist-generated
*.config linguist-language=nextflow
assets/** linguist-generated
docs/**.html linguist-generated
Original file line number Diff line number Diff line change
@@ -1,11 +1,3 @@
---
name: Pull Request
about: Create a Pull Request
title: ""
labels: ""
assignees: ""
---

## Changes

<!--
Expand All @@ -29,6 +21,6 @@ when referring to the issue.
(~Strikethrough~ any points that are not applicable.)

- [ ] This comment contains a description of changes with justifications, with any relevant issues linked.
- [ ] Write unit tests for any new features, bug fixes, or other code changes.
- ~[ ] Write unit tests for any new features, bug fixes, or other code changes.~ _testing framework not yet implemented_
- [ ] Update docs if there are any API changes.
- [ ] Update `CHANGELOG.md` with a one-line description of these changes and reference the PR number. Guidelines: https://keepachangelog.com/en/1.1.0/
6 changes: 5 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
default_install_hook_types: [pre-commit, commit-msg]
default_stages: [pre-commit]
exclude: ^assets/
exclude: |
(?x)(
^assets/|
^docs/.*.html
)
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v1.2.3
Expand Down
13 changes: 12 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,15 @@

_This project is under active development_

<!--This is the first release of CHAMPAGNE 🎉-->
## CHAMPAGNE 0.1.0

### Quality control steps implemented for single-end reads

- Trim raw reads, FastQC on raw and trimmed reads, and FastQ Screen on trimmed reads.
- Exclude reads that align to blacklist regions, align remaining reads to the reference genome, and deduplicate.
- Preseq on aligned reads.
- Phantompeakqualtools on aligned and deduplicated reads.
- Process reads with deepTools: bam coverage to generate bigwigs for each sample, summarize all bigwigs, and compute matrices relative to TSSs and scaled to metagene regions.
- Generate plots with deepTools: PCA, profile, heatmap, spearman correlation, and fingerprint plots.
- Summarize all quality control steps in a MultiQC report.
- Input-normalize ChIP fragments for the next stage of the pipeline.
Binary file modified assets/dag.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
9 changes: 9 additions & 0 deletions assets/samplesheet_mm10.csv

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

66 changes: 66 additions & 0 deletions conf/base.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ccbr/sandbox Nextflow base config file
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A 'blank slate' config file, appropriate for general use on most high performance
compute environments. Assumes that all software is installed and available on
the PATH. Runs in `local` mode - all jobs will be run on the logged in environment.
----------------------------------------------------------------------------------------
*/

process {

// TODO nf-core: Check the defaults for all processes
cpus = { check_max( 1 * task.attempt, 'cpus' ) }
memory = { check_max( 6.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }

errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' }
maxRetries = 1
maxErrors = '-1'

// Process-specific resource requirements
// NOTE - Please try and re-use the labels below as much as possible.
// These labels are used and recognised by default in DSL2 files hosted on nf-core/modules.
// If possible, it would be nice to keep the same label naming convention when
// adding in your local modules too.
// TODO nf-core: Customise requirements for specific processes.
// See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
withLabel:process_single {
cpus = { check_max( 1 , 'cpus' ) }
memory = { check_max( 6.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }
}
withLabel:process_low {
cpus = { check_max( 2 * task.attempt, 'cpus' ) }
memory = { check_max( 12.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }
}
withLabel:process_medium {
cpus = { check_max( 6 * task.attempt, 'cpus' ) }
memory = { check_max( 36.GB * task.attempt, 'memory' ) }
time = { check_max( 8.h * task.attempt, 'time' ) }
}
withLabel:process_high {
cpus = { check_max( 12 * task.attempt, 'cpus' ) }
memory = { check_max( 72.GB * task.attempt, 'memory' ) }
time = { check_max( 16.h * task.attempt, 'time' ) }
}
withLabel:process_long {
time = { check_max( 20.h * task.attempt, 'time' ) }
}
withLabel:process_high_memory {
memory = { check_max( 200.GB * task.attempt, 'memory' ) }
}
withLabel:error_ignore {
errorStrategy = 'ignore'
}
withLabel:error_retry {
errorStrategy = 'retry'
maxRetries = 2
}
/*
withName:CUSTOM_DUMPSOFTWAREVERSIONS {
cache = false
}*/
}
51 changes: 49 additions & 2 deletions conf/biowulf.config
Original file line number Diff line number Diff line change
@@ -1,4 +1,51 @@

params {
// config that only works on biowulf, but regardless of on slurm or local
// e.g. genome file paths
config_profile_description = 'Biowulf nf-core config'
config_profile_contact = '[email protected]'
config_profile_url = 'https://hpc.nih.gov/apps/nextflow.html'
max_memory = '224 GB'
max_cpus = 32
max_time = '72 h'

igenomes_base = '/fdb/igenomes/'
}

executor {

$slurm {
queue = 'norm'
queueSize = 200
pollInterval = '2 min'
queueStatInterval = '5 min'
submitRateLimit = '6/1min'
retry.maxAttempts = 1
}
}

singularity {
enabled = true
autoMounts = true
cacheDir = "/data/$USER/.singularity"
envWhitelist='https_proxy,http_proxy,ftp_proxy,DISPLAY,SLURM_JOBID,SINGULARITY_BINDPATH'
}

env {
SINGULARITY_CACHEDIR="/data/$USER/.singularity"
PYTHONNOUSERSITE = 1
}


process {
executor = 'slurm'
maxRetries = 1

clusterOptions = ' --gres=lscratch:200 '

scratch = '/lscratch/$SLURM_JOBID'

stageInMode = 'symlink'
stageOutMode = 'rsync'

// for running pipeline on group sharing data directory, this can avoid inconsistent files timestamps
cache = 'lenient'
}
51 changes: 0 additions & 51 deletions conf/biowulf_slurm.config

This file was deleted.

20 changes: 16 additions & 4 deletions conf/modules.conf → conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -21,47 +21,59 @@ process {
}
withName: 'TRIM.*' {
container = 'nciccbr/ncigb_cutadapt_v1.18:latest'
cpus = 4
cpus = 16
memory = '32 G'
}
withName: 'FASTQC' {
container = 'nciccbr/ccrgb_qctools:latest'
cpus = 4
cpus = 32
memory = '24 G'
}
withName: 'FASTQ_SCREEN' {
container = 'nciccbr/ccbr_fastq_screen_0.14.1'
containerOptions = "--bind ${params.fastq_screen.db_dir}"
cpus = 8
cpus = 24
memory = '24 G'
}
withName: 'ALIGN.*|INDEX_BAM' {
container = 'nciccbr/ccbr_ubuntu_base_20.04:latest'
cpus = 8
cpus = 32
memory = '48 G'
}
withName: 'PRESEQ' {
container = 'nciccbr/ccbr_preseq_v2.0:v1'
cpus = 16
memory = '24 G'
}
withName: 'PHANTOM_PEAKS' {
container = 'quay.io/biocontainers/phantompeakqualtools:1.2.2--hdfd78af_1'
cpus = 1
memory = '24 G'
}
withName: 'DEDUPLICATE' {
container = 'nciccbr/ccbr_macs2_2.2.9.1:v1'
cpus = 4
memory = '24 G'
}
withName: 'QC_.*|PPQT_.*' {
container = 'nciccbr/ccbr_ubuntu_base_20.04:latest'
}
withName: 'NGSQC_GEN' {
container = 'nciccbr/ccbr_ngsqc_0.31:v1'
cpus = 1
memory = '120 G'
}
withLabel: 'deeptools' {
container = 'nciccbr/ccbr_deeptools_3.5.3:v1'
cpus = 16
memory = '24 G'
}
withName: 'BAM_COVERAGE|PLOT_FINGERPRINT|COMPUTE_.*|NORMALIZE_INPUT' {
cpus = 32
memory = '24 G'
}
withName: 'MULTIQC' {
container = 'nciccbr/ccbr_multiqc_1.15:v1'
memory = '24 G'
}
}
26 changes: 25 additions & 1 deletion conf/multiqc_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,31 @@ custom_data:
id: "QC_Table"
section_name: "ChIP-specific QC metrics"
section_href: https://www.encodeproject.org/chip-seq/
description: "Encode3 standards: <br> Usable fragments (A fragment corresponds to a read mapping to one location in the genome.) If single-end, one read is considered a fragment. If paired-end, one pair is considered a fragment. <br> Usable fragments for TF/narrow histones: concerning <10M, acceptable 10-20M, ideal >20M. <br> Usable fragments for broad histones: concerning <20M, acceptable 20-40M, ideal >40M. <br> NRF: Number of distinct mapping reads after removing duplicates/total number of reads. <br> PBC1: Number of genomic locations where exactly one read maps uniquely/number of distinct genomic locations to which one read maps uniquely. <br> PBC2: Number of genomic locations where only one read maps uniquely/number of genomic locations where two reads map uniquely <br> Concerning(severe bottlenecking): NRF < 0.5, Acceptable: 0.5 < NRF < 0.8, Ideal (no bottlenecking): NRF > 0.9 <br><br> PPQT standards: https://github.com/crazyhottommy/phantompeakqualtools <br> NSC: cross-correlation value/minimum cross-correlation <br> RSC: (cross-correlation value - minimum cross-correlation) / (correlation at phantom peak - minimum cross-correlation) <br> Optimal NSC: > 1.1, Okay NSC: 1.05 - 1.1. Low NSC means low signal to noise or very few peaks (biological or technical). <br> Optimal RSC: > 1, Okay RSC: 0.8-1. Low RSC means low signal to noise caused by: poor quality ChIP, low read sequence quality, shallow sequencing depth, or few peaks. <br> QualityTag: -2: very low, 2: very high"
description: |
<h3>Encode3 standards</h3>
<ul>
<li><a href='https://www.encodeproject.org/chip-seq/histone-encode4/#standards' target='_blank'>Histone standards</a></li>
<li><a href='https://www.encodeproject.org/chip-seq/transcription-factor-encode4/#standards' target='_blank'>Transcription Factor standards</a></li>
</ul>
Usable fragments (A fragment corresponds to a read mapping to one location in the genome.)
If single-end, one read is considered a fragment. If paired-end, one pair is considered a fragment. <br>
Usable fragments for TF/narrow histones: concerning <10M, acceptable 10-20M, ideal >20M. <br>
Usable fragments for broad histones: concerning <20M, acceptable 20-40M, ideal >40M. <br>
<ul>
<li> NRF: Number of distinct mapping reads after removing duplicates/total number of reads.</li>
<li> PBC1: Number of genomic locations where exactly one read maps uniquely/number of distinct genomic locations to which one read maps uniquely.</li>
<li> PBC2: Number of genomic locations where only one read maps uniquely/number of genomic locations where two reads map uniquely</li>
<li> Concerning(severe bottlenecking): NRF < 0.5, Acceptable: 0.5 < NRF < 0.8, Ideal (no bottlenecking): NRF > 0.9</li>
</ul>
<br>
<h3><a href='https://github.com/kundajelab/phantompeakqualtools' target='_blank'>phantompeakqualtools</a> standards</h3>
<ul>
<li> NSC: cross-correlation value/minimum cross-correlation <br>
Optimal NSC: > 1.1, Okay NSC: 1.05 - 1.1. Low NSC means low signal to noise or very few peaks (biological or technical).</li>
<li> RSC: (cross-correlation value - minimum cross-correlation) / (correlation at phantom peak - minimum cross-correlation) <br>
Optimal RSC: > 1, Okay RSC: 0.8-1. Low RSC means low signal to noise caused by: poor quality ChIP,
low read sequence quality, shallow sequencing depth, or few peaks. <br> QualityTag: -2: very low, 2: very high </li>
</ul>
plot_type: "table"
pconfig:
id: "QC_Table"
Expand Down
15 changes: 15 additions & 0 deletions conf/slurmint.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
params {
config_profile_name = 'Slurm interactive node'
max_memory = '224 GB'
max_cpus = 32
max_time = '72 h'
}
process {
scratch = '/lscratch/$SLURM_JOBID'
}
singularity {
enabled = true
autoMounts = true
cacheDir = "/data/$USER/.singularity"
envWhitelist='https_proxy,http_proxy,ftp_proxy,DISPLAY,SLURM_JOBID,SINGULARITY_BINDPATH'
}
23 changes: 22 additions & 1 deletion conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,27 @@ params {

outdir = 'results/test'
input = 'assets/samplesheet_test.csv' // adapted from https://github.com/nf-core/test-datasets/blob/chipseq/samplesheet/v2.0/samplesheet_test.csv
genome = 'hg38'

max_cpus = 32 // for interactive node on biowulf
max_memory = '120 GB' // for interactive node on biowulf

publish_dir_mode = "symlink"

fastq_screen {
conf = "${baseDir}/conf/fastq_screen.conf"
db_dir = '/data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/'
}
align {
index_dir = "/data/CCBR_Pipeliner/db/PipeDB/Indices/${params.genome}_basic/indexes/"
blacklist = "${params.genome}.blacklist"
blacklist_files = "${params.align.index_dir}${params.align.blacklist}*"
reference_files = "${params.align.index_dir}${params.genome}*"
min_quality = 6 // to get a min quality of 5, set this to 6
effective_genome_size = 2700000000 // source: https://github.com/CCBR/Pipeliner/blob/86c6ccaa3d58381a0ffd696bbf9c047e4f991f9e/hg38.json#L349
chrom_sizes = "${params.align.index_dir}${params.genome}.fa.sizes" // source: https://github.com/CCBR/Pipeliner/blob/86c6ccaa3d58381a0ffd696bbf9c047e4f991f9e/hg38.json#L359
}
gene_info = "/data/CCBR_Pipeliner/db/PipeDB/Indices/${params.genome}_basic/geneinfo.bed"
deeptools {
bin_size = 10000 // this value is only to make bamCoverage run faster. use smaller value for real data.
smooth_length = 75
Expand All @@ -17,8 +33,13 @@ params {
}
}

dag {
enabled = true
overwrite = true
file = "assets/dag.png"
}
report {
enabled = true
overwrite = true
file = "assets/test_report.html"
file = "${params.outdir}/pipeline_info/execution_report.html"
}
Loading

0 comments on commit 8a2b703

Please sign in to comment.