diff --git a/.gitattributes b/.gitattributes
index 96d42102..0b9f4afd 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -4,3 +4,4 @@ modules/nf-core/** linguist-generated
subworkflows/nf-core/** linguist-generated
*.config linguist-language=nextflow
assets/** linguist-generated
+docs/**.html linguist-generated
diff --git a/.github/pull_request_template.md b/.github/PULL_REQUEST_TEMPLATE.md
similarity index 83%
rename from .github/pull_request_template.md
rename to .github/PULL_REQUEST_TEMPLATE.md
index b275088f..d33b3c9b 100644
--- a/.github/pull_request_template.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,11 +1,3 @@
----
-name: Pull Request
-about: Create a Pull Request
-title: ""
-labels: ""
-assignees: ""
----
-
## Changes
+## CHAMPAGNE 0.1.0
+
+### Quality control steps implemented for single-end reads
+
+- Trim raw reads, FastQC on raw and trimmed reads, and FastQ Screen on trimmed reads.
+- Exclude reads that align to blacklist regions, align remaining reads to the reference genome, and deduplicate.
+- Preseq on aligned reads.
+- Phantompeakqualtools on aligned and deduplicated reads.
+- Process reads with deepTools: bam coverage to generate bigwigs for each sample, summarize all bigwigs, and compute matrices relative to TSSs and scaled to metagene regions.
+- Generate plots with deepTools: PCA, profile, heatmap, spearman correlation, and fingerprint plots.
+- Summarize all quality control steps in a MultiQC report.
+- Input-normalize ChIP fragments for the next stage of the pipeline.
diff --git a/assets/dag.png b/assets/dag.png
index b4b4adf8..f9639645 100644
Binary files a/assets/dag.png and b/assets/dag.png differ
diff --git a/assets/samplesheet_mm10.csv b/assets/samplesheet_mm10.csv
new file mode 100644
index 00000000..72418af1
--- /dev/null
+++ b/assets/samplesheet_mm10.csv
@@ -0,0 +1,9 @@
+sample,fastq_1,fastq_2,antibody,control
+CTCF_ChIP_macrophage_p20_1,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081748_1.fastq.gz,,CTCF,WCE_p20
+CTCF_ChIP_macrophage_p20_2,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081749_1.fastq.gz,,CTCF,WCE_p20
+CTCF_ChIP_macrophage_p3_1,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081750_1.fastq.gz,,CTCF,WCE_p3
+CTCF_ChIP_macrophage_p3_2,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081751_1.fastq.gz,,CTCF,WCE_p3
+CTCF_ChIP_MEF_p20_1,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081752_1.fastq.gz,,CTCF,WCE_p20
+CTCF_ChIP_MEF_p20_2,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081753_1.fastq.gz,,CTCF,WCE_p20
+WCE_p3,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081772_1.fastq.gz,,,
+WCE_p20,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081773_1.fastq.gz,,,
diff --git a/conf/base.config b/conf/base.config
new file mode 100644
index 00000000..e0efddd9
--- /dev/null
+++ b/conf/base.config
@@ -0,0 +1,66 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ ccbr/sandbox Nextflow base config file
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ A 'blank slate' config file, appropriate for general use on most high performance
+ compute environments. Assumes that all software is installed and available on
+ the PATH. Runs in `local` mode - all jobs will be run on the logged in environment.
+----------------------------------------------------------------------------------------
+*/
+
+process {
+
+ // TODO nf-core: Check the defaults for all processes
+ cpus = { check_max( 1 * task.attempt, 'cpus' ) }
+ memory = { check_max( 6.GB * task.attempt, 'memory' ) }
+ time = { check_max( 4.h * task.attempt, 'time' ) }
+
+ errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' }
+ maxRetries = 1
+ maxErrors = '-1'
+
+ // Process-specific resource requirements
+ // NOTE - Please try and re-use the labels below as much as possible.
+ // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules.
+ // If possible, it would be nice to keep the same label naming convention when
+ // adding in your local modules too.
+ // TODO nf-core: Customise requirements for specific processes.
+ // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
+ withLabel:process_single {
+ cpus = { check_max( 1 , 'cpus' ) }
+ memory = { check_max( 6.GB * task.attempt, 'memory' ) }
+ time = { check_max( 4.h * task.attempt, 'time' ) }
+ }
+ withLabel:process_low {
+ cpus = { check_max( 2 * task.attempt, 'cpus' ) }
+ memory = { check_max( 12.GB * task.attempt, 'memory' ) }
+ time = { check_max( 4.h * task.attempt, 'time' ) }
+ }
+ withLabel:process_medium {
+ cpus = { check_max( 6 * task.attempt, 'cpus' ) }
+ memory = { check_max( 36.GB * task.attempt, 'memory' ) }
+ time = { check_max( 8.h * task.attempt, 'time' ) }
+ }
+ withLabel:process_high {
+ cpus = { check_max( 12 * task.attempt, 'cpus' ) }
+ memory = { check_max( 72.GB * task.attempt, 'memory' ) }
+ time = { check_max( 16.h * task.attempt, 'time' ) }
+ }
+ withLabel:process_long {
+ time = { check_max( 20.h * task.attempt, 'time' ) }
+ }
+ withLabel:process_high_memory {
+ memory = { check_max( 200.GB * task.attempt, 'memory' ) }
+ }
+ withLabel:error_ignore {
+ errorStrategy = 'ignore'
+ }
+ withLabel:error_retry {
+ errorStrategy = 'retry'
+ maxRetries = 2
+ }
+ /*
+ withName:CUSTOM_DUMPSOFTWAREVERSIONS {
+ cache = false
+ }*/
+}
diff --git a/conf/biowulf.config b/conf/biowulf.config
index 769288e0..f3faf793 100644
--- a/conf/biowulf.config
+++ b/conf/biowulf.config
@@ -1,4 +1,51 @@
+
params {
- // config that only works on biowulf, but regardless of on slurm or local
- // e.g. genome file paths
+ config_profile_description = 'Biowulf nf-core config'
+ config_profile_contact = 'staff@hpc.nih.gov'
+ config_profile_url = 'https://hpc.nih.gov/apps/nextflow.html'
+ max_memory = '224 GB'
+ max_cpus = 32
+ max_time = '72 h'
+
+ igenomes_base = '/fdb/igenomes/'
+}
+
+executor {
+
+ $slurm {
+ queue = 'norm'
+ queueSize = 200
+ pollInterval = '2 min'
+ queueStatInterval = '5 min'
+ submitRateLimit = '6/1min'
+ retry.maxAttempts = 1
+ }
+}
+
+singularity {
+ enabled = true
+ autoMounts = true
+ cacheDir = "/data/$USER/.singularity"
+ envWhitelist='https_proxy,http_proxy,ftp_proxy,DISPLAY,SLURM_JOBID,SINGULARITY_BINDPATH'
+}
+
+env {
+ SINGULARITY_CACHEDIR="/data/$USER/.singularity"
+ PYTHONNOUSERSITE = 1
+}
+
+
+process {
+ executor = 'slurm'
+ maxRetries = 1
+
+ clusterOptions = ' --gres=lscratch:200 '
+
+ scratch = '/lscratch/$SLURM_JOBID'
+
+ stageInMode = 'symlink'
+ stageOutMode = 'rsync'
+
+ // for running pipeline on group sharing data directory, this can avoid inconsistent files timestamps
+ cache = 'lenient'
}
diff --git a/conf/biowulf_slurm.config b/conf/biowulf_slurm.config
deleted file mode 100644
index a4f967b6..00000000
--- a/conf/biowulf_slurm.config
+++ /dev/null
@@ -1,51 +0,0 @@
-
-params {
- config_profile_description = 'Biowulf nf-core config'
- config_profile_contact = 'staff@hpc.nih.gov'
- config_profile_url = 'https://hpc.nih.gov/apps/nextflow.html'
- max_memory = '224 GB'
- max_cpus = 32
- max_time = '72 h'
-
- igenomes_base = '/fdb/igenomes/'
-}
-
-executor {
-
- $slurm {
- queue = 'norm'
- queueSize = 200
- pollInterval = '2 min'
- queueStatInterval = '5 min'
- submitRateLimit = '6/1min'
- retry.maxAttempts = 1
- }
-}
-
-singularity {
- enabled = true
- autoMounts = true
- cacheDir = "/data/$USER/singularity"
- envWhitelist='https_proxy,http_proxy,ftp_proxy,DISPLAY,SLURM_JOBID,SINGULARITY_BINDPATH'
-}
-
-env {
- SINGULARITY_CACHEDIR="/data/$USER/singularity"
- PYTHONNOUSERSITE = 1
-}
-
-
-process {
- executor = 'slurm'
- maxRetries = 1
-
- clusterOptions = ' --gres=lscratch:200 '
-
- scratch = '/lscratch/$SLURM_JOBID'
-
- stageInMode = 'symlink'
- stageOutMode = 'rsync'
-
- // for running pipeline on group sharing data directory, this can avoid inconsistent files timestamps
- cache = 'lenient'
-}
diff --git a/conf/modules.conf b/conf/modules.config
similarity index 85%
rename from conf/modules.conf
rename to conf/modules.config
index bbbbde37..2c2cb335 100644
--- a/conf/modules.conf
+++ b/conf/modules.config
@@ -21,32 +21,39 @@ process {
}
withName: 'TRIM.*' {
container = 'nciccbr/ncigb_cutadapt_v1.18:latest'
- cpus = 4
+ cpus = 16
+ memory = '32 G'
}
withName: 'FASTQC' {
container = 'nciccbr/ccrgb_qctools:latest'
- cpus = 4
+ cpus = 32
+ memory = '24 G'
}
withName: 'FASTQ_SCREEN' {
container = 'nciccbr/ccbr_fastq_screen_0.14.1'
containerOptions = "--bind ${params.fastq_screen.db_dir}"
- cpus = 8
+ cpus = 24
+ memory = '24 G'
}
withName: 'ALIGN.*|INDEX_BAM' {
container = 'nciccbr/ccbr_ubuntu_base_20.04:latest'
- cpus = 8
+ cpus = 32
+ memory = '48 G'
}
withName: 'PRESEQ' {
container = 'nciccbr/ccbr_preseq_v2.0:v1'
cpus = 16
+ memory = '24 G'
}
withName: 'PHANTOM_PEAKS' {
container = 'quay.io/biocontainers/phantompeakqualtools:1.2.2--hdfd78af_1'
cpus = 1
+ memory = '24 G'
}
withName: 'DEDUPLICATE' {
container = 'nciccbr/ccbr_macs2_2.2.9.1:v1'
cpus = 4
+ memory = '24 G'
}
withName: 'QC_.*|PPQT_.*' {
container = 'nciccbr/ccbr_ubuntu_base_20.04:latest'
@@ -54,14 +61,19 @@ process {
withName: 'NGSQC_GEN' {
container = 'nciccbr/ccbr_ngsqc_0.31:v1'
cpus = 1
+ memory = '120 G'
}
withLabel: 'deeptools' {
container = 'nciccbr/ccbr_deeptools_3.5.3:v1'
+ cpus = 16
+ memory = '24 G'
}
withName: 'BAM_COVERAGE|PLOT_FINGERPRINT|COMPUTE_.*|NORMALIZE_INPUT' {
cpus = 32
+ memory = '24 G'
}
withName: 'MULTIQC' {
container = 'nciccbr/ccbr_multiqc_1.15:v1'
+ memory = '24 G'
}
}
diff --git a/conf/multiqc_config.yaml b/conf/multiqc_config.yaml
index 7b202d27..ca624294 100755
--- a/conf/multiqc_config.yaml
+++ b/conf/multiqc_config.yaml
@@ -43,7 +43,31 @@ custom_data:
id: "QC_Table"
section_name: "ChIP-specific QC metrics"
section_href: https://www.encodeproject.org/chip-seq/
- description: "Encode3 standards: Usable fragments (A fragment corresponds to a read mapping to one location in the genome.) If single-end, one read is considered a fragment. If paired-end, one pair is considered a fragment. Usable fragments for TF/narrow histones: concerning <10M, acceptable 10-20M, ideal >20M. Usable fragments for broad histones: concerning <20M, acceptable 20-40M, ideal >40M. NRF: Number of distinct mapping reads after removing duplicates/total number of reads. PBC1: Number of genomic locations where exactly one read maps uniquely/number of distinct genomic locations to which one read maps uniquely. PBC2: Number of genomic locations where only one read maps uniquely/number of genomic locations where two reads map uniquely Concerning(severe bottlenecking): NRF < 0.5, Acceptable: 0.5 < NRF < 0.8, Ideal (no bottlenecking): NRF > 0.9
PPQT standards: https://github.com/crazyhottommy/phantompeakqualtools NSC: cross-correlation value/minimum cross-correlation RSC: (cross-correlation value - minimum cross-correlation) / (correlation at phantom peak - minimum cross-correlation) Optimal NSC: > 1.1, Okay NSC: 1.05 - 1.1. Low NSC means low signal to noise or very few peaks (biological or technical). Optimal RSC: > 1, Okay RSC: 0.8-1. Low RSC means low signal to noise caused by: poor quality ChIP, low read sequence quality, shallow sequencing depth, or few peaks. QualityTag: -2: very low, 2: very high"
+ description: |
+
+ Usable fragments (A fragment corresponds to a read mapping to one location in the genome.)
+ If single-end, one read is considered a fragment. If paired-end, one pair is considered a fragment.
+ Usable fragments for TF/narrow histones: concerning <10M, acceptable 10-20M, ideal >20M.
+ Usable fragments for broad histones: concerning <20M, acceptable 20-40M, ideal >40M.
+
+
NRF: Number of distinct mapping reads after removing duplicates/total number of reads.
+
PBC1: Number of genomic locations where exactly one read maps uniquely/number of distinct genomic locations to which one read maps uniquely.
+
PBC2: Number of genomic locations where only one read maps uniquely/number of genomic locations where two reads map uniquely
NSC: cross-correlation value/minimum cross-correlation
+ Optimal NSC: > 1.1, Okay NSC: 1.05 - 1.1. Low NSC means low signal to noise or very few peaks (biological or technical).
+
RSC: (cross-correlation value - minimum cross-correlation) / (correlation at phantom peak - minimum cross-correlation)
+ Optimal RSC: > 1, Okay RSC: 0.8-1. Low RSC means low signal to noise caused by: poor quality ChIP,
+ low read sequence quality, shallow sequencing depth, or few peaks. QualityTag: -2: very low, 2: very high
+
plot_type: "table"
pconfig:
id: "QC_Table"
diff --git a/conf/slurmint.config b/conf/slurmint.config
new file mode 100644
index 00000000..685797e4
--- /dev/null
+++ b/conf/slurmint.config
@@ -0,0 +1,15 @@
+params {
+ config_profile_name = 'Slurm interactive node'
+ max_memory = '224 GB'
+ max_cpus = 32
+ max_time = '72 h'
+}
+process {
+ scratch = '/lscratch/$SLURM_JOBID'
+}
+singularity {
+ enabled = true
+ autoMounts = true
+ cacheDir = "/data/$USER/.singularity"
+ envWhitelist='https_proxy,http_proxy,ftp_proxy,DISPLAY,SLURM_JOBID,SINGULARITY_BINDPATH'
+}
diff --git a/conf/test.config b/conf/test.config
index 5491a2d0..02b2b925 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -4,11 +4,27 @@ params {
outdir = 'results/test'
input = 'assets/samplesheet_test.csv' // adapted from https://github.com/nf-core/test-datasets/blob/chipseq/samplesheet/v2.0/samplesheet_test.csv
+ genome = 'hg38'
max_cpus = 32 // for interactive node on biowulf
max_memory = '120 GB' // for interactive node on biowulf
publish_dir_mode = "symlink"
+
+ fastq_screen {
+ conf = "${baseDir}/conf/fastq_screen.conf"
+ db_dir = '/data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/'
+ }
+ align {
+ index_dir = "/data/CCBR_Pipeliner/db/PipeDB/Indices/${params.genome}_basic/indexes/"
+ blacklist = "${params.genome}.blacklist"
+ blacklist_files = "${params.align.index_dir}${params.align.blacklist}*"
+ reference_files = "${params.align.index_dir}${params.genome}*"
+ min_quality = 6 // to get a min quality of 5, set this to 6
+ effective_genome_size = 2700000000 // source: https://github.com/CCBR/Pipeliner/blob/86c6ccaa3d58381a0ffd696bbf9c047e4f991f9e/hg38.json#L349
+ chrom_sizes = "${params.align.index_dir}${params.genome}.fa.sizes" // source: https://github.com/CCBR/Pipeliner/blob/86c6ccaa3d58381a0ffd696bbf9c047e4f991f9e/hg38.json#L359
+ }
+ gene_info = "/data/CCBR_Pipeliner/db/PipeDB/Indices/${params.genome}_basic/geneinfo.bed"
deeptools {
bin_size = 10000 // this value is only to make bamCoverage run faster. use smaller value for real data.
smooth_length = 75
@@ -17,8 +33,13 @@ params {
}
}
+dag {
+ enabled = true
+ overwrite = true
+ file = "assets/dag.png"
+}
report {
enabled = true
overwrite = true
- file = "assets/test_report.html"
+ file = "${params.outdir}/pipeline_info/execution_report.html"
}
diff --git a/conf/test_mm10.config b/conf/test_mm10.config
new file mode 100644
index 00000000..8f345b11
--- /dev/null
+++ b/conf/test_mm10.config
@@ -0,0 +1,20 @@
+params {
+ config_profile_name = 'Test single-end mouse dataset'
+ config_profile_description = 'Minimal mouse dataset to check pipeline function'
+
+ genome = 'mm10' // TODO this doesn't replace genome value elsewhere in the configfile
+ outdir = "results/${params.genome}"
+ input = "assets/samplesheet_${params.genome}.csv"
+
+
+ align {
+ index_dir = "/data/CCBR_Pipeliner/db/PipeDB/Indices/${params.genome}_basic/indexes/"
+ blacklist = "${params.genome}.blacklist"
+ blacklist_files = "${params.align.index_dir}${params.align.blacklist}*"
+ reference_files = "${params.align.index_dir}${params.genome}*"
+ min_quality = 6 // to get a min quality of 5, set this to 6
+ effective_genome_size = 2700000000 // source: https://github.com/CCBR/Pipeliner/blob/86c6ccaa3d58381a0ffd696bbf9c047e4f991f9e/hg38.json#L349
+ chrom_sizes = "${params.align.index_dir}${params.genome}.fa.sizes" // source: https://github.com/CCBR/Pipeliner/blob/86c6ccaa3d58381a0ffd696bbf9c047e4f991f9e/hg38.json#L359
+ }
+ gene_info = "/data/CCBR_Pipeliner/db/PipeDB/Indices/${params.genome}_basic/geneinfo.bed"
+}
diff --git a/conf/test_paired.config b/conf/test_paired.config
index a08b3a3e..8c4c2dee 100644
--- a/conf/test_paired.config
+++ b/conf/test_paired.config
@@ -1,6 +1,7 @@
params {
config_profile_name = 'Test paired-end'
config_profile_description = 'Minimal paired-end test dataset to check pipeline function'
- outdir = 'results/test'
+
+ outdir = 'results/test_paired'
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/samplesheet/v2.0/samplesheet_test.csv'
}
diff --git a/assets/test_report.html b/docs/user-guide/examples/mm10_report.html
similarity index 93%
rename from assets/test_report.html
rename to docs/user-guide/examples/mm10_report.html
index 483d381e..97095649 100644
--- a/assets/test_report.html
+++ b/docs/user-guide/examples/mm10_report.html
@@ -18,11 +18,11 @@
-
+
- [distracted_kalam] Nextflow Workflow Report
+ [silly_wescoff] Nextflow Workflow Report
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ MultiQC: Summarize analysis results for multiple tools and samples in a single report
+ Philip Ewels, Måns Magnusson, Sverker Lundin and Max Käller
+ Bioinformatics (2016)
+ doi: 10.1093/bioinformatics/btw354
+ PMID: 27312411
+
+Usable fragments (A fragment corresponds to a read mapping to one location in the genome.)
+If single-end, one read is considered a fragment. If paired-end, one pair is considered a fragment.
+Usable fragments for TF/narrow histones: concerning <10M, acceptable 10-20M, ideal >20M.
+Usable fragments for broad histones: concerning <20M, acceptable 20-40M, ideal >40M.
+
+
NRF: Number of distinct mapping reads after removing duplicates/total number of reads.
+
PBC1: Number of genomic locations where exactly one read maps uniquely/number of distinct genomic locations to which one read maps uniquely.
+
PBC2: Number of genomic locations where only one read maps uniquely/number of genomic locations where two reads map uniquely
NSC: cross-correlation value/minimum cross-correlation
+ Optimal NSC: > 1.1, Okay NSC: 1.05 - 1.1. Low NSC means low signal to noise or very few peaks (biological or technical).
+
RSC: (cross-correlation value - minimum cross-correlation) / (correlation at phantom peak - minimum cross-correlation)
+ Optimal RSC: > 1, Okay RSC: 0.8-1. Low RSC means low signal to noise caused by: poor quality ChIP,
+ low read sequence quality, shallow sequencing depth, or few peaks. QualityTag: -2: very low, 2: very high
Uncheck the tick box to hide columns. Click and drag the handle on the left to change order. Table ID: QC_Table-1
+
+
+
+
+
+
+
+
Sort
+
Visible
+
Group
+
Column
+
Description
+
ID
+
Scale
+
+
+
+
+
+
||
+
+
+
+
QC Table
+
NReads
+
The number of reads sequenced
+
NReads
+
None
+
+
+
||
+
+
+
+
QC Table
+
NMappedReads
+
The number of reads mapped
+
NMappedReads
+
None
+
+
+
||
+
+
+
+
QC Table
+
NUniqMappedReads
+
The number of reads remaining after deduplication
+
NUniqMappedReads
+
None
+
+
+
||
+
+
+
+
QC Table
+
NRF
+
Non-Redundant fraction
+
NRF
+
None
+
+
+
||
+
+
+
+
QC Table
+
PBC1
+
PCR Bottlenecking Coefficient 1
+
PBC1
+
None
+
+
+
||
+
+
+
+
QC Table
+
PBC2
+
PCR Bottlenecking Coefficient 2
+
PBC2
+
None
+
+
+
||
+
+
+
+
QC Table
+
FragmentLength
+
Predicted fragment length by cross-correlation
+
FragmentLength
+
None
+
+
+
||
+
+
+
+
QC Table
+
NSC
+
Normalized strand cross-correlation coefficient
+
NSC
+
None
+
+
+
||
+
+
+
+
QC Table
+
RSC
+
Relative strand cross-correlation coefficient
+
RSC
+
None
+
+
+
||
+
+
+
+
QC Table
+
Qtag
+
Quality tag based on thresholded RSC
+
Qtag
+
None
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
FastQC
+
FastQC is a quality control tool for high throughput sequence data, written by Simon Andrews at the Babraham Institute in Cambridge.
+
+
+
+
+
+
+
+ Sequence Counts
+
+
+
+
+
+
Sequence counts for each sample. Duplicate read counts are an estimate only.
+
+
+
+
This plot show the total number of reads, broken down into unique and duplicate
+if possible (only more recent versions of FastQC give duplicate info).
+
You can read more about duplicate calculation in the
+FastQC documentation.
+A small part has been copied here for convenience:
+
Only sequences which first appear in the first 100,000 sequences
+in each file are analysed. This should be enough to get a good impression
+for the duplication levels in the whole file. Each sequence is tracked to
+the end of the file to give a representative count of the overall duplication level.
+
The duplication detection requires an exact sequence match over the whole length of
+the sequence. Any reads over 75bp in length are truncated to 50bp for this analysis.
+
+
+
+
+
+
+
loading..
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Sequence Quality Histograms
+
+
+
+
+
+
The mean quality value across each base position in the read.
+
+
+
+
To enable multiple samples to be plotted on the same graph, only the mean quality
+scores are plotted (unlike the box plots seen in FastQC reports).
The y-axis on the graph shows the quality scores. The higher the score, the better
+the base call. The background of the graph divides the y axis into very good quality
+calls (green), calls of reasonable quality (orange), and calls of poor quality (red).
+The quality of calls on most platforms will degrade as the run progresses, so it is
+common to see base calls falling into the orange area towards the end of a read.
+
+
+
loading..
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Per Sequence Quality Scores
+
+
+
+
+
+
The number of reads with average quality scores. Shows if a subset of reads has poor quality.
The per sequence quality score report allows you to see if a subset of your
+sequences have universally low quality values. It is often the case that a
+subset of sequences will have universally poor quality, however these should
+represent only a small percentage of the total sequences.
+
+
+
loading..
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Per Base Sequence Content
+
+
+
+
+
+
The proportion of each base position for which each of the four normal DNA bases has been called.
+
+
+
+
To enable multiple samples to be shown in a single plot, the base composition data
+is shown as a heatmap. The colours represent the balance between the four bases:
+an even distribution should give an even muddy brown colour. Hover over the plot
+to see the percentage of the four bases under the cursor.
+
To see the data as a line plot, as in the original FastQC graph, click on a sample track.
Per Base Sequence Content plots out the proportion of each base position in a
+file for which each of the four normal DNA bases has been called.
+
In a random library you would expect that there would be little to no difference
+between the different bases of a sequence run, so the lines in this plot should
+run parallel with each other. The relative amount of each base should reflect
+the overall amount of these bases in your genome, but in any case they should
+not be hugely imbalanced from each other.
+
It's worth noting that some types of library will always produce biased sequence
+composition, normally at the start of the read. Libraries produced by priming
+using random hexamers (including nearly all RNA-Seq libraries) and those which
+were fragmented using transposases inherit an intrinsic bias in the positions
+at which reads start. This bias does not concern an absolute sequence, but instead
+provides enrichement of a number of different K-mers at the 5' end of the reads.
+Whilst this is a true technical bias, it isn't something which can be corrected
+by trimming and in most cases doesn't seem to adversely affect the downstream
+analysis.
+
+
+
+
+
+
+ Click a sample row to see a line plot for that dataset.
+
+
Rollover for sample name
+
+
+ Position: -
+
%T: -
+
%C: -
+
%A: -
+
%G: -
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Per Sequence GC Content
+
+
+
+
+
+
The average GC content of reads. Normal random library typically have a
+ roughly normal distribution of GC content.
This module measures the GC content across the whole length of each sequence
+in a file and compares it to a modelled normal distribution of GC content.
+
In a normal random library you would expect to see a roughly normal distribution
+of GC content where the central peak corresponds to the overall GC content of
+the underlying genome. Since we don't know the the GC content of the genome the
+modal GC content is calculated from the observed data and used to build a
+reference distribution.
+
An unusually shaped distribution could indicate a contaminated library or
+some other kinds of biased subset. A normal distribution which is shifted
+indicates some systematic bias which is independent of base position. If there
+is a systematic bias which creates a shifted normal distribution then this won't
+be flagged as an error by the module since it doesn't know what your genome's
+GC content should be.
+
+
+
+
+
+
+
+
loading..
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Per Base N Content
+
+
+
+
+
+
The percentage of base calls at each position for which an N was called.
If a sequencer is unable to make a base call with sufficient confidence then it will
+normally substitute an N rather than a conventional base call. This graph shows the
+percentage of base calls at each position for which an N was called.
+
It's not unusual to see a very low proportion of Ns appearing in a sequence, especially
+nearer the end of a sequence. However, if this proportion rises above a few percent
+it suggests that the analysis pipeline was unable to interpret the data well enough to
+make valid base calls.
+
+
+
loading..
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Sequence Length Distribution
+
+
+
+
The distribution of fragment sizes (read lengths) found.
+ See the FastQC help
+
+
+
loading..
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Sequence Duplication Levels
+
+
+
+
+
+
The relative level of duplication found for every sequence.
In a diverse library most sequences will occur only once in the final set.
+A low level of duplication may indicate a very high level of coverage of the
+target sequence, but a high level of duplication is more likely to indicate
+some kind of enrichment bias (eg PCR over amplification). This graph shows
+the degree of duplication for every sequence in a library: the relative
+number of sequences with different degrees of duplication.
+
Only sequences which first appear in the first 100,000 sequences
+in each file are analysed. This should be enough to get a good impression
+for the duplication levels in the whole file. Each sequence is tracked to
+the end of the file to give a representative count of the overall duplication level.
+
The duplication detection requires an exact sequence match over the whole length of
+the sequence. Any reads over 75bp in length are truncated to 50bp for this analysis.
+
In a properly diverse library most sequences should fall into the far left of the
+plot in both the red and blue lines. A general level of enrichment, indicating broad
+oversequencing in the library will tend to flatten the lines, lowering the low end
+and generally raising other categories. More specific enrichments of subsets, or
+the presence of low complexity contaminants will tend to produce spikes towards the
+right of the plot.
+
+
+
loading..
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Overrepresented sequences
+
+
+
+
+
+
The total amount of overrepresented sequences found in each library.
+
+
+
+
FastQC calculates and lists overrepresented sequences in FastQ files. It would not be
+possible to show this for all samples in a MultiQC report, so instead this plot shows
+the number of sequences categorized as over represented.
+
Sometimes, a single sequence may account for a large number of reads in a dataset.
+To show this, the bars are split into two: the first shows the overrepresented reads
+that come from the single most common sequence. The second shows the total count
+from all remaining overrepresented sequences.
A normal high-throughput library will contain a diverse set of sequences, with no
+individual sequence making up a tiny fraction of the whole. Finding that a single
+sequence is very overrepresented in the set either means that it is highly biologically
+significant, or indicates that the library is contaminated, or not as diverse as you expected.
+
FastQC lists all of the sequences which make up more than 0.1% of the total.
+To conserve memory only sequences which appear in the first 100,000 sequences are tracked
+to the end of the file. It is therefore possible that a sequence which is overrepresented
+but doesn't appear at the start of the file for some reason could be missed by this module.
+
+
+
+
loading..
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Adapter Content
+
+
+
+
+
+
The cumulative percentage count of the proportion of your
+ library which has seen each of the adapter sequences at each position.
+
+
+
+
Note that only samples with ≥ 0.1% adapter contamination are shown.
+
There may be several lines per sample, as one is shown for each adapter
+detected in the file.
The plot shows a cumulative percentage count of the proportion
+of your library which has seen each of the adapter sequences at each position.
+Once a sequence has been seen in a read it is counted as being present
+right through to the end of the read so the percentages you see will only
+increase as the read length goes on.
+
+
+
loading..
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Status Checks
+
+
+
+
+
+
Status for each FastQC section showing whether results seem entirely normal (green),
+slightly abnormal (orange) or very unusual (red).
+
+
+
+
FastQC assigns a status for each section of the report.
+These give a quick evaluation of whether the results of the analysis seem
+entirely normal (green), slightly abnormal (orange) or very unusual (red).
+
It is important to stress that although the analysis results appear to give a pass/fail result,
+these evaluations must be taken in the context of what you expect from your library.
+A 'normal' sample as far as FastQC is concerned is random and diverse.
+Some experiments may be expected to produce libraries which are biased in particular ways.
+You should treat the summary evaluations therefore as pointers to where you should concentrate
+your attention and understand why your library may not look random and diverse.
+
Specific guidance on how to interpret the output of each module can be found in the relevant
+report section, or in the FastQC help.
+
In this heatmap, we summarise all of these into a single heatmap for a quick overview.
+Note that not all FastQC sections have plots in MultiQC reports, but all status checks
+are shown in this heatmap.
This module parses the output from samtools flagstat. All numbers in millions.
+
+
+
+
loading..
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ XY counts
+
+
+
+
+
+
+
+
+
+
+
loading..
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Mapped reads per contig
+
+
+
+
The samtools idxstats tool counts the number of mapped reads per chromosome / contig. Chromosomes with < 0.1% of the total aligned reads are omitted from this plot.
Pairwise correlations of samples based on distribution of sequence reads
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ loading..
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ PCA plot
+
+
+
+
PCA plot with the top two principal components calculated based on genome-wide distribution of sequence reads
+
+
+
loading..
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Fingerprint plot
+
+
+
+
Signal fingerprint according to plotFingerprint
+
+
+
loading..
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Fingerprint quality metrics
+
+
+
+
Various quality metrics returned by plotFingerprint
+
+
+
loading..
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Read Distribution Profile after Annotation
+
+
+
+
Accumulated view of the distribution of sequence reads related to the closest annotated gene.
+All annotated genes have been normalized to the same size.
+
+
+
loading..
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Plot Table Data
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Regex Help
+
+
+
Toolbox search strings can behave as regular expressions (regexes). Click a button below to see an example of it in action. Try modifying them yourself in the text box.