diff --git a/.gitattributes b/.gitattributes index 96d42102..0b9f4afd 100644 --- a/.gitattributes +++ b/.gitattributes @@ -4,3 +4,4 @@ modules/nf-core/** linguist-generated subworkflows/nf-core/** linguist-generated *.config linguist-language=nextflow assets/** linguist-generated +docs/**.html linguist-generated diff --git a/.github/pull_request_template.md b/.github/PULL_REQUEST_TEMPLATE.md similarity index 83% rename from .github/pull_request_template.md rename to .github/PULL_REQUEST_TEMPLATE.md index b275088f..d33b3c9b 100644 --- a/.github/pull_request_template.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,11 +1,3 @@ ---- -name: Pull Request -about: Create a Pull Request -title: "" -labels: "" -assignees: "" ---- - ## Changes +## CHAMPAGNE 0.1.0 + +### Quality control steps implemented for single-end reads + +- Trim raw reads, FastQC on raw and trimmed reads, and FastQ Screen on trimmed reads. +- Exclude reads that align to blacklist regions, align remaining reads to the reference genome, and deduplicate. +- Preseq on aligned reads. +- Phantompeakqualtools on aligned and deduplicated reads. +- Process reads with deepTools: bam coverage to generate bigwigs for each sample, summarize all bigwigs, and compute matrices relative to TSSs and scaled to metagene regions. +- Generate plots with deepTools: PCA, profile, heatmap, spearman correlation, and fingerprint plots. +- Summarize all quality control steps in a MultiQC report. +- Input-normalize ChIP fragments for the next stage of the pipeline. diff --git a/assets/dag.png b/assets/dag.png index b4b4adf8..f9639645 100644 Binary files a/assets/dag.png and b/assets/dag.png differ diff --git a/assets/samplesheet_mm10.csv b/assets/samplesheet_mm10.csv new file mode 100644 index 00000000..72418af1 --- /dev/null +++ b/assets/samplesheet_mm10.csv @@ -0,0 +1,9 @@ +sample,fastq_1,fastq_2,antibody,control +CTCF_ChIP_macrophage_p20_1,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081748_1.fastq.gz,,CTCF,WCE_p20 +CTCF_ChIP_macrophage_p20_2,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081749_1.fastq.gz,,CTCF,WCE_p20 +CTCF_ChIP_macrophage_p3_1,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081750_1.fastq.gz,,CTCF,WCE_p3 +CTCF_ChIP_macrophage_p3_2,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081751_1.fastq.gz,,CTCF,WCE_p3 +CTCF_ChIP_MEF_p20_1,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081752_1.fastq.gz,,CTCF,WCE_p20 +CTCF_ChIP_MEF_p20_2,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081753_1.fastq.gz,,CTCF,WCE_p20 +WCE_p3,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081772_1.fastq.gz,,, +WCE_p20,/data/CCBR_Pipeliner/testdata/chipseq/SRR3081773_1.fastq.gz,,, diff --git a/conf/base.config b/conf/base.config new file mode 100644 index 00000000..e0efddd9 --- /dev/null +++ b/conf/base.config @@ -0,0 +1,66 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ccbr/sandbox Nextflow base config file +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + A 'blank slate' config file, appropriate for general use on most high performance + compute environments. Assumes that all software is installed and available on + the PATH. Runs in `local` mode - all jobs will be run on the logged in environment. +---------------------------------------------------------------------------------------- +*/ + +process { + + // TODO nf-core: Check the defaults for all processes + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 6.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } + maxRetries = 1 + maxErrors = '-1' + + // Process-specific resource requirements + // NOTE - Please try and re-use the labels below as much as possible. + // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. + // If possible, it would be nice to keep the same label naming convention when + // adding in your local modules too. + // TODO nf-core: Customise requirements for specific processes. + // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors + withLabel:process_single { + cpus = { check_max( 1 , 'cpus' ) } + memory = { check_max( 6.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + } + withLabel:process_low { + cpus = { check_max( 2 * task.attempt, 'cpus' ) } + memory = { check_max( 12.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + } + withLabel:process_medium { + cpus = { check_max( 6 * task.attempt, 'cpus' ) } + memory = { check_max( 36.GB * task.attempt, 'memory' ) } + time = { check_max( 8.h * task.attempt, 'time' ) } + } + withLabel:process_high { + cpus = { check_max( 12 * task.attempt, 'cpus' ) } + memory = { check_max( 72.GB * task.attempt, 'memory' ) } + time = { check_max( 16.h * task.attempt, 'time' ) } + } + withLabel:process_long { + time = { check_max( 20.h * task.attempt, 'time' ) } + } + withLabel:process_high_memory { + memory = { check_max( 200.GB * task.attempt, 'memory' ) } + } + withLabel:error_ignore { + errorStrategy = 'ignore' + } + withLabel:error_retry { + errorStrategy = 'retry' + maxRetries = 2 + } + /* + withName:CUSTOM_DUMPSOFTWAREVERSIONS { + cache = false + }*/ +} diff --git a/conf/biowulf.config b/conf/biowulf.config index 769288e0..f3faf793 100644 --- a/conf/biowulf.config +++ b/conf/biowulf.config @@ -1,4 +1,51 @@ + params { - // config that only works on biowulf, but regardless of on slurm or local - // e.g. genome file paths + config_profile_description = 'Biowulf nf-core config' + config_profile_contact = 'staff@hpc.nih.gov' + config_profile_url = 'https://hpc.nih.gov/apps/nextflow.html' + max_memory = '224 GB' + max_cpus = 32 + max_time = '72 h' + + igenomes_base = '/fdb/igenomes/' +} + +executor { + + $slurm { + queue = 'norm' + queueSize = 200 + pollInterval = '2 min' + queueStatInterval = '5 min' + submitRateLimit = '6/1min' + retry.maxAttempts = 1 + } +} + +singularity { + enabled = true + autoMounts = true + cacheDir = "/data/$USER/.singularity" + envWhitelist='https_proxy,http_proxy,ftp_proxy,DISPLAY,SLURM_JOBID,SINGULARITY_BINDPATH' +} + +env { + SINGULARITY_CACHEDIR="/data/$USER/.singularity" + PYTHONNOUSERSITE = 1 +} + + +process { + executor = 'slurm' + maxRetries = 1 + + clusterOptions = ' --gres=lscratch:200 ' + + scratch = '/lscratch/$SLURM_JOBID' + + stageInMode = 'symlink' + stageOutMode = 'rsync' + + // for running pipeline on group sharing data directory, this can avoid inconsistent files timestamps + cache = 'lenient' } diff --git a/conf/biowulf_slurm.config b/conf/biowulf_slurm.config deleted file mode 100644 index a4f967b6..00000000 --- a/conf/biowulf_slurm.config +++ /dev/null @@ -1,51 +0,0 @@ - -params { - config_profile_description = 'Biowulf nf-core config' - config_profile_contact = 'staff@hpc.nih.gov' - config_profile_url = 'https://hpc.nih.gov/apps/nextflow.html' - max_memory = '224 GB' - max_cpus = 32 - max_time = '72 h' - - igenomes_base = '/fdb/igenomes/' -} - -executor { - - $slurm { - queue = 'norm' - queueSize = 200 - pollInterval = '2 min' - queueStatInterval = '5 min' - submitRateLimit = '6/1min' - retry.maxAttempts = 1 - } -} - -singularity { - enabled = true - autoMounts = true - cacheDir = "/data/$USER/singularity" - envWhitelist='https_proxy,http_proxy,ftp_proxy,DISPLAY,SLURM_JOBID,SINGULARITY_BINDPATH' -} - -env { - SINGULARITY_CACHEDIR="/data/$USER/singularity" - PYTHONNOUSERSITE = 1 -} - - -process { - executor = 'slurm' - maxRetries = 1 - - clusterOptions = ' --gres=lscratch:200 ' - - scratch = '/lscratch/$SLURM_JOBID' - - stageInMode = 'symlink' - stageOutMode = 'rsync' - - // for running pipeline on group sharing data directory, this can avoid inconsistent files timestamps - cache = 'lenient' -} diff --git a/conf/modules.conf b/conf/modules.config similarity index 85% rename from conf/modules.conf rename to conf/modules.config index bbbbde37..2c2cb335 100644 --- a/conf/modules.conf +++ b/conf/modules.config @@ -21,32 +21,39 @@ process { } withName: 'TRIM.*' { container = 'nciccbr/ncigb_cutadapt_v1.18:latest' - cpus = 4 + cpus = 16 + memory = '32 G' } withName: 'FASTQC' { container = 'nciccbr/ccrgb_qctools:latest' - cpus = 4 + cpus = 32 + memory = '24 G' } withName: 'FASTQ_SCREEN' { container = 'nciccbr/ccbr_fastq_screen_0.14.1' containerOptions = "--bind ${params.fastq_screen.db_dir}" - cpus = 8 + cpus = 24 + memory = '24 G' } withName: 'ALIGN.*|INDEX_BAM' { container = 'nciccbr/ccbr_ubuntu_base_20.04:latest' - cpus = 8 + cpus = 32 + memory = '48 G' } withName: 'PRESEQ' { container = 'nciccbr/ccbr_preseq_v2.0:v1' cpus = 16 + memory = '24 G' } withName: 'PHANTOM_PEAKS' { container = 'quay.io/biocontainers/phantompeakqualtools:1.2.2--hdfd78af_1' cpus = 1 + memory = '24 G' } withName: 'DEDUPLICATE' { container = 'nciccbr/ccbr_macs2_2.2.9.1:v1' cpus = 4 + memory = '24 G' } withName: 'QC_.*|PPQT_.*' { container = 'nciccbr/ccbr_ubuntu_base_20.04:latest' @@ -54,14 +61,19 @@ process { withName: 'NGSQC_GEN' { container = 'nciccbr/ccbr_ngsqc_0.31:v1' cpus = 1 + memory = '120 G' } withLabel: 'deeptools' { container = 'nciccbr/ccbr_deeptools_3.5.3:v1' + cpus = 16 + memory = '24 G' } withName: 'BAM_COVERAGE|PLOT_FINGERPRINT|COMPUTE_.*|NORMALIZE_INPUT' { cpus = 32 + memory = '24 G' } withName: 'MULTIQC' { container = 'nciccbr/ccbr_multiqc_1.15:v1' + memory = '24 G' } } diff --git a/conf/multiqc_config.yaml b/conf/multiqc_config.yaml index 7b202d27..ca624294 100755 --- a/conf/multiqc_config.yaml +++ b/conf/multiqc_config.yaml @@ -43,7 +43,31 @@ custom_data: id: "QC_Table" section_name: "ChIP-specific QC metrics" section_href: https://www.encodeproject.org/chip-seq/ - description: "Encode3 standards:
Usable fragments (A fragment corresponds to a read mapping to one location in the genome.) If single-end, one read is considered a fragment. If paired-end, one pair is considered a fragment.
Usable fragments for TF/narrow histones: concerning <10M, acceptable 10-20M, ideal >20M.
Usable fragments for broad histones: concerning <20M, acceptable 20-40M, ideal >40M.
NRF: Number of distinct mapping reads after removing duplicates/total number of reads.
PBC1: Number of genomic locations where exactly one read maps uniquely/number of distinct genomic locations to which one read maps uniquely.
PBC2: Number of genomic locations where only one read maps uniquely/number of genomic locations where two reads map uniquely
Concerning(severe bottlenecking): NRF < 0.5, Acceptable: 0.5 < NRF < 0.8, Ideal (no bottlenecking): NRF > 0.9

PPQT standards: https://github.com/crazyhottommy/phantompeakqualtools
NSC: cross-correlation value/minimum cross-correlation
RSC: (cross-correlation value - minimum cross-correlation) / (correlation at phantom peak - minimum cross-correlation)
Optimal NSC: > 1.1, Okay NSC: 1.05 - 1.1. Low NSC means low signal to noise or very few peaks (biological or technical).
Optimal RSC: > 1, Okay RSC: 0.8-1. Low RSC means low signal to noise caused by: poor quality ChIP, low read sequence quality, shallow sequencing depth, or few peaks.
QualityTag: -2: very low, 2: very high" + description: | +

Encode3 standards

+ + Usable fragments (A fragment corresponds to a read mapping to one location in the genome.) + If single-end, one read is considered a fragment. If paired-end, one pair is considered a fragment.
+ Usable fragments for TF/narrow histones: concerning <10M, acceptable 10-20M, ideal >20M.
+ Usable fragments for broad histones: concerning <20M, acceptable 20-40M, ideal >40M.
+ +
+

phantompeakqualtools standards

+ plot_type: "table" pconfig: id: "QC_Table" diff --git a/conf/slurmint.config b/conf/slurmint.config new file mode 100644 index 00000000..685797e4 --- /dev/null +++ b/conf/slurmint.config @@ -0,0 +1,15 @@ +params { + config_profile_name = 'Slurm interactive node' + max_memory = '224 GB' + max_cpus = 32 + max_time = '72 h' +} +process { + scratch = '/lscratch/$SLURM_JOBID' +} +singularity { + enabled = true + autoMounts = true + cacheDir = "/data/$USER/.singularity" + envWhitelist='https_proxy,http_proxy,ftp_proxy,DISPLAY,SLURM_JOBID,SINGULARITY_BINDPATH' +} diff --git a/conf/test.config b/conf/test.config index 5491a2d0..02b2b925 100644 --- a/conf/test.config +++ b/conf/test.config @@ -4,11 +4,27 @@ params { outdir = 'results/test' input = 'assets/samplesheet_test.csv' // adapted from https://github.com/nf-core/test-datasets/blob/chipseq/samplesheet/v2.0/samplesheet_test.csv + genome = 'hg38' max_cpus = 32 // for interactive node on biowulf max_memory = '120 GB' // for interactive node on biowulf publish_dir_mode = "symlink" + + fastq_screen { + conf = "${baseDir}/conf/fastq_screen.conf" + db_dir = '/data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/' + } + align { + index_dir = "/data/CCBR_Pipeliner/db/PipeDB/Indices/${params.genome}_basic/indexes/" + blacklist = "${params.genome}.blacklist" + blacklist_files = "${params.align.index_dir}${params.align.blacklist}*" + reference_files = "${params.align.index_dir}${params.genome}*" + min_quality = 6 // to get a min quality of 5, set this to 6 + effective_genome_size = 2700000000 // source: https://github.com/CCBR/Pipeliner/blob/86c6ccaa3d58381a0ffd696bbf9c047e4f991f9e/hg38.json#L349 + chrom_sizes = "${params.align.index_dir}${params.genome}.fa.sizes" // source: https://github.com/CCBR/Pipeliner/blob/86c6ccaa3d58381a0ffd696bbf9c047e4f991f9e/hg38.json#L359 + } + gene_info = "/data/CCBR_Pipeliner/db/PipeDB/Indices/${params.genome}_basic/geneinfo.bed" deeptools { bin_size = 10000 // this value is only to make bamCoverage run faster. use smaller value for real data. smooth_length = 75 @@ -17,8 +33,13 @@ params { } } +dag { + enabled = true + overwrite = true + file = "assets/dag.png" +} report { enabled = true overwrite = true - file = "assets/test_report.html" + file = "${params.outdir}/pipeline_info/execution_report.html" } diff --git a/conf/test_mm10.config b/conf/test_mm10.config new file mode 100644 index 00000000..8f345b11 --- /dev/null +++ b/conf/test_mm10.config @@ -0,0 +1,20 @@ +params { + config_profile_name = 'Test single-end mouse dataset' + config_profile_description = 'Minimal mouse dataset to check pipeline function' + + genome = 'mm10' // TODO this doesn't replace genome value elsewhere in the configfile + outdir = "results/${params.genome}" + input = "assets/samplesheet_${params.genome}.csv" + + + align { + index_dir = "/data/CCBR_Pipeliner/db/PipeDB/Indices/${params.genome}_basic/indexes/" + blacklist = "${params.genome}.blacklist" + blacklist_files = "${params.align.index_dir}${params.align.blacklist}*" + reference_files = "${params.align.index_dir}${params.genome}*" + min_quality = 6 // to get a min quality of 5, set this to 6 + effective_genome_size = 2700000000 // source: https://github.com/CCBR/Pipeliner/blob/86c6ccaa3d58381a0ffd696bbf9c047e4f991f9e/hg38.json#L349 + chrom_sizes = "${params.align.index_dir}${params.genome}.fa.sizes" // source: https://github.com/CCBR/Pipeliner/blob/86c6ccaa3d58381a0ffd696bbf9c047e4f991f9e/hg38.json#L359 + } + gene_info = "/data/CCBR_Pipeliner/db/PipeDB/Indices/${params.genome}_basic/geneinfo.bed" +} diff --git a/conf/test_paired.config b/conf/test_paired.config index a08b3a3e..8c4c2dee 100644 --- a/conf/test_paired.config +++ b/conf/test_paired.config @@ -1,6 +1,7 @@ params { config_profile_name = 'Test paired-end' config_profile_description = 'Minimal paired-end test dataset to check pipeline function' - outdir = 'results/test' + + outdir = 'results/test_paired' input = 'https://raw.githubusercontent.com/nf-core/test-datasets/chipseq/samplesheet/v2.0/samplesheet_test.csv' } diff --git a/assets/test_report.html b/docs/user-guide/examples/mm10_report.html similarity index 93% rename from assets/test_report.html rename to docs/user-guide/examples/mm10_report.html index 483d381e..97095649 100644 --- a/assets/test_report.html +++ b/docs/user-guide/examples/mm10_report.html @@ -18,11 +18,11 @@ - + - [distracted_kalam] Nextflow Workflow Report + [silly_wescoff] Nextflow Workflow Report + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+

+ + + + + + +

+ +

Loading report..

+ +
+ +
+
+ + + +
+ + + + +
+ + + + +
+

+ + Highlight Samples +

+ +
+ + + +
+

+ Regex mode off + + +

+
    +
    + + +
    +

    + + Rename Samples +

    + +
    + + + +
    +

    Click here for bulk input.

    +
    +

    Paste two columns of a tab-delimited table here (eg. from Excel).

    +

    First column should be the old name, second column the new name.

    +
    + + +
    +
    +

    + Regex mode off + + +

    +
      +
      + + +
      +

      + + Show / Hide Samples +

      + +
      +
      + +
      +
      + +
      +
      + + +
      +
      + +

      + Regex mode off + + +

      +
        +
        + + +
        +

        Export Plots

        +
        + +
        +
        +
        +
        +
        + + px +
        +
        +
        +
        + + px +
        +
        +
        +
        +
        + +
        +
        + +
        +
        +
        +
        + +
        +
        +
        + + X +
        +
        +
        +
        + +
        +

        Download the raw data used to create the plots in this report below:

        +
        +
        + +
        +
        + +
        +
        + +

        Note that additional data was saved in multiqc_data when this report was generated.

        + +
        +
        +
        + +
        +
        Choose Plots
        + + +
        + +
        + +

        If you use plots from MultiQC in a publication or presentation, please cite:

        +
        + MultiQC: Summarize analysis results for multiple tools and samples in a single report
        + Philip Ewels, Måns Magnusson, Sverker Lundin and Max Käller
        + Bioinformatics (2016)
        + doi: 10.1093/bioinformatics/btw354
        + PMID: 27312411 +
        +
        +
        + + +
        +

        Save Settings

        +

        You can save the toolbox settings for this report to the browser.

        +
        + + +
        +
        + +

        Load Settings

        +

        Choose a saved report profile from the dropdown box below:

        +
        +
        + +
        +
        + + + + +
        +
        +
        + + +
        +

        Tool Citations

        +

        Please remember to cite the tools that you use in your analysis.

        +

        To help with this, you can download publication details of the tools mentioned in this report:

        +

        +

        +
        + + +
        +

        About MultiQC

        +

        This report was generated using MultiQC, version 1.15

        +

        You can see a YouTube video describing how to use MultiQC reports here: + https://youtu.be/qPbIlO_KWN0

        +

        For more information about MultiQC, including other videos and + extensive documentation, please visit http://multiqc.info

        +

        You can report bugs, suggest improvements and find the source code for MultiQC on GitHub: + https://github.com/ewels/MultiQC

        +

        MultiQC is published in Bioinformatics:

        +
        + MultiQC: Summarize analysis results for multiple tools and samples in a single report
        + Philip Ewels, Måns Magnusson, Sverker Lundin and Max Käller
        + Bioinformatics (2016)
        + doi: 10.1093/bioinformatics/btw354
        + PMID: 27312411 +
        +
        + +
        + +
        + + +
        + + + +

        + + + + +

        + + + +

        + A modular tool to aggregate results from bioinformatics analyses across many samples into a single report. +

        + + + +
        This report has been generated by CCBR/CHAMPAGNE. For information about how to interpret these results, please see the documentation. +
        + + + + + + + + + +
        +

        Report + + generated on 2023-09-06, 17:28 EDT + + + based on data in: + + /lscratch/7526425/nxf.FBQXPcOVYD

        + + +
        + + + + + + + +
        + + + + + + + + +
        +

        General Statistics

        + + + + + + + + + + + Showing 32/32 rows and 7/12 columns. + +
        +
        + +
        Sample Name% Dups% GCM SeqsM Reads MappedFrag LengthNSCRSC
        CTCF_ChIP_MEF_p20_1.aligned.filtered.dedup.bam.flagstat
        32.9
        CTCF_ChIP_MEF_p20_1.spp.out
        135
        1.73
        2.84
        CTCF_ChIP_MEF_p20_1.trimmed.fastq.gz
        19.1%
        44%
        48.5
        CTCF_ChIP_MEF_p20_2.aligned.filtered.dedup.bam.flagstat
        13.6
        CTCF_ChIP_MEF_p20_2.spp.out
        110
        3.39
        2.45
        CTCF_ChIP_MEF_p20_2.trimmed.fastq.gz
        23.5%
        51%
        36.6
        CTCF_ChIP_macrophage_p20_1.aligned.filtered.dedup.bam.flagstat
        38.1
        CTCF_ChIP_macrophage_p20_1.spp.out
        130
        1.51
        2.66
        CTCF_ChIP_macrophage_p20_1.trimmed.fastq.gz
        19.3%
        45%
        59.8
        CTCF_ChIP_macrophage_p20_2.aligned.filtered.dedup.bam.flagstat
        10.6
        CTCF_ChIP_macrophage_p20_2.spp.out
        115
        2.08
        2.12
        CTCF_ChIP_macrophage_p20_2.trimmed.fastq.gz
        9.3%
        45%
        22.3
        CTCF_ChIP_macrophage_p3_1.aligned.filtered.dedup.bam.flagstat
        31.1
        CTCF_ChIP_macrophage_p3_1.spp.out
        135
        1.53
        2.65
        CTCF_ChIP_macrophage_p3_1.trimmed.fastq.gz
        17.2%
        43%
        50.2
        CTCF_ChIP_macrophage_p3_2.aligned.filtered.dedup.bam.flagstat
        16.0
        CTCF_ChIP_macrophage_p3_2.spp.out
        105
        1.93
        2.57
        CTCF_ChIP_macrophage_p3_2.trimmed.fastq.gz
        26.0%
        51%
        44.4
        SRR3081748_1.fastq.gz
        20.2%
        45%
        60.4
        SRR3081749_1.fastq.gz
        10.0%
        45%
        22.7
        SRR3081750_1.fastq.gz
        17.6%
        44%
        50.5
        SRR3081751_1.fastq.gz
        36.2%
        51%
        51.4
        SRR3081752_1.fastq.gz
        19.4%
        44%
        48.7
        SRR3081753_1.fastq.gz
        24.5%
        51%
        36.9
        SRR3081772_1.fastq.gz
        14.0%
        39%
        27.6
        SRR3081773_1.fastq.gz
        16.9%
        39%
        38.2
        WCE_p20.aligned.filtered.dedup.bam.flagstat
        27.6
        WCE_p20.spp.out
        0
        1.01
        1.00
        WCE_p20.trimmed.fastq.gz
        16.5%
        39%
        38.0
        WCE_p3.aligned.filtered.dedup.bam.flagstat
        20.9
        WCE_p3.spp.out
        145
        1.01
        1.10
        WCE_p3.trimmed.fastq.gz
        13.6%
        39%
        27.5
        + + +
        + + + + + + +
        + + +
        +

        ChIP-specific QC metrics

        +

        Encode3 standards

        + +Usable fragments (A fragment corresponds to a read mapping to one location in the genome.) +If single-end, one read is considered a fragment. If paired-end, one pair is considered a fragment.
        +Usable fragments for TF/narrow histones: concerning <10M, acceptable 10-20M, ideal >20M.
        +Usable fragments for broad histones: concerning <20M, acceptable 20-40M, ideal >40M.
        +
          +
        • NRF: Number of distinct mapping reads after removing duplicates/total number of reads.
        • +
        • PBC1: Number of genomic locations where exactly one read maps uniquely/number of distinct genomic locations to which one read maps uniquely.
        • +
        • PBC2: Number of genomic locations where only one read maps uniquely/number of genomic locations where two reads map uniquely
        • +
        • Concerning(severe bottlenecking): NRF < 0.5, Acceptable: 0.5 < NRF < 0.8, Ideal (no bottlenecking): NRF > 0.9
        • +
        +
        +

        phantompeakqualtools standards

        +
          +
        • NSC: cross-correlation value/minimum cross-correlation
          + Optimal NSC: > 1.1, Okay NSC: 1.05 - 1.1. Low NSC means low signal to noise or very few peaks (biological or technical).
        • +
        • RSC: (cross-correlation value - minimum cross-correlation) / (correlation at phantom peak - minimum cross-correlation)
          + Optimal RSC: > 1, Okay RSC: 0.8-1. Low RSC means low signal to noise caused by: poor quality ChIP, + low read sequence quality, shallow sequencing depth, or few peaks.
          QualityTag: -2: very low, 2: very high
        • +
        +.

        + + + + +
        + + + + +
        + + + + + + + + + Showing 8/8 rows and 8/10 columns. + +
        +
        + +
        SampleNameNUniqMappedReadsNRFPBC1PBC2FragmentLengthNSCRSCQtag
        CTCF_ChIP_MEF_p20_2
        31099296
        0.8
        0.9
        7.0
        200.0
        1.93
        2.6
        2.0
        CTCF_ChIP_MEF_p20_1
        15983324
        0.8
        0.8
        4.4
        200.0
        2.08
        2.1
        2.0
        CTCF_ChIP_macrophage_p20_2
        38105036
        0.9
        0.9
        12.9
        200.0
        1.53
        2.7
        2.0
        CTCF_ChIP_macrophage_p20_1
        32917124
        0.8
        0.9
        7.9
        200.0
        3.39
        2.4
        2.0
        CTCF_ChIP_macrophage_p3_1
        10554895
        0.9
        0.9
        8.2
        200.0
        1.73
        2.8
        2.0
        WCE_p20
        13607337
        0.9
        0.9
        18.0
        200.0
        1.51
        2.7
        2.0
        CTCF_ChIP_macrophage_p3_2
        27568678
        0.7
        0.8
        4.2
        200.0
        1.01
        1.0
        1.0
        WCE_p3
        20856684
        NA
        NA
        NA
        200.0
        1.01
        1.1
        1.0
        + +
        + + + +
        + + +
        + + +
        +
        + + + +
        + + +
        +

        FastQC

        +

        FastQC is a quality control tool for high throughput sequence data, written by Simon Andrews at the Babraham Institute in Cambridge.

        + + + + +
        + +

        + Sequence Counts + + + +

        + +

        Sequence counts for each sample. Duplicate read counts are an estimate only.

        + + +
        +

        This plot show the total number of reads, broken down into unique and duplicate +if possible (only more recent versions of FastQC give duplicate info).

        +

        You can read more about duplicate calculation in the +FastQC documentation. +A small part has been copied here for convenience:

        +

        Only sequences which first appear in the first 100,000 sequences +in each file are analysed. This should be enough to get a good impression +for the duplication levels in the whole file. Each sequence is tracked to +the end of the file to give a representative count of the overall duplication level.

        +

        The duplication detection requires an exact sequence match over the whole length of +the sequence. Any reads over 75bp in length are truncated to 50bp for this analysis.

        +
        + +
        + + +
        +
        loading..
        +
        + + +
        +
        + + +
        + + + + + +
        + +

        + Sequence Quality Histograms + + + +

        + +

        The mean quality value across each base position in the read.

        + + +
        +

        To enable multiple samples to be plotted on the same graph, only the mean quality +scores are plotted (unlike the box plots seen in FastQC reports).

        +

        Taken from the FastQC help:

        +

        The y-axis on the graph shows the quality scores. The higher the score, the better +the base call. The background of the graph divides the y axis into very good quality +calls (green), calls of reasonable quality (orange), and calls of poor quality (red). +The quality of calls on most platforms will degrade as the run progresses, so it is +common to see base calls falling into the orange area towards the end of a read.

        +
        + +
        loading..
        +
        + + +
        +
        + + + + + + +
        + +

        + Per Sequence Quality Scores + + + +

        + +

        The number of reads with average quality scores. Shows if a subset of reads has poor quality.

        + + +
        +

        From the FastQC help:

        +

        The per sequence quality score report allows you to see if a subset of your +sequences have universally low quality values. It is often the case that a +subset of sequences will have universally poor quality, however these should +represent only a small percentage of the total sequences.

        +
        + +
        loading..
        +
        + + +
        +
        + + + + + + +
        + +

        + Per Base Sequence Content + + + +

        + +

        The proportion of each base position for which each of the four normal DNA bases has been called.

        + + +
        +

        To enable multiple samples to be shown in a single plot, the base composition data +is shown as a heatmap. The colours represent the balance between the four bases: +an even distribution should give an even muddy brown colour. Hover over the plot +to see the percentage of the four bases under the cursor.

        +

        To see the data as a line plot, as in the original FastQC graph, click on a sample track.

        +

        From the FastQC help:

        +

        Per Base Sequence Content plots out the proportion of each base position in a +file for which each of the four normal DNA bases has been called.

        +

        In a random library you would expect that there would be little to no difference +between the different bases of a sequence run, so the lines in this plot should +run parallel with each other. The relative amount of each base should reflect +the overall amount of these bases in your genome, but in any case they should +not be hugely imbalanced from each other.

        +

        It's worth noting that some types of library will always produce biased sequence +composition, normally at the start of the read. Libraries produced by priming +using random hexamers (including nearly all RNA-Seq libraries) and those which +were fragmented using transposases inherit an intrinsic bias in the positions +at which reads start. This bias does not concern an absolute sequence, but instead +provides enrichement of a number of different K-mers at the 5' end of the reads. +Whilst this is a true technical bias, it isn't something which can be corrected +by trimming and in most cases doesn't seem to adversely affect the downstream +analysis.

        +
        + +
        +
        +
        + + Click a sample row to see a line plot for that dataset. +
        +
        Rollover for sample name
        + +
        + Position: - +
        %T: -
        +
        %C: -
        +
        %A: -
        +
        %G: -
        +
        +
        +
        + +
        +
        +
        +
        + + + +
        +
        + + + + + + +
        + +

        + Per Sequence GC Content + + + +

        + +

        The average GC content of reads. Normal random library typically have a + roughly normal distribution of GC content.

        + + +
        +

        From the FastQC help:

        +

        This module measures the GC content across the whole length of each sequence +in a file and compares it to a modelled normal distribution of GC content.

        +

        In a normal random library you would expect to see a roughly normal distribution +of GC content where the central peak corresponds to the overall GC content of +the underlying genome. Since we don't know the the GC content of the genome the +modal GC content is calculated from the observed data and used to build a +reference distribution.

        +

        An unusually shaped distribution could indicate a contaminated library or +some other kinds of biased subset. A normal distribution which is shifted +indicates some systematic bias which is independent of base position. If there +is a systematic bias which creates a shifted normal distribution then this won't +be flagged as an error by the module since it doesn't know what your genome's +GC content should be.

        +
        + +
        + + +
        + +
        loading..
        +
        + + +
        +
        + + + + + + +
        + +

        + Per Base N Content + + + +

        + +

        The percentage of base calls at each position for which an N was called.

        + + +
        +

        From the FastQC help:

        +

        If a sequencer is unable to make a base call with sufficient confidence then it will +normally substitute an N rather than a conventional base call. This graph shows the +percentage of base calls at each position for which an N was called.

        +

        It's not unusual to see a very low proportion of Ns appearing in a sequence, especially +nearer the end of a sequence. However, if this proportion rises above a few percent +it suggests that the analysis pipeline was unable to interpret the data well enough to +make valid base calls.

        +
        + +
        loading..
        +
        + + +
        +
        + + + + + + +
        + +

        + Sequence Length Distribution + +

        + +

        The distribution of fragment sizes (read lengths) found. + See the FastQC help

        + + +
        loading..
        +
        + + +
        +
        + + + + + + +
        + +

        + Sequence Duplication Levels + + + +

        + +

        The relative level of duplication found for every sequence.

        + + +
        +

        From the FastQC Help:

        +

        In a diverse library most sequences will occur only once in the final set. +A low level of duplication may indicate a very high level of coverage of the +target sequence, but a high level of duplication is more likely to indicate +some kind of enrichment bias (eg PCR over amplification). This graph shows +the degree of duplication for every sequence in a library: the relative +number of sequences with different degrees of duplication.

        +

        Only sequences which first appear in the first 100,000 sequences +in each file are analysed. This should be enough to get a good impression +for the duplication levels in the whole file. Each sequence is tracked to +the end of the file to give a representative count of the overall duplication level.

        +

        The duplication detection requires an exact sequence match over the whole length of +the sequence. Any reads over 75bp in length are truncated to 50bp for this analysis.

        +

        In a properly diverse library most sequences should fall into the far left of the +plot in both the red and blue lines. A general level of enrichment, indicating broad +oversequencing in the library will tend to flatten the lines, lowering the low end +and generally raising other categories. More specific enrichments of subsets, or +the presence of low complexity contaminants will tend to produce spikes towards the +right of the plot.

        +
        + +
        loading..
        +
        + + +
        +
        + + + + + + +
        + +

        + Overrepresented sequences + + + +

        + +

        The total amount of overrepresented sequences found in each library.

        + + +
        +

        FastQC calculates and lists overrepresented sequences in FastQ files. It would not be +possible to show this for all samples in a MultiQC report, so instead this plot shows +the number of sequences categorized as over represented.

        +

        Sometimes, a single sequence may account for a large number of reads in a dataset. +To show this, the bars are split into two: the first shows the overrepresented reads +that come from the single most common sequence. The second shows the total count +from all remaining overrepresented sequences.

        +

        From the FastQC Help:

        +

        A normal high-throughput library will contain a diverse set of sequences, with no +individual sequence making up a tiny fraction of the whole. Finding that a single +sequence is very overrepresented in the set either means that it is highly biologically +significant, or indicates that the library is contaminated, or not as diverse as you expected.

        +

        FastQC lists all of the sequences which make up more than 0.1% of the total. +To conserve memory only sequences which appear in the first 100,000 sequences are tracked +to the end of the file. It is therefore possible that a sequence which is overrepresented +but doesn't appear at the start of the file for some reason could be missed by this module.

        +
        + +
        +
        loading..
        +
        + + +
        +
        + + + + + + +
        + +

        + Adapter Content + + + +

        + +

        The cumulative percentage count of the proportion of your + library which has seen each of the adapter sequences at each position.

        + + +
        +

        Note that only samples with ≥ 0.1% adapter contamination are shown.

        +

        There may be several lines per sample, as one is shown for each adapter +detected in the file.

        +

        From the FastQC Help:

        +

        The plot shows a cumulative percentage count of the proportion +of your library which has seen each of the adapter sequences at each position. +Once a sequence has been seen in a read it is counted as being present +right through to the end of the read so the percentages you see will only +increase as the read length goes on.

        +
        + +
        loading..
        +
        + + +
        +
        + + + + + + +
        + +

        + Status Checks + + + +

        + +

        Status for each FastQC section showing whether results seem entirely normal (green), +slightly abnormal (orange) or very unusual (red).

        + + +
        +

        FastQC assigns a status for each section of the report. +These give a quick evaluation of whether the results of the analysis seem +entirely normal (green), slightly abnormal (orange) or very unusual (red).

        +

        It is important to stress that although the analysis results appear to give a pass/fail result, +these evaluations must be taken in the context of what you expect from your library. +A 'normal' sample as far as FastQC is concerned is random and diverse. +Some experiments may be expected to produce libraries which are biased in particular ways. +You should treat the summary evaluations therefore as pointers to where you should concentrate +your attention and understand why your library may not look random and diverse.

        +

        Specific guidance on how to interpret the output of each module can be found in the relevant +report section, or in the FastQC help.

        +

        In this heatmap, we summarise all of these into a single heatmap for a quick overview. +Note that not all FastQC sections have plots in MultiQC reports, but all status checks +are shown in this heatmap.

        +
        + +
        +
        +
        + +
        +
        +
        + + + +
        +
        + + + +
        +
        +
        +
        + loading.. +
        +
        +
        +
        + + + +
        + + + +
        +
        + + + +
        + + +
        +

        Samtools

        +

        Samtools is a suite of programs for interacting with high-throughput sequencing data.DOI: 10.1093/bioinformatics/btp352.

        + + + + +
        + +

        + Samtools Flagstat + +

        + +

        This module parses the output from samtools flagstat. All numbers in millions.

        + + +
        +
        loading..
        +
        + + +
        +
        + + +
        + + + + + +
        + +

        + XY counts + +

        + + + + +
        + + +
        +
        loading..
        +
        + + +
        +
        + + + + + + +
        + +

        + Mapped reads per contig + +

        + +

        The samtools idxstats tool counts the number of mapped reads per chromosome / contig. Chromosomes with < 0.1% of the total aligned reads are omitted from this plot.

        + + +
        + + +
           
        + + + +
        + +
        loading..
        +
        + + + +
        + + + +
        +
        + + + +
        + + +
        +

        deepTools

        +

        deepTools is a suite of tools to process and analyze deep sequencing data.DOI: 10.1093/nar/gkw257.

        + + + + +
        + +

        + Correlation heatmap + +

        + +

        Pairwise correlations of samples based on distribution of sequence reads

        + + +
        +
        +
        + +
        +
        +
        + + + +
        +
        + + + +
        +
        +
        +
        + loading.. +
        +
        +
        +
        + + +
        +
        + + +
        + + + + + +
        + +

        + PCA plot + +

        + +

        PCA plot with the top two principal components calculated based on genome-wide distribution of sequence reads

        + + +
        loading..
        +
        + + +
        +
        + + + + + + +
        + +

        + Fingerprint plot + +

        + +

        Signal fingerprint according to plotFingerprint

        + + +
        loading..
        +
        + + +
        +
        + + + + + + +
        + +

        + Fingerprint quality metrics + +

        + +

        Various quality metrics returned by plotFingerprint

        + + +
        loading..
        +
        + + +
        +
        + + + + + + +
        + +

        + Read Distribution Profile after Annotation + +

        + +

        Accumulated view of the distribution of sequence reads related to the closest annotated gene. +All annotated genes have been normalized to the same size.

        + + +
        loading..
        +
        + + + +
        + + + +
        +
        + + + + + +
        + + + + + + + + + + + + + + + + diff --git a/main.nf b/main.nf index a228c10c..32118702 100644 --- a/main.nf +++ b/main.nf @@ -27,6 +27,8 @@ include { FASTQC as FASTQC_TRIMMED } from "./modules/local/qc.nf" include { FASTQ_SCREEN } from "./modules/local/qc.nf" include { DEDUPLICATE } from "./modules/local/qc.nf" include { PRESEQ } from "./modules/local/qc.nf" +include { HANDLE_PRESEQ_ERROR } from "./modules/local/qc.nf" +include { PARSE_PRESEQ_LOG } from "./modules/local/qc.nf" include { PHANTOM_PEAKS } from "./modules/local/qc.nf" include { PPQT_PROCESS } from "./modules/local/qc.nf" include { NGSQC_GEN } from "./modules/local/qc.nf" @@ -64,25 +66,46 @@ workflow { trimmed_fastqs.combine(Channel.value("trimmed")) | FASTQC_TRIMMED trimmed_fastqs.combine(Channel.fromPath(params.fastq_screen.conf)) | FASTQ_SCREEN blacklist_files = Channel - .fromPath("${params.align.index_dir}${params.align.blacklist}*") + .fromPath(params.align.blacklist_files) .collect() ALIGN_BLACKLIST(trimmed_fastqs, blacklist_files) reference_files = Channel - .fromPath("${params.align.index_dir}${params.align.genome}*") + .fromPath(params.align.reference_files) .collect() - ALIGN_GENOME(ALIGN_BLACKLIST.out, reference_files) + ALIGN_GENOME(ALIGN_BLACKLIST.out.reads, reference_files) + PRESEQ(ALIGN_GENOME.out.bam) - chrom_sizes = Channel.fromPath("${params.align.index_dir}${params.align.chrom_sizes}") + // when preseq fails, write NAs for the stats that are calculated from its log + PRESEQ.out.log + .join(ALIGN_GENOME.out.bam, remainder: true) + .branch { meta, preseq_log, bam_tuple -> + failed: preseq_log == null + return (tuple(meta, "nopresqlog")) + succeeded: true + return (tuple(meta, preseq_log)) + }.set{ preseq_logs } + preseq_logs.failed | HANDLE_PRESEQ_ERROR + preseq_logs.succeeded | PARSE_PRESEQ_LOG + PARSE_PRESEQ_LOG.out.nrf + .concat(HANDLE_PRESEQ_ERROR.out.nrf) + .set{ preseq_nrf } + + + chrom_sizes = Channel.fromPath(params.align.chrom_sizes) + print params.align.chrom_sizes ALIGN_GENOME.out.bam.combine(chrom_sizes) | DEDUPLICATE DEDUPLICATE.out.bam | INDEX_BAM + + // NGSQC is seg faulting, see https://github.com/CCBR/CHAMPAGNE/issues/13 //DEDUPLICATE.out.tag_align.combine(chrom_sizes) | NGSQC_GEN + INDEX_BAM.out.bam | PHANTOM_PEAKS PPQT_PROCESS(PHANTOM_PEAKS.out.fraglen) QC_STATS( raw_fastqs, ALIGN_GENOME.out.flagstat, DEDUPLICATE.out.flagstat, - PRESEQ.out.nrf, + preseq_nrf, PHANTOM_PEAKS.out.spp, PPQT_PROCESS.out.fraglen ) @@ -94,7 +117,7 @@ workflow { BIGWIG_SUM.out.array.combine(Channel.from('heatmap', 'scatterplot')) | PLOT_CORRELATION BIGWIG_SUM.out.array | PLOT_PCA - // Create channels: [ meta, [ ip_bam, control_bam ] [ ip_bai, control_bai ] ] + // Create channel: [ meta, [ ip_bam, control_bam ] [ ip_bai, control_bai ] ] ch_genome_bam_bai = INDEX_BAM.out.bam ch_genome_bam_bai .combine(ch_genome_bam_bai) @@ -130,7 +153,6 @@ workflow { FASTQC_RAW.out.zip.collect(), FASTQC_TRIMMED.out.zip.collect(), FASTQ_SCREEN.out.screen.collect(), - //PRESEQ.out.files.collect(), //NGSQC_GEN DEDUPLICATE.out.flagstat.collect(), PHANTOM_PEAKS.out.spp.collect(), @@ -141,4 +163,5 @@ workflow { PLOT_PCA.out.tab.collect(), PLOT_PROFILE.out.tab.collect() ) + } diff --git a/modules/local/align.nf b/modules/local/align.nf index ca759e11..41e43a29 100644 --- a/modules/local/align.nf +++ b/modules/local/align.nf @@ -13,8 +13,12 @@ process ALIGN_BLACKLIST { script: // TODO use samtools -f4 for single-end and -f12 for paired to get unmapped reads https://broadinstitute.github.io/picard/explain-flags.html def prefix = task.ext.prefix ?: "${meta.id}" """ - bwa mem -t $task.cpus $params.align.blacklist $fastq |\ - samtools view -@ $task.cpus -f4 -b |\ + bwa mem -t $task.cpus $params.align.blacklist $fastq > ${prefix}.sam + samtools view \\ + -@ ${task.cpus} \\ + -f4 \\ + -b \\ + ${prefix}.sam |\ samtools bam2fq |\ pigz -p $task.cpus > ${prefix}.no_blacklist.fastq.gz """ @@ -40,15 +44,27 @@ process ALIGN_GENOME { script: def prefix = task.ext.prefix ?: "${meta.id}" """ - bwa mem -t ${task.cpus} ${params.align.genome} $fastq |\ - samtools sort -@ ${task.cpus} |\ - samtools view -b -q ${params.align.min_quality} -o ${prefix}.aligned.filtered.bam + # current working directory is a tmpdir when 'scratch' is set + tmp=tmp/ + trap 'rm -rf "\$tmp"' EXIT + + bwa mem -t ${task.cpus} ${params.genome} ${fastq} > ${prefix}.bam + samtools sort \\ + -@ ${task.cpus} \\ + -m 2G \\ + -T \$tmp \\ + ${prefix}.bam > ${prefix}.sorted.bam + samtools view \\ + -@ ${task.cpus} \\ + -q ${params.align.min_quality} \\ + -b \\ + ${prefix}.sorted.bam > ${prefix}.aligned.filtered.bam samtools flagstat ${prefix}.aligned.filtered.bam > ${prefix}.aligned.filtered.bam.flagstat """ stub: """ - touch ${meta.id}.aligned.filtered.bam + touch ${meta.id}.aligned.filtered.bam ${meta.id}.aligned.filtered.bam.flagstat """ } diff --git a/modules/local/deeptools.nf b/modules/local/deeptools.nf index d6595093..cccffded 100644 --- a/modules/local/deeptools.nf +++ b/modules/local/deeptools.nf @@ -161,12 +161,12 @@ process BED_PROTEIN_CODING { script: """ - grep --line-buffered 'protein_coding' ${bed} | awk -v OFS='\t' -F'\t' '{{print \$1, \$2, \$3, \$5, \".\", \$4}}' > ${params.align.genome}.protein_coding.bed + grep --line-buffered 'protein_coding' ${bed} | awk -v OFS='\t' -F'\t' '{{print \$1, \$2, \$3, \$5, \".\", \$4}}' > ${params.genome}.protein_coding.bed """ stub: """ - touch ${params.align.genome}.protein_coding.bed + touch ${params.genome}.protein_coding.bed """ } diff --git a/modules/local/peaks.nf b/modules/local/peaks.nf deleted file mode 100644 index 634ec01c..00000000 --- a/modules/local/peaks.nf +++ /dev/null @@ -1,6 +0,0 @@ - -process SICER { - label 'peaks' - - -} diff --git a/modules/local/qc.nf b/modules/local/qc.nf index 14dd6fe4..1bc83bac 100644 --- a/modules/local/qc.nf +++ b/modules/local/qc.nf @@ -20,7 +20,7 @@ process FASTQC { stub: """ - touch ${fastq..getBaseName(2)}_fastqc.html ${fastq..getBaseName(2)}_fastqc.zip + touch ${fastq.getBaseName(2)}_fastqc.html ${fastq.getBaseName(2)}_fastqc.zip """ } @@ -48,7 +48,7 @@ process FASTQ_SCREEN { process PRESEQ { """ - Calls preseq c_curve and lc_extrap, and calls bin/parse_preseq_log.py to get NRF statistics from the log. + Calls preseq c_curve and lc_extrap """ tag { meta.id } label 'qc' @@ -63,8 +63,7 @@ process PRESEQ { output: path("*.c_curve"), emit: c_curve path("*.lc_extrap.txt"), emit: preseq - path("*.preseq.log"), emit: log - path("*nrf.txt"), emit: nrf + tuple val(meta), path("*.preseq.log"), emit: log script: // TODO handle paired: https://github.com/nf-core/rnaseq/blob/3bec2331cac2b5ff88a1dc71a21fab6529b57a0f/modules/nf-core/preseq/lcextrap/main.nf#L25 @@ -72,12 +71,47 @@ process PRESEQ { """ preseq c_curve -B -o ${prefix}.c_curve ${bam} preseq lc_extrap -B -D -o ${prefix}.lc_extrap.txt ${bam} -seed 12345 -v -l 100000000000 2> ${prefix}.preseq.log + """ + + stub: + """ + touch ${meta.id}.c_curve ${meta.id}.lc_extrap.txt ${meta.id}.preseq.log + """ +} + +process HANDLE_PRESEQ_ERROR { + input: + tuple val(meta), val(log) + + output: + path("*nrf.txt"), emit: nrf + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + echo "NA\tNA\tNA\n" > ${prefix}.preseq.nrf.txt + """ +} + +process PARSE_PRESEQ_LOG { + """ + Calls bin/parse_preseq_log.py to get NRF statistics from the preseq log. + """ + input: + tuple val(meta), path(log) + + output: + path("*nrf.txt"), emit: nrf + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ parse_preseq_log.py ${prefix}.preseq.log > ${prefix}.preseq.nrf.txt """ stub: """ - touch ${meta.id}.c_curve ${meta.id}.preseq ${meta.id}.preseqlog ${meta.id}.preseqlog.nrf.txt + touch ${meta.id}.preseqlog.nrf.txt """ } @@ -92,7 +126,7 @@ process PHANTOM_PEAKS { output: path("${meta.id}.ppqt.pdf"), emit: pdf path("${meta.id}.spp.out"), emit: spp - path("${meta.id}.fraglen.txt"), emit: fraglen + tuple val(meta), path("${meta.id}.fraglen.txt"), emit: fraglen script: // TODO: for PE, just use first read of each pair def prefix = task.ext.prefix ?: "${meta.id}" @@ -115,7 +149,7 @@ process PPQT_PROCESS { // refactor of https://github.com/CCBR/Pipeliner/blob/86c label 'qc' input: - path(fraglen) + tuple val(meta), path(fraglen) output: path("${fraglen.baseName}.process.txt"), emit: fraglen diff --git a/nextflow.config b/nextflow.config index 8332cde1..ecc2cfc9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -2,9 +2,12 @@ nextflow.enable.dsl = 2 params { input = null + outdir = 'results' + genome = null seq_center = null enable_conda = false publish_dir_mode = "copy" + cutadapt { adapters = '/opt2/TruSeq_and_nextera_adapters.consolidated.fa' minlen = 20 @@ -12,34 +15,30 @@ params { trailingquality = 10 } fastq_screen { - conf = '$baseDir/conf/fastq_screen.conf' + conf = "${baseDir}/conf/fastq_screen.conf" db_dir = '/data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/' } align { - index_dir = '/data/CCBR_Pipeliner/db/PipeDB/Indices/hg38_basic/indexes/' - genome = 'hg38' - blacklist = 'hg38.blacklist' // TODO: blacklist depends on the genome. set a genome param. + index_dir = "/data/CCBR_Pipeliner/db/PipeDB/Indices/${params.genome}_basic/indexes/" + blacklist = "${params.genome}.blacklist" + blacklist_files = "${params.align.index_dir}${params.align.blacklist}*" + reference_files = "${params.align.index_dir}${params.genome}*" min_quality = 6 // to get a min quality of 5, set this to 6 effective_genome_size = 2700000000 // source: https://github.com/CCBR/Pipeliner/blob/86c6ccaa3d58381a0ffd696bbf9c047e4f991f9e/hg38.json#L349 - chrom_sizes = 'hg38.fa.sizes' // source: https://github.com/CCBR/Pipeliner/blob/86c6ccaa3d58381a0ffd696bbf9c047e4f991f9e/hg38.json#L359 + chrom_sizes = "${params.align.index_dir}${params.genome}.fa.sizes" // source: https://github.com/CCBR/Pipeliner/blob/86c6ccaa3d58381a0ffd696bbf9c047e4f991f9e/hg38.json#L359 } deeptools { - bin_size = 25 // TODO don't do nested params structure, it breaks on overriding single parameters in a nested block. + bin_size = 25 smooth_length = 75 normalize_using = "RPGC" excluded_chroms = "chrM chrX chrY" } - gene_info = '/data/CCBR_Pipeliner/db/PipeDB/Indices/hg38_basic/geneinfo.bed' - multiqc_config = '$baseDir/conf/multiqc_config.yaml' + gene_info = "/data/CCBR_Pipeliner/db/PipeDB/Indices/${params.genome}_basic/geneinfo.bed" + multiqc_config = "${baseDir}/conf/multiqc_config.yaml" min_fragment_length = 200 // https://github.com/CCBR/Pipeliner/blob/86c6ccaa3d58381a0ffd696bbf9c047e4f991f9e/Rules/InitialChIPseqQC.snakefile#L539 } -def timestamp = new java.util.Date().format('yyyy-MM-dd_HH-mm-ss') -dag { - enabled = true - overwrite = true - file = "assets/dag.png" -} +includeConfig 'conf/base.config' profiles { debug { process.beforeScript = 'echo $HOSTNAME' } @@ -55,13 +54,46 @@ profiles { singularity.enabled = true singularity.autoMounts = true singularity.cacheDir = "/data/$USER/.singularity" // TODO this may be a different default on other (non biowulf) platforms + envWhitelist='https_proxy,http_proxy,ftp_proxy,DISPLAY,SLURM_JOBID,SINGULARITY_BINDPATH' + } + biowulf { + includeConfig "conf/biowulf.config" + } + slurmint { + includeConfig "conf/slurmint.config" } test { includeConfig "conf/test.config" } + test_mm10 { + includeConfig "conf/test_mm10.config" + } +} + +// Export these variables to prevent local Python/R libraries from conflicting with those in the container +// The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. +// See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable. +env { + PYTHONNOUSERSITE = 1 + R_PROFILE_USER = "/.Rprofile" + R_ENVIRON_USER = "/.Renviron" + JULIA_DEPOT_PATH = "/usr/local/share/julia" +} + +// Capture exit codes from upstream processes when piping +process.shell = ['/bin/bash', '-euo', 'pipefail'] + +def trace_timestamp = new java.util.Date().format('yyyy-MM-dd_HH-mm-ss') +dag { + enabled = true + file = "${params.outdir}/pipeline_info/pipeline_dag_${trace_timestamp}.html" +} +report { + enabled = true + file = "${params.outdir}/pipeline_info/execution_report_${trace_timestamp}.html" } -includeConfig 'conf/modules.conf' +includeConfig 'conf/modules.config' manifest { name = "CCBR/CHAMPAGNE" diff --git a/pyproject.toml b/pyproject.toml index 563fd852..923d521d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,7 +61,7 @@ Changelog = "https://github.com/CCBR/CHAMPAGNE/blob/main/docs/CHANGELOG.md" champagne = "." [tool.setuptools.package-data] -"*" = ["CITATION.cff", "LICENSE", "VERSION", "main.nf", "nextflow.conf", "bin/", "conf/", "modules/*/*", "submodules/*/*"] +"*" = ["CITATION.cff", "LICENSE", "VERSION", "main.nf", "nextflow.conf", "assets/", "bin/", "conf/", "modules/*/*", "submodules/*/*"] [tool.setuptools.dynamic] version = {file = "src/VERSION"}