Merge pull request galaxyproject#5791 from npinter/add-peakzilla

Add Peakzilla
pavanvidem · Feb 26, 2024 · defa185 · defa185
2 parents f25b49b + 0d8f296
commit defa185
Show file tree

Hide file tree

Showing 7 changed files with 19,710 additions and 0 deletions.
diff --git a/tools/peakzilla/.shed.yml b/tools/peakzilla/.shed.yml
@@ -0,0 +1,10 @@
+name: peakzilla
+owner: iuc
+categories:
+  - ChIP-seq
+description: Peakzilla identifies sites of enrichment and transcription factor binding sites from ChIP-seq and ChIP-exo experiments.
+long_description: |
+  Peakzilla is a tool for identifying sites of enrichment and transcription factor binding sites from transcription factor ChIP-seq and ChIP-exo experiments at high accuracy and resolution. It is designed to work with data from any species and is suitable for both single and paired end data from any sequencing platform.
+homepage_url: https://github.com/steinmann/peakzilla
+remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/peakzilla
+type: unrestricted
diff --git a/tools/peakzilla/peakzilla.xml b/tools/peakzilla/peakzilla.xml
@@ -0,0 +1,160 @@
+<tool id="peakzilla" name="Peakzilla" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
+    <description>Identify transcription factor binding sites from ChIP-seq and ChIP-exo experiments</description>
+    <macros>
+        <token name="@TOOL_VERSION@">1.0</token>
+        <token name="@VERSION_SUFFIX@">0</token>
+    </macros>
+    <requirements>
+        <requirement type="package" version="2.7">python</requirement>
+        <requirement type="package" version="@TOOL_VERSION@">peakzilla</requirement>
+    </requirements>
+    <stdio>
+      <exit_code range="1:" level="fatal" description="Generic error"/>
+      <regex match="ValueError: cannot convert float NaN to integer"
+             level="fatal"
+             description="No peaks detected or input data error"/>
+    </stdio>
+    <command>
+        <![CDATA[
+            peakzilla.py
+            #if $options.model_peaks
+                -m '$options.model_peaks'
+            #end if
+            #if $options.enrichment_cutoff
+                -c '$options.enrichment_cutoff'
+            #end if
+            #if $options.score_cutoff
+                -s '$options.score_cutoff'
+            #end if
+            #if $options.fragment_size
+                -f '$options.fragment_size'
+            #end if
+            #if $options.gaussian
+                -e
+            #end if
+            #if $options.bedpe
+                -p
+            #end if
+            #if $outputs.negative
+                -n
+            #end if
+            #if $outputs.log
+                -l log.txt
+            #end if
+            '$chip_bed' '$input_bed' > '$results'
+        ]]>
+    </command>
+    <inputs>
+        <param name="chip_bed" type="data" format="bed" label="ChIP Dataset in BED format"/>
+        <param name="input_bed" type="data" format="bed" label="Input Dataset in BED format"/>
+        <section name="options" title="Optional Parameters" expanded="false">
+            <param name="model_peaks" type="integer" optional="true" min="1" label="Number of most highly enriched regions used to estimate peak size (Default: 200)"/>
+            <param name="enrichment_cutoff" type="integer" optional="true" min="1" label="Minimum cutoff for fold enrichment (Default: 2)"/>
+            <param name="score_cutoff" type="integer" optional="true" min="1" label="Minimum cutoff for peak score (Default: 1)"/>
+            <param name="fragment_size" type="integer" optional="true" min="1" label="Manually set fragment size in bp" help="If not set, it will be estimated from data"/>
+            <param name="gaussian" type="boolean" checked="false" label="Use empirical model estimate instead of gaussian"/>
+            <param name="bedpe" type="boolean" checked="false" label="Input is paired end and in BEDPE format"/>
+        </section>
+        <section name="outputs" title="Output Options" expanded="false">
+            <param name="negative" type="boolean" checked="false" label="Output negative peaks in control sample"/>
+            <param name="log" type="boolean" checked="false" label="Output log file"/>
+        </section>
+    </inputs>
+    <outputs>
+        <data name="results" format="tabular" label="${tool.name} on ${on_string}"/>
+        <data name="log" format="txt" from_work_dir="log.txt" label="Log file for ${tool.name} on ${on_string}">
+            <filter>log</filter>
+        </data>
+        <data name="negative_peaks" format="tabular" from_work_dir="negative_peaks.tsv" label="Negative peaks for ${tool.name} on ${on_string}">
+            <filter>negative</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="3">
+            <param name="chip_bed" value="chip.bed" />
+            <param name="input_bed" value="input.bed" />
+            <param name="model_peaks" value="200" />
+            <param name="fragment_size" value="50" />
+            <param name="enrichment_cutoff" value="2" />
+            <param name="score_cutoff" value="1" />
+            <param name="log" value="true" />
+            <param name="negative" value="true" />
+            <output name="results" file="results.tsv">
+                <assert_contents>
+                    <has_text text="Peak_1" />
+                </assert_contents>
+            </output>
+            <output name="log" file="log.txt" lines_diff="30">
+                <assert_contents>
+                    <has_text text="9569"/>
+                </assert_contents>
+            </output>
+            <output name="negative_peaks" file="negative_peaks.tsv">
+                <assert_contents>
+                     <has_text text="Chromosome" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+        **Peakzilla**
+
+        Peakzilla identifies sites of enrichment and transcription factor binding sites from transcription factor ChIP-seq and ChIP-exo experiments at high accuracy and resolution.
+        It is designed to perform equally well for data from any species. All necessary parameters are estimated from the data. Peakzilla is suitable for both single and
+        paired-end data from any sequencing platform.
+
+        Note that peakzilla is not suited for the identification of broad regions of enrichment (e.g. ChIP-seq for histone marks), we recommend using MACS instead: Zhang et al.
+        Model-based Analysis of ChIP-Seq (MACS). Genome Biol (2008) 9(9):R137
+
+        *INPUT FORMAT*
+        Peakzilla accepts BED formatted alignments as input.
+
+        For conversation to BED format and working with BED files and alignments in
+        general I highly recommend:
+
+        * bowtie (http://bowtie-bio.sourceforge.net/)
+        * SAMtools (http://samtools.sourceforge.net/)
+        * bedtools (http://code.google.com/p/bedtools/)
+
+
+        *WORKFLOW EXAMPLE*
+        # use bowtie to map uniquely mappable reads to the genome
+        bowtie -p4 -m1 --sam genome_index input.fastq input.sam
+        bowtie -p4 -m1 --sam genome_index chip.fastq chip.sam
+
+        # convert to BAM format
+        samtools view -bS input.sam > input.bam
+        samtools view -bS chip.sam > chip.bam
+
+        # convert to BED format
+        bamToBed -i input.bam > input.bed
+        bamToBed -i chip.bam > chip.bed
+
+        # run peakzilla
+        python peakzilla.py chip.bed input.bed > chip_peaks.tsv
+
+        # Comparison of 2 datasets
+        #    Determine significant peaks with a score threshold of 10
+        python peakzilla.py -s 10 chip1.bed input1.bed > chip1_s10_peaks.tsv
+        #    Determine enriched regions with a score threshold of 2
+        python peakzilla.py -s 2 chip2.bed input2.bed > chip2_s2_peaks.tsv
+        #    Overlap significant peaks from chip1 with enriched regions from chip2
+        intersectBed -a chip1_s10_peaks.tsv -b chip2_s2_peaks.tsv > intersect_peaks.tsv
+
+        For example datasets as well as an example of a computational pipeline for the comparative analysis of ChIP-seq datasets, please refer to our
+        publication: Bardet AF et al. A computational pipeline for comparative ChIP-seq analyses. Nature Protocols (2011) 7(1):45-61 (http://www.starklab.org/data/bardet_natprotoc_2011/)
+
+        *OPTIONS*
+        One of peakzilla's design goals is to learn all the necessary information
+        from the data. The usage of the options should therefore not be required.
+
+        *OUTPUT FORMAT*
+        * Results are printed as a table of tab-delimited values to stdout
+        * Logs are appended to logs.txt in the current directory or a custom directory/filename specified by the -l option
+        * Enriched regions in the control sample are written to negative_peaks.tsv or a custom directory/filename specified by the -n option
+        * Columns represent Chromosome / Start / End / Name / Summit / Score / ChIP / Control / FoldEnrichment / DistributionScore / FDR (%)
+    ]]></help>
+    <citations>
+        <citation type="doi">10.1038/nprot.2011.420</citation>
+    </citations>
+</tool>