Add tool: SeqSero2 (galaxyproject#5109)

* add seqsero2 * Subset test file - keep tests output intact * Update seqsero2 wrapper * Fix test & variables * Use boolean with logfile * Update test for paired input * Fix paired input string * Update SeqSero description * Fix logfile boolean + filter * Fix R1 and R2 file naming * Update tool version * Add workflow information to tool description * Fix param error. Set assembly workflow to k-mer * Update seqsero2 wrapper Add multiple samples as test-data Remove unecessary test files Update nanopore case Add tests Revert Add log file test Update seqsero wrapper * Add expected outputs to test cases * Add macros to seqsero2 wrapper * Remove whitespaces from read names * Add single quote and input condition * Update seqsero2.xml --------- Co-authored-by: thanhleviet <[email protected]> Co-authored-by: Björn Grüning <[email protected]>
pavanvidem · Nov 7, 2023 · 2268c10 · 2268c10
1 parent ef56448
commit 2268c10
Show file tree

Hide file tree

Showing 6 changed files with 14,238 additions and 0 deletions.
diff --git a/tools/seqsero2/.shed.yml b/tools/seqsero2/.shed.yml
@@ -0,0 +1,9 @@
+name: seqsero2
+owner: iuc
+description: Salmonella serotype prediction from genome sequencing data
+long_description: |
+  A pipeline for Salmonella serotype prediction from raw sequencing reads or genome assemblies.
+categories: [Sequence Analysis]
+remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/seqsero2
+homepage_url: https://github.com/denglab/SeqSero2
+type: unrestricted
diff --git a/tools/seqsero2/macros.xml b/tools/seqsero2/macros.xml
@@ -0,0 +1,21 @@
+<macros>
+    <token name="@TOOL_VERSION@">1.2.1</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">seqsero2</requirement>
+        </requirements>
+    </xml>
+    <macro name="select_workflow_type">
+        <param argument="-m" type="select" name="workflow" label="Workflow">
+            <option value="a" selected="true">allele</option>
+            <option value="k">k-mer</option>
+        </param>
+    </macro>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">
+                https://doi.org/10.1128/AEM.01746-19
+            </citation>
+        </citations>
+    </xml>
+</macros>
diff --git a/tools/seqsero2/seqsero2.xml b/tools/seqsero2/seqsero2.xml
@@ -0,0 +1,208 @@
+<tool id="seqsero2" name="SeqSero2" version="@TOOL_VERSION@+galaxy0" profile="21.05">
+    <description>Salmonella serotype prediction</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <version_command>SeqSero2_package.py -v</version_command>
+    <command detect_errors="exit_code"><![CDATA[
+    #import re
+    #set $seqsero_t_value = ''
+    #set $seqsero_workflow_type = ''
+
+    #if str($input_type_cond.input_type) in ['single', 'paired', 'assembly', 'nanopore']:
+        #set read1 = $input_type_cond.read1
+
+        #if str($input_type_cond.input_type) in ['assembly', 'nanopore']:
+            #set ext = '.fasta'
+        #else:
+            #set ext = '.fastq'
+        #end if
+        #if $read1.ext.endswith('.gz'):
+            #set ext = $ext+'.gz'
+        #end if
+
+        #if str($input_type_cond.input_type) == 'paired':
+            #set read1_file = re.sub('[^\w\-\.]', '_', str($read1.element_identifier)) + '_forward' + $ext
+            #set read2 = $input_type_cond.read2
+            #set read2_file = re.sub('[^\w\-\.]', '_', str($read2.element_identifier)) + '_reverse' + $ext
+            ln -s '$read2' '$read2_file' &&
+        #else:
+            #set read1_file = re.sub('[^\w\-\.]', '_', str($read1.element_identifier)) + $ext
+        #end if
+
+        ln -s '$read1' '$read1_file' &&
+    #else:
+        #set read1 = $input_type_cond.input_collection.forward
+
+        #set ext = '.fastq'
+        #if $read1.ext.endswith('.gz'):
+            #set ext = $ext+'.gz'
+        #end if
+
+        #set read1_file = re.sub('[^\w\-\.]', '_', str($read1.element_identifier)) + $ext
+        ln -s '$read1' '$read1_file' &&
+
+        #set read2 = $input_type_cond.input_collection.reverse
+        #set read2_file = re.sub('[^\w\-\.]', '_', str($read2.element_identifier)) + $ext
+        ln -s '$read2' '$read2_file' &&
+
+    #end if
+
+    #if str($input_type_cond.input_type) in ['paired', 'collection']:
+        #set $seqsero_t_value = '2'
+    #elif str($input_type_cond.input_type) == 'assembly':
+        #set $seqsero_t_value = '4'
+    #elif str($input_type_cond.input_type) == 'single':
+        #set $seqsero_t_value = '3'
+    #elif str($input_type_cond.input_type) == 'nanopore':
+        #set $seqsero_t_value = '5'
+    #end if
+
+    #if str($input_type_cond.input_type) in ['assembly', 'nanopore']:
+        #set $seqsero_workflow_type = 'k'
+    #else:
+        #set $seqsero_workflow_type = $input_type_cond.workflow
+    #end if
+
+    SeqSero2_package.py 
+    -m '$seqsero_workflow_type'
+    -t '$seqsero_t_value'
+    -i '$read1_file'
+    #if str($input_type_cond.input_type) in ['paired', 'collection']:
+        '$read2_file'
+    #end if
+    -p \${GALAXY_SLOTS:-4}
+    -d output
+    ]]>    </command>
+    <inputs>
+        <conditional name="input_type_cond">
+            <param name="input_type" type="select" label="Input type" help="Select 'paired end' reads or 'sequence' for genomes/contigs">
+                <option value="paired">Paired End</option>
+                <option value="collection">Collection</option>
+                <option value="assembly">Contigs</option>
+                <option value="single">Interleaved</option>
+                <option value="nanopore">Nanopore reads</option>
+            </param>
+            <when value="paired">
+                <param name="read1" type="data" format="fastqsanger,fastqsanger.gz" label="Forward reads (R1)" help="The file of forward reads in FASTQ format"/>
+                <param name="read2" type="data" format="fastqsanger,fastqsanger.gz" label="Reverse reads (R2)" help="The file of reverse reads in FASTQ format"/>
+                <expand macro="select_workflow_type" />
+            </when>
+            <when value="collection">
+                <param name="input_collection" format="fastqsanger,fastqsanger.gz" type="data_collection" collection_type="paired" label="Paired collection"/>
+                <expand macro="select_workflow_type" />
+            </when>
+            <when value="single">
+                <param name="read1" type="data" format="fastqsanger,fastqsanger.gz" multiple="false" label="Interleaved" />
+                <expand macro="select_workflow_type" />
+            </when>
+            <when value="nanopore">
+                <param name="read1" type="data" format="fasta" multiple="false" label="Nanopore reads" />
+            </when>
+            <when value="assembly">
+                <param name="read1" type="data" format="fasta" multiple="false" label="Contigs/genomes" />
+            </when>
+        </conditional>
+        <section name="output_options" title="Output options">
+            <param argument="logfile" type="boolean" checked="false" truevalue="true" falsevalue="" label="Include log as output file."/>
+        </section>
+    </inputs>
+    <outputs>
+        <data name="results" format="tabular" label="${tool.name} on ${on_string} Results" from_work_dir="output/SeqSero_result.tsv"/>
+        <data name="log" format="txt" label="${tool.name} on ${on_string} Log" from_work_dir="output/SeqSero_log.txt">
+            <filter>output_options['logfile']</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="2">
+            <param name="input_type" value="assembly" />
+            <param name="read1" value="CP009102.1.fasta" ftype="fasta" />
+            <param name="logfile" value="true" />
+            <output name="results">
+                <assert_contents>
+                    <has_text text="Salmonella enterica subspecies enterica (subspecies I)" />
+                    <has_text text="Typhimurium" />
+                </assert_contents>
+            </output>
+            <output name="log">
+                <assert_contents>
+                    <has_n_lines n="104" />
+                    <has_text text="O_scores" />
+                    <has_text text="H_scores" />
+                    <has_text text="Special_scores" />
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="1">
+            <param name="input_type" value="paired" />
+            <param name="workflow" value="a" />
+            <param name="read1" value="SRR10859038_R1.fastq.gz" ftype="fastqsanger.gz" />
+            <param name="read2" value="SRR10859038_R2.fastq.gz" ftype="fastqsanger.gz" />
+            <output name="results">
+                <assert_contents>
+                    <has_text text="The input genome cannot be identified as Salmonella." />
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="1">
+            <param name="input_type" value="collection" />
+            <param name="workflow" value="k" />
+            <param name="input_collection">
+                <collection type="paired">
+                    <element name="forward" value="SRR10859038_R1.fastq.gz" ftype="fastqsanger.gz" />
+                    <element name="reverse" value="SRR10859038_R2.fastq.gz" ftype="fastqsanger.gz" />
+                </collection>
+            </param>
+            <output name="results">
+                <assert_contents>
+                    <has_text text="The input genome cannot be identified as Salmonella." />
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="1">
+            <param name="input_type" value="single" />
+            <param name="workflow" value="a" />
+            <param name="read1" value="SRR10859038_R1.fastq.gz" ftype="fastqsanger.gz" />
+            <output name="results">
+                <assert_contents>
+                    <has_text text="The input genome cannot be identified as Salmonella" />
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="1">
+            <param name="input_type" value="nanopore" />
+            <param name="read1" value="CP009102.1.fasta" ftype="fasta" />
+            <output name="results">
+                <assert_contents>
+                    <has_text text="Salmonella enterica subspecies enterica (subspecies I)" />
+                    <has_text text="Typhimurium" />
+                </assert_contents>
+            </output>
+        </test>
+     </tests>
+    <help><![CDATA[
+    **What is SeqSero2**
+
+SeqSero2 is a pipeline for Salmonella serotype prediction from raw sequencing reads or genome assemblies.
+SeqSero is based on curated databases of Salmonella serotype determinants (rfb gene cluster, fliC and fljB alleles) and is predicted to determine serotype rapidly and accurately for nearly the full spectrum of Salmonella serotypes (more than 2,300 serotypes), from both raw sequencing reads and genome assemblies.
+
+**Workflows**
+
+* Allele micro-assembly (workflow: allele). This workflow takes raw reads as input and performs targeted assembly of serotype determinant alleles. Assembled alleles are used to predict serotype and flag potential inter-serotype contamination in sequencing data (i.e., presence of reads from multiple serotypes due to, for example, cross or carryover contamination during sequencing).
+* Raw reads k-mer (workflow: k-mer). This workflow takes raw reads as input and performs rapid serotype prediction based on unique k-mers of serotype determinants.
+* Genome assembly k-mer (input type: contigs). This workflow takes genome assemblies as input and the rest of the workflow largely overlaps with the raw reads k-mer workflow.
+
+**Extra parameter information.**
+
+The '-t' parameter is automatically determined using the input files, and therefore not customizable by the user. The logic used is based on the tool's parameter description:
+
+* -t {1,2,3,4,5}  <int>: '1' for interleaved paired-end reads, '2' for separated paired-end reads, '3' for single reads, '4' for genome assembly, '5' for nanopore reads (fasta/fastq)
+
+-----
+
+_`Document`:  https://github.com/denglab/SeqSero2
+
+    ]]></help>
+    <expand macro="citations"/>
+</tool>