Skip to content

Commit

Permalink
few changes
Browse files Browse the repository at this point in the history
  • Loading branch information
hugolefeuvre committed Nov 21, 2024
1 parent 3637da0 commit 6d07b87
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 63 deletions.
38 changes: 16 additions & 22 deletions tools/mmseqs2/mmseqs2_easy_linclust_clustering.xml
Original file line number Diff line number Diff line change
Expand Up @@ -109,14 +109,14 @@ mmseqs easy-linclust
<option value="nucleotide">Nucleotide</option>
</param>
<when value="amino_acid">
<param name="alph_size_amino_acid" type="integer" min="2" max="5" value="5" label="Alphabet size" help=""/>
<param argument="--alph-size" name="alph_size_amino_acid" type="integer" min="2" max="21" value="21" label="Alphabet size" help=""/>
<param argument="--comp-bias-corr-scale" type="float" min="0" max="1" value="1" label="Scale composition bias correction" help=""/>
<param name="kmer_per_seq_scale" type="float" min="0" value="0.000" label="Scale k-mer per sequence based on sequence length" help=""/>
<param argument="--kmer-per-seq-scale" type="float" min="0" value="0.000" label="Scale k-mer per sequence based on sequence length" help=""/>
</when>
<when value="nucleotide">
<param name="alph_size_nucleotide" type="integer" min="2" max="21" value="21" label="Alphabet size" help=""/>
<param name="zdrop" type="integer" min="0" value="40" label="Maximal allowed difference between score values before alignment is truncated" help=""/>
<param name="kmer_per_seq_scale" type="float" min="0" value="0.200" label="Scale k-mer per sequence based on sequence length" help=""/>
<param argument="--alph-size" name="alph_size_nucleotide" type="integer" min="2" max="5" value="5" label="Alphabet size" help=""/>
<param argument="--zdrop" type="integer" min="0" value="40" label="Maximal allowed difference between score values before alignment is truncated" help=""/>
<param argument="--kmer-per-seq-scale" type="float" min="0" value="0.200" label="Scale k-mer per sequence based on sequence length" help=""/>
<param argument="--adjust-kmer-len" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Adjust k-mer length based on specificity" help=""/>
</when>
</conditional>
Expand All @@ -129,10 +129,10 @@ mmseqs easy-linclust
<option value="4">Query seq. length has to be at least x% of target length</option>
<option value="5">Short seq. needs to be at least x% of the other seq. length</option>
</param>
<param name="cov" type="float" min="0" value="0.800" label="List matches above this fraction of aligned (covered) residues" help="(-c)"/>
<param argument="-c" name="cov" type="float" min="0" value="0.800" label="List matches above this fraction of aligned (covered) residues" help=""/>
<section name="prefilter" title="Pre-filter">
<param argument="--add-self-matches" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Artificially add entries of queries with themselves (for clustering)" help=""/>
<param name="kmer_length" type="integer" min="0" value="0" label="k-mer length" help="(0: automatically set to optimum)"/>
<param argument="-k" name="kmer_length" type="integer" min="0" value="0" label="k-mer length" help="(0: automatically set to optimum)"/>
<param argument="--mask" type="select" label="Mask sequences in k-mer stage" help="">
<option value="0">Without low complexity masking</option>
<option value="1" selected="true">With low complexity masking</option>
Expand All @@ -148,7 +148,7 @@ mmseqs easy-linclust
</param>
</section>
<section name="align" title="Align">
<param name="convertalis" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Add backtrace string" help="Convert to alignments with mmseqs convertalis module (-a)"/>
<param argument="-a" name="convertalis" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Add backtrace string" help="Convert to alignments with mmseqs convertalis module"/>
<param argument="--alignment-mode" type="select" label="Alignment mode : How to compute the alignment" help="" >
<option value="0" selected="true">Automatic</option>
<option value="1">Only score and end_pos</option>
Expand All @@ -165,16 +165,16 @@ mmseqs easy-linclust
<option value="5">score only (output) cluster format</option>
</param>
<param argument="--wrapped-scoring" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Double the (nucleotide) query sequence during the scoring process" help="Allow wrapped diagonal scoring around end and start"/>
<param name="evalue" type="float" min="0" value="1.000E-03" label="E-value threshold" help="List matches below this E-value (-e)"/>
<param argument="-e" name="evalue" type="float" min="0" value="1.000E-03" label="E-value threshold" help="List matches below this E-value"/>
<param argument="--min-aln-len" type="integer" min="0" value="0" label="Minimum alignment length" help=""/>
<param argument="--seq-id-mode" type="select" label="Sequence identity mode" help="" >
<option value="0" selected="true">Alignment length</option>
<option value="1">Shorter</option>
<option value="2">Longer sequence</option>
</param>
<param argument="--alt-ali" type="integer" min="0" value="0" label="Show up to this many alternative alignments" help=""/>
<param argument="--max-rejected" type="integer" min="0" value="2147483647" label="Maximum rejected alignments before alignment calculation for a query is stopped" help=""/>
<param argument="--max-accept" type="integer" min="0" value="2147483647" label="Maximum accepted alignments before alignment calculation for a query is stopped" help=""/>
<param argument="--max-rejected" type="integer" min="0" value="2147483647" optional="true" label="Maximum rejected alignments before alignment calculation for a query is stopped" help=""/>
<param argument="--max-accept" type="integer" min="0" value="2147483647" optional="true" label="Maximum accepted alignments before alignment calculation for a query is stopped" help=""/>
<param argument="--score-bias" type="float" value="0" label="Score bias when computing Smith-Waterman alignment" help=""/>
<param argument="--realign" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Compute more conservative, shorter alignments" help="Scores and E-values not changed"/>
<param argument="--realign-score-bias" type="float" value="-0.200" label="Additional bias when computing realignment" help=""/>
Expand Down Expand Up @@ -221,7 +221,7 @@ mmseqs easy-linclust
<param argument="--id-offset" type="integer" min="0" value="0" label="Numeric ids in index file are offset by this value" help=""/>
</section>
<section name="common" title="Common">
<param argument="--max-seq-len" type="integer" min="0" value="65535" label="Maximum sequence length" help=""/>
<param argument="--max-seq-len" type="integer" min="0" value="65535" optional="true" label="Maximum sequence length" help=""/>
</section>
<section name="expert" title="Expert">
<param argument="--filter-hits" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Filter hits by seq.id. and coverage" help=""/>
Expand All @@ -231,7 +231,7 @@ mmseqs easy-linclust
</param>
</section>
<section name="output_files" title="Selection of the output files">
<param name="output_selection" type="select" display="checkboxes" multiple="true" label="Output files selection">
<param name="output_selection" type="select" min="1" display="checkboxes" multiple="true" label="Output files selection">
<option value="file_rep_seq" selected="true">Representatives sequences in fasta</option>
<option value="file_all_seq" selected="true">FASTA-like per cluster</option>
<option value="file_cluster_tsv" selected="true">Adjecency list in TSV</option>
Expand Down Expand Up @@ -261,7 +261,8 @@ mmseqs easy-linclust
<output name="output_all_seq" ftype="fasta">
<assert_contents>
<has_text text="GAATAGCGGGACGCCAAGGGGCGGCCTTGCGTCCGCCCACGTGTGTGCTTGGCACGCGGGGCGTCCGCAAACCTTTGATCGGAACTTGCGATGGAGAAGCT"/>
<has_size value="627000" delta="50000"/>
<has_size value="627000" delta="20000"/>
<has_n_lines n="14806" delta="500"/>
</assert_contents>
</output>
<output name="output_cluster" ftype="tabular">
Expand All @@ -286,15 +287,8 @@ It can perform profile searches with the same sensitivity as PSI-BLAST at over 4
MMseqs easy-linclust is useful to clusters entries from a FASTA/FASTQ file using the cascaded clustering algorithm.
It offers an efficient clustering workflow, scaling linearly with input size. Similar to easy-cluster, but more suitable for handling very large datasets efficiently.
https://github.com/soedinglab/MMseqs2
By Martin Steinegger <[email protected]> & Milot Mirdita <[email protected]> & Florian Breitwieser <[email protected]> & Eli Levy Karin <[email protected]>
-----
**References**
- Steinegger M, Soding J: MMseqs2 enables sensitive protein sequence searching for the analysis of massive data sets. Nature Biotechnology, 35(11), 1026-1028 (2017)
- Mirdita M, Steinegger M, Breitwieser F, Soding J, Levy Karin E: Fast and sensitive taxonomic assignment to metagenomic contigs. Bioinformatics, btab184 (2021)
]]></help>
<expand macro="citations"/>
</tool>
Loading

0 comments on commit 6d07b87

Please sign in to comment.