forked from galaxyproject/tools-iuc
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add mashmap - long read optimised multispecies alignments - produces paf * add mashmap license - complex. Good luck * add some quotes.... * could be the last ones. Don't want them on the boolean dense - it can be blank. * remove useful but forbidden script - the secret sauce needed to recreate the tool at reviewer request. * Update tools/mashmap/mashmap.xml Co-authored-by: Björn Grüning <[email protected]> * Update tools/mashmap/mashmap.xml Co-authored-by: Björn Grüning <[email protected]> * Update tools/mashmap/mashmap.xml Co-authored-by: Björn Grüning <[email protected]> * Update tools/mashmap/mashmap.xml Co-authored-by: Björn Grüning <[email protected]> * Update .shed.yml add mandatory homepage_url (which one?) and long_description * Update .shed.yml * Update .shed.yml * remove license * adress review comments, add support for gz and a few more params --------- Co-authored-by: fubar2 <[email protected]> Co-authored-by: Ross Lazarus <[email protected]>
- Loading branch information
1 parent
defa185
commit 4d07f32
Showing
10 changed files
with
1,597 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
categories: | ||
- Sequence Analysis | ||
description: Fast local alignment boundaries | ||
name: mashmap | ||
owner: iuc | ||
remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/mashmap | ||
synopsis: optimised for long reds | ||
type: unrestricted | ||
long_description: MashMap implements a fast and approximate algorithm for computing local alignment boundaries between long DNA sequences. It can be useful for mapping genome assembly or long reads (PacBio/ONT) to reference genome(s). | ||
homepage_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/mashmap |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
<tool name="mashmap" id="mashmap" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="22.05"> | ||
<description>Fast local alignment boundaries</description> | ||
<macros> | ||
<token name="@TOOL_VERSION@">3.1.3</token> | ||
<token name="@VERSION_SUFFIX@">0</token> | ||
</macros> | ||
<requirements> | ||
<requirement version="@TOOL_VERSION@" type="package">mashmap</requirement> | ||
</requirements> | ||
<version_command>mashmap --version</version_command> | ||
<command><![CDATA[ | ||
#if len($reflist) > 1: | ||
#for $r in $reflist: | ||
echo '$r' >> ./reflist && | ||
#end for | ||
cat ./reflist && | ||
#end if | ||
#if len($query) > 1: | ||
#for $q in $query: | ||
echo '$q' >> ./query && | ||
#end for | ||
cat ./query && | ||
#end if | ||
mashmap | ||
--threads \${GALAXY_SLOTS:-1} | ||
--perc_identity $perc_identity | ||
--segLength $seqLength | ||
--filter_mode $filter_mode | ||
$reportPercentage | ||
$dense | ||
$noMerge | ||
$noHgFilter | ||
#if $kmerThreshold: | ||
--kmerThreshold $kmerThreshold | ||
#end if | ||
#if $kmerComplexity | ||
--kmerComplexity $kmerComplexity | ||
#end if | ||
#if int($sketchSize) > 0: | ||
-J $sketchSize | ||
#end if | ||
#if len($reflist) == 1: | ||
-r '$reflist' | ||
#else | ||
--rl ./reflist | ||
#end if | ||
#if len($query) == 1: | ||
-q '$query' | ||
#else | ||
--ql ./query | ||
#end if | ||
]]> </command> | ||
<inputs> | ||
<param name="query" type="data" format="fasta,fasta.gz,fastq,fastq.gz" multiple="true" | ||
label="Query sequences to mash against the references supplied below"/> | ||
<param name="reflist" type="data" format="fasta,fasta.gz,fastq,fastq.gz" multiple="true" | ||
label="Reference or references to mash the query sequences on" | ||
help="Choose one or more reference sequences to mash the query sequences against."/> | ||
<param argument="--perc_identity" type="float" value="85.0" label="Identity threshold" | ||
help="By default, it is set to 85, implying mappings with 85 or more identity should be reported. For example, it can be set to 80to account for more noisy long-read datasets or 95 for mapping human genome assembly to human reference."/> | ||
<param argument="--seqLength" type="integer" value="5000" min="1" label="Minimum segment length" | ||
help="Default is 5,000 bp. Sequences below this length are ignored. Mashmap provides guarantees on reporting local alignments of length twice this value."/> | ||
<param argument="--sketchSize" type="integer" value="0" label="Sketch size - leave 0 for automatic setting based" | ||
help="This parameter sets the seed density of the winnowing scheme, gauranteeing that the minhash will be calculated from a sample of sketchSize k-mers for each segment. It is set automatically based on --pi but can be manually set as well."/> | ||
<param argument="dense" type="boolean" truevalue="--dense" falsevalue="" label="Dense sketching" | ||
help="This flag will increase the seed density substantially, resulting in a density of roughly 0.02 * (1 + (1 - pi) / .05) where pi is the perc_identity threshold. This leads to longer runtimes and higher RAM usage, but significantly more accurate estimates of ANI."/> | ||
<param argument="--kmerThreshold" type="float" min="0.0" max="0.0" optional="true" label="Ignore the top % most-frequent kmer window" /> | ||
<param argument="--kmerComplexity" type="float" min="0.0" max="1.0" optional="true" label="Threshold for kmer complexity" /> | ||
<param argument="filter_mode" type="select" label="Filter mode" help="Mashmap implements a plane-sweep based algorithm to perform the alignment filtering. Similar to delta-filter in nucmer, different filtering options are provided that are suitable for long read or assembly mapping. Option -f map is suitable for reporting the best mappings for long reads, whereas -f one-to-one is suitable for reporting orthologous mappings among all computed assembly to genome mappings."> | ||
<option value="map" selected="true">map - best mapping for long reads</option> | ||
<option value="one-to-one">one-to-one - best for mapping orthologous reads</option> | ||
<option value="none">None</option> | ||
</param> | ||
<param argument="--reportPercentage" type="boolean" truevalue="--reportPercentage" falsevalue="" checked="false" | ||
label="Report predicted ANI values in [0, 100]" | ||
help="instead of [0,1]" /> | ||
<param argument="--noMerge" type="boolean" truevalue="--noMerge" falsevalue="" checked="false" | ||
label="Don't merge consecutive segment-level mappings" /> | ||
<param argument="--noHgFilter" type="boolean" truevalue="--noHgFilter" falsevalue="" checked="false" label="Use MashMap2 first pass filtering" | ||
help="Don't use the hypergeometric filtering and instead use the MashMap2 first pass filtering." /> | ||
</inputs> | ||
<outputs> | ||
<data name="mashout" format="paf" from_work_dir="mashmap.out" /> | ||
</outputs> | ||
<tests> | ||
<test expect_num_outputs="1"> | ||
<param name="query" value="query_sample.fasta" ftype="fasta"/> | ||
<param name="reflist" value="reflist_sample.fasta" ftype="fasta"/> | ||
<param name="perc_identity" value="85.0"/> | ||
<param name="seqLength" value="5000"/> | ||
<param name="sketchSize" value="0"/> | ||
<param name="dense" value="true"/> | ||
<param name="filter_mode" value="map"/> | ||
<output name="mashout" value="mashout_sample.paf" ftype="paf"/> | ||
</test> | ||
<test expect_num_outputs="1"> | ||
<param name="query" value="query_sample.fasta.gz" ftype="fasta.gz"/> | ||
<param name="reflist" value="reflist_sample.fasta.gz" ftype="fasta.gz"/> | ||
<param name="perc_identity" value="85.0"/> | ||
<param name="seqLength" value="5000"/> | ||
<param name="sketchSize" value="0"/> | ||
<param name="dense" value="true"/> | ||
<param name="filter_mode" value="map"/> | ||
<output name="mashout" value="mashout_sample.paf" ftype="paf"/> | ||
</test> | ||
<test expect_num_outputs="1"> | ||
<param name="query" value="query_sample.fasta.gz,query_sample.fasta.gz" ftype="fasta.gz"/> | ||
<param name="reflist" value="reflist_1_sample.fasta.gz,reflist_2_sample.fasta.gz" ftype="fasta.gz"/> | ||
<param name="perc_identity" value="85.0"/> | ||
<param name="seqLength" value="5000"/> | ||
<param name="sketchSize" value="0"/> | ||
<param name="dense" value="true"/> | ||
<param name="filter_mode" value="map"/> | ||
<output name="mashout" value="mashout_multi_sample.paf" ftype="paf"/> | ||
</test> | ||
</tests> | ||
<help><![CDATA[ | ||
*MashMap* implements a fast and approximate algorithm for computing local alignment boundaries between long DNA sequences. | ||
It can be useful for mapping genome assembly or long reads (PacBio/ONT) to reference genome(s). | ||
Given a minimum alignment length and an identity threshold for the desired local alignments, | ||
Mashmap computes alignment boundaries and identity estimates using k-mers. It does not compute the alignments explicitly, | ||
but rather estimates an unbiased k-mer based Jaccard similarity using a combination of minmers (a novel winnowing scheme) and MinHash. | ||
This is then converted to an estimate of sequence identity using the Mash distance. An appropriate k-mer sampling rate | ||
is automatically determined using the given minimum local alignment length and identity thresholds. | ||
As an example, Mashmap can map a human genome assembly to the human reference genome in about one minute total execution | ||
time and < 4 GB memory using just 8 CPU threads, achieving more than an order of magnitude improvement in both runtime and | ||
memory over alternative methods. We describe the algorithms associated with Mashmap, and report on speed, scalability, and | ||
accuracy of the software in the publications listed below. Unlike traditional mappers, MashMap does not compute exact sequence alignments. | ||
In future, we plan to add an optional alignment support to generate base-to-base alignments. | ||
The output is space-delimited with each line consisting of query name, length, 0-based start, end, strand, target name, | ||
length, start, end and mapping nucleotide identity. | ||
]]></help> | ||
<citations> | ||
<citation type="doi">10.1093/bioinformatics/btad512</citation> | ||
<citation type="doi">10.1093/bioinformatics/bts573</citation> | ||
</citations> | ||
</tool> | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
sacCer3_gold_BK006935.2 6150 0 5000 + sacCer3_gold_BK006935.1 186550 4 5004 398 5000 255 id:f:1 kc:f:1.00938 | ||
sacCer3_gold_BK006935.2 6150 1150 6150 + sacCer3_gold_BK006935.1 186550 1147 6147 398 5000 255 id:f:1 kc:f:1.03323 | ||
sacCer3_gold_BK006935.3 4100 0 4100 + sacCer3_gold_BK006935.1 186550 6151 10251 336 4100 23 id:f:0.995366 kc:f:0.995857 | ||
sacCer3_gold_BK006935.4 18400 0 18400 + sacCer3_gold_BK006935.1 186550 10256 28651 398 18400 255 id:f:1 kc:f:0.895232 | ||
sacCer3_gold_BK006935.5 4050 0 4050 + sacCer3_gold_BK006935.1 186550 28516 32566 333 4050 23 id:f:0.995111 kc:f:0.974676 | ||
sacCer3_gold_BK006935.6 2800 0 2800 + sacCer3_gold_BK006935.1 186550 32642 35442 221 2800 18 id:f:0.98243 kc:f:0.954345 | ||
sacCer3_gold_BK006935.2 6150 0 5000 + sacCer3_gold_BK006935.1 186550 4 5004 398 5000 255 id:f:1 kc:f:1.00938 | ||
sacCer3_gold_BK006935.2 6150 1150 6150 + sacCer3_gold_BK006935.1 186550 1147 6147 398 5000 255 id:f:1 kc:f:1.03323 | ||
sacCer3_gold_BK006935.3 4100 0 4100 + sacCer3_gold_BK006935.1 186550 6151 10251 336 4100 23 id:f:0.995366 kc:f:0.995857 | ||
sacCer3_gold_BK006935.4 18400 0 18400 + sacCer3_gold_BK006935.1 186550 10256 28651 398 18400 255 id:f:1 kc:f:0.895232 | ||
sacCer3_gold_BK006935.5 4050 0 4050 + sacCer3_gold_BK006935.1 186550 28516 32566 333 4050 23 id:f:0.995111 kc:f:0.974676 | ||
sacCer3_gold_BK006935.6 2800 0 2800 + sacCer3_gold_BK006935.1 186550 32642 35442 221 2800 18 id:f:0.98243 kc:f:0.954345 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
sacCer3_gold_BK006935.2 6150 0 6150 + sacCer3_gold_BK006935.2 6150 4 6149 398 8065 255 id:f:1 kc:f:1.02131 | ||
sacCer3_gold_BK006935.4 18400 0 18400 + sacCer3_gold_BK006935.4 18400 7 18399 398 18408 255 id:f:1 kc:f:0.895232 |
Oops, something went wrong.