add mashmap (galaxyproject#5789)

* add mashmap - long read optimised multispecies alignments - produces paf * add mashmap license - complex. Good luck * add some quotes.... * could be the last ones. Don't want them on the boolean dense - it can be blank. * remove useful but forbidden script - the secret sauce needed to recreate the tool at reviewer request. * Update tools/mashmap/mashmap.xml Co-authored-by: Björn Grüning <[email protected]> * Update tools/mashmap/mashmap.xml Co-authored-by: Björn Grüning <[email protected]> * Update tools/mashmap/mashmap.xml Co-authored-by: Björn Grüning <[email protected]> * Update tools/mashmap/mashmap.xml Co-authored-by: Björn Grüning <[email protected]> * Update .shed.yml add mandatory homepage_url (which one?) and long_description * Update .shed.yml * Update .shed.yml * remove license * adress review comments, add support for gz and a few more params --------- Co-authored-by: fubar2 <[email protected]> Co-authored-by: Ross Lazarus <[email protected]>
pavanvidem · Feb 26, 2024 · 4d07f32 · 4d07f32
1 parent defa185
commit 4d07f32
Show file tree

Hide file tree

Showing 10 changed files with 1,597 additions and 0 deletions.
diff --git a/tools/mashmap/.shed.yml b/tools/mashmap/.shed.yml
@@ -0,0 +1,10 @@
+categories:
+- Sequence Analysis
+description: Fast local alignment boundaries
+name: mashmap
+owner: iuc
+remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/mashmap
+synopsis: optimised for long reds
+type: unrestricted
+long_description: MashMap implements a fast and approximate algorithm for computing local alignment boundaries between long DNA sequences. It can be useful for mapping genome assembly or long reads (PacBio/ONT) to reference genome(s). 
+homepage_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/mashmap
diff --git a/tools/mashmap/mashmap.xml b/tools/mashmap/mashmap.xml
@@ -0,0 +1,143 @@
+<tool name="mashmap" id="mashmap" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="22.05">
+    <description>Fast local alignment boundaries</description>
+    <macros>
+        <token name="@TOOL_VERSION@">3.1.3</token>
+        <token name="@VERSION_SUFFIX@">0</token>
+    </macros>
+    <requirements>
+        <requirement version="@TOOL_VERSION@" type="package">mashmap</requirement>
+    </requirements>
+    <version_command>mashmap --version</version_command>
+    <command><![CDATA[
+#if len($reflist) > 1:
+    #for $r in $reflist:
+        echo '$r' >> ./reflist &&
+    #end for
+    cat ./reflist &&
+#end if
+#if len($query) > 1:
+    #for $q in $query:
+        echo '$q' >> ./query &&
+    #end for
+    cat ./query &&
+#end if
+mashmap
+    --threads \${GALAXY_SLOTS:-1}
+    --perc_identity $perc_identity
+    --segLength $seqLength
+    --filter_mode $filter_mode
+    $reportPercentage
+    $dense
+    $noMerge
+    $noHgFilter
+    #if $kmerThreshold:
+        --kmerThreshold $kmerThreshold
+    #end if
+    #if $kmerComplexity
+        --kmerComplexity $kmerComplexity
+    #end if
+    #if int($sketchSize) > 0:
+        -J $sketchSize
+    #end if
+    #if len($reflist) == 1:
+        -r '$reflist'
+    #else
+        --rl ./reflist
+    #end if
+    #if len($query) == 1:
+        -q '$query'
+    #else
+        --ql ./query
+    #end if
+
+]]> </command>
+    <inputs>
+        <param name="query" type="data" format="fasta,fasta.gz,fastq,fastq.gz" multiple="true"
+            label="Query sequences to mash against the references supplied below"/>
+        <param name="reflist" type="data" format="fasta,fasta.gz,fastq,fastq.gz" multiple="true"
+            label="Reference or references to mash the query sequences on"
+            help="Choose one or more reference sequences to mash the query sequences against."/>
+        <param argument="--perc_identity" type="float" value="85.0" label="Identity threshold"
+            help="By default, it is set to 85, implying mappings with 85 or more identity should be reported. For example, it can be set to 80to account for more noisy long-read datasets or 95 for mapping human genome assembly to human reference."/>
+        <param argument="--seqLength" type="integer" value="5000" min="1" label="Minimum segment length"
+            help="Default is 5,000 bp. Sequences below this length are ignored. Mashmap provides guarantees on reporting local alignments of length twice this value."/>
+        <param argument="--sketchSize" type="integer" value="0" label="Sketch size - leave 0 for automatic setting based"
+            help="This parameter sets the seed density of the winnowing scheme, gauranteeing that the minhash will be calculated from a sample of sketchSize k-mers for each segment. It is set automatically based on --pi but can be manually set as well."/>
+        <param argument="dense" type="boolean" truevalue="--dense" falsevalue="" label="Dense sketching"
+            help="This flag will increase the seed density substantially, resulting in a density of roughly 0.02 * (1 + (1 - pi) / .05) where pi is the perc_identity threshold. This leads to longer runtimes and higher RAM usage, but significantly more accurate estimates of ANI."/>
+        <param argument="--kmerThreshold" type="float" min="0.0" max="0.0" optional="true" label="Ignore the top % most-frequent kmer window" />
+        <param argument="--kmerComplexity" type="float" min="0.0" max="1.0" optional="true" label="Threshold for kmer complexity" />
+        <param argument="filter_mode" type="select" label="Filter mode" help="Mashmap implements a plane-sweep based algorithm to perform the alignment filtering. Similar to delta-filter in nucmer, different filtering options are provided that are suitable for long read or assembly mapping. Option -f map is suitable for reporting the best mappings for long reads, whereas -f one-to-one is suitable for reporting orthologous mappings among all computed assembly to genome mappings.">
+            <option value="map" selected="true">map - best mapping for long reads</option>
+            <option value="one-to-one">one-to-one - best for mapping orthologous reads</option>
+            <option value="none">None</option>
+        </param>
+        <param argument="--reportPercentage" type="boolean" truevalue="--reportPercentage" falsevalue="" checked="false"
+            label="Report predicted ANI values in [0, 100]"
+            help="instead of [0,1]" />
+        <param argument="--noMerge" type="boolean" truevalue="--noMerge" falsevalue="" checked="false"
+            label="Don't merge consecutive segment-level mappings" />
+        <param argument="--noHgFilter" type="boolean" truevalue="--noHgFilter" falsevalue="" checked="false" label="Use MashMap2 first pass filtering"
+            help="Don't use the hypergeometric filtering and instead use the MashMap2 first pass filtering." />
+    </inputs>
+    <outputs>
+        <data name="mashout" format="paf" from_work_dir="mashmap.out" />
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+            <param name="query" value="query_sample.fasta" ftype="fasta"/>
+            <param name="reflist" value="reflist_sample.fasta" ftype="fasta"/>
+            <param name="perc_identity" value="85.0"/>
+            <param name="seqLength" value="5000"/>
+            <param name="sketchSize" value="0"/>
+            <param name="dense" value="true"/>
+            <param name="filter_mode" value="map"/>
+            <output name="mashout" value="mashout_sample.paf" ftype="paf"/>
+        </test>
+        <test expect_num_outputs="1">
+            <param name="query" value="query_sample.fasta.gz" ftype="fasta.gz"/>
+            <param name="reflist" value="reflist_sample.fasta.gz" ftype="fasta.gz"/>
+            <param name="perc_identity" value="85.0"/>
+            <param name="seqLength" value="5000"/>
+            <param name="sketchSize" value="0"/>
+            <param name="dense" value="true"/>
+            <param name="filter_mode" value="map"/>
+            <output name="mashout" value="mashout_sample.paf" ftype="paf"/>
+        </test>
+        <test expect_num_outputs="1">
+            <param name="query" value="query_sample.fasta.gz,query_sample.fasta.gz" ftype="fasta.gz"/>
+            <param name="reflist" value="reflist_1_sample.fasta.gz,reflist_2_sample.fasta.gz" ftype="fasta.gz"/>
+            <param name="perc_identity" value="85.0"/>
+            <param name="seqLength" value="5000"/>
+            <param name="sketchSize" value="0"/>
+            <param name="dense" value="true"/>
+            <param name="filter_mode" value="map"/>
+            <output name="mashout" value="mashout_multi_sample.paf" ftype="paf"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+ *MashMap* implements a fast and approximate algorithm for computing local alignment boundaries between long DNA sequences.
+ It can be useful for mapping genome assembly or long reads (PacBio/ONT) to reference genome(s).
+ Given a minimum alignment length and an identity threshold for the desired local alignments, 
+ 
+ Mashmap computes alignment boundaries and identity estimates using k-mers. It does not compute the alignments explicitly,
+ but rather estimates an unbiased k-mer based Jaccard similarity using a combination of minmers (a novel winnowing scheme) and MinHash.
+ This is then converted to an estimate of sequence identity using the Mash distance. An appropriate k-mer sampling rate
+ is automatically determined using the given minimum local alignment length and identity thresholds.
+ 
+ As an example, Mashmap can map a human genome assembly to the human reference genome in about one minute total execution
+ time and < 4 GB memory using just 8 CPU threads, achieving more than an order of magnitude improvement in both runtime and
+ memory over alternative methods. We describe the algorithms associated with Mashmap, and report on speed, scalability, and
+ accuracy of the software in the publications listed below. Unlike traditional mappers, MashMap does not compute exact sequence alignments.
+ In future, we plan to add an optional alignment support to generate base-to-base alignments.
+ 
+ The output is space-delimited with each line consisting of query name, length, 0-based start, end, strand, target name,
+ length, start, end and mapping nucleotide identity.
+ 
+  ]]></help>
+    <citations>
+        <citation type="doi">10.1093/bioinformatics/btad512</citation>
+        <citation type="doi">10.1093/bioinformatics/bts573</citation>
+    </citations>
+</tool>
+
diff --git a/tools/mashmap/test-data/mashout_multi_sample.paf b/tools/mashmap/test-data/mashout_multi_sample.paf
@@ -0,0 +1,12 @@
+sacCer3_gold_BK006935.2	6150	0	5000	+	sacCer3_gold_BK006935.1	186550	4	5004	398	5000	255	id:f:1	kc:f:1.00938
+sacCer3_gold_BK006935.2	6150	1150	6150	+	sacCer3_gold_BK006935.1	186550	1147	6147	398	5000	255	id:f:1	kc:f:1.03323
+sacCer3_gold_BK006935.3	4100	0	4100	+	sacCer3_gold_BK006935.1	186550	6151	10251	336	4100	23	id:f:0.995366	kc:f:0.995857
+sacCer3_gold_BK006935.4	18400	0	18400	+	sacCer3_gold_BK006935.1	186550	10256	28651	398	18400	255	id:f:1	kc:f:0.895232
+sacCer3_gold_BK006935.5	4050	0	4050	+	sacCer3_gold_BK006935.1	186550	28516	32566	333	4050	23	id:f:0.995111	kc:f:0.974676
+sacCer3_gold_BK006935.6	2800	0	2800	+	sacCer3_gold_BK006935.1	186550	32642	35442	221	2800	18	id:f:0.98243	kc:f:0.954345
+sacCer3_gold_BK006935.2	6150	0	5000	+	sacCer3_gold_BK006935.1	186550	4	5004	398	5000	255	id:f:1	kc:f:1.00938
+sacCer3_gold_BK006935.2	6150	1150	6150	+	sacCer3_gold_BK006935.1	186550	1147	6147	398	5000	255	id:f:1	kc:f:1.03323
+sacCer3_gold_BK006935.3	4100	0	4100	+	sacCer3_gold_BK006935.1	186550	6151	10251	336	4100	23	id:f:0.995366	kc:f:0.995857
+sacCer3_gold_BK006935.4	18400	0	18400	+	sacCer3_gold_BK006935.1	186550	10256	28651	398	18400	255	id:f:1	kc:f:0.895232
+sacCer3_gold_BK006935.5	4050	0	4050	+	sacCer3_gold_BK006935.1	186550	28516	32566	333	4050	23	id:f:0.995111	kc:f:0.974676
+sacCer3_gold_BK006935.6	2800	0	2800	+	sacCer3_gold_BK006935.1	186550	32642	35442	221	2800	18	id:f:0.98243	kc:f:0.954345
diff --git a/tools/mashmap/test-data/mashout_sample.paf b/tools/mashmap/test-data/mashout_sample.paf
@@ -0,0 +1,2 @@
+sacCer3_gold_BK006935.2	6150	0	6150	+	sacCer3_gold_BK006935.2	6150	4	6149	398	8065	255	id:f:1	kc:f:1.02131
+sacCer3_gold_BK006935.4	18400	0	18400	+	sacCer3_gold_BK006935.4	18400	7	18399	398	18408	255	id:f:1	kc:f:0.895232
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		sacCer3_gold_BK006935.2 6150 0 6150 + sacCer3_gold_BK006935.2 6150 4 6149 398 8065 255 id:f:1 kc:f:1.02131
		sacCer3_gold_BK006935.4 18400 0 18400 + sacCer3_gold_BK006935.4 18400 7 18399 398 18408 255 id:f:1 kc:f:0.895232