Skip to content

Commit

Permalink
fixed mmseq, updated to 6.3.4
Browse files Browse the repository at this point in the history
  • Loading branch information
Paul Klemm committed Dec 13, 2024
1 parent 5eb04fe commit fda6f17
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 35 deletions.
79 changes: 60 additions & 19 deletions tools/proteinortho/proteinortho.xml
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,13 @@
2> >(sed -E "s/.\[([0-9]{1,2}(;[0-9]{1,2})?)?[mGK]//g" 1>&2)
#if $more_options.selfblast:
&&
mv result.blast-graph_clean result.blast-graph;
mv result.blast-graph_clean result.blast-graph
#end if
#if $synteny.synteny_options == "specified":
&&
mv result.poff-graph result.proteinortho-graph &&
mv result.poff.tsv result.proteinortho.tsv &&
mv result.poff.html result.proteinortho.html ;
mv result.poff.html result.proteinortho.html
#end if
]]></command>
<inputs>
Expand All @@ -115,6 +115,8 @@
<option value="autoblast">auto detect NCBI-BLAST (protein and nucleotide sequences)</option>
<option value="blastp">NCBI-BLASTP+ (protein sequences)</option>
<option value="blastn">NCBI-BLASTN+ (nucleotide sequences)</option>
<option value="mmseqsp">MMseqs2 (aminoacid sequences)</option>
<option value="mmseqsn">MMseqs2 (nucleotide sequences)</option>
<option value="lastp">Last (aminoacid sequences)</option>
<option value="lastn">Last (nucleotide sequences)</option>
<option value="blatp">BLAT (aminoacid sequences)</option>
Expand All @@ -126,7 +128,7 @@
<param argument="--evalue" type="float" value="0.001" min="0" label="E-value threshold of the blast algorithm" help="Larger values results in more false positives (connections between proteins)."/>
<param argument="--cov" type="integer" value="50" min="0" max="100" label="Minimal coverage of best blast alignments in %"/>
<param argument="--identity" type="integer" value="25" min="0" max="100" label="Minimal percent identity of best blast hits in %"/>
<param argument="--selfblast" type="boolean" checked="false" truevalue="--selfblast" falsevalue="" label="Apply selfblast, detects paralogs without orthologs "/>
<param argument="--selfblast" type="boolean" checked="false" truevalue="--selfblast" falsevalue="" label="Apply selfblast, detects paralogs without orthologs (not compatible with synteny) "/>
<param argument="--singles" type="boolean" checked="false" truevalue="--singles" falsevalue="" label="Report singleton genes without any hit "/>
<param argument="--core" type="boolean" checked="false" truevalue="--core" falsevalue="" label="Stop clustering if a split would result in groups that do not span across all species of the inital connected component." help="Overrules the -conn threshold."/>
<param argument="--isoform" type="select" label="Use isoform information" help="The reciprocal best hit graph is built using isoform information (isoforms are treated equivalent). For ncbi : simply add the additional files to the input (file names need to match). For Uniprot : the isoforms need to contain the word isoform and the corresponding identifier. For trinity simply use the trinity output format.">
Expand All @@ -137,7 +139,7 @@
</param>
</section>
<conditional name="synteny">
<param name="synteny_options" type="select" label="Activate synteny feature (POFF)" help="To enhance the prediction accuracy, the relative order of genes (synteny) can be used as an additional feature for the discrimination of orthologs. For more details see doi:10.1371/journal.pone.0105015.">
<param name="synteny_options" type="select" label="Activate synteny feature (POFF)" help="To enhance the prediction accuracy, the relative order of genes (synteny) can be used as an additional feature for the discrimination of orthologs. For more details see doi:10.1371/journal.pone.0105015. (Not compatible with selfblast)">
<option value="no" selected="true">no</option>
<option value="specified">yes</option>
</param>
Expand Down Expand Up @@ -177,7 +179,7 @@
</data>
</outputs>
<tests>
<test expect_num_outputs="3"> <!-- test normal -->
<test expect_num_outputs="3"> <!-- test normal / default params -->
<param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
<param name="p" value="diamond"/>
<expand macro="test_output_proteinortho" nlines="33" nlines_delta="5"/>
Expand All @@ -187,6 +189,16 @@
<has_text text="--p=diamond"/>
</assert_command>
</test>
<test expect_num_outputs="3"> <!-- test normal mmseqs -->
<param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
<param name="p" value="mmseqsp"/>
<expand macro="test_output_proteinortho" nlines="33" nlines_delta="5"/>
<expand macro="test_output_blastgraph" nlines="156" nlines_delta="20"/>
<expand macro="test_output_proteinorthograph" nlines="139" nlines_delta="20"/>
<assert_command>
<has_text text="--p=mmseqsp"/>
</assert_command>
</test>
<test expect_num_outputs="3"> <!-- various parameter -->
<param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
<param name="p" value="diamond"/>
Expand Down Expand Up @@ -251,12 +263,12 @@
</test>
<test expect_num_outputs="3"> <!-- blat -->
<param name="input_files" value="L.fasta,C.fasta,E.fasta,M.fasta"/>
<param name="p" value="blastp"/>
<param name="p" value="blatp"/>
<expand macro="test_output_proteinortho" nlines="33" nlines_delta="20"/>
<expand macro="test_output_blastgraph" nlines="156" nlines_delta="50"/>
<expand macro="test_output_proteinorthograph" nlines="136" nlines_delta="50"/>
<expand macro="test_output_blastgraph" nlines="56" nlines_delta="50"/>
<expand macro="test_output_proteinorthograph" nlines="56" nlines_delta="50"/>
<assert_command>
<has_text text="--p=blastp"/>
<has_text text="--p=blatp"/>
</assert_command>
</test>
</tests>
Expand Down Expand Up @@ -285,8 +297,8 @@ Proteinortho is a tool to detect orthologous proteins/genes within different spe
* **(ii) Cluster the RBH**
| Using two clustering algorithms, edges are removed that weakly connect two connected components to reduce false positive hits.
| The resulting connected components are outputted in orthology-groups / -pairs
| A spectral clustering algorithm is used to remove weak connections, reducing false positives.
| The connected components from this process are output as orthology groups or pairs.
----
Expand Down Expand Up @@ -322,41 +334,70 @@ Proteinortho is a tool to detect orthologous proteins/genes within different spe
| The result of the (ii) step, the clustered reciprocal best hit graph or the orthology groups.
| Every line corresponds to an orthology group.
| The first 3 columns characterize the general properties of that group: number of proteins, species, and algebraic connectivity. The higher the algebraic connectivity the more edges are there and the better the group is connected to itself in general.
| The first 3 columns characterize the general properties of that group: number of proteins, species, and algebraic connectivity. The higher the algebraic connectivity the more edges are there and the better the group is connected to itself.
| Then a column for each species follows containing the proteins of these species.
| If a species contributes with more than one protein to a group of orthologs, then they are ordered by descending connectivity.
| The '*' represents that this species does not contribute to the group.
.. csv-table::
Species,Genes,alg.-conn.,ecoli.faa,human.faa,snail.faa,wale.faa,ebola.faa
Species,Genes,alg.-conn.,ecoli.faa,human.faa,snail.faa,wale.faa,mouse.faa
5,5,0.715,C_10,C_10;test,E_10,L_10,M_10
4,6,0.115,*,C_12,E_315,L_313,M_313
4,5,0.167,*,C_63,E_19,L_19,M_19
4,4,0.816,*,C_64,E_18,L_18,M_18
----
| The first group is comprised of 5 proteins of 5 species: 'C_10' of ecoli.faa, 'C_10;test' of human.faa, 'E_10' of snail.faa, 'L_10' of wale.faa, and 'M_10' of mouse.faa.
| The alg.-conn. (algebraic connectivity) of 0.715 indicates the connectivity of this group, the higher the more edges are connecting these 5 proteins (at most there can be 10 and at least there need to be 4).
| The second group contains 6 proteins distributed over 4 species. The star indicates the species where no protein was found (in this case ecoli.faa).
.. csv-table::
seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba
# ecoli.faa,human.faa
# 1.91e-112,357.5,1.825e-113,360
L_10,C_10;test,4.32e-151,447,4.30e-151,446
L_11,C_11,1.17e-68,209,3.00e-69,210
L_14,C_14,3.64e-139,422,1.19e-142,431
L_15,C_15,3.51e-100,303,2.12e-102,308
L_16,C_16,3.75e-49,157,7.06e-50,159
L_17,C_17,2.96e-195,578,5.50e-196,579
----
* **orthology-pairs**
| The same as orthology-groups but every edge is printed one-by-one instead of the whole group. The output is formatted the same as the RBH graph:
| Similar to orthology groups, but each edge is printed individually.
| The output is formatted the same as the RBH graph.
| For example extracting all hits of the second group of the example orthology-group output ('4,6,0.115,*,C_12,E_315,L_313,M_313') using grep (-E, regular expression="(C_12|E_315|L_313|M_313).*(C_12|E_315|L_313|M_313)", input file=proteinortho-graph) would reveal all edges of this groups:
.. csv-table::
seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba
seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba
M_313,C_12,1.18e-115,407,6.12e-116,407
C_12,E_315,4.50e-127,445,4.09e-127,445
L_313,M_313,0.00e+00,1368,0.00e+00,1368
L_313,C_12,3.76e-114,402,1.94e-114,402
----
| Especially L_313 and M_313 are very similar, probably identical.
| The group cotnains 4 edges out of the 6 possible edges for a group of 4 proteins. The missing edges are M_313-E_315 as well as L_313-E_315. This means that E_315 is only connected to the other 3 proteins via C_12 and thus could be considered as a weak link in the group.
**Proteinortho-Tools for downstream analysis**
* `proteinortho grab proteins` : find gene(s)/protein(s) in a given fasta file and retrieve their sequence(s). You can also use a orthology-groups file or a subset (e.g. filter by Species>10).
* `proteinortho summary` : Summaries the orthology-pairs/RBH files to determine how the species are connected to each other.
More information can be found on github https://gitlab.com/paulklemm_PHD/proteinortho
**Citations:**
]]>
</help>
<expand macro="citations" /> <!--- TODO: citations are not working in usegalxy, therefore they are added manually at the above. -->
<citations>
<citation type="doi">10.3389/fbinf.2023.1322477</citation>
<citation type="doi">10.1186/1471-2105-12-124</citation>
<citation type="doi">10.1371/journal.pone.0105015</citation>
</citations>
</tool>
6 changes: 5 additions & 1 deletion tools/proteinortho/proteinortho_grab_proteins.xml
Original file line number Diff line number Diff line change
Expand Up @@ -112,5 +112,9 @@ proteinortho_grab_proteins : find gene(s)/protein(s) in a given fasta file and r
More information can be found on github https://gitlab.com/paulklemm_PHD/proteinortho
]]>
</help>
<expand macro="citations"/>
<citations>
<citation type="doi">10.3389/fbinf.2023.1322477</citation>
<citation type="doi">10.1186/1471-2105-12-124</citation>
<citation type="doi">10.1371/journal.pone.0105015</citation>
</citations>
</tool>
22 changes: 8 additions & 14 deletions tools/proteinortho/proteinortho_macros.xml
Original file line number Diff line number Diff line change
@@ -1,15 +1,8 @@
<?xml version="1.0"?>
<macros>
<token name="@TOOL_VERSION@">6.3.3</token>
<token name="@WRAPPER_VERSION@">0</token>
<token name="@PROFILE@">22.05</token>
<xml name="citations">
<citations>
<citation type="doi">10.1186/1471-2105-12-124</citation>
<citation type="doi">10.1371/journal.pone.0105015</citation>
<citation type="doi">10.3389/fbinf.2023.1322477</citation>
</citations>
</xml>
<token name="@TOOL_VERSION@">6.3.4</token>
<token name="@WRAPPER_VERSION@">0</token>
<token name="@PROFILE@">22.05</token>
<xml name="biotools">
<xrefs>
<xref type="bio.tools">proteinortho</xref>
Expand All @@ -18,10 +11,11 @@
<xml name="requirements">
<requirements>
<requirement type="package" version="@TOOL_VERSION@">proteinortho</requirement>
<requirement type="package" version="2.1.10">diamond</requirement>
<requirement type="package" version="2.16.0">blast</requirement>
<requirement type="package" version="472">ucsc-blat</requirement>
<requirement type="package" version="1595">last</requirement>
<requirement type="package" version="2.1.8">diamond</requirement>
<requirement type="package" version="2.15.0">blast</requirement>
<requirement type="package" version="445">ucsc-blat</requirement>
<requirement type="package" version="1519">last</requirement>
<requirement type="package" version="16.747c6">mmseqs2</requirement>
</requirements>
</xml>
<xml name="version_command">
Expand Down
6 changes: 5 additions & 1 deletion tools/proteinortho/proteinortho_summary.xml
Original file line number Diff line number Diff line change
Expand Up @@ -120,5 +120,9 @@ Or given 2 orthology-pairs from the same set of fasta files with different param
More information can be found on github https://gitlab.com/paulklemm_PHD/proteinortho
]]>
</help>
<expand macro="citations"/>
<citations>
<citation type="doi">10.3389/fbinf.2023.1322477</citation>
<citation type="doi">10.1186/1471-2105-12-124</citation>
<citation type="doi">10.1371/journal.pone.0105015</citation>
</citations>
</tool>

0 comments on commit fda6f17

Please sign in to comment.