-
Notifications
You must be signed in to change notification settings - Fork 441
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add new hierachical clustering tool based on scipy (#6214)
* Add new hierachical clustering tool * Describe input format in tool help * Add .shed.yml * Add edam information * Fix flake8 issues * Add note about test data origin * Try to fix tests * Update tools/clustering_from_distmat/.shed.yml * Write column names to cluster assignment output --------- Co-authored-by: Björn Grüning <[email protected]>
- Loading branch information
Showing
8 changed files
with
286 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
name: clustering_from_distmat | ||
owner: iuc | ||
description: Distance matrix-based hierarchical clustering using SciPy | ||
remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/clustering_from_distmat/ | ||
long_description: Distance matrix-based hierarchical clustering using SciPy | ||
homepage_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/clustering_from_distmat/ | ||
categories: | ||
- Statistics |
147 changes: 147 additions & 0 deletions
147
tools/clustering_from_distmat/clustering_from_distmat.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
import argparse | ||
import sys | ||
|
||
import scipy | ||
|
||
|
||
def linkage_as_newick(linkage, tip_names): | ||
newick_parts = tip_names[::] | ||
within_cluster_distances = [0] * len(tip_names) | ||
for step in linkage: | ||
n1 = int(step[0]) | ||
n2 = int(step[1]) | ||
d = float(step[2]) | ||
d1 = d - within_cluster_distances[n1] | ||
d2 = d - within_cluster_distances[n2] | ||
id1 = newick_parts[n1] | ||
id2 = newick_parts[n2] | ||
part = f'({id1}:{d1 / 2},{id2}:{d2 / 2})' | ||
within_cluster_distances.append(d) | ||
newick_parts.append(part) | ||
return newick_parts[-1].format(*newick_parts) + ';' | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
'infile', | ||
help='Distance matrix input file' | ||
) | ||
parser.add_argument( | ||
'out_prefix', | ||
help="Output prefix" | ||
) | ||
parser.add_argument | ||
parser.add_argument( | ||
'-m', '--method', default="average", | ||
choices=[ | ||
"single", | ||
"complete", | ||
"average", | ||
"weighted", | ||
"centroid", | ||
"median", | ||
"ward" | ||
], | ||
help="Clustering method to use" | ||
) | ||
cut_mode = parser.add_mutually_exclusive_group() | ||
cut_mode.add_argument( | ||
"-n", "--n-clusters", nargs="*", type=int | ||
) | ||
cut_mode.add_argument( | ||
"--height", nargs="*", type=float | ||
) | ||
args = parser.parse_args() | ||
|
||
# TO DO: | ||
# - parse outputs to generate | ||
|
||
# read from input and check that | ||
# we have been passed a symmetric distance matrix | ||
with open(args.infile) as i: | ||
col_names = next(i).rstrip("\n\r").split("\t")[1:] | ||
col_count = len(col_names) | ||
if not col_count: | ||
sys.exit( | ||
'No data columns found. ' | ||
'This tool expects tabular input with column names on the first line ' | ||
'and a row name in the first column of each row followed by data columns.' | ||
) | ||
row_count = 0 | ||
matrix = [] | ||
for line in i: | ||
if not line.strip(): | ||
# skip empty lines | ||
continue | ||
row_count += 1 | ||
if row_count > col_count: | ||
sys.exit( | ||
'This tool expects a symmetric distance matrix with an equal number of rows and columns, ' | ||
'but got more rows than columns.' | ||
) | ||
row_name, *row_data = line.strip(" \n\r").split("\t") | ||
col_name = col_names[row_count - 1] | ||
if not row_name: | ||
# tolerate omitted row names, use col name instead | ||
row_name = col_name | ||
if row_name != col_name: | ||
sys.exit( | ||
'This tool expects a symmetric distance matrix with identical names for rows and columns, ' | ||
f'but got "{col_name}" in column {row_count} and "{row_name}" on row {row_count}.' | ||
) | ||
if len(row_data) != col_count: | ||
sys.exit( | ||
'This tool expects a symmetric distance matrix with the same number of columns on each row, ' | ||
f'but row {row_count} ("{row_name}") has {len(row_data)} columns instead of {col_count}.' | ||
) | ||
try: | ||
matrix.append([float(x) for x in row_data]) | ||
except ValueError as e: | ||
sys.exit(str(e) + f' on row {row_count} ("{row_name}")') | ||
if row_count < col_count: | ||
sys.exit( | ||
'This tool expects a symmetric distance matrix with an equal number of rows and columns, ' | ||
'but got more columns than rows.' | ||
) | ||
|
||
# turn the distance matrix into "condensed" vector form | ||
# this gives us further checks and raises ValueErrors if: | ||
# - the values on the diagonal aren't zero | ||
# - the upper and lower triangle of the matrix aren't identical | ||
D = scipy.spatial.distance.squareform(matrix) | ||
|
||
# perform the requested clustering and retrieve the result as a linkage object | ||
linkage = scipy.cluster.hierarchy.linkage(D, args.method) | ||
|
||
with open(args.out_prefix + '.tree.newick', 'w') as o: | ||
o.write(linkage_as_newick(linkage, col_names)) | ||
|
||
# cut the tree as specified and report sample to cluster assignments | ||
if args.n_clusters or args.height: | ||
if args.n_clusters: | ||
cut_values = args.n_clusters | ||
colname_template = "cluster_id_n{}" | ||
else: | ||
cut_values = args.height | ||
colname_template = "cluster_id_h{}" | ||
header_cols = ["sample"] + [ | ||
colname_template.format(x) for x in cut_values | ||
] | ||
cluster_assignments = [] | ||
for name, cluster_ids in zip( | ||
col_names, | ||
scipy.cluster.hierarchy.cut_tree( | ||
linkage, | ||
args.n_clusters, | ||
args.height | ||
) | ||
): | ||
cluster_assignments.append( | ||
[name] | ||
+ [str(c + 1) for c in cluster_ids] | ||
) | ||
with open(args.out_prefix + '.cluster_assignments.tsv', 'w') as o: | ||
print("\t".join(header_cols), file=o) | ||
for ass in cluster_assignments: | ||
print("\t".join(ass), file=o) |
111 changes: 111 additions & 0 deletions
111
tools/clustering_from_distmat/clustering_from_distmat.xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
<tool id="clustering_from_distmat" name="Distance matrix-based hierarchical clustering" version="1.0" profile="23.0"> | ||
<description>using Scipy</description> | ||
<edam_topics> | ||
<edam_topic>topic_2269</edam_topic> | ||
<edam_topic>topic_0084</edam_topic> | ||
</edam_topics> | ||
<edam_operations> | ||
<edam_operation>operation_3432</edam_operation> | ||
</edam_operations> | ||
<requirements> | ||
<requirement type="package" version="3.12">python</requirement> | ||
<requirement type="package" version="1.14.0">scipy</requirement> | ||
</requirements> | ||
<command detect_errors="exit_code"><![CDATA[ | ||
python '$__tool_directory__/clustering_from_distmat.py' | ||
'$distmat' | ||
result | ||
--method $method | ||
#if str($cluster_assignment.select) == 'n-cluster': | ||
--n-clusters $cluster_assignment.n_cluster | ||
#elif str($cluster_assignment.select) == 'height': | ||
--height $cluster_assignment.height | ||
#end if | ||
]]></command> | ||
<inputs> | ||
<param name="distmat" type="data" format="tabular" label="Distance matrix" /> | ||
<param name="method" type="select" label="Clustering method"> | ||
<option value="single">Nearest Point (scipy 'single' method)</option> | ||
<option value="complete">Farthest Point (scipy 'complete' method)</option> | ||
<option value="average" selected="true">UPGMA (scipy 'average' method)</option> | ||
<option value="weighted">WPGMA (scipy 'weighted' method)</option> | ||
<option value="centroid">UPGMC (scipy 'centroid' method)</option> | ||
<option value="median">WPGMC (scipy 'median' method)</option> | ||
<option value="ward">Ward/Incremental (scipy 'ward' method)</option> | ||
</param> | ||
<conditional name="cluster_assignment"> | ||
<param name="select" type="select" label="Generate cluster assignments?"> | ||
<option value="dendrogram-only">No, just generate the dendrogram of clustering results</option> | ||
<option value="n-cluster">Yes, and divide into specified number of clusters </option> | ||
<option value="height">Yes, and use distance threshold to divide into clusters</option> | ||
</param> | ||
<when value="dendrogram-only" /> | ||
<when value="n-cluster"> | ||
<param name="n_cluster" type="integer" value="5" min="1" label="How many clusters to divide into?" /> | ||
<param name="generate_dendrogram" type="boolean" label="Produce also the dendrogram of clustering results" /> | ||
</when> | ||
<when value="height"> | ||
<param name="height" type="float" value="5.0" label="Distance threshold for clusters to be reported" /> | ||
<param name="generate_dendrogram" type="boolean" label="Produce also the dendrogram of clustering results" /> | ||
</when> | ||
</conditional> | ||
</inputs> | ||
<outputs> | ||
<data name="clustering_dendrogram" format="newick" from_work_dir="result.tree.newick" label="${tool.name} on ${on_string}: Dendrogram"> | ||
<filter>cluster_assignment["select"] == "dendrogram-only" or cluster_assignment["generate_dendrogram"]</filter> | ||
</data> | ||
<data name="clustering_assignment" format="tabular" from_work_dir="result.cluster_assignments.tsv" label="${tool.name} on ${on_string}: Cluster assignment"> | ||
<filter>cluster_assignment["select"] in ["n-cluster", "height"]</filter> | ||
</data> | ||
</outputs> | ||
<tests> | ||
<!-- Test data and expected results taken from https://en.wikipedia.org/wiki/UPGMA#Working_example --> | ||
<test expect_num_outputs="1"> | ||
<param name="distmat" value="test_matrix.tsv"/> | ||
<output name="clustering_dendrogram" ftype="newick" file="test_tree_average.newick" /> | ||
</test> | ||
<test expect_num_outputs="1"> | ||
<param name="distmat" value="test_matrix.tsv" /> | ||
<param name="method" value="complete" /> | ||
<output name="clustering_dendrogram" ftype="newick" file="test_tree_complete.newick" /> | ||
</test> | ||
<test expect_num_outputs="1"> | ||
<param name="distmat" value="test_matrix.tsv"/> | ||
<conditional name="cluster_assignment"> | ||
<param name="select" value="height" /> | ||
<param name="height" value="18" /> | ||
</conditional> | ||
<output name="clustering_assignment" ftype="tabular" file="test_assignment_average_h18.tsv" /> | ||
</test> | ||
<test expect_num_outputs="2"> | ||
<param name="distmat" value="test_matrix.tsv"/> | ||
<conditional name="cluster_assignment"> | ||
<param name="select" value="n-cluster" /> | ||
<param name="n_cluster" value="4" /> | ||
<param name="generate_dendrogram" value="true" /> | ||
</conditional> | ||
<output name="clustering_assignment" ftype="tabular" file="test_assignment_average_n4.tsv" /> | ||
</test> | ||
</tests> | ||
<help><![CDATA[ | ||
.. class:: infomark | ||
**What it does** | ||
This tool lets you perform hierarchical clustering of samples using the `scipy.cluster.hierarchy.linkage`_ function and any of the clustering methods supported by it. | ||
As input it expects a symmetrical distance matrix with sample names on the first row and in the first column. | ||
The clustering result can be reported in the form of a dendrogram in newick format. | ||
Additionally, the tool can report the assignment of the samples to clusters "cut" from the clustering tree using the `scipy.cluster.hierarchy.cut_tree`_ function. | ||
Reflecting the parameters of that function, you can specify *how* to cut the tree by specifying either the number of clusters to cut into or a distance threshold, i.e., the height at which to cut the tree as SciPy calls it. | ||
.. _`scipy.cluster.hierarchy.linkage`: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html | ||
.. _`scipy.cluster.hierarchy.cut_tree`: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.cut_tree.html | ||
]]></help> | ||
<citations> | ||
<citation type="doi">10.1038/s41592-019-0686-2</citation> | ||
</citations> | ||
</tool> |
6 changes: 6 additions & 0 deletions
6
tools/clustering_from_distmat/test-data/test_assignment_average_h18.tsv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
sample cluster_id_h18.0 | ||
a 1 | ||
b 1 | ||
c 2 | ||
d 3 | ||
e 4 |
6 changes: 6 additions & 0 deletions
6
tools/clustering_from_distmat/test-data/test_assignment_average_n4.tsv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
sample cluster_id_n4 | ||
a 1 | ||
b 1 | ||
c 2 | ||
d 3 | ||
e 4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
a b c d e | ||
a 0 17 21 31 23 | ||
b 17 0 30 34 21 | ||
c 21 30 0 28 39 | ||
d 31 34 28 0 43 | ||
e 23 21 39 43 0 |
1 change: 1 addition & 0 deletions
1
tools/clustering_from_distmat/test-data/test_tree_average.newick
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
((e:11.0,(a:8.5,b:8.5):2.5):5.5,(c:14.0,d:14.0):2.5); |
1 change: 1 addition & 0 deletions
1
tools/clustering_from_distmat/test-data/test_tree_complete.newick
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
((e:11.5,(a:8.5,b:8.5):3.0):10.0,(c:14.0,d:14.0):7.5); |