Skip to content

Commit

Permalink
Add new hierachical clustering tool based on scipy (galaxyproject#6214)
Browse files Browse the repository at this point in the history
* Add new hierachical clustering tool

* Describe input format in tool help

* Add .shed.yml

* Add edam information

* Fix flake8 issues

* Add note about test data origin

* Try to fix tests

* Update tools/clustering_from_distmat/.shed.yml

* Write column names to cluster assignment output

---------

Co-authored-by: Björn Grüning <[email protected]>
  • Loading branch information
2 people authored and nilchia committed Aug 24, 2024
1 parent b34f41d commit 259472f
Show file tree
Hide file tree
Showing 8 changed files with 286 additions and 0 deletions.
8 changes: 8 additions & 0 deletions tools/clustering_from_distmat/.shed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
name: clustering_from_distmat
owner: iuc
description: Distance matrix-based hierarchical clustering using SciPy
remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/clustering_from_distmat/
long_description: Distance matrix-based hierarchical clustering using SciPy
homepage_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/clustering_from_distmat/
categories:
- Statistics
147 changes: 147 additions & 0 deletions tools/clustering_from_distmat/clustering_from_distmat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import argparse
import sys

import scipy


def linkage_as_newick(linkage, tip_names):
newick_parts = tip_names[::]
within_cluster_distances = [0] * len(tip_names)
for step in linkage:
n1 = int(step[0])
n2 = int(step[1])
d = float(step[2])
d1 = d - within_cluster_distances[n1]
d2 = d - within_cluster_distances[n2]
id1 = newick_parts[n1]
id2 = newick_parts[n2]
part = f'({id1}:{d1 / 2},{id2}:{d2 / 2})'
within_cluster_distances.append(d)
newick_parts.append(part)
return newick_parts[-1].format(*newick_parts) + ';'


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
'infile',
help='Distance matrix input file'
)
parser.add_argument(
'out_prefix',
help="Output prefix"
)
parser.add_argument
parser.add_argument(
'-m', '--method', default="average",
choices=[
"single",
"complete",
"average",
"weighted",
"centroid",
"median",
"ward"
],
help="Clustering method to use"
)
cut_mode = parser.add_mutually_exclusive_group()
cut_mode.add_argument(
"-n", "--n-clusters", nargs="*", type=int
)
cut_mode.add_argument(
"--height", nargs="*", type=float
)
args = parser.parse_args()

# TO DO:
# - parse outputs to generate

# read from input and check that
# we have been passed a symmetric distance matrix
with open(args.infile) as i:
col_names = next(i).rstrip("\n\r").split("\t")[1:]
col_count = len(col_names)
if not col_count:
sys.exit(
'No data columns found. '
'This tool expects tabular input with column names on the first line '
'and a row name in the first column of each row followed by data columns.'
)
row_count = 0
matrix = []
for line in i:
if not line.strip():
# skip empty lines
continue
row_count += 1
if row_count > col_count:
sys.exit(
'This tool expects a symmetric distance matrix with an equal number of rows and columns, '
'but got more rows than columns.'
)
row_name, *row_data = line.strip(" \n\r").split("\t")
col_name = col_names[row_count - 1]
if not row_name:
# tolerate omitted row names, use col name instead
row_name = col_name
if row_name != col_name:
sys.exit(
'This tool expects a symmetric distance matrix with identical names for rows and columns, '
f'but got "{col_name}" in column {row_count} and "{row_name}" on row {row_count}.'
)
if len(row_data) != col_count:
sys.exit(
'This tool expects a symmetric distance matrix with the same number of columns on each row, '
f'but row {row_count} ("{row_name}") has {len(row_data)} columns instead of {col_count}.'
)
try:
matrix.append([float(x) for x in row_data])
except ValueError as e:
sys.exit(str(e) + f' on row {row_count} ("{row_name}")')
if row_count < col_count:
sys.exit(
'This tool expects a symmetric distance matrix with an equal number of rows and columns, '
'but got more columns than rows.'
)

# turn the distance matrix into "condensed" vector form
# this gives us further checks and raises ValueErrors if:
# - the values on the diagonal aren't zero
# - the upper and lower triangle of the matrix aren't identical
D = scipy.spatial.distance.squareform(matrix)

# perform the requested clustering and retrieve the result as a linkage object
linkage = scipy.cluster.hierarchy.linkage(D, args.method)

with open(args.out_prefix + '.tree.newick', 'w') as o:
o.write(linkage_as_newick(linkage, col_names))

# cut the tree as specified and report sample to cluster assignments
if args.n_clusters or args.height:
if args.n_clusters:
cut_values = args.n_clusters
colname_template = "cluster_id_n{}"
else:
cut_values = args.height
colname_template = "cluster_id_h{}"
header_cols = ["sample"] + [
colname_template.format(x) for x in cut_values
]
cluster_assignments = []
for name, cluster_ids in zip(
col_names,
scipy.cluster.hierarchy.cut_tree(
linkage,
args.n_clusters,
args.height
)
):
cluster_assignments.append(
[name]
+ [str(c + 1) for c in cluster_ids]
)
with open(args.out_prefix + '.cluster_assignments.tsv', 'w') as o:
print("\t".join(header_cols), file=o)
for ass in cluster_assignments:
print("\t".join(ass), file=o)
111 changes: 111 additions & 0 deletions tools/clustering_from_distmat/clustering_from_distmat.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
<tool id="clustering_from_distmat" name="Distance matrix-based hierarchical clustering" version="1.0" profile="23.0">
<description>using Scipy</description>
<edam_topics>
<edam_topic>topic_2269</edam_topic>
<edam_topic>topic_0084</edam_topic>
</edam_topics>
<edam_operations>
<edam_operation>operation_3432</edam_operation>
</edam_operations>
<requirements>
<requirement type="package" version="3.12">python</requirement>
<requirement type="package" version="1.14.0">scipy</requirement>
</requirements>
<command detect_errors="exit_code"><![CDATA[
python '$__tool_directory__/clustering_from_distmat.py'
'$distmat'
result
--method $method
#if str($cluster_assignment.select) == 'n-cluster':
--n-clusters $cluster_assignment.n_cluster
#elif str($cluster_assignment.select) == 'height':
--height $cluster_assignment.height
#end if
]]></command>
<inputs>
<param name="distmat" type="data" format="tabular" label="Distance matrix" />
<param name="method" type="select" label="Clustering method">
<option value="single">Nearest Point (scipy 'single' method)</option>
<option value="complete">Farthest Point (scipy 'complete' method)</option>
<option value="average" selected="true">UPGMA (scipy 'average' method)</option>
<option value="weighted">WPGMA (scipy 'weighted' method)</option>
<option value="centroid">UPGMC (scipy 'centroid' method)</option>
<option value="median">WPGMC (scipy 'median' method)</option>
<option value="ward">Ward/Incremental (scipy 'ward' method)</option>
</param>
<conditional name="cluster_assignment">
<param name="select" type="select" label="Generate cluster assignments?">
<option value="dendrogram-only">No, just generate the dendrogram of clustering results</option>
<option value="n-cluster">Yes, and divide into specified number of clusters </option>
<option value="height">Yes, and use distance threshold to divide into clusters</option>
</param>
<when value="dendrogram-only" />
<when value="n-cluster">
<param name="n_cluster" type="integer" value="5" min="1" label="How many clusters to divide into?" />
<param name="generate_dendrogram" type="boolean" label="Produce also the dendrogram of clustering results" />
</when>
<when value="height">
<param name="height" type="float" value="5.0" label="Distance threshold for clusters to be reported" />
<param name="generate_dendrogram" type="boolean" label="Produce also the dendrogram of clustering results" />
</when>
</conditional>
</inputs>
<outputs>
<data name="clustering_dendrogram" format="newick" from_work_dir="result.tree.newick" label="${tool.name} on ${on_string}: Dendrogram">
<filter>cluster_assignment["select"] == "dendrogram-only" or cluster_assignment["generate_dendrogram"]</filter>
</data>
<data name="clustering_assignment" format="tabular" from_work_dir="result.cluster_assignments.tsv" label="${tool.name} on ${on_string}: Cluster assignment">
<filter>cluster_assignment["select"] in ["n-cluster", "height"]</filter>
</data>
</outputs>
<tests>
<!-- Test data and expected results taken from https://en.wikipedia.org/wiki/UPGMA#Working_example -->
<test expect_num_outputs="1">
<param name="distmat" value="test_matrix.tsv"/>
<output name="clustering_dendrogram" ftype="newick" file="test_tree_average.newick" />
</test>
<test expect_num_outputs="1">
<param name="distmat" value="test_matrix.tsv" />
<param name="method" value="complete" />
<output name="clustering_dendrogram" ftype="newick" file="test_tree_complete.newick" />
</test>
<test expect_num_outputs="1">
<param name="distmat" value="test_matrix.tsv"/>
<conditional name="cluster_assignment">
<param name="select" value="height" />
<param name="height" value="18" />
</conditional>
<output name="clustering_assignment" ftype="tabular" file="test_assignment_average_h18.tsv" />
</test>
<test expect_num_outputs="2">
<param name="distmat" value="test_matrix.tsv"/>
<conditional name="cluster_assignment">
<param name="select" value="n-cluster" />
<param name="n_cluster" value="4" />
<param name="generate_dendrogram" value="true" />
</conditional>
<output name="clustering_assignment" ftype="tabular" file="test_assignment_average_n4.tsv" />
</test>
</tests>
<help><![CDATA[
.. class:: infomark
**What it does**
This tool lets you perform hierarchical clustering of samples using the `scipy.cluster.hierarchy.linkage`_ function and any of the clustering methods supported by it.
As input it expects a symmetrical distance matrix with sample names on the first row and in the first column.
The clustering result can be reported in the form of a dendrogram in newick format.
Additionally, the tool can report the assignment of the samples to clusters "cut" from the clustering tree using the `scipy.cluster.hierarchy.cut_tree`_ function.
Reflecting the parameters of that function, you can specify *how* to cut the tree by specifying either the number of clusters to cut into or a distance threshold, i.e., the height at which to cut the tree as SciPy calls it.
.. _`scipy.cluster.hierarchy.linkage`: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
.. _`scipy.cluster.hierarchy.cut_tree`: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.cut_tree.html
]]></help>
<citations>
<citation type="doi">10.1038/s41592-019-0686-2</citation>
</citations>
</tool>
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
sample cluster_id_h18.0
a 1
b 1
c 2
d 3
e 4
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
sample cluster_id_n4
a 1
b 1
c 2
d 3
e 4
6 changes: 6 additions & 0 deletions tools/clustering_from_distmat/test-data/test_matrix.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
a b c d e
a 0 17 21 31 23
b 17 0 30 34 21
c 21 30 0 28 39
d 31 34 28 0 43
e 23 21 39 43 0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
((e:11.0,(a:8.5,b:8.5):2.5):5.5,(c:14.0,d:14.0):2.5);
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
((e:11.5,(a:8.5,b:8.5):3.0):10.0,(c:14.0,d:14.0):7.5);

0 comments on commit 259472f

Please sign in to comment.