From 259472f4f10023a44468061294684fa71258d29f Mon Sep 17 00:00:00 2001
From: Wolfgang Maier <maierw@informatik.uni-freiburg.de>
Date: Thu, 8 Aug 2024 21:18:57 +0200
Subject: [PATCH] Add new hierachical clustering tool based on scipy (#6214)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add new hierachical clustering tool

* Describe input format in tool help

* Add .shed.yml

* Add edam information

* Fix flake8 issues

* Add note about test data origin

* Try to fix tests

* Update tools/clustering_from_distmat/.shed.yml

* Write column names to cluster assignment output

---------

Co-authored-by: Björn Grüning <bjoern@gruenings.eu>
---
 tools/clustering_from_distmat/.shed.yml       |   8 +
 .../clustering_from_distmat.py                | 147 ++++++++++++++++++
 .../clustering_from_distmat.xml               | 111 +++++++++++++
 .../test-data/test_assignment_average_h18.tsv |   6 +
 .../test-data/test_assignment_average_n4.tsv  |   6 +
 .../test-data/test_matrix.tsv                 |   6 +
 .../test-data/test_tree_average.newick        |   1 +
 .../test-data/test_tree_complete.newick       |   1 +
 8 files changed, 286 insertions(+)
 create mode 100644 tools/clustering_from_distmat/.shed.yml
 create mode 100644 tools/clustering_from_distmat/clustering_from_distmat.py
 create mode 100644 tools/clustering_from_distmat/clustering_from_distmat.xml
 create mode 100644 tools/clustering_from_distmat/test-data/test_assignment_average_h18.tsv
 create mode 100644 tools/clustering_from_distmat/test-data/test_assignment_average_n4.tsv
 create mode 100644 tools/clustering_from_distmat/test-data/test_matrix.tsv
 create mode 100644 tools/clustering_from_distmat/test-data/test_tree_average.newick
 create mode 100644 tools/clustering_from_distmat/test-data/test_tree_complete.newick

diff --git a/tools/clustering_from_distmat/.shed.yml b/tools/clustering_from_distmat/.shed.yml
new file mode 100644
index 00000000000..548f28d85f7
--- /dev/null
+++ b/tools/clustering_from_distmat/.shed.yml
@@ -0,0 +1,8 @@
+name: clustering_from_distmat
+owner: iuc
+description: Distance matrix-based hierarchical clustering using SciPy
+remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/clustering_from_distmat/
+long_description: Distance matrix-based hierarchical clustering using SciPy
+homepage_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/clustering_from_distmat/
+categories:
+  - Statistics
diff --git a/tools/clustering_from_distmat/clustering_from_distmat.py b/tools/clustering_from_distmat/clustering_from_distmat.py
new file mode 100644
index 00000000000..e253be69845
--- /dev/null
+++ b/tools/clustering_from_distmat/clustering_from_distmat.py
@@ -0,0 +1,147 @@
+import argparse
+import sys
+
+import scipy
+
+
+def linkage_as_newick(linkage, tip_names):
+    newick_parts = tip_names[::]
+    within_cluster_distances = [0] * len(tip_names)
+    for step in linkage:
+        n1 = int(step[0])
+        n2 = int(step[1])
+        d = float(step[2])
+        d1 = d - within_cluster_distances[n1]
+        d2 = d - within_cluster_distances[n2]
+        id1 = newick_parts[n1]
+        id2 = newick_parts[n2]
+        part = f'({id1}:{d1 / 2},{id2}:{d2 / 2})'
+        within_cluster_distances.append(d)
+        newick_parts.append(part)
+    return newick_parts[-1].format(*newick_parts) + ';'
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        'infile',
+        help='Distance matrix input file'
+    )
+    parser.add_argument(
+        'out_prefix',
+        help="Output prefix"
+    )
+    parser.add_argument
+    parser.add_argument(
+        '-m', '--method', default="average",
+        choices=[
+            "single",
+            "complete",
+            "average",
+            "weighted",
+            "centroid",
+            "median",
+            "ward"
+        ],
+        help="Clustering method to use"
+    )
+    cut_mode = parser.add_mutually_exclusive_group()
+    cut_mode.add_argument(
+        "-n", "--n-clusters", nargs="*", type=int
+    )
+    cut_mode.add_argument(
+        "--height", nargs="*", type=float
+    )
+    args = parser.parse_args()
+
+    # TO DO:
+    # - parse outputs to generate
+
+    # read from input and check that
+    # we have been passed a symmetric distance matrix
+    with open(args.infile) as i:
+        col_names = next(i).rstrip("\n\r").split("\t")[1:]
+        col_count = len(col_names)
+        if not col_count:
+            sys.exit(
+                'No data columns found. '
+                'This tool expects tabular input with column names on the first line '
+                'and a row name in the first column of each row followed by data columns.'
+            )
+        row_count = 0
+        matrix = []
+        for line in i:
+            if not line.strip():
+                # skip empty lines
+                continue
+            row_count += 1
+            if row_count > col_count:
+                sys.exit(
+                    'This tool expects a symmetric distance matrix with an equal number of rows and columns, '
+                    'but got more rows than columns.'
+                )
+            row_name, *row_data = line.strip(" \n\r").split("\t")
+            col_name = col_names[row_count - 1]
+            if not row_name:
+                # tolerate omitted row names, use col name instead
+                row_name = col_name
+            if row_name != col_name:
+                sys.exit(
+                    'This tool expects a symmetric distance matrix with identical names for rows and columns, '
+                    f'but got "{col_name}" in column {row_count} and "{row_name}" on row {row_count}.'
+                )
+            if len(row_data) != col_count:
+                sys.exit(
+                    'This tool expects a symmetric distance matrix with the same number of columns on each row, '
+                    f'but row {row_count} ("{row_name}") has {len(row_data)} columns instead of {col_count}.'
+                )
+            try:
+                matrix.append([float(x) for x in row_data])
+            except ValueError as e:
+                sys.exit(str(e) + f' on row {row_count} ("{row_name}")')
+    if row_count < col_count:
+        sys.exit(
+            'This tool expects a symmetric distance matrix with an equal number of rows and columns, '
+            'but got more columns than rows.'
+        )
+
+    # turn the distance matrix into "condensed" vector form
+    # this gives us further checks and raises ValueErrors if:
+    # - the values on the diagonal aren't zero
+    # - the upper and lower triangle of the matrix aren't identical
+    D = scipy.spatial.distance.squareform(matrix)
+
+    # perform the requested clustering and retrieve the result as a linkage object
+    linkage = scipy.cluster.hierarchy.linkage(D, args.method)
+
+    with open(args.out_prefix + '.tree.newick', 'w') as o:
+        o.write(linkage_as_newick(linkage, col_names))
+
+    # cut the tree as specified and report sample to cluster assignments
+    if args.n_clusters or args.height:
+        if args.n_clusters:
+            cut_values = args.n_clusters
+            colname_template = "cluster_id_n{}"
+        else:
+            cut_values = args.height
+            colname_template = "cluster_id_h{}"
+        header_cols = ["sample"] + [
+            colname_template.format(x) for x in cut_values
+        ]
+        cluster_assignments = []
+        for name, cluster_ids in zip(
+            col_names,
+            scipy.cluster.hierarchy.cut_tree(
+                linkage,
+                args.n_clusters,
+                args.height
+            )
+        ):
+            cluster_assignments.append(
+                [name]
+                + [str(c + 1) for c in cluster_ids]
+            )
+        with open(args.out_prefix + '.cluster_assignments.tsv', 'w') as o:
+            print("\t".join(header_cols), file=o)
+            for ass in cluster_assignments:
+                print("\t".join(ass), file=o)
diff --git a/tools/clustering_from_distmat/clustering_from_distmat.xml b/tools/clustering_from_distmat/clustering_from_distmat.xml
new file mode 100644
index 00000000000..89963542dbe
--- /dev/null
+++ b/tools/clustering_from_distmat/clustering_from_distmat.xml
@@ -0,0 +1,111 @@
+<tool id="clustering_from_distmat" name="Distance matrix-based hierarchical clustering" version="1.0" profile="23.0">
+    <description>using Scipy</description>
+    <edam_topics>
+        <edam_topic>topic_2269</edam_topic>
+        <edam_topic>topic_0084</edam_topic>
+    </edam_topics>
+    <edam_operations>
+        <edam_operation>operation_3432</edam_operation>
+    </edam_operations>
+    <requirements>
+        <requirement type="package" version="3.12">python</requirement>
+        <requirement type="package" version="1.14.0">scipy</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+python '$__tool_directory__/clustering_from_distmat.py'
+  '$distmat'
+  result
+  --method $method
+  #if str($cluster_assignment.select) == 'n-cluster':
+    --n-clusters $cluster_assignment.n_cluster
+  #elif str($cluster_assignment.select) == 'height':
+    --height $cluster_assignment.height
+  #end if
+    ]]></command>
+    <inputs>
+        <param name="distmat" type="data" format="tabular" label="Distance matrix" />
+        <param name="method" type="select" label="Clustering method">
+            <option value="single">Nearest Point (scipy 'single' method)</option>
+            <option value="complete">Farthest Point (scipy 'complete' method)</option>
+            <option value="average" selected="true">UPGMA (scipy 'average' method)</option>
+            <option value="weighted">WPGMA (scipy 'weighted' method)</option>
+            <option value="centroid">UPGMC (scipy 'centroid' method)</option>
+            <option value="median">WPGMC (scipy 'median' method)</option>
+            <option value="ward">Ward/Incremental (scipy 'ward' method)</option>
+        </param>
+        <conditional name="cluster_assignment">
+            <param name="select" type="select" label="Generate cluster assignments?">
+                <option value="dendrogram-only">No, just generate the dendrogram of clustering results</option>
+                <option value="n-cluster">Yes, and divide into specified number of clusters </option>
+                <option value="height">Yes, and use distance threshold to divide into clusters</option>
+            </param>
+            <when value="dendrogram-only" />
+            <when value="n-cluster">
+                <param name="n_cluster" type="integer" value="5" min="1" label="How many clusters to divide into?" />
+                <param name="generate_dendrogram" type="boolean" label="Produce also the dendrogram of clustering results" />
+            </when>
+            <when value="height">
+                <param name="height" type="float" value="5.0" label="Distance threshold for clusters to be reported" />
+                <param name="generate_dendrogram" type="boolean" label="Produce also the dendrogram of clustering results" />
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data name="clustering_dendrogram" format="newick" from_work_dir="result.tree.newick" label="${tool.name} on ${on_string}: Dendrogram">
+            <filter>cluster_assignment["select"] == "dendrogram-only" or cluster_assignment["generate_dendrogram"]</filter>
+        </data>
+        <data name="clustering_assignment" format="tabular" from_work_dir="result.cluster_assignments.tsv" label="${tool.name} on ${on_string}: Cluster assignment">
+            <filter>cluster_assignment["select"] in ["n-cluster", "height"]</filter>
+        </data>
+    </outputs>
+    <tests>
+        <!-- Test data and expected results taken from https://en.wikipedia.org/wiki/UPGMA#Working_example -->
+        <test expect_num_outputs="1">
+            <param name="distmat" value="test_matrix.tsv"/>
+            <output name="clustering_dendrogram" ftype="newick" file="test_tree_average.newick" />
+        </test>
+        <test expect_num_outputs="1">
+            <param name="distmat" value="test_matrix.tsv" />
+            <param name="method" value="complete" />
+            <output name="clustering_dendrogram" ftype="newick" file="test_tree_complete.newick" />
+        </test>
+        <test expect_num_outputs="1">
+            <param name="distmat" value="test_matrix.tsv"/>
+            <conditional name="cluster_assignment">
+                <param name="select" value="height" />
+                <param name="height" value="18" />
+            </conditional>
+            <output name="clustering_assignment" ftype="tabular" file="test_assignment_average_h18.tsv" />
+        </test>
+        <test expect_num_outputs="2">
+            <param name="distmat" value="test_matrix.tsv"/>
+            <conditional name="cluster_assignment">
+                <param name="select" value="n-cluster" />
+                <param name="n_cluster" value="4" />
+                <param name="generate_dendrogram" value="true" />
+            </conditional>
+            <output name="clustering_assignment" ftype="tabular" file="test_assignment_average_n4.tsv" />
+        </test>
+    </tests>
+    <help><![CDATA[
+
+.. class:: infomark
+
+**What it does**
+
+This tool lets you perform hierarchical clustering of samples using the `scipy.cluster.hierarchy.linkage`_ function and any of the clustering methods supported by it.
+
+As input it expects a symmetrical distance matrix with sample names on the first row and in the first column.
+
+The clustering result can be reported in the form of a dendrogram in newick format.
+
+Additionally, the tool can report the assignment of the samples to clusters "cut" from the clustering tree using the `scipy.cluster.hierarchy.cut_tree`_ function.
+Reflecting the parameters of that function, you can specify *how* to cut the tree by specifying either the number of clusters to cut into or a distance threshold, i.e., the height at which to cut the tree as SciPy calls it.
+
+.. _`scipy.cluster.hierarchy.linkage`: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
+.. _`scipy.cluster.hierarchy.cut_tree`: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.cut_tree.html
+    ]]></help>
+    <citations>
+        <citation type="doi">10.1038/s41592-019-0686-2</citation>
+    </citations>
+</tool>
diff --git a/tools/clustering_from_distmat/test-data/test_assignment_average_h18.tsv b/tools/clustering_from_distmat/test-data/test_assignment_average_h18.tsv
new file mode 100644
index 00000000000..43b9e6dbba5
--- /dev/null
+++ b/tools/clustering_from_distmat/test-data/test_assignment_average_h18.tsv
@@ -0,0 +1,6 @@
+sample	cluster_id_h18.0
+a	1
+b	1
+c	2
+d	3
+e	4
diff --git a/tools/clustering_from_distmat/test-data/test_assignment_average_n4.tsv b/tools/clustering_from_distmat/test-data/test_assignment_average_n4.tsv
new file mode 100644
index 00000000000..839fba70ae7
--- /dev/null
+++ b/tools/clustering_from_distmat/test-data/test_assignment_average_n4.tsv
@@ -0,0 +1,6 @@
+sample	cluster_id_n4
+a	1
+b	1
+c	2
+d	3
+e	4
diff --git a/tools/clustering_from_distmat/test-data/test_matrix.tsv b/tools/clustering_from_distmat/test-data/test_matrix.tsv
new file mode 100644
index 00000000000..a72c1acc644
--- /dev/null
+++ b/tools/clustering_from_distmat/test-data/test_matrix.tsv
@@ -0,0 +1,6 @@
+ 	a	b	c	d	e
+a	0	17	21	31	23
+b	17	0	30	34	21
+c	21	30	0	28	39
+d	31	34	28	0	43
+e	23	21	39	43	0
diff --git a/tools/clustering_from_distmat/test-data/test_tree_average.newick b/tools/clustering_from_distmat/test-data/test_tree_average.newick
new file mode 100644
index 00000000000..84530f51416
--- /dev/null
+++ b/tools/clustering_from_distmat/test-data/test_tree_average.newick
@@ -0,0 +1 @@
+((e:11.0,(a:8.5,b:8.5):2.5):5.5,(c:14.0,d:14.0):2.5);
\ No newline at end of file
diff --git a/tools/clustering_from_distmat/test-data/test_tree_complete.newick b/tools/clustering_from_distmat/test-data/test_tree_complete.newick
new file mode 100644
index 00000000000..bb436ddf845
--- /dev/null
+++ b/tools/clustering_from_distmat/test-data/test_tree_complete.newick
@@ -0,0 +1 @@
+((e:11.5,(a:8.5,b:8.5):3.0):10.0,(c:14.0,d:14.0):7.5);
\ No newline at end of file