From 259472f4f10023a44468061294684fa71258d29f Mon Sep 17 00:00:00 2001 From: Wolfgang Maier Date: Thu, 8 Aug 2024 21:18:57 +0200 Subject: [PATCH] Add new hierachical clustering tool based on scipy (#6214) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add new hierachical clustering tool * Describe input format in tool help * Add .shed.yml * Add edam information * Fix flake8 issues * Add note about test data origin * Try to fix tests * Update tools/clustering_from_distmat/.shed.yml * Write column names to cluster assignment output --------- Co-authored-by: Björn Grüning --- tools/clustering_from_distmat/.shed.yml | 8 + .../clustering_from_distmat.py | 147 ++++++++++++++++++ .../clustering_from_distmat.xml | 111 +++++++++++++ .../test-data/test_assignment_average_h18.tsv | 6 + .../test-data/test_assignment_average_n4.tsv | 6 + .../test-data/test_matrix.tsv | 6 + .../test-data/test_tree_average.newick | 1 + .../test-data/test_tree_complete.newick | 1 + 8 files changed, 286 insertions(+) create mode 100644 tools/clustering_from_distmat/.shed.yml create mode 100644 tools/clustering_from_distmat/clustering_from_distmat.py create mode 100644 tools/clustering_from_distmat/clustering_from_distmat.xml create mode 100644 tools/clustering_from_distmat/test-data/test_assignment_average_h18.tsv create mode 100644 tools/clustering_from_distmat/test-data/test_assignment_average_n4.tsv create mode 100644 tools/clustering_from_distmat/test-data/test_matrix.tsv create mode 100644 tools/clustering_from_distmat/test-data/test_tree_average.newick create mode 100644 tools/clustering_from_distmat/test-data/test_tree_complete.newick diff --git a/tools/clustering_from_distmat/.shed.yml b/tools/clustering_from_distmat/.shed.yml new file mode 100644 index 00000000000..548f28d85f7 --- /dev/null +++ b/tools/clustering_from_distmat/.shed.yml @@ -0,0 +1,8 @@ +name: clustering_from_distmat +owner: iuc +description: Distance matrix-based hierarchical clustering using SciPy +remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/clustering_from_distmat/ +long_description: Distance matrix-based hierarchical clustering using SciPy +homepage_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/clustering_from_distmat/ +categories: + - Statistics diff --git a/tools/clustering_from_distmat/clustering_from_distmat.py b/tools/clustering_from_distmat/clustering_from_distmat.py new file mode 100644 index 00000000000..e253be69845 --- /dev/null +++ b/tools/clustering_from_distmat/clustering_from_distmat.py @@ -0,0 +1,147 @@ +import argparse +import sys + +import scipy + + +def linkage_as_newick(linkage, tip_names): + newick_parts = tip_names[::] + within_cluster_distances = [0] * len(tip_names) + for step in linkage: + n1 = int(step[0]) + n2 = int(step[1]) + d = float(step[2]) + d1 = d - within_cluster_distances[n1] + d2 = d - within_cluster_distances[n2] + id1 = newick_parts[n1] + id2 = newick_parts[n2] + part = f'({id1}:{d1 / 2},{id2}:{d2 / 2})' + within_cluster_distances.append(d) + newick_parts.append(part) + return newick_parts[-1].format(*newick_parts) + ';' + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + 'infile', + help='Distance matrix input file' + ) + parser.add_argument( + 'out_prefix', + help="Output prefix" + ) + parser.add_argument + parser.add_argument( + '-m', '--method', default="average", + choices=[ + "single", + "complete", + "average", + "weighted", + "centroid", + "median", + "ward" + ], + help="Clustering method to use" + ) + cut_mode = parser.add_mutually_exclusive_group() + cut_mode.add_argument( + "-n", "--n-clusters", nargs="*", type=int + ) + cut_mode.add_argument( + "--height", nargs="*", type=float + ) + args = parser.parse_args() + + # TO DO: + # - parse outputs to generate + + # read from input and check that + # we have been passed a symmetric distance matrix + with open(args.infile) as i: + col_names = next(i).rstrip("\n\r").split("\t")[1:] + col_count = len(col_names) + if not col_count: + sys.exit( + 'No data columns found. ' + 'This tool expects tabular input with column names on the first line ' + 'and a row name in the first column of each row followed by data columns.' + ) + row_count = 0 + matrix = [] + for line in i: + if not line.strip(): + # skip empty lines + continue + row_count += 1 + if row_count > col_count: + sys.exit( + 'This tool expects a symmetric distance matrix with an equal number of rows and columns, ' + 'but got more rows than columns.' + ) + row_name, *row_data = line.strip(" \n\r").split("\t") + col_name = col_names[row_count - 1] + if not row_name: + # tolerate omitted row names, use col name instead + row_name = col_name + if row_name != col_name: + sys.exit( + 'This tool expects a symmetric distance matrix with identical names for rows and columns, ' + f'but got "{col_name}" in column {row_count} and "{row_name}" on row {row_count}.' + ) + if len(row_data) != col_count: + sys.exit( + 'This tool expects a symmetric distance matrix with the same number of columns on each row, ' + f'but row {row_count} ("{row_name}") has {len(row_data)} columns instead of {col_count}.' + ) + try: + matrix.append([float(x) for x in row_data]) + except ValueError as e: + sys.exit(str(e) + f' on row {row_count} ("{row_name}")') + if row_count < col_count: + sys.exit( + 'This tool expects a symmetric distance matrix with an equal number of rows and columns, ' + 'but got more columns than rows.' + ) + + # turn the distance matrix into "condensed" vector form + # this gives us further checks and raises ValueErrors if: + # - the values on the diagonal aren't zero + # - the upper and lower triangle of the matrix aren't identical + D = scipy.spatial.distance.squareform(matrix) + + # perform the requested clustering and retrieve the result as a linkage object + linkage = scipy.cluster.hierarchy.linkage(D, args.method) + + with open(args.out_prefix + '.tree.newick', 'w') as o: + o.write(linkage_as_newick(linkage, col_names)) + + # cut the tree as specified and report sample to cluster assignments + if args.n_clusters or args.height: + if args.n_clusters: + cut_values = args.n_clusters + colname_template = "cluster_id_n{}" + else: + cut_values = args.height + colname_template = "cluster_id_h{}" + header_cols = ["sample"] + [ + colname_template.format(x) for x in cut_values + ] + cluster_assignments = [] + for name, cluster_ids in zip( + col_names, + scipy.cluster.hierarchy.cut_tree( + linkage, + args.n_clusters, + args.height + ) + ): + cluster_assignments.append( + [name] + + [str(c + 1) for c in cluster_ids] + ) + with open(args.out_prefix + '.cluster_assignments.tsv', 'w') as o: + print("\t".join(header_cols), file=o) + for ass in cluster_assignments: + print("\t".join(ass), file=o) diff --git a/tools/clustering_from_distmat/clustering_from_distmat.xml b/tools/clustering_from_distmat/clustering_from_distmat.xml new file mode 100644 index 00000000000..89963542dbe --- /dev/null +++ b/tools/clustering_from_distmat/clustering_from_distmat.xml @@ -0,0 +1,111 @@ + + using Scipy + + topic_2269 + topic_0084 + + + operation_3432 + + + python + scipy + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cluster_assignment["select"] == "dendrogram-only" or cluster_assignment["generate_dendrogram"] + + + cluster_assignment["select"] in ["n-cluster", "height"] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 10.1038/s41592-019-0686-2 + + diff --git a/tools/clustering_from_distmat/test-data/test_assignment_average_h18.tsv b/tools/clustering_from_distmat/test-data/test_assignment_average_h18.tsv new file mode 100644 index 00000000000..43b9e6dbba5 --- /dev/null +++ b/tools/clustering_from_distmat/test-data/test_assignment_average_h18.tsv @@ -0,0 +1,6 @@ +sample cluster_id_h18.0 +a 1 +b 1 +c 2 +d 3 +e 4 diff --git a/tools/clustering_from_distmat/test-data/test_assignment_average_n4.tsv b/tools/clustering_from_distmat/test-data/test_assignment_average_n4.tsv new file mode 100644 index 00000000000..839fba70ae7 --- /dev/null +++ b/tools/clustering_from_distmat/test-data/test_assignment_average_n4.tsv @@ -0,0 +1,6 @@ +sample cluster_id_n4 +a 1 +b 1 +c 2 +d 3 +e 4 diff --git a/tools/clustering_from_distmat/test-data/test_matrix.tsv b/tools/clustering_from_distmat/test-data/test_matrix.tsv new file mode 100644 index 00000000000..a72c1acc644 --- /dev/null +++ b/tools/clustering_from_distmat/test-data/test_matrix.tsv @@ -0,0 +1,6 @@ + a b c d e +a 0 17 21 31 23 +b 17 0 30 34 21 +c 21 30 0 28 39 +d 31 34 28 0 43 +e 23 21 39 43 0 diff --git a/tools/clustering_from_distmat/test-data/test_tree_average.newick b/tools/clustering_from_distmat/test-data/test_tree_average.newick new file mode 100644 index 00000000000..84530f51416 --- /dev/null +++ b/tools/clustering_from_distmat/test-data/test_tree_average.newick @@ -0,0 +1 @@ +((e:11.0,(a:8.5,b:8.5):2.5):5.5,(c:14.0,d:14.0):2.5); \ No newline at end of file diff --git a/tools/clustering_from_distmat/test-data/test_tree_complete.newick b/tools/clustering_from_distmat/test-data/test_tree_complete.newick new file mode 100644 index 00000000000..bb436ddf845 --- /dev/null +++ b/tools/clustering_from_distmat/test-data/test_tree_complete.newick @@ -0,0 +1 @@ +((e:11.5,(a:8.5,b:8.5):3.0):10.0,(c:14.0,d:14.0):7.5); \ No newline at end of file