-
Notifications
You must be signed in to change notification settings - Fork 6
/
single_linkage_clustering.py
executable file
·61 lines (59 loc) · 1.97 KB
/
single_linkage_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env python3
'''
Perform single-linkage clustering on pairwise genetic distances
'''
# parse args
from gzip import open as gopen
from sys import stdin,stdout
import argparse
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-i', '--input', required=False, type=str, default='stdin', help="Input Genetic Distances (TSV)")
parser.add_argument('-t', '--threshold', required=False, type=float, default=float('inf'), help="Distance Threshold")
parser.add_argument('-o', '--output', required=False, type=str, default='stdout', help="Output Clusters (TSV)")
args,unknown = parser.parse_known_args()
assert args.threshold >= 0, "ERROR: Length threshold must be at least 0"
if args.input == 'stdin':
args.input = stdin
elif args.input.lower().endswith('.gz'):
args.input = gopen(args.input)
else:
args.input = open(args.input)
if args.output == 'stdout':
args.output = stdout
else:
args.output = open(args.output,'w')
# build graph
g = {} # g[node] = set of neighbors of node
for line in args.input:
if isinstance(line,bytes):
l = line.decode().strip()
else:
l = line.strip()
if l == 'ID1\tID2\tDistance':
continue
u,v,d = l.split('\t')
if u not in g:
g[u] = set()
if v not in g:
g[v] = set()
if float(d) > args.threshold:
continue
g[u].add(v); g[v].add(u)
# output clusters
from queue import Queue
explore = set(g.keys()); CLUST = 0
args.output.write("SequenceName\tClusterNumber\n")
while len(explore) != 0:
curr = explore.pop(); cluster = set()
q = Queue(); q.put(curr)
while not q.empty():
curr = q.get(); cluster.add(curr)
for n in g[curr]:
if n in explore:
q.put(n); explore.remove(n)
if len(cluster) == 1:
args.output.write('%s\t-1\n' % cluster.pop())
else:
CLUST += 1
for n in cluster:
args.output.write('%s\t%d\n' % (n,CLUST))