-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclustering_poc_sim.py
58 lines (47 loc) · 1.58 KB
/
clustering_poc_sim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import numpy as np
from sklearn.cluster import AffinityPropagation
from scala.bqp.algos.cluster_cold_single import solve_ccs_bqp
file = "tests/data/pipeline/prot_sim.tsv"
items = []
dists = []
with open(file, "r") as data:
for i, line in enumerate(data.readlines()):
parts = line.strip().split("\t")
items.append(parts[0])
dists.append([float(x) for x in parts[1:]])
dists = np.array(dists)
ca = AffinityPropagation(affinity='precomputed', random_state=42)
labels = ca.fit_predict(dists)
print(labels)
counts = np.asarray(np.unique(labels, return_counts=True)).T
print(counts)
cluster_sims, cluster_count = np.zeros((max(labels) + 1, max(labels) + 1)), np.zeros((max(labels) + 1, max(labels) + 1))
for i in range(len(items)):
for j in range(i + 1, len(items)):
if labels[i] != labels[j]:
cluster_sims[labels[i], labels[j]] += dists[i, j]
cluster_count[labels[i], labels[j]] += 1
cluster_sims[labels[j], labels[i]] += dists[i, j]
cluster_count[labels[j], labels[i]] += 1
cluster_sims /= (cluster_count + np.eye(max(labels) + 1))
print(cluster_sims)
print(np.average(cluster_sims))
split = solve_ccs_bqp(
clusters=list(range(max(labels) + 1)),
weights=[c for _, c in counts],
similarities=cluster_sims,
distances=None,
threshold=np.average(cluster_sims),
limit=0.1,
splits=[0.7, 0.3],
names=["train", "test"],
max_sec=1000,
max_sol=1000,
)
print(split)
splits = {}
for l in labels:
if split[l] not in splits:
splits[split[l]] = 0
splits[split[l]] += 1
print(splits)