-
Notifications
You must be signed in to change notification settings - Fork 68
/
Copy pathembedding_metrics.py
149 lines (121 loc) · 5.12 KB
/
embedding_metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
from skills_ml.ontologies.clustering import Clustering
from skills_ml.algorithms.preprocessing import IterablePipeline
from itertools import chain
from collections import defaultdict
from scipy.spatial import distance
from gensim.models import KeyedVectors
from typing import Callable, Dict
import numpy as np
import tempfile
import os
def metrics_for_embedding(vectorization_pipeline, metric_objects):
result = defaultdict(dict)
for metric in metric_objects:
result[metric.name] = metric.eval(vectorization_pipeline)
return result
class BaseEmbeddingMetric(object):
def __init__(self):
self.eval_result = None
def eval(self):
return NotImplementedError
class CategorizationMetric(BaseEmbeddingMetric):
"""cosine similarity between the clustering concept and the mean vector
of all entities within that concept cluster.
"""
def __init__(self, clustering: Clustering):
self.clustering = clustering
super().__init__()
@property
def name(self):
return f"{self.clustering.name}_categorization_metric"
def eval(self, vectorization: Callable) -> Dict:
result = {}
for concept, entities in self.clustering.items():
centroid = np.average([vectorization(entity[1]) for entity in entities], axis=0)
result[concept] = distance.cosine(vectorization(concept), centroid).astype(float)
self.eval_result = result
return result
class IntraClusterCohesion(BaseEmbeddingMetric):
"""sum of squared error of the centroid of the concept cluster and
each entities within the concept cluster.
"""
def __init__(self, clustering: Clustering):
self.clustering = clustering
super().__init__()
@property
def name(self):
return f"{self.clustering.name}_intra_cluster_cohesion"
def eval(self, vectorization: Callable) -> Dict:
result = {}
for concept, entities in self.clustering.items():
entities_vec = [vectorization(entity[1]) for entity in entities]
centroid = np.average(entities_vec, axis=0)
result[concept] = np.sum((entities_vec - centroid)**2).astype(float)
self.eval_result = result
return result
class RecallTopN(BaseEmbeddingMetric):
"""For a given concept cluster and a given number n, find top
n similar entities from the whole entity pool based on cosin
similarity, and then calculate the top n recall: number of the true
positive from top n closest entities divided by the total number
of the concept cluster.
"""
def __init__(self, clustering: Clustering, topn=20):
self.clustering = clustering
self.topn = topn
super().__init__()
@property
def entity_generator(self):
return chain(*self.clustering.values())
@property
def name(self):
return f"{self.clustering.name}_recall_top{self.topn}"
def eval(self, vectorization: Callable) -> Dict:
convert2tuples = IterablePipeline(
lambda x: (x[0], vectorization(x[1]))
)
wv = convert_to_keyedvector(list(convert2tuples(self.entity_generator)))
result = {}
for concept, entities in self.clustering.raw_items():
closest = wv.similar_by_vector(vectorization(concept), topn=self.topn)
tp = set([c[0] for c in closest]) & set([e.identifier for e in entities])
result[concept] = len(tp) / len(entities)
return result
class PrecisionTopN(BaseEmbeddingMetric):
"""For a given concept cluster and a given number n, find top
n similar entities from the whole entity pool based on cosin
similarity, and then calculate the top n precision: number of the true
positive from top n closest entities divided by n.
"""
def __init__(self, clustering: Clustering, topn=10):
self.clustering = clustering
self.topn = topn
super().__init__()
@property
def entity_generator(self):
return chain(*self.clustering.values())
@property
def name(self):
return f"{self.clustering.name}_precision_top{self.topn}"
def eval(self, vectorization: Callable) -> Dict:
convert2tuples = IterablePipeline(
lambda x: (x[0], vectorization(x[1]))
)
wv = convert_to_keyedvector(list(convert2tuples(self.entity_generator)))
result = {}
for concept, entities in self.clustering.raw_items():
closest = wv.similar_by_vector(vectorization(concept), topn=self.topn)
tp = set([c[0] for c in closest]) & set([e.identifier for e in entities])
result[concept] = len(tp) / self.topn
return result
def convert_to_keyedvector(id_vector_tuples):
vector_size = len(id_vector_tuples[0][1])
with tempfile.NamedTemporaryFile(mode='w') as f:
f.write(str(len(id_vector_tuples)) + " " + str(vector_size) + "\n")
for identifier, vectors in id_vector_tuples:
f.write(identifier + " ")
for vector in list(map(str, vectors)):
f.write(vector + " ")
f.write("\n")
f.seek(0)
return KeyedVectors.load_word2vec_format(f.name)