-
Notifications
You must be signed in to change notification settings - Fork 1
/
bneighbors.py
105 lines (72 loc) · 2.44 KB
/
bneighbors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import bvec
import bcolz
import similarity as sim
class Neighborhood:
'''
"Don't you want to be my neighbor?"
bneighbors finds nearest neighbors between two arbitrary vector spaces,
contained in bcolz databases, using bvec.
'''
def __init__(self, source_path):
'''
Create the Neighborhood, for finding nearest neighbors.
Args:
source_path (string): path to a bcolz database with three carray
columns: 'id', 'vector' and 'norm'
'''
self.source_path = source_path
# open bcolz datastores
self.vectors = bvec.carray(rootdir=source_path + "/vector")
self.norms = bvec.carray(rootdir=source_path + "/norm")
self.source_table = bcolz.ctable(rootdir=source_path)
#print("Created similarity object from BCOLZ files: source {0}; target: {1}".format(source_path, target_path))
# create similarity object
self.similarity = sim.Similarity(self.vectors, self.norms)
# create domain <-> index maps
# dictionary taking ids to indeces (source)
self.id_index_map = self._create_id_index_map(self.source_table)
self.index_id_map = self._create_index_id_map(self.source_table)
@staticmethod
def _create_id_index_map(ctable):
'''
create a dictionary taking ids to indeces (source)
'''
i = 0
id_index_map = {}
for block in bcolz.iterblocks(ctable['id']):
for item in block:
id_index_map[str(item)] = i
i += 1
return id_index_map
@staticmethod
def _create_index_id_map(ctable):
'''
create a dictionary taking an index to an id (target)
'''
i = 0
index_id_map = {}
for block in bcolz.iterblocks(ctable['id']):
for item in block:
index_id_map[i] = str(item)
i += 1
return index_id_map
def neighbors(self, source_id, n=100, sim_type=sim.SimilarityType.Cosine, p=None):
'''
Find the nearest neighbors of the given source_id
'''
if source_id not in self.id_index_map:
return []
source_index = self.id_index_map[source_id]
sorted_target_indeces = self.similarity.similarities(source_index, n=n, sim_type=sim_type, p=p)
# convert indeces to domain names
sorted_target_ids = ( (self.index_id_map[index], score) for (index, score) in sorted_target_indeces )
return sorted_target_ids
def location(self, source_id):
'''
Return the vector (numpy.ndarray) for the given source_id
source_id: external identifier for the vector
'''
if source_id not in self.id_index_map:
return []
source_index = self.id_index_map[source_id]
return self.vectors[source_index]