-
Notifications
You must be signed in to change notification settings - Fork 3
/
build.py
65 lines (50 loc) · 1.66 KB
/
build.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from collections import defaultdict
import dill
import pybktree
import editdistance
import config
print("Loading words...")
words = dill.load(open(f"{config.MODEL}/words.dill", 'rb'))
# remove words that only appear one time
i = 0
keys = list(words.keys())
for word in keys:
if sum(words[word].values()) <= 2:
i += 1
del words[word]
print(i/len(keys))
words_inverse = defaultdict(lambda: defaultdict(float))
print("Building inverse words...")
# build inverse lookup
for predecessor in words.keys():
for successor in words[predecessor].keys():
words_inverse[successor][predecessor] += words[predecessor][successor]
for successor in words_inverse.values():
predecessors = successor.keys()
occurrences = successor.values()
prob_factor = 1/sum(occurrences)
for pred in predecessors:
successor[pred] *= prob_factor
# sort inverse lookup
for successor in words_inverse.keys():
pred_and_probs = words_inverse[successor].items()
pred_and_probs = sorted(pred_and_probs, key=lambda x: x[1], reverse=True)
words_inverse[successor] = dict()
for (pred, probability) in pred_and_probs:
words_inverse[successor][pred] = probability
print("Normalizing word frequencies...")
for word in words.values():
successors = word.keys()
occurrences = word.values()
prob_factor = 1/sum(occurrences)
for successor in successors:
word[successor] *= prob_factor
print("Building BKTree...")
tree = pybktree.BKTree(editdistance.eval)
[tree.add(word) for word in words.keys()]
print("Dumping to file...")
model = dict()
model['words'] = words
model['words_inverse'] = words_inverse
model['tree'] = tree
dill.dump(model, open(f"{config.MODEL}/model.dill", 'wb'))