-
Notifications
You must be signed in to change notification settings - Fork 0
/
Vocabulary.py
47 lines (35 loc) · 1.08 KB
/
Vocabulary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import time
class Vocabulary(object):
def __init__(self):
self._voc = {}
self._voc_size = 0
self._tmp_id = 0
self.DEFAULT = 100000
self._tmp_filename = []
def reset(self):
self._voc = {}
self._voc_size = 0
def add(self, doc_id, token_stream):
for token in token_stream.keys():
if not self._voc.has_key(token):
#(wf, df, doc_id_list)
self._voc[token] = [0, 0, []]
self._voc_size += 1
self._voc[token][0] += token_stream[token]
self._voc[token][1] += 1
self._voc[token][2].append('%s:%s' % (doc_id, token_stream[token]))
if self._voc_size > self.DEFAULT:
self.save()
self.reset()
def save(self):
if self._voc:
f = open('.t' + str(self._tmp_id), 'w')
self._tmp_filename.append('.t' + str(self._tmp_id))
self._tmp_id += 1
for token in sorted(self._voc.keys()):
(wf, df, doc_info_list) = self._voc[token]
f.write('%s %s %s %s\n' % (token.encode('utf8'), wf, df, \
reduce(lambda x,y: '%s,%s' % (x, y), doc_info_list)))
f.close()
def files(self):
return self._tmp_filename