-
Notifications
You must be signed in to change notification settings - Fork 34
/
prediction.py
85 lines (67 loc) · 2.96 KB
/
prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
__author__ = "[email protected]"
import gensim
import smart_open
from gensim.models import Doc2Vec
from sklearn.externals import joblib
from collections import defaultdict
from operator import add
res0 = defaultdict(int)
res1 = defaultdict(int)
num_1 = 2910
num_5 = 5882
model = Doc2Vec.load('./my_db.d2v')
# have fun with the amazing most_similar method
# the definition of similar please refer to word embedding
print(model.most_similar(positive=['交通']))
print(model.most_similar(positive=['风景']))
rf = joblib.load('classifier.model')
sources_train = {'1_train.txt': 'ONE', '5_train.txt': 'FIVE'}
def read_corpus(source_set):
for source_file, prefix in source_set.items():
with smart_open.smart_open(source_file, encoding="utf-8") as f:
for i, line in enumerate(f):
# split with space to isolate each word
# the words list are tagged with a label as its identity
yield gensim.models.doc2vec.TaggedDocument(gensim.utils.to_unicode(line).split(), [prefix + '_%s' % i])
train_corpus = list(read_corpus(sources_train))
# since 1_test is empty..., use 2_test
with open('./2_test.txt', 'r', encoding='utf-8') as f, open('./2_sim.txt', 'w', encoding='utf-8') as ff:
while True:
line = f.readline()
if not line:
break
line_ls = line.replace('\n', '').split(' ')
# pls fine-tune your hyper-parameters: alpha & steps
# as there is random seed, you could infer for N times and use the average as the final infer vector
# it will take more time but have high accuracy
line_vec = model.infer_vector(line_ls, alpha=0.95, steps=50)
for _ in range(0, 9):
line_vec += model.infer_vector(line_ls, alpha=0.95, steps=50)
line_vec /= 10
# find the most similar docs/sentence
sims = model.docvecs.most_similar([line_vec], topn=1)
ff.write(line)
index = int(sims[0][0].split('_')[1]) + (num_1 if sims[0][0].split('_')[0] == 'FIVE' else 0)
ff.write(' '.join(train_corpus[index].words) + '\n\n')
res0[int(rf.predict([line_vec]))] += 1
print('Accuracy for negative comments: %s' % (res0[0] / (res0[0] + res0[1]), ))
with open('./5_test.txt', 'r', encoding='utf-8') as f, open('./5_sim.txt', 'w', encoding='utf-8') as ff:
while True:
line = f.readline()
if not line:
break
line_ls = line.split(' ')
# pls fine-tune your hyper-parameters: alpha & steps
# as there is random seed, you could infer for N times and use the average as the final infer vector
# it will take more time but have high accuracy
line_vec = model.infer_vector(line_ls, alpha=0.05, steps=50)
for _ in range(0, 9):
line_vec += model.infer_vector(line_ls, alpha=0.05, steps=50)
line_vec /= 10
# find the most similar docs/sentence
sims = model.docvecs.most_similar([line_vec], topn=1)
ff.write(line)
index = int(sims[0][0].split('_')[1]) + (num_1 if sims[0][0].split('_')[0] == 'FIVE' else 0)
ff.write(' '.join(train_corpus[index].words) + '\n\n')
res1[int(rf.predict([line_vec]))] += 1
print('Accuracy for positive comments: %s' % (res1[1] / (res1[0] + res1[1]), ))