-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathhybrid.py
executable file
·174 lines (131 loc) · 7.95 KB
/
hybrid.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
"""
Hybrid Update for Vanilla LDA
@author: Ke Zhai ([email protected])
"""
import time
import numpy;
import scipy
import nltk;
import sys;
from inferencer import compute_dirichlet_expectation
from inferencer import Inferencer
from variational_bayes import VariationalBayes;
"""
This is a python implementation of vanilla lda, based on a lda approach of variational inference and gibbs sampling, with hyper parameter updating.
It supports asymmetric Dirichlet prior over the topic simplex.
References:
[1] D. Mimno, M. Hoffman, D. Blei. Sparse Stochastic Inference for Latent Dirichlet Allocation. Internal Conference on Machine Learning, Jun 2012.
"""
class Hybrid(VariationalBayes, Inferencer):
"""
"""
def __init__(self,
hyper_parameter_optimize_interval=1,
#hyper_parameter_iteration=100,
#hyper_parameter_decay_factor=0.9,
#hyper_parameter_maximum_decay=10,
#hyper_parameter_converge_threshold=1e-6,
):
Inferencer.__init__(self, hyper_parameter_optimize_interval);
#VariationalBayes.__init__(self, hyper_parameter_optimize_interval, hyper_parameter_iteration, hyper_parameter_decay_factor, hyper_parameter_maximum_decay, hyper_parameter_converge_threshold);
"""
"""
def _initialize(self, corpus, vocab, number_of_topics, alpha_alpha, alpha_beta):
Inferencer._initialize(self, vocab, number_of_topics, alpha_alpha, alpha_beta);
# self._corpus = corpus;
self._parsed_corpus = self.parse_data(corpus);
# define the total number of document
self._number_of_documents = len(self._parsed_corpus);
# initialize a D-by-K matrix gamma, valued at N_d/K
#self._gamma = numpy.zeros((self._number_of_documents, self._number_of_topics)) + self._alpha_alpha + 1.0 * self._number_of_types / self._number_of_topics;
self._gamma = numpy.tile(self._alpha_alpha + 1.0 * self._number_of_types / self._number_of_topics, (self._number_of_documents, 1));
# initialize a V-by-K matrix beta, valued at 1/V, subject to the sum over every row is 1
self._eta = numpy.random.gamma(100., 1. / 100., (self._number_of_topics, self._number_of_types));
#self._E_log_eta = compute_dirichlet_expectation(numpy.random.gamma(100., 1. / 100., (self._number_of_topics, self._number_of_types)));
#self._exp_E_log_beta = numpy.exp(compute_dirichlet_expectation(numpy.random.gamma(100., 1. / 100., (self._number_of_topics, self._number_of_types))));
def parse_data(self, corpus):
doc_count = 0
word_idss = [];
for document_line in corpus:
word_ids = [];
for token in document_line.split():
if token not in self._type_to_index:
continue;
type_id = self._type_to_index[token];
word_ids.append(type_id);
if len(word_ids)==0:
sys.stderr.write("warning: document collapsed during parsing");
continue;
word_idss.append(word_ids);
doc_count+=1
if doc_count%10000==0:
print "successfully parse %d documents..." % doc_count;
print "successfully parse %d documents..." % (doc_count);
return word_idss;
def e_step(self, parsed_corpus=None, number_of_samples=10, burn_in_samples=5):
if parsed_corpus==None:
word_idss = self._parsed_corpus;
else:
word_idss = parsed_corpus;
number_of_documents = len(word_idss);
E_log_eta = compute_dirichlet_expectation(self._eta)
exp_E_log_eta = numpy.exp(E_log_eta);
document_log_likelihood = 0;
words_log_likelihood = 0;
# initialize a V-by-K matrix phi contribution
phi_sufficient_statistics = numpy.zeros((self._number_of_topics, self._number_of_types));
# initialize a D-by-K matrix gamma values
gamma_values = numpy.zeros((number_of_documents, self._number_of_topics)) + self._alpha_alpha[numpy.newaxis, :] + 1.0 * self._number_of_types / self._number_of_topics;
# iterate over all documents
for doc_id in xrange(number_of_documents):
phi = numpy.random.random((self._number_of_topics, len(word_idss[doc_id])));
phi = phi / numpy.sum(phi, axis=0)[numpy.newaxis, :];
phi_sum = numpy.sum(phi, axis=1)[:, numpy.newaxis];
assert(phi_sum.shape == (self._number_of_topics, 1));
document_phi = numpy.zeros((len(word_idss[doc_id]), self._number_of_topics));
# collect phi samples from empirical distribution
for it in xrange(number_of_samples):
for word_pos in xrange(len(word_idss[doc_id])):
word_index = word_idss[doc_id][word_pos];
phi_sum -= phi[:, word_pos][:, numpy.newaxis];
# this is to get rid of the underflow error from the above summation, ideally, phi will become all integers after few iterations
phi_sum *= (phi_sum > 0);
#assert(numpy.all(phi_sum >= 0));
temp_phi = (phi_sum.T + self._alpha_alpha) * exp_E_log_eta[:, [word_index]].T;
assert(temp_phi.shape == (1, self._number_of_topics));
temp_phi /= numpy.sum(temp_phi);
# sample a topic for this word
temp_phi = numpy.random.multinomial(1, temp_phi[0])[:, numpy.newaxis];
assert(temp_phi.shape == (self._number_of_topics, 1));
phi[:, word_pos][:, numpy.newaxis] = temp_phi;
phi_sum += temp_phi;
# discard the first few burn-in sweeps
if it < burn_in_samples:
continue;
phi_sufficient_statistics[:, word_index] += temp_phi[:, 0];
document_phi[word_pos, :] += temp_phi[:, 0];
gamma_values[doc_id, :] = self._alpha_alpha + phi_sum.T[0, :];
#batch_document_topic_distribution[doc_id, :] = self._alpha_alpha + phi_sum.T[0, :];
document_phi /= (number_of_samples - burn_in_samples);
# this is to prevent 0 during log()
document_phi += 1e-100;
# Note: all terms including E_q[p(\theta|\alpha)], i.e., terms involving \Psi(\gamma), are cancelled due to \gamma updates
# Note: all terms including E_q[p(\eta | \beta)], i.e., terms involving \Psi(\eta), are cancelled due to \eta updates in M-step
# compute the alpha terms
document_log_likelihood += scipy.special.gammaln(numpy.sum(self._alpha_alpha)) - numpy.sum(scipy.special.gammaln(self._alpha_alpha))
# compute the gamma terms
document_log_likelihood += numpy.sum(scipy.special.gammaln(gamma_values[doc_id, :])) - scipy.special.gammaln(numpy.sum(gamma_values[doc_id, :]));
# compute the phi terms
document_log_likelihood -= numpy.sum(numpy.log(document_phi) * document_phi);
# compute the p(w_{dn} | z_{dn}, \eta) terms, which will be cancelled during M-step
words_log_likelihood += numpy.sum(document_phi * (E_log_eta[:, word_idss[doc_id]].T));
if (doc_id+1) % 1000==0:
print "successfully processed %d documents in hybrid mode..." % (doc_id+1);
phi_sufficient_statistics /= (number_of_samples - burn_in_samples);
if parsed_corpus==None:
self._gamma = gamma_values;
return document_log_likelihood, phi_sufficient_statistics
else:
return words_log_likelihood, gamma_values
if __name__ == "__main__":
print "not implemented..."