forked from kristyj/twitter_clusters
-
Notifications
You must be signed in to change notification settings - Fork 0
/
em_for_clustering.py
79 lines (56 loc) · 2.7 KB
/
em_for_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
__author__ = 'Kristy'
#methods for em clustering on a clustercontainer object
class EMTopicClass(object):
def __init__(self, totalclasses, tweetlist):
self.prior = 1/totalclasses #set at start, update at each iteration
#posterior is a list of the posterior for each tweet in order
self.posteriors = [] #updated for each iteration
self.lm = LanguageModel(tweetlist, smoothing='witten-bell')
self.temp_sent_prob = float
class ThetaParams(object):
def __init__(self, totalclasses):
self.m = totalclasses
self.topics = [EMTopicClass(self.m) for x in range(self.m)]
self.normaliser = 1
def calc_sentence_prob(self, sentencelist):
self.normaliser = 0
for topic in self.topics:
topic.temp_sent_prob = topic.lm.give_sentence_probability(sentencelist) * topic.prior
self.normaliser += topic.temp_sent_prob
for topic in self.topics:
zij = topic.temp_sent_prob / self.normaliser
topic.temp_sent_prob = float
topic.posteriors.append(zij)
def reset_posteriors(self):
for topic in self.topics:
topic.posteriors = []
def em_pre_initialised(clusterobject, iters=3):
#the clusterobject has been pre-initialised (either randomly, agglomeratively, or kMeans)
m = clusterobject.m
#set parameters for iterations or precision
#initialise the parameters as theta
theta = ThetaParams(m) #initialise parameters for all topics #TODO: Priors are now all equal, not reflecting clustering
alltweets = []
tweetassociations = []
for c in clusterobject.clusters:
alltweets += c
tweetassociations.append([1/m] * m)
#the assignment of each tweet to a topic is found in theta.topics.posteriors
for i in range(iters):
#expectation step -------------------------------------------------------
for tweet in alltweets:
theta.calc_sentence_prob(tweet) #now theta.topic.posteriors contains scores
#maximisation step ------------------------------------------------------
#adjust word to topic association (w|t) based on posteriors
for topic in theta.topics:
topic.lm.adjust() #TODO
#adjust topic to document associaton (t|d) based the score each document got on the topic's LM
for tweet in list(zip(alltweets, tweetassociations)):
#tweet[0] is tweet object
#tweet[1] is tweet topic associations
theta.reset_posteriors()
#compute likelihood of sentence given every topic-mix so far
for
#(sum these so normalisation can take place)
#do this over every sentence. Keep the total presumably
#maximisation step