-
Notifications
You must be signed in to change notification settings - Fork 2
/
models.py
198 lines (153 loc) · 5.95 KB
/
models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
'''
### The code was developed as part of the Ancient Identities in Modern Britain (IARH) project
### It allows performing topic modelling on a corpus of text
### Project: Ancient Identities in Modern Britain (IARH); ancientidentities.org
### Author: Mark Altaweel
'''
import os
import warnings
import gensim
import sys
import csv
from nltk.tokenize import RegexpTokenizer
import pyLDAvis.gensim
from gensim.models import CoherenceModel, LdaModel
from gensim.corpora import Dictionary
from gensim.utils import lemmatize
from nltk.corpus import stopwords
warnings.filterwarnings('ignore') # Let's not pay heed to them right now
stops = set(stopwords.words('english')) # nltk stopwords list
'''
Method to get the text output from results in a CSV. Retrieves relevant texts only.
@param pn the path to find the relevant text
'''
def retrieveText(self,pn):
del self.listResults[:]
doc_set=[]
os.chdir(pn+'/output')
en_stop = stops
result=[]
for filename in os.listdir(os.getcwd()):
txt=''
if(filename == ".DS_Store" or "lda" in filename or "hdp" in filename or ".csv" not in filename):
continue
print(filename)
with open(filename, 'rU') as csvfile:
reader = csv.reader(csvfile, quotechar='|')
i=0
try:
for row in reader:
if row in ['\n', '\r\n']:
continue;
if(i==0):
i=i+1
continue
if(len(row)<1):
continue
text=''
for r in row:
text+=r.strip()
text=re.sub('"','',text)
text=re.sub(',','',text)
text.strip()
tFalse=True
if(len(result)==0):
result.append(text)
i+=1
txt=txt+" "+text
if(tFalse==True):
txt=txt+" "+text
if text==' ':
continue
tokenizer = RegexpTokenizer(r'\w+')
text = tokenizer.tokenize(unicode(text, errors='replace'))
stopped_tokens = [t for t in text if not t in en_stop]
doc_set.append(stopped_tokens)
i+=1
except csv.Error, e:
sys.exit('line %d: %s' % (reader.line_num, e))
return doc_set
'''
Process text files based on minimum length of file
@param files for processing
'''
def preProcsText(files):
for f in files:
yield gensim.utils.simple_preprocess(f, deacc=True, min_len=3)
'''
Method for text processing with input text.
@param input texts
@return texts the texts to be returned after processing.
'''
def process_texts(texts):
"""
Function to process texts. Following are the steps we take:
1. Stopword Removal.
2. Collocation detection.
3. Lemmatization (not stem since stemming can reduce the interpretability).
Parameters:
----------
texts: Tokenized texts.
Returns:
-------
texts: Pre-processed tokenized texts.
"""
# reg. expression tokenizer
texts = [[word for word in line if word not in stops] for line in texts]
texts = [bigram[line] for line in texts]
texts = [[word.split('/')[0] for word in lemmatize(' '.join(line), allowed_tags=re.compile('(NN)'), min_length=3)] for line in texts]
return texts
'''
Method for using a coherence model to look at topic coherence for LDA models.
@param dictionary the dictionary of assessment
@param corpus the texts
@param limit the limit of topics to assess
@return lm_list lda model output
@return c_v coherence score
'''
def evaluate_graph(dictionary, corpus, texts, limit):
"""
Function to display num_topics - LDA graph using c_v coherence
Parameters:
----------
dictionary : Gensim dictionary
corpus : Gensim corpus
limit : topic limit
Returns:
-------
lm_list : List of LDA topic models
c_v : Coherence values corresponding to the LDA model with respective number of topics
"""
c_v = []
lm_list = []
for num_topics in range(1, limit):
lm = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
lm_list.append(lm)
cm = CoherenceModel(model=lm, texts=texts, dictionary=dictionary, coherence='c_v')
c_v.append(cm.get_coherence())
del cm
return lm_list, c_v
'''
The below code is executed to conduct the models for topic modelling and
coherence testing for LDA models
'''
pn=os.path.abspath(__file__)
pn=pn.split("src")[0]
results=retrieveText(pn)
bigram = gensim.models.Phrases(results)
#train_texts = process_texts(train_texts)
train_texts=process_texts(results)
dictionary = Dictionary(train_texts)
corpus = [dictionary.doc2bow(text) for text in train_texts]
#up to 50 topics are tested
for i in range(1,50,1):
#lda model
ldamodel = LdaModel(corpus=corpus, num_topics=i, id2word=dictionary)
num=str(i)
ldamodel.save('lda'+num+'.model')
ldatopics = ldamodel.show_topics(num_topics=i)
visualisation = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.save_html(visualisation, 'LDA_Visualization_'+str(i)+'_.html')
i=50
#coherence model evaluation
lmlist, c_v = evaluate_graph(dictionary=dictionary, corpus=corpus, texts=train_texts, limit=i)