-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathapp.py
296 lines (239 loc) · 10.5 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
import pandas as pd
import streamlit as st
import SessionState
import input_output as io
import preprocessor as pp
import model_evaluator as mv
import lda as LDA
import base64
from io import BytesIO
from pathlib import Path
import time
def display_app_header(main_txt,sub_txt,is_sidebar = False):
"""
function to display major headers at user interface
Parameters
----------
main_txt: str -> the major text to be displayed
sub_txt: str -> the minor text to be displayed
is_sidebar: bool -> check if its side panel or major panel
"""
html_temp = f"""
<div style = "background.color:#054029 ; padding:15px">
<h2 style = "color:white; text_align:center;"> {main_txt} </h2>
<p style = "color:white; text_align:center;"> {sub_txt} </p>
</div>
"""
if is_sidebar:
st.sidebar.markdown(html_temp, unsafe_allow_html = True)
else:
st.markdown(html_temp, unsafe_allow_html = True)
def display_side_panel_header(txt):
"""
function to display minor headers at side panel
Parameters
----------
txt: str -> the text to be displayed
"""
st.sidebar.markdown(f'## {txt}')
#Main panel title
display_app_header(main_txt='Topic Modelling Open Source Tool',sub_txt='Build and Evaluate a Topic Model without coding!!!')
#Side panel title
display_app_header(main_txt='Instance Configuration ',sub_txt='Use this panel to configure the modelling space and complete corresponding actions after each step in the main panel',is_sidebar=True)
display_side_panel_header(txt='Step 1:')
data_input_mthd = st.sidebar.radio("Select Data Input Method",('Copy-Paste text', 'Upload a CSV file'))
display_side_panel_header(txt='Step 2:')
clean_data_opt = st.sidebar.radio("Clean the Data or Use Raw Data",('Use Raw Data', 'Clean the Data'))
display_side_panel_header(txt='Step 3:')
normalization_mthd = st.sidebar.radio("Select a text normalization method",('None','Lemmatization', 'Stemming'))
encoding_mthd = st.sidebar.selectbox('Select a feature extraction method',(['None','BOW with Term Frequency','BOW with TF-IDF']))
display_side_panel_header(txt='Step 4:')
model = st.sidebar.radio("Select Model Type and set Hyperparameters. Default parameters actually work fine",('Latent Dirichlet Allocation', 'Non-Negative Matrix Factorization'))
n_of_topics = st.sidebar.number_input('Expected Number of Topics',min_value=1,value=5,step =1)
update_every= st.sidebar.slider('update_every (0: batch learning, 1: online iterative learning.)', 0,1,1)
chunksize= st.sidebar.slider('chunksize (Number of documents to be used in each training chunk))', 10,20,10)
passes= st.sidebar.slider('passes (Number of passes through the corpus during training))', 10,20,10)
alpha= st.sidebar.selectbox('Alpha',(['symmetric','auto']))
iterations=st.sidebar.number_input('Number of Iteration',min_value=50,max_value=500,value=100,step =1)
#How to contribute
display_app_header(main_txt='How to Contribute',sub_txt='Send Pull Request to Project Repository on Github (Topic-Modelling-Open-Source-Tool)',is_sidebar=True)
# Project Version:
display_app_header(main_txt='Project Version',sub_txt=' 0.1.0 c2020',is_sidebar=True)
#About Aauthor
display_app_header(main_txt='About Author',sub_txt='Opeyemi Bamigbade (Yhemmy) is a Data Scientist and ML-Engineer at Data Science Nigeria. Twitter:@opeyemibami Email:[email protected]',is_sidebar=True)
#session state
ss = SessionState.get(output_df = pd.DataFrame(),
df_raw = pd.DataFrame(),
_model=None,
text_col='text',
is_file_uploaded=False,
id2word = None,
corpus= None,
is_valid_text_feat = False,
to_clean_data = False,
to_encode = False,
to_train = False,
to_evaluate = False,
to_visualize = False,
to_download_report = False,
df = pd.DataFrame(),
txt = 'Paste the text to analyze here',
default_txt = 'Paste the text to analyze here',
clean_text = None,
ldamodel = None,
topics_df = None)
def display_header(header):
"""
function to display minor headers at user interface main pannel
Parameters
----------
header: str -> the major text to be displayed
"""
#view clean data
html_temp = f"""
<div style = "background.color:#054029; padding:10px">
<h4 style = "color:white;text_align:center;"> {header} </h5>
</div>
"""
st.markdown(html_temp, unsafe_allow_html = True)
def space_header():
"""
function to create space using html
Parameters
----------
"""
hide_streamlit_style = """
<style>
#MainMenu {visibility: hidden;}
footer {visibility: hidden;}
</style>
"""
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
st.cache()
def check_input_method(data_input_mthd):
"""
function check user input method if uploading or pasting
Parameters
----------
data_input_mthd: str -> the default displayed text for decision making
"""
if data_input_mthd=='Copy-Paste text':
df,ss.txt = io.get_input(ss_text= ss.txt)
else:
df,ss.txt= io.get_input(is_batch=True,ss_text= ss.txt)
if df.shape[0]>0:
# ss.is_batch_process = True
ss.is_file_uploaded = True
return df,ss.txt
def get_table_download_link(df):
"""Generates a link allowing the data in a given panda dataframe to be downloaded
in: dataframe
out: href string
"""
csv = df.to_csv(index=False)
b64 = base64.b64encode(csv.encode()).decode() # some strings <-> bytes conversions necessary here
href = f'<a href="data:file/csv;base64,{b64}" download="Report.csv" >Download csv file</a>'
return href
def get_chat_download_link():
"""Generates a link allowing the data in a given PDF to be downloaded
in: PDF file
out: href string
"""
with open("chart.pdf", "rb") as pdf_file:
base64_pdf = base64.b64encode(pdf_file.read()).decode()
href = f'<a href="data:file/pdf;base64,{base64_pdf }" download="Charts.pdf" >Download charts file</a>'
return href
############ APP Logical Flow ###############
space_header()
ss.df,ss.txt = check_input_method(data_input_mthd)
if ss.text_col != ss.default_txt:
ss.to_clean_data = True
ss.df_raw = ss.df.copy()
if ss.is_file_uploaded:
ss.df,ss.text_col = io.select_text_feature(ss.df)
if ss.df[ss.text_col].dtype =='O':
ss.to_clean_data = True
ss.is_valid_text_feat =True
else:
st.warning('select a valid text column')
ss.to_clean_data = False
# clean data #######
if ss.to_clean_data:
if clean_data_opt=='Use Raw Data':
display_header(header = 'Using Raw data') #Raw data header
space_header()
st.write(ss.df_raw.head())
if ss.text_col != ss.default_txt:
ss.to_encode = True
else:
display_header(header = 'Using Clean Data') #Clean data header
space_header()
ss.df = pp.clean_data(ss.df,feature=ss.text_col)
st.success('Data cleaning successfuly done')
ss.to_encode = True
st.write(ss.df.head())
# Encoding ##############
if ss.to_encode and encoding_mthd !='None':
display_header(header = 'Data Encoding Section ') #Encoding header
space_header()
if encoding_mthd=='BOW with Term Frequency':
ss.id2word, ss.corpus,ss.clean_text=pp.extract_features(ss.df,feature=ss.text_col,normalization_mthd=normalization_mthd,mode='Term Frequency')
st.success('Data Encoding Successfully done with Term Frequency')
ss.to_train = True
elif encoding_mthd=='BOW with TF-IDF':
ss.id2word, ss.corpus,ss.clean_text=pp.extract_features(ss.df,feature=ss.text_col,normalization_mthd=normalization_mthd,mode='Term Frequency')
st.success('Data Encoding Successfully done with Term Frequency - Inverse Term Frequency')
ss.to_train = True
elif ss.to_encode and ss.is_valid_text_feat and encoding_mthd =='None':
st.info('Select an ecoding method in the side panel')
################### Training ###########################
if ss.to_train:
display_header(header = 'Model Training Section ') #Model Training header
space_header()
button_train = st.button('Train Model')
if button_train:
if model== 'Latent Dirichlet Allocation':
ss._model = LDA.lda_train(ss.corpus,ss.id2word,update_every,chunksize,passes,alpha,iterations,number_of_topics=n_of_topics)
ss.output_df= ss.df.copy()
st.success('Training completed!!!')
ss.to_evaluate = True
elif model=='Non-Negative Matrix Factorization':
st.error('Non-Negative Matrix Factorization is yet to be implemented, Select LDA')
################### Model Evaluation ###########################
if ss.to_evaluate:
display_header(header = 'Model Evaluation Section') #Model Evaluation header
space_header()
button_eva = st.button('Evaluate Model')
if button_eva:
ss.topics_df = mv.get_model_results(corpus=ss.corpus, texts = ss.clean_text,ldamodel=ss._model)
# Formatting
ss.topics_df = ss.topics_df.reset_index()
ss.topics_df.columns = ["Document_No", "Text","Topic_Keywords","Dominant_Topic_Number","Percentage_Contribution"]
st.info('First few Rows of the Model Output')
st.write(ss.topics_df.head(10))
ss.to_visualize = True
################### Model Evaluation with Visualization ###########################
if ss.to_visualize:
display_header(header = 'Topics Visualization') #Topics Visualization header
space_header()
button_vis = st.button('Evaluate with Visuals')
if button_vis:
mv.vis_distribution(n_of_topics,ss.topics_df)
mv.vis_word_cloud(n_of_topics,ss._model)
mv.vis_count_n_weight(n_of_topics, ss._model,ss.clean_text)
st.success('Visualization completed!!!')
ss.to_download_report = True
################### Downloading Section ###########################
if ss.to_download_report:
display_header(header = 'Download Report Section') #Report Section header
space_header()
button_download = st.button('Generate Report Sheet')
if button_download and Path(Path.cwd().joinpath('chart1.pdf')).is_file():
mv.generate_chart()
st.success('Report successfuly generated, click the links below to download.')
st.markdown(get_table_download_link(ss.topics_df), unsafe_allow_html=True)
st.markdown(get_chat_download_link(), unsafe_allow_html=True)
mv.del_charts()
st.balloons()
elif button_download and not Path(Path.cwd().joinpath('chart1.pdf')).is_file():
st.info("Kindly click the 'Evaluate with Visuals' button above to first create reports ")