-
Notifications
You must be signed in to change notification settings - Fork 1
/
streamlit_setup.py
207 lines (170 loc) · 8.44 KB
/
streamlit_setup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import lander.text_mining as tm
import lander.file_handling as fh
import streamlit as st
from streamlit_tags import st_tags
import base64
from PIL import Image
import time
import pandas as pd
import pandas as pd
import nltk
import PyPDF2
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
pd.options.mode.chained_assignment = None
st.set_page_config(
page_title="Lander",
page_icon='image/lander.png',
)
@st.cache_data
def strip_cluster(data_eval, cluster, im_df):
'''Function to slice the dataframe into given cluster'''
ind = []
for i in data_eval.index:
if int(data_eval["ClusterName"][i]) == int(cluster):
ind.append(i)
match_df = im_df.loc[ind]
return match_df
@st.cache_data
def match_score_cs(match_df, content):
'''Caculate similarity score using cosine similarity, take top 100'''
similarityscore = []
for ind in match_df.index:
score = tm.similarity_caculator(content, match_df['jobdescription'][ind])
similarityscore.append(score)
match_df['SimilarityScore'] = pd.Series(similarityscore)
temp_df1 = match_df.copy()
temp_df = temp_df1.sort_values('SimilarityScore', ascending=False)
temp_df = temp_df.iloc[:100]
return temp_df
@st.cache_data
def match_score_pm(data_eval, cluster, content, im_df, seniority):
'''Caculate similarity score using Phrase Matcher'''
pre_match_df = strip_cluster(data_eval, cluster, im_df)
if seniority.lower() in ['junior' or 'jr' or 'entry']:
pre_match_df = pre_match_df[pre_match_df["jobtitle"].str.contains("Senior|Sr|senior|Manager|Principal") == False]
match_df = match_score_cs(pre_match_df, content)
scores = []
matches_kws = []
for ind in match_df.index:
text_skill = str(match_df['Extracted Skills'][ind])
text_skill_use = [k.lower() for k in text_skill.split(',')]
matches_kw = tm.keyword_matching(content.lower(), text_skill_use)
matches_kws.append(','.join(matches_kw))
score = (len(matches_kw)/len(text_skill_use))*100
scores.append(score)
match_df['MatchingPercentage'] = pd.Series(scores)
match_df['KeywordMatched'] = pd.Series(matches_kws)
# Return top 5 matches
match_df = match_df.sort_values('MatchingPercentage', ascending=False)
top_df = match_df.iloc[:5]
return top_df
def main():
img = Image.open('image/lander.png')
st.image(img)
st.sidebar.markdown("...Please Choose Something...")
activities = ["Home", "Analyzer", "Database"]
choice = st.sidebar.selectbox("Please select: Home to know more about the project"
+ "\n Analyzer to analyze your resume"
+ "\n Database to learn more about the dataset and download it if you need", activities)
im_df = pd.read_csv('./data/skill_extracted_df.csv', index_col = 0)
# prepare a data frame of only skills and job title to train
col = ['jobtitle', 'Extracted Skills']
data_eval = im_df[col]
# Drop rows with missing data
data_eval.dropna(subset=['Extracted Skills'], inplace=True)
data_forfit = data_eval['Extracted Skills']
# define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(sublinear_tf = True, min_df = 0.001, use_idf=True, stop_words= 'english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data_forfit)
# generate k-cluster
num_clusters = 26
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
clusters = km.predict(tfidf_matrix)
#add cluster name into the df
data_eval["ClusterName"] = clusters
if choice == 'Analyzer':
# Load job description file
# Collecting Miscellaneous Information
act_name = st.text_input('Please enter your name')
seniority = st.text_input('Please enter your seniority level in tech')
# Upload Resume
st.markdown('''<h5 style='text-align: left; color: #021659;'> Upload Your Resume, And Get Smart Recommendations</h5>''',unsafe_allow_html=True)
## file upload in pdf format
pdf_file = st.file_uploader("Please upload your Resume", type=["pdf"])
if pdf_file is not None:
### parsing and extracting whole resume
content = fh.load_data_pdf(pdf_file)
if content:
## Showing Analyzed data from (resume_data)
st.header("Here is your Resume report")
st.success("Hello "+ act_name)
st.subheader("Below is your basic info")
try:
st.text('Name: '+ act_name)
st.text('Seniority Level: '+ seniority)
except:
pass
st.subheader("Below is top 5 job matches your skills and info")
cluster = km.predict(tfidf_vectorizer.transform([content]))
# Put all rows having matched cluster into a new dataframe with matching score
top_df = match_score_pm(data_eval, cluster, content, im_df, seniority)
# Return missing keyphrase from a job
x = 1
y = 10
for ind in top_df.index:
st.success("🎆 Your skills is matched to " + top_df['jobtitle'][ind] + " at " + top_df['company'][ind] + " 🎆")
key = top_df['Extracted Skills'][ind]
key_list = [k.lower() for k in key.split(',')]
matched_key = str(top_df['KeywordMatched'][ind])
matched_key_list = matched_key.split(',')
missing = []
for kw in key_list:
if kw not in matched_key_list:
missing.append(kw)
st_tags(label=' Your matched skills with this job are',
text='See our skills recommendation below',value=matched_key_list,key = x)
st_tags(label='### Recommended skills for you to boost chance with this job title',
text='Recommended skills generated from System',value= missing, key = y)
st.markdown('''<h5 style='text-align: left; color: #1ed760;'>Adding this skills to resume will boost🚀 the chances of getting a Job</h5>''',unsafe_allow_html=True)
with st.expander(label="Click to display Job Description"):
st.markdown(top_df['jobdescription'][ind])
x += 1
y += 1
elif choice == 'Home':
st.subheader("Lander: Together we shoot for the moon")
st.markdown('''
<p align='justify'>
A text-mining based tool to help student with finding the most compatible job post based on their past experiences and interesrs as well as optimizing their resume by a keyword suggesting system.
</p>
<p align="justify">
<b>Analyzer -</b> <br/>
In the Side Bar select Analyzer option to start the process by filling out the required fields and uploading your resume in pdf format.<br/>
Just sit back and relax our tool will do the magic on it's own.<br/><br/>
<b>Database -</b> <br/>
A place where user can explore and download Lander's skills database.<br/><br/>
</p><br/><br/>
''',unsafe_allow_html=True)
elif choice == "Database":
option = ['Download Database', "Cluster Plotting"]
choice1 = st.selectbox(
'What action do you want to choose', option)
if choice1 == 'Download Database':
st.subheader("You can download Lander's job and skills dataset below")
with open("./data/skill_extracted_df.csv", "rb") as file:
st.download_button(
label="Download database",
data=file,
file_name="lander_skill_extracted_df.csv",
mime="text/csv"
)
st.success("Now you can do something with this database!!!")
elif choice1 == "Cluster Plotting":
st.subheader("Preview top keyword in each clusters with relevant job titles")
dfs = tm.get_top_features_cluster(tfidf_matrix.toarray(), clusters, 6, tfidf_vectorizer)
for i in range(0,26):
st.pyplot(tm.plotWords(dfs, 6, data_eval, i))
main()