-
Notifications
You must be signed in to change notification settings - Fork 127
/
hot_words_generator.py
106 lines (84 loc) · 3.94 KB
/
hot_words_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import sys
import jieba
import jieba.analyse
import numpy as np
import pandas as pd
from wordcloud import WordCloud
from config.config import *
STOPWORDS_PATH = BASE_PATH + '/config/stopwords.txt'
USER_CORPUS = BASE_PATH + '/config/usercorpus.txt'
class HotWordsGenerator:
def concat_all_text(self, text_dir):
"""
read and concatenate all text content in a specific text_dir
:param text_dir:
:return:
"""
all_txt = list()
for each_txt in os.listdir(text_dir):
filepath = text_dir + os.path.sep + each_txt
with open(filepath, mode='rt', encoding='UTF-8') as f:
text = ''.join(f.readlines())
all_txt.append(text)
return ''.join(all_txt)
def cal_and_show_job_impression_hot_words(self, interviewee_comments_dir='../spider/impression'):
"""
calculate and show hot words of Job Impression
:param interviewee_comments_dir:
:return:
"""
if not os.path.exists(interviewee_comments_dir) or len(os.listdir(interviewee_comments_dir)) == 0:
print('Error! No valid content in {0}'.format(interviewee_comments_dir))
sys.exit(0)
else:
job_and_dir = {_: os.path.join(interviewee_comments_dir, _) for _ in os.listdir(interviewee_comments_dir)}
for k, v in job_and_dir.items():
text = self.concat_all_text(v)
jieba.analyse.set_stop_words(STOPWORDS_PATH)
jieba.load_userdict(USER_CORPUS)
hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=())
frequencies = {_[0]: _[1] for _ in hot_words_with_weights}
print(frequencies)
x, y = np.ogrid[:300, :300]
mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)
wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white",
repeat=False,
mask=mask)
wordcloud.generate_from_frequencies(frequencies)
import matplotlib.pyplot as plt
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
def cal_and_show_jd_hot_words(self, jd_dir='../spider/jd'):
"""
calculate and show hot words of Job Description (JD)
:param jd_dir:
:return:
"""
if not os.path.exists(jd_dir) or len(os.listdir(jd_dir)) == 0:
print('Error! No valid content in {0}'.format(jd_dir))
sys.exit(0)
else:
jd_and_dir = {_.split('.')[0]: os.path.join(jd_dir, _) for _ in os.listdir(jd_dir)}
for k, v in jd_and_dir.items():
text = "".join(pd.read_excel(v)['详情描述'])
jieba.analyse.set_stop_words(STOPWORDS_PATH)
jieba.load_userdict(USER_CORPUS)
hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=())
frequencies = {_[0]: _[1] for _ in hot_words_with_weights}
print(frequencies)
x, y = np.ogrid[:300, :300]
mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)
wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white",
repeat=False,
mask=mask)
wordcloud.generate_from_frequencies(frequencies)
import matplotlib.pyplot as plt
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
if __name__ == '__main__':
hot_words_generator = HotWordsGenerator()
hot_words_generator.cal_and_show_jd_hot_words()