analysis/hot_words_generator.py

import sys

import jieba
import jieba.analyse
import numpy as np
import pandas as pd
from wordcloud import WordCloud

from config.config import *

STOPWORDS_PATH = BASE_PATH + '/config/stopwords.txt'
USER_CORPUS = BASE_PATH + '/config/usercorpus.txt'


class HotWordsGenerator:
    def concat_all_text(self, text_dir):
        """
        read and concatenate all text content in a specific text_dir
        :param text_dir:
        :return:
        """
        all_txt = list()
        for each_txt in os.listdir(text_dir):
            filepath = text_dir + os.path.sep + each_txt
            with open(filepath, mode='rt', encoding='UTF-8') as f:
                text = ''.join(f.readlines())
                all_txt.append(text)

        return ''.join(all_txt)

    def cal_and_show_job_impression_hot_words(self, interviewee_comments_dir='../spider/impression'):
        """
        calculate and show hot words of Job Impression
        :param interviewee_comments_dir:
        :return:
        """
        if not os.path.exists(interviewee_comments_dir) or len(os.listdir(interviewee_comments_dir)) == 0:
            print('Error! No valid content in {0}'.format(interviewee_comments_dir))
            sys.exit(0)
        else:
            job_and_dir = {_: os.path.join(interviewee_comments_dir, _) for _ in os.listdir(interviewee_comments_dir)}

            for k, v in job_and_dir.items():
                text = self.concat_all_text(v)
                jieba.analyse.set_stop_words(STOPWORDS_PATH)
                jieba.load_userdict(USER_CORPUS)
                hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=())

                frequencies = {_[0]: _[1] for _ in hot_words_with_weights}

                print(frequencies)

                x, y = np.ogrid[:300, :300]
                mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
                mask = 255 * mask.astype(int)

                wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white",
                                      repeat=False,
                                      mask=mask)
                wordcloud.generate_from_frequencies(frequencies)

                import matplotlib.pyplot as plt
                plt.imshow(wordcloud, interpolation='bilinear')
                plt.axis("off")
                plt.show()

    def cal_and_show_jd_hot_words(self, jd_dir='../spider/jd'):
        """
        calculate and show hot words of Job Description (JD)
        :param jd_dir:
        :return:
        """
        if not os.path.exists(jd_dir) or len(os.listdir(jd_dir)) == 0:
            print('Error! No valid content in {0}'.format(jd_dir))
            sys.exit(0)
        else:
            jd_and_dir = {_.split('.')[0]: os.path.join(jd_dir, _) for _ in os.listdir(jd_dir)}

            for k, v in jd_and_dir.items():
                text = "".join(pd.read_excel(v)['详情描述'])
                jieba.analyse.set_stop_words(STOPWORDS_PATH)
                jieba.load_userdict(USER_CORPUS)
                hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=())

                frequencies = {_[0]: _[1] for _ in hot_words_with_weights}

                print(frequencies)

                x, y = np.ogrid[:300, :300]
                mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
                mask = 255 * mask.astype(int)

                wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white",
                                      repeat=False,
                                      mask=mask)
                wordcloud.generate_from_frequencies(frequencies)

                import matplotlib.pyplot as plt
                plt.imshow(wordcloud, interpolation='bilinear')
                plt.axis("off")
                plt.show()


if __name__ == '__main__':
    hot_words_generator = HotWordsGenerator()
    hot_words_generator.cal_and_show_jd_hot_words()