diff --git a/app/main/checks/report_checks/sw_keywords_check.py b/app/main/checks/report_checks/sw_keywords_check.py index 4562eaad..8ed5eb7d 100644 --- a/app/main/checks/report_checks/sw_keywords_check.py +++ b/app/main/checks/report_checks/sw_keywords_check.py @@ -20,6 +20,8 @@ def __init__(self, file_info, min_key_words = 3): self.chapters = [] self.text_par = [] self.lemme_list = [] + self.translator = str.maketrans('', '', string.punctuation) + self.stop_words = set(stopwords.words("russian")) def late_init(self): self.chapters = self.file.make_chapters(self.file_type['report_type']) @@ -33,7 +35,6 @@ def check(self): key_words_result = [word.strip() for word in final_str.split(',')] if len(key_words_result) < self.min_key_words: return answer(False, f'Не пройдена! Количество ключевых слов должно быть не менее {self.min_key_words}') - stop_words = set(stopwords.words("russian")) if self.file.page_counter() < 4: return answer(False, "В отчете недостаточно страниц. Нечего проверять.") self.late_init() @@ -45,18 +46,10 @@ def check(self): par = intro_par['text'].lower() self.text_par.append(par) for phrase in key_words_result: - words = word_tokenize(phrase) - words_lemma = [MORPH_ANALYZER.parse(w)[0].normal_form for w in words if w.lower() not in stop_words] - phrase_lemma = ' '.join(words_lemma) + phrase_lemma = self.text_to_lemma(phrase) self.lemme_list.append(phrase) for text in self.text_par: - cleaned_text = re.sub(r'<[^>]*>', '', text) - translator = str.maketrans('', '', string.punctuation) - text_without_punct = cleaned_text.translate(translator) - word_in_text = word_tokenize(text_without_punct) - lemma_text = [MORPH_ANALYZER.parse(w)[0].normal_form for w in word_in_text if w.lower() not in stop_words] - lemma_text_str = ' '.join(lemma_text) - if phrase_lemma in lemma_text_str: + if phrase_lemma in self.text_to_lemma(text): del self.lemme_list[-1] break @@ -64,3 +57,11 @@ def check(self): return answer(False, f"Не пройдена! В тексте не найдены следующие ключевые слова: «{'», «'.join(self.lemme_list)}»") else: return answer(True, 'Пройдена!') + + + def text_to_lemma(self, text): + return ' '.join([ + MORPH_ANALYZER.parse(w)[0].normal_form + for w in word_tokenize(re.sub(r'<[^>]*>', '', text.lower()).translate(self.translator)) + if w not in self.stop_words + ]) \ No newline at end of file