Skip to content

Commit

Permalink
update SWKeywordsCheck (all text and phrase to lemma)
Browse files Browse the repository at this point in the history
  • Loading branch information
HadronCollider committed Dec 11, 2024
1 parent 0c94e16 commit 6d94f3c
Showing 1 changed file with 12 additions and 11 deletions.
23 changes: 12 additions & 11 deletions app/main/checks/report_checks/sw_keywords_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ def __init__(self, file_info, min_key_words = 3):
self.chapters = []
self.text_par = []
self.lemme_list = []
self.translator = str.maketrans('', '', string.punctuation)
self.stop_words = set(stopwords.words("russian"))

def late_init(self):
self.chapters = self.file.make_chapters(self.file_type['report_type'])
Expand All @@ -33,7 +35,6 @@ def check(self):
key_words_result = [word.strip() for word in final_str.split(',')]
if len(key_words_result) < self.min_key_words:
return answer(False, f'Не пройдена! Количество ключевых слов должно быть не менее {self.min_key_words}')
stop_words = set(stopwords.words("russian"))
if self.file.page_counter() < 4:
return answer(False, "В отчете недостаточно страниц. Нечего проверять.")
self.late_init()
Expand All @@ -45,22 +46,22 @@ def check(self):
par = intro_par['text'].lower()
self.text_par.append(par)
for phrase in key_words_result:
words = word_tokenize(phrase)
words_lemma = [MORPH_ANALYZER.parse(w)[0].normal_form for w in words if w.lower() not in stop_words]
phrase_lemma = ' '.join(words_lemma)
phrase_lemma = self.text_to_lemma(phrase)
self.lemme_list.append(phrase)
for text in self.text_par:
cleaned_text = re.sub(r'<[^>]*>', '', text)
translator = str.maketrans('', '', string.punctuation)
text_without_punct = cleaned_text.translate(translator)
word_in_text = word_tokenize(text_without_punct)
lemma_text = [MORPH_ANALYZER.parse(w)[0].normal_form for w in word_in_text if w.lower() not in stop_words]
lemma_text_str = ' '.join(lemma_text)
if phrase_lemma in lemma_text_str:
if phrase_lemma in self.text_to_lemma(text):
del self.lemme_list[-1]
break

if self.lemme_list:
return answer(False, f"Не пройдена! В тексте не найдены следующие ключевые слова: «{'», «'.join(self.lemme_list)}»")
else:
return answer(True, 'Пройдена!')


def text_to_lemma(self, text):
return ' '.join([
MORPH_ANALYZER.parse(w)[0].normal_form
for w in word_tokenize(re.sub(r'<[^>]*>', '', text.lower()).translate(self.translator))
if w not in self.stop_words
])

0 comments on commit 6d94f3c

Please sign in to comment.