Merge branch 'master' of https://github.com/moevm/document_insight_sy…

…stem
moevm · May 20, 2024 · f54ac44 · f54ac44
2 parents 67e35fc + c3101e0
commit f54ac44
Show file tree

Hide file tree

Showing 6 changed files with 32 additions and 15 deletions.
diff --git a/app/main/checks/presentation_checks/sld_similarity.py b/app/main/checks/presentation_checks/sld_similarity.py
@@ -1,6 +1,6 @@
-from nlp.similarity_of_texts import check_similarity
 from utils import get_text_from_slides, tasks_conclusions_feedback
-
+from app.nlp.similarity_of_texts import check_similarity
+from app.nlp.stemming import Stemming
 from ..base_check import BasePresCriterion, answer
 
 
@@ -18,8 +18,6 @@ def __init__(self, file_info, goals='Цель и задачи', conclusion='За
     def check(self):
         goals = get_text_from_slides(self.file, self.goals)
         conclusions = get_text_from_slides(self.file, self.conclusion)
-        if goals == "" or conclusions == "":
-            return answer(False, 'Задач или заключения не существует')
 
         results = check_similarity(goals, conclusions)
 

diff --git a/app/main/checks/report_checks/find_theme_in_report.py b/app/main/checks/report_checks/find_theme_in_report.py
@@ -69,9 +69,13 @@ def find_theme(self):
             if key == 1:
                 lower_text = text_on_page.lower()
                 text_without_punct = lower_text.translate(str.maketrans('', '', string.punctuation))
-                list_full = text_without_punct.split()
-                start = list_full.index('тема') + 1
-                end = list_full.index('студент')
+                list_full = tuple(text_without_punct.split())
+                start, end = 0, len(list_full)
+                for index, value in enumerate(list_full):
+                    if value == "тема":
+                        start = index + 1
+                    elif value in {"студент", "студентка"}:
+                        end = index
                 list_theme = list_full[start:end]
                 lemma_theme = {MORPH_ANALYZER.parse(word)[0].normal_form for word in list_theme if
                                 word not in stop_words}

diff --git a/app/main/checks/report_checks/image_references.py b/app/main/checks/report_checks/image_references.py
@@ -26,12 +26,18 @@ def check(self):
             if not len(self.headers):
                 return answer(False, "Не найдено ни одного заголовка.<br><br>Проверьте корректность использования стилей.")
             number_of_images, all_numbers = self.count_images_vkr()
-            if not number_of_images:
+            count_file_image_object = self.file.pdf_file.get_image_num()
+            if count_file_image_object and not number_of_images:
+                return answer(False, f'В отчёте найдено {count_file_image_object} рисунков, но не найдено ни одной подписи рисунка.<br><br> Если в вашей работе присутствуют рисунки, убедитесь, что для их подписи был '
+                                     f'использован стиль {self.image_style}, и формат: '
+                                     f'"Рисунок <Номер рисунка> — <Название рисунка>".')
+            elif not number_of_images:
                 return answer(True, f'Не найдено ни одного рисунка.<br><br> Если в вашей работе присутствуют рисунки, убедитесь, что для их подписи был '
                                      f'использован стиль {self.image_style}, и формат: '
-                                     f'"Рисунок <Номер рисунка> -- <Название рисунка>".')
+                                     f'"Рисунок <Номер рисунка> — <Название рисунка>".')
         else:
             return answer(False, 'Во время обработки произошла критическая ошибка')
+
         references = self.search_references()
         if len(references.symmetric_difference(all_numbers)) == 0:
             return answer(True, f"Пройдена!")

diff --git a/app/main/reports/pdf_document/pdf_document_manager.py b/app/main/reports/pdf_document/pdf_document_manager.py
@@ -27,6 +27,9 @@ def get_text_on_page(self):
     # def get_text_on_page(self):
     #     return {page + 1: self.pages[page].extract_text() for page in range(self.page_count_all)}
 
+    def get_image_num(self):
+        return len(self.pdf_file.get_page_images(0))
+
     def page_images(self, page_without_pril):
         total_height = 0
         for page_num in range(page_without_pril):

diff --git a/app/nlp/find_tasks_on_slides.py b/app/nlp/find_tasks_on_slides.py
@@ -12,7 +12,7 @@ def compare_sentences(sentence_1, sentence_2):
     stemming = Stemming()
     set_1 = stemming.get_filtered_docs(sentence_1, False)
     set_2 = stemming.get_filtered_docs(sentence_2, False)
-    rvector = set_1.union(set_2)
+    rvector = set_1 #.union(set_2)
     vector_1 = [w in set_1 for w in rvector]
     vector_2 = [w in set_2 for w in rvector]
     cosine_similarity = 1 - distance.cosine(vector_1, vector_2)

diff --git a/app/nlp/similarity_of_texts.py b/app/nlp/similarity_of_texts.py
@@ -6,12 +6,18 @@
 
 def check_similarity(string1, string2):
     stemming = Stemming()
+
+    stemming.parse_text(string2, False)
     further_dev = stemming.further_dev()
-    base_conclusions = stemming.get_sentences(string2, False)
+    base_conclusions = stemming.sentences
     ignore = re.compile('[0-9]+[.]?|Заключение|‹#›')
-    clear_conclusions = [ch for ch in base_conclusions if not re.fullmatch(ignore, ch)]
-    recognized_conclusions = [s for s in clear_conclusions if s != further_dev.get('dev_sentence')]
+    conclusions = [ch for ch in base_conclusions if not re.fullmatch(ignore, ch)]
+    cleaned_conclusions = "\n".join(s for s in conclusions if s != further_dev.get('dev_sentence'))
+
+    tasks = stemming.get_sentences(string1, True)
+    ignore = re.compile('[0-9][.]?|Задачи:|‹#›')  # [:]?
+    cleaned_tasks = "\n".join(task for task in tasks if not re.fullmatch(ignore, task))
 
-    percentage_of_similarity = int(compare_sentences(string1, string2) * 100)
+    percentage_of_similarity = int(compare_sentences(cleaned_tasks, cleaned_conclusions) * 100)
 
-    return percentage_of_similarity, further_dev, recognized_conclusions
+    return percentage_of_similarity, further_dev, conclusions