resolve conflicts

moevm · Apr 24, 2024 · fc2e885 · fc2e885
2 parents 26cb9b4 + 63e6848
commit fc2e885
Show file tree

Hide file tree

Showing 23 changed files with 491 additions and 185 deletions.
diff --git a/app/db/db_methods.py b/app/db/db_methods.py
@@ -407,10 +407,12 @@ def mark_celery_task_as_finished(celery_task_id, finished_time=None):
         '$set': {'finished_at': finished_time,
                  'processing_time': (finished_time - celery_task['started_at']).total_seconds()}})
 
-
 def get_average_processing_time(min_time=5.0):
-    result = list(celery_check_collection.aggregate(
-        [{'$match': {"processing_time": {"$lt": 175} }}, {'$group': {'_id': None, 'avg_processing_time': {'$avg': "$processing_time"}}}]))
+    # use only success check (failed checks processing time is more bigger than normal)
+    result = list(celery_check_collection.aggregate([
+        {'$match': {'processing_time': {'$lt': 170}}},
+        {'$group': {'_id': None, 'avg_processing_time': {'$avg': "$processing_time"}}}
+    ]))
     if result and result[0]['avg_processing_time']:
         result = result[0]['avg_processing_time']
         if result > min_time:

diff --git a/app/main/check_packs/pack_config.py b/app/main/check_packs/pack_config.py
@@ -16,7 +16,7 @@
     ['future_dev'],
     ['pres_banned_words_check'],
     ['pres_empty_slide'],
-    ['pres_banned_words_check'],
+    ['theme_in_pres_check'],
     ['verify_git_link'],
 ]
 BASE_REPORT_CRITERION = [
@@ -40,7 +40,9 @@
     ["header_check"],
     ["report_section_component"],
     ["main_text_check"],
-    ["spelling_check"]
+    ["spelling_check"],
+    ["max_abstract_size_check"],
+    ["theme_in_report_check"],
 ]
 
 DEFAULT_TYPE = 'pres'

diff --git a/app/main/checks/presentation_checks/__init__.py b/app/main/checks/presentation_checks/__init__.py
@@ -5,10 +5,11 @@
 from .sld_enum import SldEnumCheck
 from .sld_num import SldNumCheck
 from .sld_similarity import SldSimilarity
-from .template_name import TemplateNameCheck
+from .template_name import PresTemplateNameCheck
 from .title_format import TitleFormatCheck
 from .pres_right_words import PresRightWordsCheck
 from .image_share import PresImageShareCheck
 from .banned_words import PresBannedWordsCheck
+from .find_theme_in_pres import FindThemeInPres
 from .verify_git_link import PresVerifyGitLinkCheck
 from .empty_slide_check import PresEmptySlideCheck
diff --git a/app/main/checks/presentation_checks/find_def_sld.py b/app/main/checks/presentation_checks/find_def_sld.py
@@ -3,7 +3,7 @@
 
 class FindDefSld(BasePresCriterion):
     label = "Поиск ключевого слова в заголовках"
-    description = 'Ключевые слова: "Апробация", "Цели и задачи", "Заключение"'
+    description = 'Поиск ключевого слова в заголовках'
     id = 'find_slides'
 
     def __init__(self, file_info, key_slide):
@@ -12,18 +12,18 @@ def __init__(self, file_info, key_slide):
         self.found_idxs = []
 
     def check(self):
-        found_slides = []
         for i, title in enumerate(self.file.get_titles(), 1):
             if str(title).lower().find(str(self.type_of_slide).lower()) != -1:
-                found_slides.append(self.file.get_text_from_slides()[i - 1])
+                #found_slides.append(self.file.get_text_from_slides()[i - 1])
                 self.found_idxs.append(i)
-        if len(found_slides) == 0:
-            self.file.found_index[str(self.type_of_slide)] = None
-            return answer(False, 'Слайд не найден')
+
+        # save fot future
+        self.file.found_index[str(self.type_of_slide)] = self.found_idxs.copy()
+
+        if self.found_idxs:
+            return answer(True, 'Найден под номером: {}'.format(', '.join(map(str, self.format_page_link(self.found_idxs)))))
         else:
-            self.file.found_index[str(self.type_of_slide)] = ''.join(str(item) for item in self.found_idxs)
-            found_idxs_link = self.format_page_link(self.found_idxs)
-            return answer(True, 'Найден под номером: {}'.format(', '.join(map(str, found_idxs_link))))
+            return answer(False, 'Слайд не найден')
 
     @property
     def name(self):

diff --git a/app/main/checks/presentation_checks/find_theme_in_pres.py b/app/main/checks/presentation_checks/find_theme_in_pres.py
@@ -0,0 +1,64 @@
+
+from ..base_check import BasePresCriterion, answer
+from .find_def_sld import FindDefSld
+from app.nlp.stemming import Stemming
+
+import string
+import nltk
+from nltk.tokenize import word_tokenize, sent_tokenize
+from nltk.corpus import stopwords
+from pymorphy2 import MorphAnalyzer
+
+
+MORPH_ANALYZER = MorphAnalyzer()
+
+
+class FindThemeInPres(BasePresCriterion):
+    label = "Проверка упоминания темы в заголовках презентации"
+    description = """Проверка упоминания темы в заголовках презентации, не включая титульный слайд, слайды "Цели и задачи", "Заключение" """
+    id = 'theme_in_pres_check'
+
+    def __init__(self, file_info, skip_slides_nums=(1,), skip_slides_titles=("Заключение",), limit=60):
+        super().__init__(file_info)
+        self.skip_slides_title = skip_slides_titles
+        slides = []
+        for title in self.skip_slides_title:
+            find_sld = FindDefSld(file_info=file_info, key_slide=title)
+            find_sld.check()
+            slides.extend(find_sld.found_idxs)
+        self.skip_slides = [
+            *skip_slides_nums,
+            *slides
+        ]        
+        self.limit = limit
+
+    def check(self):
+        stop_words = set(stopwords.words("russian"))
+
+        text_from_title = [slide for page, slide in enumerate(self.file.get_titles(), 1) if page not in self.skip_slides]
+        theme = ''.join(word for word in text_from_title[0])
+
+        translator = str.maketrans('', '', string.punctuation)
+        theme_without_punct = theme.translate(translator)
+        words_in_theme = word_tokenize(theme_without_punct)
+        lemma_theme = {MORPH_ANALYZER.parse(word)[0].normal_form for word in words_in_theme if word.lower() not in stop_words}
+
+        text_from_slide = [slide for page, slide in enumerate(self.file.get_text_from_slides(), 1) if page > 1]
+        string_from_text = ''.join(text_from_slide)
+
+        text_without_punct = string_from_text.translate(translator)
+        words_in_text = word_tokenize(text_without_punct)
+
+        lemma_text = {MORPH_ANALYZER.parse(word)[0].normal_form for word in words_in_text if word.lower() not in stop_words}
+
+        value_intersection = round(len(lemma_theme.intersection(lemma_text))*100//len(lemma_theme))
+
+        if value_intersection == 0:
+            return answer(False, "Не пройдена! В презентации не упоминаются слова, завяленные в теме.")
+        elif value_intersection < self.limit:
+            return answer(
+                round(value_intersection / self.limit, 1),
+                f"Частично пройдена! Процент упоминания темы в вашей презентации ({value_intersection} %) ниже требуемого ({self.limit} %)."
+            )
+        else:
+            return answer(True, f'Пройдена! Процент упоминания темы в презентации: {value_intersection} %')
diff --git a/app/main/checks/presentation_checks/template_name.py b/app/main/checks/presentation_checks/template_name.py
@@ -3,7 +3,7 @@
 from ..base_check import BasePresCriterion, answer
 
 
-class TemplateNameCheck(BasePresCriterion):
+class PresTemplateNameCheck(BasePresCriterion):
     label = "Проверка соответствия названия файла шаблону"
     description = 'Шаблон названия: "Презентация_ВКР_Иванов", "ПРЕЗЕНТАЦИЯ_НИР_ИВАНОВ"'
     id = 'template_name'

diff --git a/app/main/checks/presentation_checks/verify_git_link.py b/app/main/checks/presentation_checks/verify_git_link.py
@@ -17,7 +17,7 @@ class PresVerifyGitLinkCheck(BasePresCriterion):
     description = ''
     id = 'verify_git_link'
 
-    def __init__(self, file_info, deep_check=True):
+    def __init__(self, file_info, deep_check=False):
         super().__init__(file_info)
         self.deep_check = deep_check
         self.wrong_repo_ref = []
@@ -59,9 +59,8 @@ def check(self):
                     link = requests.get(i[0])
                     if link.status_code != 200:
                         raise requests.exceptions.ConnectionError
-                    else:
-                        if self.deep_check:
-                            self.deep_check_repo(i, link)
+                    if self.deep_check:
+                        self.deep_check_repo(i, link)
                 except (requests.exceptions.SSLError, requests.exceptions.ConnectionError):
                     self.wrong_repo_ref.append(i[0])
         if self.wrong_repo_ref:
@@ -78,7 +77,7 @@ def check(self):
     def deep_check_repo(self, repo, link):
         if re.findall(r'github', repo[0]):
             tree = html.fromstring(link.content)
-            if not tree.xpath("//a[@class ='js-navigation-open Link--primary']"):
+            if not tree.xpath("//a[@class ='Link--primary']"):
                 self.empty_repo_ref.append(repo[0])
 
         # if re.findall(r'gitlab', i[0]):

diff --git a/app/main/checks/report_checks/__init__.py b/app/main/checks/report_checks/__init__.py
@@ -17,8 +17,10 @@
 from .short_sections_check import ReportShortSectionsCheck
 from .simple_check import ReportSimpleCheck
 from .style_check_settings import StyleCheckSettings
+from .find_theme_in_report import FindThemeInReport
 from .headers_at_page_top_check import ReportHeadersAtPageTopCheck
 from .sections_check import LRReportSectionCheck
 from .style_check import ReportStyleCheck
 from .spelling_check import SpellingCheck
-
+from .max_abstract_size_check import ReportMaxSizeOfAbstractCheck
+from .template_name import ReportTemplateNameCheck
diff --git a/app/main/checks/report_checks/find_theme_in_report.py b/app/main/checks/report_checks/find_theme_in_report.py
@@ -0,0 +1,78 @@
+import re
+import string
+
+from ..base_check import BaseReportCriterion, answer
+
+import  string
+from nltk.tokenize import word_tokenize, sent_tokenize
+from nltk.corpus import stopwords
+from pymorphy2 import MorphAnalyzer
+
+
+MORPH_ANALYZER = MorphAnalyzer()
+
+
+class FindThemeInReport(BaseReportCriterion):
+    label = "Проверка упоминания темы в отчете"
+    description = "Проверка упоминания темы в отчете"
+    id = 'theme_in_report_check'
+
+    def __init__(self, file_info, limit = 40):
+        super().__init__(file_info)
+        self.intro = {}
+        self.chapters = []
+        self.text_par = []
+        self.full_text = set()
+        self.limit = limit
+
+    def late_init(self):
+        self.chapters = self.file.make_chapters(self.file_type['report_type'])
+
+    def check(self):
+        stop_words = set(stopwords.words("russian"))
+        if self.file.page_counter() < 4:
+            return answer(False, "В отчете недостаточно страниц. Нечего проверять.")
+
+        self.late_init()
+        for intro in self.chapters:
+            header = intro["text"].lower()
+            if header not in ['заключение', "введение", "список использованных источников", "условные обозначения"]:
+                self.intro = intro
+                for intro_par in self.intro['child']:
+                    par = intro_par['text'].lower()
+                    self.text_par.append(par)
+        lemma_theme = self.find_theme()
+
+        for text in self.text_par:
+            translator = str.maketrans('', '', string.punctuation)
+            theme_without_punct = text.translate(translator)
+            word_in_text = word_tokenize(theme_without_punct)
+            lemma_text = {MORPH_ANALYZER.parse(w)[0].normal_form for w in word_in_text if w.lower() not in stop_words}
+            self.full_text.update(lemma_text)
+
+        intersection = lemma_theme.intersection(self.full_text)
+        value_intersection = round(len(intersection)*100//len(lemma_theme))
+        if value_intersection == 0:
+            return answer(False, "Не пройдена! В отчете не упоминаются слова, заявленные в теме отчета.")
+        elif value_intersection < self.limit:
+            return answer(
+                          round(value_intersection/self.limit, 1),
+                          f"Частично пройдена! Процент упоминания темы в вашем отчете ({value_intersection} %) ниже требуемого ({self.limit} %)."
+            )
+        else:
+            return answer (True, f'Пройдена! Процент упоминания темы в отчете: {value_intersection} %.')
+
+    def find_theme(self):
+        stop_words = set(stopwords.words("russian"))
+        lemma_theme = []
+        for key, text_on_page in self.file.pdf_file.get_text_on_page().items():
+            if key == 1:
+                lower_text = text_on_page.lower()
+                text_without_punct = lower_text.translate(str.maketrans('', '', string.punctuation))
+                list_full = text_without_punct.split()
+                start = list_full.index('тема') + 1
+                end = list_full.index('студент')
+                list_theme = list_full[start:end]
+                lemma_theme = {MORPH_ANALYZER.parse(word)[0].normal_form for word in list_theme if
+                                word not in stop_words}
+            return lemma_theme
diff --git a/app/main/checks/report_checks/headers_at_page_top_check.py b/app/main/checks/report_checks/headers_at_page_top_check.py
@@ -26,7 +26,7 @@ def check(self):
         if self.file_type["report_type"] == 'LR':
             for header in self.headers:
                 found = False
-                for page_num in range(1, self.pdf.page_count):
+                for page_num in range(1, self.pdf.page_count_all):
                     lines = self.pdf.text_on_page[page_num + 1].split("\n")
                     last_header_line = 0
                     collected_text = ""

diff --git a/app/main/checks/report_checks/image_share_check.py b/app/main/checks/report_checks/image_share_check.py
@@ -1,6 +1,5 @@
 from ..base_check import BaseReportCriterion, answer
 
-
 class ReportImageShareCheck(BaseReportCriterion):
     label = "Проверка доли объема отчёта, приходящейся на изображения"
     description = 'Доля изображений (не включая "Приложение") не должна превышать 0,9'
@@ -13,27 +12,23 @@ def __init__(self, file_info, limit=0.3):
     def check(self):
         if self.file.page_counter() < 4:
             return answer(False, "В отчете недостаточно страниц. Нечего проверять.")
-        images_height = 0
-        for image in self.file.inline_shapes:
-            images_height += image.height.cm
-        if len(self.file.file.sections):
-            available_space = self.file.file.sections[0].page_height.cm - self.file.file.sections[0].bottom_margin.cm - \
-                              self.file.file.sections[0].top_margin.cm
-            images_pages = images_height / available_space
-            share = images_pages / self.file.page_count
-            if share > self.limit:
-                result_str = f'Проверка не пройдена! Изображения в работе занимают около {round(share, 2)} объема ' \
-                             f'документа без учета приложения, ограничение - {round(self.limit, 2)}'
-                result_str += '''
-                            Если доля отчета, приходящаяся на изображения, больше нормы, попробуйте сделать следующее:
-                            <ul>
-                                <li>Попробуйте перенести малозначимые иллюстрации в Приложение;</li>
-                                <li>Если у вас уже есть раздел Приложение, убедитесь, что количество страниц в отчете посчитано программой без учета приложения;</li>
-                                <li>Если страницы посчитаны программой неверно, убедитесь, что заголовок приложения правильно оформлен;</li>
-                                <li>Убедитесь, что красная строка не сделана с помощью пробелов или табуляции.</li>
-                            </ul>
-                            '''
-                return answer(False, result_str)
-            else:
-                return answer(True, f'Пройдена!')
-        return answer(False, 'Во время обработки произошла критическая ошибка')
+        images_height = self.file.pdf_file.page_images(page_without_pril=self.file.page_count)
+        available_space = self.file.pdf_file.page_height(page_without_pril=self.file.page_count)
+
+        images_value = images_height/available_space
+
+        if images_value > self.limit:
+            result_str = f'Проверка не пройдена! Изображения в работе занимают около {round(images_value, 2)} объема ' \
+                         f'документа без учета приложения, ограничение - {round(self.limit, 2)}'
+            result_str += '''
+                        Если доля отчета, приходящаяся на изображения, больше нормы, попробуйте сделать следующее:
+                        <ul>
+                            <li>Попробуйте перенести малозначимые иллюстрации в Приложение;</li>
+                            <li>Если у вас уже есть раздел Приложение, убедитесь, что количество страниц в отчете посчитано программой без учета приложения;</li>
+                            <li>Если страницы посчитаны программой неверно, убедитесь, что заголовок приложения правильно оформлен;</li>
+                            <li>Убедитесь, что красная строка не сделана с помощью пробелов или табуляции.</li>
+                        </ul>
+                        '''
+            return answer(False, result_str)
+        else:
+            return answer(True, 'Пройдена!')
diff --git a/app/main/checks/report_checks/literature_references.py b/app/main/checks/report_checks/literature_references.py
@@ -138,7 +138,7 @@ def count_sources(self):
 
     def search_literature_start_pdf(self):
         start_page = 0
-        end_page = self.file.pdf_file.page_count
+        end_page = self.file.pdf_file.page_count_all
         for i in self.file.pdf_file.text_on_page.keys():
             lowercase_str = self.file.pdf_file.text_on_page[i].lower()
             if re.search(self.name_pattern, lowercase_str):