moevm · HadronCollider · Apr 23, 2024 · Jun 29, 2023 · Jul 4, 2023 · Jul 13, 2023
diff --git a/app/main/check_packs/pack_config.py b/app/main/check_packs/pack_config.py
@@ -16,7 +16,7 @@
     ['future_dev'],
     ['pres_banned_words_check'],
     ['pres_empty_slide'],
-    ['pres_banned_words_check'],
+    ['theme_in_pres_check'],
     ['verify_git_link'],
 ]
 BASE_REPORT_CRITERION = [
@@ -40,7 +40,8 @@
     ["header_check"],
     ["report_section_component"],
     ["main_text_check"],
-    ["spelling_check"]
+    ["spelling_check"],
+    ["theme_in_report_check"],
 ]
 
 DEFAULT_TYPE = 'pres'

diff --git a/app/main/checks/presentation_checks/__init__.py b/app/main/checks/presentation_checks/__init__.py
@@ -10,5 +10,6 @@
 from .pres_right_words import PresRightWordsCheck
 from .image_share import PresImageShareCheck
 from .banned_words import PresBannedWordsCheck
+from .find_theme_in_pres import FindThemeInPres
 from .verify_git_link import PresVerifyGitLinkCheck
 from .empty_slide_check import PresEmptySlideCheck
diff --git a/app/main/checks/presentation_checks/find_def_sld.py b/app/main/checks/presentation_checks/find_def_sld.py
@@ -3,7 +3,7 @@
 
 class FindDefSld(BasePresCriterion):
     label = "Поиск ключевого слова в заголовках"
-    description = 'Ключевые слова: "Апробация", "Цели и задачи", "Заключение"'
+    description = 'Поиск ключевого слова в заголовках'
     id = 'find_slides'
 
     def __init__(self, file_info, key_slide):
@@ -12,18 +12,18 @@ def __init__(self, file_info, key_slide):
         self.found_idxs = []
 
     def check(self):
-        found_slides = []
         for i, title in enumerate(self.file.get_titles(), 1):
             if str(title).lower().find(str(self.type_of_slide).lower()) != -1:
-                found_slides.append(self.file.get_text_from_slides()[i - 1])
+                #found_slides.append(self.file.get_text_from_slides()[i - 1])
                 self.found_idxs.append(i)
-        if len(found_slides) == 0:
-            self.file.found_index[str(self.type_of_slide)] = None
-            return answer(False, 'Слайд не найден')
+
+        # save fot future
+        self.file.found_index[str(self.type_of_slide)] = self.found_idxs.copy()
+
+        if self.found_idxs:
+            return answer(True, 'Найден под номером: {}'.format(', '.join(map(str, self.format_page_link(self.found_idxs)))))
         else:
-            self.file.found_index[str(self.type_of_slide)] = ''.join(str(item) for item in self.found_idxs)
-            found_idxs_link = self.format_page_link(self.found_idxs)
-            return answer(True, 'Найден под номером: {}'.format(', '.join(map(str, found_idxs_link))))
+            return answer(False, 'Слайд не найден')
 
     @property
     def name(self):

diff --git a/app/main/checks/presentation_checks/find_theme_in_pres.py b/app/main/checks/presentation_checks/find_theme_in_pres.py
@@ -0,0 +1,64 @@
+
+from ..base_check import BasePresCriterion, answer
+from .find_def_sld import FindDefSld
+from app.nlp.stemming import Stemming
+
+import string
+import nltk
+from nltk.tokenize import word_tokenize, sent_tokenize
+from nltk.corpus import stopwords
+from pymorphy2 import MorphAnalyzer
+
+
+MORPH_ANALYZER = MorphAnalyzer()
+
+
+class FindThemeInPres(BasePresCriterion):
+    label = "Проверка упоминания темы в заголовках презентации"
+    description = """Проверка упоминания темы в заголовках презентации, не включая титульный слайд, слайды "Цели и задачи", "Заключение" """
+    id = 'theme_in_pres_check'
+
+    def __init__(self, file_info, skip_slides_nums=(1,), skip_slides_titles=("Заключение",), limit=60):
+        super().__init__(file_info)
+        self.skip_slides_title = skip_slides_titles
+        slides = []
+        for title in self.skip_slides_title:
+            find_sld = FindDefSld(file_info=file_info, key_slide=title)
+            find_sld.check()
+            slides.extend(find_sld.found_idxs)
+        self.skip_slides = [
+            *skip_slides_nums,
+            *slides
+        ]        
+        self.limit = limit
+
+    def check(self):
+        stop_words = set(stopwords.words("russian"))
+
+        text_from_title = [slide for page, slide in enumerate(self.file.get_titles(), 1) if page not in self.skip_slides]
+        theme = ''.join(word for word in text_from_title[0])
+
+        translator = str.maketrans('', '', string.punctuation)
+        theme_without_punct = theme.translate(translator)
+        words_in_theme = word_tokenize(theme_without_punct)
+        lemma_theme = {MORPH_ANALYZER.parse(word)[0].normal_form for word in words_in_theme if word.lower() not in stop_words}
+
+        text_from_slide = [slide for page, slide in enumerate(self.file.get_text_from_slides(), 1) if page > 1]
+        string_from_text = ''.join(text_from_slide)
+
+        text_without_punct = string_from_text.translate(translator)
+        words_in_text = word_tokenize(text_without_punct)
+
+        lemma_text = {MORPH_ANALYZER.parse(word)[0].normal_form for word in words_in_text if word.lower() not in stop_words}
+
+        value_intersection = round(len(lemma_theme.intersection(lemma_text))*100//len(lemma_theme))
+
+        if value_intersection == 0:
+            return answer(False, "Не пройдена! В презентации не упоминаются слова, завяленные в теме.")
+        elif value_intersection < self.limit:
+            return answer(
+                round(value_intersection / self.limit, 1),
+                f"Частично пройдена! Процент упоминания темы в вашей презентации ({value_intersection} %) ниже требуемого ({self.limit} %)."
+            )
+        else:
+            return answer(True, f'Пройдена! Процент упоминания темы в презентации: {value_intersection} %')
diff --git a/app/main/checks/presentation_checks/verify_git_link.py b/app/main/checks/presentation_checks/verify_git_link.py
@@ -17,7 +17,7 @@ class PresVerifyGitLinkCheck(BasePresCriterion):
     description = ''
     id = 'verify_git_link'
 
-    def __init__(self, file_info, deep_check=True):
+    def __init__(self, file_info, deep_check=False):
         super().__init__(file_info)
         self.deep_check = deep_check
         self.wrong_repo_ref = []
@@ -59,9 +59,8 @@ def check(self):
                     link = requests.get(i[0])
                     if link.status_code != 200:
                         raise requests.exceptions.ConnectionError
-                    else:
-                        if self.deep_check:
-                            self.deep_check_repo(i, link)
+                    if self.deep_check:
+                        self.deep_check_repo(i, link)
                 except (requests.exceptions.SSLError, requests.exceptions.ConnectionError):
                     self.wrong_repo_ref.append(i[0])
         if self.wrong_repo_ref:
@@ -78,7 +77,7 @@ def check(self):
     def deep_check_repo(self, repo, link):
         if re.findall(r'github', repo[0]):
             tree = html.fromstring(link.content)
-            if not tree.xpath("//a[@class ='js-navigation-open Link--primary']"):
+            if not tree.xpath("//a[@class ='Link--primary']"):
                 self.empty_repo_ref.append(repo[0])
 
         # if re.findall(r'gitlab', i[0]):

diff --git a/app/main/checks/report_checks/__init__.py b/app/main/checks/report_checks/__init__.py
@@ -17,6 +17,7 @@
 from .short_sections_check import ReportShortSectionsCheck
 from .simple_check import ReportSimpleCheck
 from .style_check_settings import StyleCheckSettings
+from .find_theme_in_report import FindThemeInReport
 from .headers_at_page_top_check import ReportHeadersAtPageTopCheck
 from .sections_check import LRReportSectionCheck
 from .style_check import ReportStyleCheck

diff --git a/app/main/checks/report_checks/find_theme_in_report.py b/app/main/checks/report_checks/find_theme_in_report.py
@@ -0,0 +1,75 @@
+import re
+import string
+
+from ..base_check import BaseReportCriterion, answer
+
+import  string
+from nltk.tokenize import word_tokenize, sent_tokenize
+from nltk.corpus import stopwords
+from pymorphy2 import MorphAnalyzer
+
+
+MORPH_ANALYZER = MorphAnalyzer()
+
+
+class FindThemeInReport(BaseReportCriterion):
+
+    description = "Проверка упоминания темы в отчете"
+    id = 'theme_in_report_check'
+
+    def __init__(self, file_info, limit = 40):
+        super().__init__(file_info)
+        self.intro = {}
+        self.chapters = []
+        self.text_par = []
+        self.full_text = set()
+        self.limit = limit
+
+    def late_init(self):
+        self.chapters = self.file.make_chapters(self.file_type['report_type'])
+
+    def check(self):
+        stop_words = set(stopwords.words("russian"))
+        if self.file.page_counter() < 4:
+            return answer(False, "В отчете недостаточно страниц. Нечего проверять.")
+
+        self.late_init()
+        for intro in self.chapters:
+            header = intro["text"].lower()
+            if header not in ['заключение', "введение", "список использованных источников", "условные обозначения"]:
+                self.intro = intro
+                for intro_par in self.intro['child']:
+                    par = intro_par['text'].lower()
+                    self.text_par.append(par)
+        lemma_theme = self.find_theme()
+
+        for text in self.text_par:
+            translator = str.maketrans('', '', string.punctuation)
+            theme_without_punct = text.translate(translator)
+            word_in_text = word_tokenize(theme_without_punct)
+            lemma_text = {MORPH_ANALYZER.parse(w)[0].normal_form for w in word_in_text if w.lower() not in stop_words}
+            self.full_text.update(lemma_text)
+
+        intersection = lemma_theme.intersection(self.full_text)
+        value_intersection = round(len(intersection)*100//len(lemma_theme))
+        if value_intersection == 0:
+            return answer(False, f"Не пройдена! В отчете не упоминаются слова, завяленные в теме отчета.")
+        elif 1 < value_intersection < self.limit:
+            return answer(False, f"Не пройдена! Процент упоминания темы в вашем отчете ({value_intersection} %) ниже требуемого ({self.limit} %).")
+        else:
+            return answer (True, f'Пройдена! Процент упоминания темы в ответе: {value_intersection} %.')
+
+    def find_theme(self):
+        stop_words = set(stopwords.words("russian"))
+        lemma_theme = []
+        for key, text_on_page in self.file.pdf_file.get_text_on_page().items():
+            if key == 1:
+                lower_text = text_on_page.lower()
+                text_without_punct = lower_text.translate(str.maketrans('', '', string.punctuation))
+                list_full = text_without_punct.split()
+                start = list_full.index('тема') + 1
+                end = list_full.index('студент')
+                list_theme = list_full[start:end]
+                lemma_theme = {MORPH_ANALYZER.parse(word)[0].normal_form for word in list_theme if
+                                word not in stop_words}
+            return lemma_theme
diff --git a/app/main/presentations/odp/presentation_odp.py b/app/main/presentations/odp/presentation_odp.py
@@ -12,6 +12,7 @@ def __init__(self, presentation_name):
         self.prs = opendocument.load(presentation_name)
         self.parse_styles()
         self.add_slides()
+        self.found_index = {}
 
     def add_slides(self):
         for slide in self.prs.getElementsByType(draw.Page):