Skip to content

Commit

Permalink
resolve conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
MarinaProsche committed Apr 24, 2024
2 parents 26cb9b4 + 63e6848 commit fc2e885
Show file tree
Hide file tree
Showing 23 changed files with 491 additions and 185 deletions.
8 changes: 5 additions & 3 deletions app/db/db_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,10 +407,12 @@ def mark_celery_task_as_finished(celery_task_id, finished_time=None):
'$set': {'finished_at': finished_time,
'processing_time': (finished_time - celery_task['started_at']).total_seconds()}})


def get_average_processing_time(min_time=5.0):
result = list(celery_check_collection.aggregate(
[{'$match': {"processing_time": {"$lt": 175} }}, {'$group': {'_id': None, 'avg_processing_time': {'$avg': "$processing_time"}}}]))
# use only success check (failed checks processing time is more bigger than normal)
result = list(celery_check_collection.aggregate([
{'$match': {'processing_time': {'$lt': 170}}},
{'$group': {'_id': None, 'avg_processing_time': {'$avg': "$processing_time"}}}
]))
if result and result[0]['avg_processing_time']:
result = result[0]['avg_processing_time']
if result > min_time:
Expand Down
6 changes: 4 additions & 2 deletions app/main/check_packs/pack_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
['future_dev'],
['pres_banned_words_check'],
['pres_empty_slide'],
['pres_banned_words_check'],
['theme_in_pres_check'],
['verify_git_link'],
]
BASE_REPORT_CRITERION = [
Expand All @@ -40,7 +40,9 @@
["header_check"],
["report_section_component"],
["main_text_check"],
["spelling_check"]
["spelling_check"],
["max_abstract_size_check"],
["theme_in_report_check"],
]

DEFAULT_TYPE = 'pres'
Expand Down
3 changes: 2 additions & 1 deletion app/main/checks/presentation_checks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@
from .sld_enum import SldEnumCheck
from .sld_num import SldNumCheck
from .sld_similarity import SldSimilarity
from .template_name import TemplateNameCheck
from .template_name import PresTemplateNameCheck
from .title_format import TitleFormatCheck
from .pres_right_words import PresRightWordsCheck
from .image_share import PresImageShareCheck
from .banned_words import PresBannedWordsCheck
from .find_theme_in_pres import FindThemeInPres
from .verify_git_link import PresVerifyGitLinkCheck
from .empty_slide_check import PresEmptySlideCheck
18 changes: 9 additions & 9 deletions app/main/checks/presentation_checks/find_def_sld.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

class FindDefSld(BasePresCriterion):
label = "Поиск ключевого слова в заголовках"
description = 'Ключевые слова: "Апробация", "Цели и задачи", "Заключение"'
description = 'Поиск ключевого слова в заголовках'
id = 'find_slides'

def __init__(self, file_info, key_slide):
Expand All @@ -12,18 +12,18 @@ def __init__(self, file_info, key_slide):
self.found_idxs = []

def check(self):
found_slides = []
for i, title in enumerate(self.file.get_titles(), 1):
if str(title).lower().find(str(self.type_of_slide).lower()) != -1:
found_slides.append(self.file.get_text_from_slides()[i - 1])
#found_slides.append(self.file.get_text_from_slides()[i - 1])
self.found_idxs.append(i)
if len(found_slides) == 0:
self.file.found_index[str(self.type_of_slide)] = None
return answer(False, 'Слайд не найден')

# save fot future
self.file.found_index[str(self.type_of_slide)] = self.found_idxs.copy()

if self.found_idxs:
return answer(True, 'Найден под номером: {}'.format(', '.join(map(str, self.format_page_link(self.found_idxs)))))
else:
self.file.found_index[str(self.type_of_slide)] = ''.join(str(item) for item in self.found_idxs)
found_idxs_link = self.format_page_link(self.found_idxs)
return answer(True, 'Найден под номером: {}'.format(', '.join(map(str, found_idxs_link))))
return answer(False, 'Слайд не найден')

@property
def name(self):
Expand Down
64 changes: 64 additions & 0 deletions app/main/checks/presentation_checks/find_theme_in_pres.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@

from ..base_check import BasePresCriterion, answer
from .find_def_sld import FindDefSld
from app.nlp.stemming import Stemming

import string
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer


MORPH_ANALYZER = MorphAnalyzer()


class FindThemeInPres(BasePresCriterion):
label = "Проверка упоминания темы в заголовках презентации"
description = """Проверка упоминания темы в заголовках презентации, не включая титульный слайд, слайды "Цели и задачи", "Заключение" """
id = 'theme_in_pres_check'

def __init__(self, file_info, skip_slides_nums=(1,), skip_slides_titles=("Заключение",), limit=60):
super().__init__(file_info)
self.skip_slides_title = skip_slides_titles
slides = []
for title in self.skip_slides_title:
find_sld = FindDefSld(file_info=file_info, key_slide=title)
find_sld.check()
slides.extend(find_sld.found_idxs)
self.skip_slides = [
*skip_slides_nums,
*slides
]
self.limit = limit

def check(self):
stop_words = set(stopwords.words("russian"))

text_from_title = [slide for page, slide in enumerate(self.file.get_titles(), 1) if page not in self.skip_slides]
theme = ''.join(word for word in text_from_title[0])

translator = str.maketrans('', '', string.punctuation)
theme_without_punct = theme.translate(translator)
words_in_theme = word_tokenize(theme_without_punct)
lemma_theme = {MORPH_ANALYZER.parse(word)[0].normal_form for word in words_in_theme if word.lower() not in stop_words}

text_from_slide = [slide for page, slide in enumerate(self.file.get_text_from_slides(), 1) if page > 1]
string_from_text = ''.join(text_from_slide)

text_without_punct = string_from_text.translate(translator)
words_in_text = word_tokenize(text_without_punct)

lemma_text = {MORPH_ANALYZER.parse(word)[0].normal_form for word in words_in_text if word.lower() not in stop_words}

value_intersection = round(len(lemma_theme.intersection(lemma_text))*100//len(lemma_theme))

if value_intersection == 0:
return answer(False, "Не пройдена! В презентации не упоминаются слова, завяленные в теме.")
elif value_intersection < self.limit:
return answer(
round(value_intersection / self.limit, 1),
f"Частично пройдена! Процент упоминания темы в вашей презентации ({value_intersection} %) ниже требуемого ({self.limit} %)."
)
else:
return answer(True, f'Пройдена! Процент упоминания темы в презентации: {value_intersection} %')
2 changes: 1 addition & 1 deletion app/main/checks/presentation_checks/template_name.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from ..base_check import BasePresCriterion, answer


class TemplateNameCheck(BasePresCriterion):
class PresTemplateNameCheck(BasePresCriterion):
label = "Проверка соответствия названия файла шаблону"
description = 'Шаблон названия: "Презентация_ВКР_Иванов", "ПРЕЗЕНТАЦИЯ_НИР_ИВАНОВ"'
id = 'template_name'
Expand Down
9 changes: 4 additions & 5 deletions app/main/checks/presentation_checks/verify_git_link.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class PresVerifyGitLinkCheck(BasePresCriterion):
description = ''
id = 'verify_git_link'

def __init__(self, file_info, deep_check=True):
def __init__(self, file_info, deep_check=False):
super().__init__(file_info)
self.deep_check = deep_check
self.wrong_repo_ref = []
Expand Down Expand Up @@ -59,9 +59,8 @@ def check(self):
link = requests.get(i[0])
if link.status_code != 200:
raise requests.exceptions.ConnectionError
else:
if self.deep_check:
self.deep_check_repo(i, link)
if self.deep_check:
self.deep_check_repo(i, link)
except (requests.exceptions.SSLError, requests.exceptions.ConnectionError):
self.wrong_repo_ref.append(i[0])
if self.wrong_repo_ref:
Expand All @@ -78,7 +77,7 @@ def check(self):
def deep_check_repo(self, repo, link):
if re.findall(r'github', repo[0]):
tree = html.fromstring(link.content)
if not tree.xpath("//a[@class ='js-navigation-open Link--primary']"):
if not tree.xpath("//a[@class ='Link--primary']"):
self.empty_repo_ref.append(repo[0])

# if re.findall(r'gitlab', i[0]):
Expand Down
4 changes: 3 additions & 1 deletion app/main/checks/report_checks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@
from .short_sections_check import ReportShortSectionsCheck
from .simple_check import ReportSimpleCheck
from .style_check_settings import StyleCheckSettings
from .find_theme_in_report import FindThemeInReport
from .headers_at_page_top_check import ReportHeadersAtPageTopCheck
from .sections_check import LRReportSectionCheck
from .style_check import ReportStyleCheck
from .spelling_check import SpellingCheck

from .max_abstract_size_check import ReportMaxSizeOfAbstractCheck
from .template_name import ReportTemplateNameCheck
78 changes: 78 additions & 0 deletions app/main/checks/report_checks/find_theme_in_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import re
import string

from ..base_check import BaseReportCriterion, answer

import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer


MORPH_ANALYZER = MorphAnalyzer()


class FindThemeInReport(BaseReportCriterion):
label = "Проверка упоминания темы в отчете"
description = "Проверка упоминания темы в отчете"
id = 'theme_in_report_check'

def __init__(self, file_info, limit = 40):
super().__init__(file_info)
self.intro = {}
self.chapters = []
self.text_par = []
self.full_text = set()
self.limit = limit

def late_init(self):
self.chapters = self.file.make_chapters(self.file_type['report_type'])

def check(self):
stop_words = set(stopwords.words("russian"))
if self.file.page_counter() < 4:
return answer(False, "В отчете недостаточно страниц. Нечего проверять.")

self.late_init()
for intro in self.chapters:
header = intro["text"].lower()
if header not in ['заключение', "введение", "список использованных источников", "условные обозначения"]:
self.intro = intro
for intro_par in self.intro['child']:
par = intro_par['text'].lower()
self.text_par.append(par)
lemma_theme = self.find_theme()

for text in self.text_par:
translator = str.maketrans('', '', string.punctuation)
theme_without_punct = text.translate(translator)
word_in_text = word_tokenize(theme_without_punct)
lemma_text = {MORPH_ANALYZER.parse(w)[0].normal_form for w in word_in_text if w.lower() not in stop_words}
self.full_text.update(lemma_text)

intersection = lemma_theme.intersection(self.full_text)
value_intersection = round(len(intersection)*100//len(lemma_theme))
if value_intersection == 0:
return answer(False, "Не пройдена! В отчете не упоминаются слова, заявленные в теме отчета.")
elif value_intersection < self.limit:
return answer(
round(value_intersection/self.limit, 1),
f"Частично пройдена! Процент упоминания темы в вашем отчете ({value_intersection} %) ниже требуемого ({self.limit} %)."
)
else:
return answer (True, f'Пройдена! Процент упоминания темы в отчете: {value_intersection} %.')

def find_theme(self):
stop_words = set(stopwords.words("russian"))
lemma_theme = []
for key, text_on_page in self.file.pdf_file.get_text_on_page().items():
if key == 1:
lower_text = text_on_page.lower()
text_without_punct = lower_text.translate(str.maketrans('', '', string.punctuation))
list_full = text_without_punct.split()
start = list_full.index('тема') + 1
end = list_full.index('студент')
list_theme = list_full[start:end]
lemma_theme = {MORPH_ANALYZER.parse(word)[0].normal_form for word in list_theme if
word not in stop_words}
return lemma_theme
2 changes: 1 addition & 1 deletion app/main/checks/report_checks/headers_at_page_top_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def check(self):
if self.file_type["report_type"] == 'LR':
for header in self.headers:
found = False
for page_num in range(1, self.pdf.page_count):
for page_num in range(1, self.pdf.page_count_all):
lines = self.pdf.text_on_page[page_num + 1].split("\n")
last_header_line = 0
collected_text = ""
Expand Down
45 changes: 20 additions & 25 deletions app/main/checks/report_checks/image_share_check.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from ..base_check import BaseReportCriterion, answer


class ReportImageShareCheck(BaseReportCriterion):
label = "Проверка доли объема отчёта, приходящейся на изображения"
description = 'Доля изображений (не включая "Приложение") не должна превышать 0,9'
Expand All @@ -13,27 +12,23 @@ def __init__(self, file_info, limit=0.3):
def check(self):
if self.file.page_counter() < 4:
return answer(False, "В отчете недостаточно страниц. Нечего проверять.")
images_height = 0
for image in self.file.inline_shapes:
images_height += image.height.cm
if len(self.file.file.sections):
available_space = self.file.file.sections[0].page_height.cm - self.file.file.sections[0].bottom_margin.cm - \
self.file.file.sections[0].top_margin.cm
images_pages = images_height / available_space
share = images_pages / self.file.page_count
if share > self.limit:
result_str = f'Проверка не пройдена! Изображения в работе занимают около {round(share, 2)} объема ' \
f'документа без учета приложения, ограничение - {round(self.limit, 2)}'
result_str += '''
Если доля отчета, приходящаяся на изображения, больше нормы, попробуйте сделать следующее:
<ul>
<li>Попробуйте перенести малозначимые иллюстрации в Приложение;</li>
<li>Если у вас уже есть раздел Приложение, убедитесь, что количество страниц в отчете посчитано программой без учета приложения;</li>
<li>Если страницы посчитаны программой неверно, убедитесь, что заголовок приложения правильно оформлен;</li>
<li>Убедитесь, что красная строка не сделана с помощью пробелов или табуляции.</li>
</ul>
'''
return answer(False, result_str)
else:
return answer(True, f'Пройдена!')
return answer(False, 'Во время обработки произошла критическая ошибка')
images_height = self.file.pdf_file.page_images(page_without_pril=self.file.page_count)
available_space = self.file.pdf_file.page_height(page_without_pril=self.file.page_count)

images_value = images_height/available_space

if images_value > self.limit:
result_str = f'Проверка не пройдена! Изображения в работе занимают около {round(images_value, 2)} объема ' \
f'документа без учета приложения, ограничение - {round(self.limit, 2)}'
result_str += '''
Если доля отчета, приходящаяся на изображения, больше нормы, попробуйте сделать следующее:
<ul>
<li>Попробуйте перенести малозначимые иллюстрации в Приложение;</li>
<li>Если у вас уже есть раздел Приложение, убедитесь, что количество страниц в отчете посчитано программой без учета приложения;</li>
<li>Если страницы посчитаны программой неверно, убедитесь, что заголовок приложения правильно оформлен;</li>
<li>Убедитесь, что красная строка не сделана с помощью пробелов или табуляции.</li>
</ul>
'''
return answer(False, result_str)
else:
return answer(True, 'Пройдена!')
2 changes: 1 addition & 1 deletion app/main/checks/report_checks/literature_references.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def count_sources(self):

def search_literature_start_pdf(self):
start_page = 0
end_page = self.file.pdf_file.page_count
end_page = self.file.pdf_file.page_count_all
for i in self.file.pdf_file.text_on_page.keys():
lowercase_str = self.file.pdf_file.text_on_page[i].lower()
if re.search(self.name_pattern, lowercase_str):
Expand Down
Loading

0 comments on commit fc2e885

Please sign in to comment.