Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

442 theme in text #456

Merged
merged 6 commits into from
Apr 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions app/main/check_packs/pack_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
['future_dev'],
['pres_banned_words_check'],
['pres_empty_slide'],
['pres_banned_words_check'],
['theme_in_pres_check'],
['verify_git_link'],
]
BASE_REPORT_CRITERION = [
Expand All @@ -40,7 +40,8 @@
["header_check"],
["report_section_component"],
["main_text_check"],
["spelling_check"]
["spelling_check"],
["theme_in_report_check"],
]

DEFAULT_TYPE = 'pres'
Expand Down
1 change: 1 addition & 0 deletions app/main/checks/presentation_checks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@
from .pres_right_words import PresRightWordsCheck
from .image_share import PresImageShareCheck
from .banned_words import PresBannedWordsCheck
from .find_theme_in_pres import FindThemeInPres
from .verify_git_link import PresVerifyGitLinkCheck
from .empty_slide_check import PresEmptySlideCheck
18 changes: 9 additions & 9 deletions app/main/checks/presentation_checks/find_def_sld.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

class FindDefSld(BasePresCriterion):
label = "Поиск ключевого слова в заголовках"
description = 'Ключевые слова: "Апробация", "Цели и задачи", "Заключение"'
description = 'Поиск ключевого слова в заголовках'
id = 'find_slides'

def __init__(self, file_info, key_slide):
Expand All @@ -12,18 +12,18 @@ def __init__(self, file_info, key_slide):
self.found_idxs = []

def check(self):
found_slides = []
for i, title in enumerate(self.file.get_titles(), 1):
if str(title).lower().find(str(self.type_of_slide).lower()) != -1:
found_slides.append(self.file.get_text_from_slides()[i - 1])
#found_slides.append(self.file.get_text_from_slides()[i - 1])
self.found_idxs.append(i)
if len(found_slides) == 0:
self.file.found_index[str(self.type_of_slide)] = None
return answer(False, 'Слайд не найден')

# save fot future
self.file.found_index[str(self.type_of_slide)] = self.found_idxs.copy()

if self.found_idxs:
return answer(True, 'Найден под номером: {}'.format(', '.join(map(str, self.format_page_link(self.found_idxs)))))
else:
self.file.found_index[str(self.type_of_slide)] = ''.join(str(item) for item in self.found_idxs)
found_idxs_link = self.format_page_link(self.found_idxs)
return answer(True, 'Найден под номером: {}'.format(', '.join(map(str, found_idxs_link))))
return answer(False, 'Слайд не найден')

@property
def name(self):
Expand Down
64 changes: 64 additions & 0 deletions app/main/checks/presentation_checks/find_theme_in_pres.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@

from ..base_check import BasePresCriterion, answer
from .find_def_sld import FindDefSld
from app.nlp.stemming import Stemming

import string
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer


MORPH_ANALYZER = MorphAnalyzer()


class FindThemeInPres(BasePresCriterion):
label = "Проверка упоминания темы в заголовках презентации"
description = """Проверка упоминания темы в заголовках презентации, не включая титульный слайд, слайды "Цели и задачи", "Заключение" """
id = 'theme_in_pres_check'

def __init__(self, file_info, skip_slides_nums=(1,), skip_slides_titles=("Заключение",), limit=60):
super().__init__(file_info)
self.skip_slides_title = skip_slides_titles
slides = []
for title in self.skip_slides_title:
find_sld = FindDefSld(file_info=file_info, key_slide=title)
find_sld.check()
slides.extend(find_sld.found_idxs)
self.skip_slides = [
*skip_slides_nums,
*slides
]
self.limit = limit

def check(self):
stop_words = set(stopwords.words("russian"))

text_from_title = [slide for page, slide in enumerate(self.file.get_titles(), 1) if page not in self.skip_slides]
theme = ''.join(word for word in text_from_title[0])

translator = str.maketrans('', '', string.punctuation)
theme_without_punct = theme.translate(translator)
words_in_theme = word_tokenize(theme_without_punct)
lemma_theme = {MORPH_ANALYZER.parse(word)[0].normal_form for word in words_in_theme if word.lower() not in stop_words}

text_from_slide = [slide for page, slide in enumerate(self.file.get_text_from_slides(), 1) if page > 1]
string_from_text = ''.join(text_from_slide)

text_without_punct = string_from_text.translate(translator)
words_in_text = word_tokenize(text_without_punct)

lemma_text = {MORPH_ANALYZER.parse(word)[0].normal_form for word in words_in_text if word.lower() not in stop_words}

value_intersection = round(len(lemma_theme.intersection(lemma_text))*100//len(lemma_theme))

if value_intersection == 0:
return answer(False, "Не пройдена! В презентации не упоминаются слова, завяленные в теме.")
elif value_intersection < self.limit:
return answer(
round(value_intersection / self.limit, 1),
f"Частично пройдена! Процент упоминания темы в вашей презентации ({value_intersection} %) ниже требуемого ({self.limit} %)."
)
else:
return answer(True, f'Пройдена! Процент упоминания темы в презентации: {value_intersection} %')
9 changes: 4 additions & 5 deletions app/main/checks/presentation_checks/verify_git_link.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class PresVerifyGitLinkCheck(BasePresCriterion):
description = ''
id = 'verify_git_link'

def __init__(self, file_info, deep_check=True):
def __init__(self, file_info, deep_check=False):
super().__init__(file_info)
self.deep_check = deep_check
self.wrong_repo_ref = []
Expand Down Expand Up @@ -59,9 +59,8 @@ def check(self):
link = requests.get(i[0])
if link.status_code != 200:
raise requests.exceptions.ConnectionError
else:
if self.deep_check:
self.deep_check_repo(i, link)
if self.deep_check:
self.deep_check_repo(i, link)
except (requests.exceptions.SSLError, requests.exceptions.ConnectionError):
self.wrong_repo_ref.append(i[0])
if self.wrong_repo_ref:
Expand All @@ -78,7 +77,7 @@ def check(self):
def deep_check_repo(self, repo, link):
if re.findall(r'github', repo[0]):
tree = html.fromstring(link.content)
if not tree.xpath("//a[@class ='js-navigation-open Link--primary']"):
if not tree.xpath("//a[@class ='Link--primary']"):
self.empty_repo_ref.append(repo[0])

# if re.findall(r'gitlab', i[0]):
Expand Down
1 change: 1 addition & 0 deletions app/main/checks/report_checks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from .short_sections_check import ReportShortSectionsCheck
from .simple_check import ReportSimpleCheck
from .style_check_settings import StyleCheckSettings
from .find_theme_in_report import FindThemeInReport
from .headers_at_page_top_check import ReportHeadersAtPageTopCheck
from .sections_check import LRReportSectionCheck
from .style_check import ReportStyleCheck
Expand Down
75 changes: 75 additions & 0 deletions app/main/checks/report_checks/find_theme_in_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import re
import string

from ..base_check import BaseReportCriterion, answer

import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer


MORPH_ANALYZER = MorphAnalyzer()


class FindThemeInReport(BaseReportCriterion):

description = "Проверка упоминания темы в отчете"
id = 'theme_in_report_check'

def __init__(self, file_info, limit = 40):
super().__init__(file_info)
self.intro = {}
self.chapters = []
self.text_par = []
self.full_text = set()
self.limit = limit

def late_init(self):
self.chapters = self.file.make_chapters(self.file_type['report_type'])

def check(self):
stop_words = set(stopwords.words("russian"))
if self.file.page_counter() < 4:
return answer(False, "В отчете недостаточно страниц. Нечего проверять.")

self.late_init()
for intro in self.chapters:
header = intro["text"].lower()
if header not in ['заключение', "введение", "список использованных источников", "условные обозначения"]:
self.intro = intro
for intro_par in self.intro['child']:
par = intro_par['text'].lower()
self.text_par.append(par)
lemma_theme = self.find_theme()
Comment on lines +37 to +44
Copy link
Collaborator

@HadronCollider HadronCollider Jul 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Я в процессе небольшой разработки/обновления представления текста отчета для проверок (простая структура вида

{
    'chapter_name': '...',
    'text': 'full chapter text'
}

чтобы избавиться от всяких списков параграфов и прочего. аналогично с презентациями (если там такого нет)

Если не успею сделать - пока оставим так


for text in self.text_par:
translator = str.maketrans('', '', string.punctuation)
theme_without_punct = text.translate(translator)
word_in_text = word_tokenize(theme_without_punct)
lemma_text = {MORPH_ANALYZER.parse(w)[0].normal_form for w in word_in_text if w.lower() not in stop_words}
self.full_text.update(lemma_text)

intersection = lemma_theme.intersection(self.full_text)
value_intersection = round(len(intersection)*100//len(lemma_theme))
if value_intersection == 0:
return answer(False, f"Не пройдена! В отчете не упоминаются слова, завяленные в теме отчета.")
elif 1 < value_intersection < self.limit:
return answer(False, f"Не пройдена! Процент упоминания темы в вашем отчете ({value_intersection} %) ниже требуемого ({self.limit} %).")
else:
return answer (True, f'Пройдена! Процент упоминания темы в ответе: {value_intersection} %.')

def find_theme(self):
stop_words = set(stopwords.words("russian"))
lemma_theme = []
for key, text_on_page in self.file.pdf_file.get_text_on_page().items():
if key == 1:
lower_text = text_on_page.lower()
text_without_punct = lower_text.translate(str.maketrans('', '', string.punctuation))
list_full = text_without_punct.split()
start = list_full.index('тема') + 1
end = list_full.index('студент')
list_theme = list_full[start:end]
lemma_theme = {MORPH_ANALYZER.parse(word)[0].normal_form for word in list_theme if
word not in stop_words}
return lemma_theme
1 change: 1 addition & 0 deletions app/main/presentations/odp/presentation_odp.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def __init__(self, presentation_name):
self.prs = opendocument.load(presentation_name)
self.parse_styles()
self.add_slides()
self.found_index = {}

def add_slides(self):
for slide in self.prs.getElementsByType(draw.Page):
Expand Down
Loading