Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

466 criteria pack for md test #471

Merged
merged 25 commits into from
Mar 16, 2024
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions app/main/check_packs/pack_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,13 @@
["banned_words_in_literature"],
["page_counter"],
["image_share_check"],
# ["headers_at_page_top_check", {"headers": ["Приложение А Исходный код программы"]}],
["headers_at_page_top_check", {"headers": ["Приложение А Исходный код программы"]}],
["headers_at_page_top_check"],
["lr_sections_check"],
["style_check"],
["short_sections_check"],
["banned_words_check"],
["right_words_check"],
["banned_words_in_literature"],
["literature_references"],
["image_references"],
["table_references"],
Expand Down
8 changes: 5 additions & 3 deletions app/main/checks/report_checks/banned_words_in_literature.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@ def __init__(self, file_info, banned_words=["wikipedia"]):
self.literature_header = []
self.banned_words = [morph.normal_forms(word)[0] for word in banned_words]
self.name_pattern = r'список[ \t]*(использованных|использованной|)[ \t]*(источников|литературы)'
self.md_name_pattern = r'<h2>список[ \t]*(использованных|использованной|)[ \t]*(источников|литературы)</h2>'

def late_init_vkr(self):
self.literature_header = self.file.find_literature_vkr(self.file_type['report_type'])
self.headers_page = self.file.find_header_page(self.file_type['report_type'])
# self.headers_page = self.file.find_header_page(self.file_type['report_type'])
self.lit_page = self.file.find_literature_page(self.file_type['report_type'])

def check(self):
if self.file.page_counter() < 4:
Expand Down Expand Up @@ -52,7 +54,7 @@ def check(self):
for i in sorted(detected_words_dict.keys()):
result_str += f"Абзац {i}: {detected_words_dict[i]}.<br>"
return answer(False, f'Есть запрещенные слова в списке источников '
f'{self.format_page_link([self.headers_page])}:<br><br>{result_str}')
f'{self.format_page_link([self.lit_page])}:<br><br>{result_str}')
return answer(True, f"Пройдена!")

def find_banned_words(self, list_of_literature):
Expand Down Expand Up @@ -83,6 +85,6 @@ def start_of_literature_chapter(self, ):
start_index = 0
for i in range(len(self.file.paragraphs)):
text_string = self.file.paragraphs[i].to_string().lower().split('\n')[1]
if re.fullmatch(self.name_pattern, text_string):
if re.fullmatch(f'{self.name_pattern}|{self.md_name_pattern}', text_string):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

А что будет, если загрузить не md-файл, простую ВКР в docx? это ведь общий критерий

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Удалила ненужный паттерн для md

start_index = i
return start_index
5 changes: 4 additions & 1 deletion app/main/checks/report_checks/image_references.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,10 @@ def check(self):
def search_references(self):
array_of_references = set()
for i in range(0, self.last_child_number):
detected_references = re.findall(r'[Рр]ис\. [\d .,]+', self.file.paragraphs[i].paragraph_text)
if isinstance(self.file.paragraphs[i], str):
detected_references = re.findall(r'[Рр]ис\. [\d .,]+', self.file.paragraphs[i])
else:
detected_references = re.findall(r'[Рр]ис\. [\d .,]+', self.file.paragraphs[i].paragraph_text)
if detected_references:
for reference in detected_references:
for one_part in re.split(r'[Рр]ис\.|,| ', reference):
Expand Down
24 changes: 17 additions & 7 deletions app/main/checks/report_checks/literature_references.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def __init__(self, file_info, min_ref=1, max_ref=1000):
self.headers = []
self.literature_header = []
self.name_pattern = r'список[ \t]*(использованных|использованной|)[ \t]*(источников|литературы)'
self.md_name_pattern = r"<h2>(Список использованных источников|Список использованной литературы)<\/h2>"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Почему тут другой паттерн? https://github.com/moevm/mse_auto_checking_slides_vaganov/pull/471/files#diff-4a9d734369983549e0972bd59f7620e6ec4e30bed37af49ced2fcdc06ab558fbR16

Возможно, его (их / другие regexp) стоит вынести в класс файла (или uploader'а), чтобы использовать в нужных местах проверок и не теряться в их обилии

self.min_ref = min_ref
self.max_ref = max_ref

Expand Down Expand Up @@ -77,7 +78,10 @@ def check(self):
def search_references(self, start_par):
array_of_references = set()
for i in range(0, start_par):
detected_references = re.findall(r'\[[\d \-,]+\]', self.file.paragraphs[i].to_string().split('\n')[1])
if isinstance(self.file.paragraphs[i], str):
detected_references = re.findall(r'\[[\d \-,]+\]', self.file.paragraphs[i])
else:
detected_references = re.findall(r'\[[\d \-,]+\]', self.file.paragraphs[i].to_string().split('\n')[1])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

А в app/main/checks/report_checks/image_references.py используется self.file.paragraphs[i].paragraph_text - в чем разница?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Исправила на "paragraph_text", работает корректно

if detected_references:
for reference in detected_references:
for one_part in re.split(r'[\[\],]', reference):
Expand All @@ -86,16 +90,22 @@ def search_references(self, start_par):
for k in range(int(start), int(end) + 1):
array_of_references.add(k)
elif one_part != '':
array_of_references.add(int(one_part))
array_of_references.add(int(one_part))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Пустые изменения

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Исправила

return array_of_references

def find_start_paragraph(self):
start_index = 0
for i in range(len(self.file.paragraphs)):
text_string = self.file.paragraphs[i].to_string().lower().split('\n')[1]
if re.fullmatch(self.name_pattern, text_string):
start_index = i
break
if isinstance(self.file.paragraphs[i], str):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • В app/main/checks/report_checks/banned_words_in_literature.py логики для name_pattern нет, а тут есть

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Исправила, работает корректно

text_string = self.file.paragraphs[i].lower()
if re.fullmatch(self.md_name_pattern, text_string):
start_index = i
break
else:
text_string = self.file.paragraphs[i].to_string().lower().split('\n')[1]
if re.fullmatch(self.name_pattern, text_string):
start_index = i
break
return start_index

def count_sources_vkr(self, header):
Expand Down Expand Up @@ -142,4 +152,4 @@ def search_literature_start_pdf(self):
if re.search('приложение а[\n .]', lowercase_str):
end_page = i
break
return start_page, end_page
return start_page, end_page
4 changes: 2 additions & 2 deletions app/main/checks/report_checks/section_component.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@


class ReportSectionComponent(BaseReportCriterion):
description = "Проверка наличия необходимых компонент указанного раздела"
description = "Проверка наличия необходимых компонентов указанного раздела"
id = 'report_section_component'

def __init__(self, file_info, chapter='Введение', patterns=('цель', 'задачи', 'объект', 'предмет')):
def __init__(self, file_info, chapter='Введение', patterns=('цель', 'задач', 'объект', 'предмет')):
super().__init__(file_info)
self.intro = {}
self.chapter = chapter
Expand Down
5 changes: 4 additions & 1 deletion app/main/checks/report_checks/table_references.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,10 @@ def check(self):
def search_references(self):
array_of_references = set()
for i in range(0, self.last_child_number):
detected_references = re.findall(r'таблиц[аеыу][\d .]+', self.file.paragraphs[i].paragraph_text)
if isinstance(self.file.paragraphs[i], str):
detected_references = re.findall(r'таблиц[аеыу][\d .]+', self.file.paragraphs[i])
else:
detected_references = re.findall(r'таблиц[аеыу][\d .]+', self.file.paragraphs[i].paragraph_text)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Аналогично про

А в `app/main/checks/report_checks/image_references.py` используется
`self.file.paragraphs[i].paragraph_text` - в чем разница?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Оставила "paragraph_text"

if detected_references:
for reference in detected_references:
for one_part in re.split(r'таблиц[аеыу]| ', reference):
Expand Down
12 changes: 11 additions & 1 deletion app/main/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from main.presentations import PresentationPPTX
from main.reports.docx_uploader import DocxUploader
from main.reports.md_uploader import MdUpload
from utils import convert_to

logger = logging.getLogger('root_logger')
Expand All @@ -19,15 +20,24 @@ def parse(filepath, pdf_filepath):
logger.info(f"Презентация {filepath} старого формата. Временно преобразована в pptx для обработки.")
new_filepath = convert_to(filepath, target_format='pptx')
file_object = PresentationPPTX(new_filepath)
elif tmp_filepath.endswith(('.doc', '.odt', '.docx')):
elif tmp_filepath.endswith(('.doc', '.odt', '.docx', )):
new_filepath = filepath
if tmp_filepath.endswith(('.doc', '.odt')):
logger.info(f"Отчёт {filepath} старого формата. Временно преобразован в docx для обработки.")
new_filepath = convert_to(filepath, target_format='docx')

docx = DocxUploader()
docx.upload(new_filepath, pdf_filepath)
docx.parse()
file_object = docx

elif tmp_filepath.endswith('.md' ):
new_filepath = filepath
doc = MdUpload(new_filepath)
md_text = doc.upload()
doc.parse(md_text)
file_object = doc

else:
raise ValueError("Файл с недопустимым именем или недопустимого формата: " + filepath)
# Если была конвертация, то удаляем временный файл.
Expand Down
2 changes: 0 additions & 2 deletions app/main/reports/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,10 @@ Proof-of-concept парсинг файлов `.docx` с выводом стру
```bash
$ python3 -m app.main.mse22.pdf_document text_from_pages --filename path_to_file
```

## `MD`

Парсинг файлов `.md` с выводом структуры файла в текстовом виде в stdout.

```bash
$ python3 -m app.main.reports.md_uploader md_parser --mdfile path_to_md_file
```

31 changes: 31 additions & 0 deletions app/main/reports/document_uploader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from abc import ABC, abstractmethod

class DocumentUploader(ABC):

@abstractmethod
def upload(self):
pass

@abstractmethod
def parse(self):
pass

@abstractmethod
def parse_effective_styles(self):
pass

@abstractmethod
def page_counter(self):
pass

@abstractmethod
def make_chapters(self, work_type):
pass

@abstractmethod
def find_header_page(self, work_type):
pass

@abstractmethod
def find_literature_vkr(self, work_type):
pass
14 changes: 13 additions & 1 deletion app/main/reports/docx_uploader/docx_uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@
from .style import Style
from .table import Table, Cell
from ..pdf_document.pdf_document_manager import PdfDocumentManager
from ..document_uploader import DocumentUploader


class DocxUploader:
class DocxUploader(DocumentUploader):
def __init__(self):
self.inline_shapes = []
self.core_properties = None
Expand All @@ -27,6 +28,7 @@ def __init__(self):
self.first_lines = []
self.literature_header = []
self.headers_page = 0
self.literature_page = 0

def upload(self, file, pdf_filepath=''):
self.file = docx.Document(file)
Expand Down Expand Up @@ -122,6 +124,16 @@ def find_header_page(self, work_type):
self.headers_page = header["page"]
break
return self.headers_page

def find_literature_page(self, work_type):
if not self.literature_page:
for k, v in self.pdf_file.text_on_page.items():
line = v[:40] if len(v) > 21 else v
if re.search('СПИСОК ИСПОЛЬЗОВАННЫХ ИСТОЧНИКОВ', line.strip()):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

В некоторых критериях используется regexp с более широким спектром возможных вариаций названия - а почему тут в таком виде?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Исправлено на более широкое выпажение:
'список[ \t](использованных|использованной|)[ \t](источников|литературы)', line.strip().lower()

break
self.literature_page += 1
self.literature_page += 1
return self.literature_page

def find_literature_vkr(self, work_type):
if not self.literature_header:
Expand Down
Loading