-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
466 criteria pack for md test #471
Changes from 15 commits
423c3bf
be66bf9
e20e27a
595487c
45c78d4
dc92232
fc87018
7f666ce
9510244
3c0ee06
3ae0696
5c7074b
21a7fb2
568b6eb
0d408dd
0940f2a
541cd03
a87c53b
c95b2b5
e0489c3
371b283
9aa2130
928bf03
6245561
905295d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,7 @@ def __init__(self, file_info, min_ref=1, max_ref=1000): | |
self.headers = [] | ||
self.literature_header = [] | ||
self.name_pattern = r'список[ \t]*(использованных|использованной|)[ \t]*(источников|литературы)' | ||
self.md_name_pattern = r"<h2>(Список использованных источников|Список использованной литературы)<\/h2>" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Почему тут другой паттерн? https://github.com/moevm/mse_auto_checking_slides_vaganov/pull/471/files#diff-4a9d734369983549e0972bd59f7620e6ec4e30bed37af49ced2fcdc06ab558fbR16 Возможно, его (их / другие regexp) стоит вынести в класс файла (или uploader'а), чтобы использовать в нужных местах проверок и не теряться в их обилии |
||
self.min_ref = min_ref | ||
self.max_ref = max_ref | ||
|
||
|
@@ -77,7 +78,10 @@ def check(self): | |
def search_references(self, start_par): | ||
array_of_references = set() | ||
for i in range(0, start_par): | ||
detected_references = re.findall(r'\[[\d \-,]+\]', self.file.paragraphs[i].to_string().split('\n')[1]) | ||
if isinstance(self.file.paragraphs[i], str): | ||
detected_references = re.findall(r'\[[\d \-,]+\]', self.file.paragraphs[i]) | ||
else: | ||
detected_references = re.findall(r'\[[\d \-,]+\]', self.file.paragraphs[i].to_string().split('\n')[1]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. А в There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Исправила на "paragraph_text", работает корректно |
||
if detected_references: | ||
for reference in detected_references: | ||
for one_part in re.split(r'[\[\],]', reference): | ||
|
@@ -86,16 +90,22 @@ def search_references(self, start_par): | |
for k in range(int(start), int(end) + 1): | ||
array_of_references.add(k) | ||
elif one_part != '': | ||
array_of_references.add(int(one_part)) | ||
array_of_references.add(int(one_part)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Пустые изменения There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Исправила |
||
return array_of_references | ||
|
||
def find_start_paragraph(self): | ||
start_index = 0 | ||
for i in range(len(self.file.paragraphs)): | ||
text_string = self.file.paragraphs[i].to_string().lower().split('\n')[1] | ||
if re.fullmatch(self.name_pattern, text_string): | ||
start_index = i | ||
break | ||
if isinstance(self.file.paragraphs[i], str): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Исправила, работает корректно |
||
text_string = self.file.paragraphs[i].lower() | ||
if re.fullmatch(self.md_name_pattern, text_string): | ||
start_index = i | ||
break | ||
else: | ||
text_string = self.file.paragraphs[i].to_string().lower().split('\n')[1] | ||
if re.fullmatch(self.name_pattern, text_string): | ||
start_index = i | ||
break | ||
return start_index | ||
|
||
def count_sources_vkr(self, header): | ||
|
@@ -142,4 +152,4 @@ def search_literature_start_pdf(self): | |
if re.search('приложение а[\n .]', lowercase_str): | ||
end_page = i | ||
break | ||
return start_page, end_page | ||
return start_page, end_page |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -63,7 +63,10 @@ def check(self): | |
def search_references(self): | ||
array_of_references = set() | ||
for i in range(0, self.last_child_number): | ||
detected_references = re.findall(r'таблиц[аеыу][\d .]+', self.file.paragraphs[i].paragraph_text) | ||
if isinstance(self.file.paragraphs[i], str): | ||
detected_references = re.findall(r'таблиц[аеыу][\d .]+', self.file.paragraphs[i]) | ||
else: | ||
detected_references = re.findall(r'таблиц[аеыу][\d .]+', self.file.paragraphs[i].paragraph_text) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Аналогично про
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Оставила "paragraph_text" |
||
if detected_references: | ||
for reference in detected_references: | ||
for one_part in re.split(r'таблиц[аеыу]| ', reference): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
from abc import ABC, abstractmethod | ||
|
||
class DocumentUploader(ABC): | ||
|
||
@abstractmethod | ||
def upload(self): | ||
pass | ||
|
||
@abstractmethod | ||
def parse(self): | ||
pass | ||
|
||
@abstractmethod | ||
def parse_effective_styles(self): | ||
pass | ||
|
||
@abstractmethod | ||
def page_counter(self): | ||
pass | ||
|
||
@abstractmethod | ||
def make_chapters(self, work_type): | ||
pass | ||
|
||
@abstractmethod | ||
def find_header_page(self, work_type): | ||
pass | ||
|
||
@abstractmethod | ||
def find_literature_vkr(self, work_type): | ||
pass |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,9 +9,10 @@ | |
from .style import Style | ||
from .table import Table, Cell | ||
from ..pdf_document.pdf_document_manager import PdfDocumentManager | ||
from ..document_uploader import DocumentUploader | ||
|
||
|
||
class DocxUploader: | ||
class DocxUploader(DocumentUploader): | ||
def __init__(self): | ||
self.inline_shapes = [] | ||
self.core_properties = None | ||
|
@@ -27,6 +28,7 @@ def __init__(self): | |
self.first_lines = [] | ||
self.literature_header = [] | ||
self.headers_page = 0 | ||
self.literature_page = 0 | ||
|
||
def upload(self, file, pdf_filepath=''): | ||
self.file = docx.Document(file) | ||
|
@@ -122,6 +124,16 @@ def find_header_page(self, work_type): | |
self.headers_page = header["page"] | ||
break | ||
return self.headers_page | ||
|
||
def find_literature_page(self, work_type): | ||
if not self.literature_page: | ||
for k, v in self.pdf_file.text_on_page.items(): | ||
line = v[:40] if len(v) > 21 else v | ||
if re.search('СПИСОК ИСПОЛЬЗОВАННЫХ ИСТОЧНИКОВ', line.strip()): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. В некоторых критериях используется regexp с более широким спектром возможных вариаций названия - а почему тут в таком виде? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Исправлено на более широкое выпажение: |
||
break | ||
self.literature_page += 1 | ||
self.literature_page += 1 | ||
return self.literature_page | ||
|
||
def find_literature_vkr(self, work_type): | ||
if not self.literature_header: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
А что будет, если загрузить не md-файл, простую ВКР в docx? это ведь общий критерий
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Удалила ненужный паттерн для md