From 20fec484d67d7744a855f291fa3bfe6796fd699a Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Tue, 19 Nov 2024 12:31:14 +0000 Subject: [PATCH 1/3] fix typo [skip ci] --- tests/schema/groups/test_list_grouping.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/schema/groups/test_list_grouping.py b/tests/schema/groups/test_list_grouping.py index 1a50cc4..27a60f7 100644 --- a/tests/schema/groups/test_list_grouping.py +++ b/tests/schema/groups/test_list_grouping.py @@ -9,11 +9,10 @@ def test_list_grouping(pdf_document): structure = StructureBuilder() structure(pdf_document) - page = pdf_document.pags[0] + page = pdf_document.pages[0] list_groups = [] for block in page.children: if block.block_type == BlockTypes.ListGroup: list_groups.append(block) assert len(list_groups) == 1 - From d4564acaf06f8403995c32d7d6971f815aa038ec Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Tue, 19 Nov 2024 12:45:06 +0000 Subject: [PATCH 2/3] fix block_type in BaseProcessor [skip ci] --- marker/v2/processors/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/marker/v2/processors/__init__.py b/marker/v2/processors/__init__.py index 2794772..f9bcc43 100644 --- a/marker/v2/processors/__init__.py +++ b/marker/v2/processors/__init__.py @@ -8,7 +8,7 @@ class BaseProcessor: - block_types: Tuple[str] | None = None # What block types this processor is responsible for + block_types: Tuple[BlockTypes] | None = None # What block types this processor is responsible for def __init__(self, config: Optional[BaseModel | dict] = None): assign_config(self, config) From efcfc24d53ac083534591e8053ff5574be7567f3 Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Tue, 19 Nov 2024 14:22:12 +0000 Subject: [PATCH 3/3] fixes to debug processors and pdf converter [skip ci] --- marker/v2/converters/pdf.py | 20 +++++++++----------- marker/v2/processors/debug.py | 9 +++++---- marker/v2/processors/sectionheader.py | 5 +++-- marker/v2/schema/__init__.py | 4 ++-- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/marker/v2/converters/pdf.py b/marker/v2/converters/pdf.py index 55caad9..604bc26 100644 --- a/marker/v2/converters/pdf.py +++ b/marker/v2/converters/pdf.py @@ -50,17 +50,15 @@ def __call__(self, filepath: str): document = DocumentBuilder(self.config)(pdf_provider, layout_builder, ocr_builder) StructureBuilder(self.config)(document) - equation_processor = EquationProcessor(self.texify_model, self.config) - equation_processor(document) + processor_list = [ + EquationProcessor(self.texify_model, self.config), + TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model, self.config), + SectionHeaderProcessor(self.config), + DebugProcessor(self.config), + ] - table_processor = TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model, self.config) - table_processor(document) - - section_header_processor = SectionHeaderProcessor(self.config) - section_header_processor(document) - - debug_processor = DebugProcessor(self.config) - debug_processor(document) + for processor in processor_list: + processor(document) renderer = MarkdownRenderer(self.config) return renderer(document) @@ -86,7 +84,7 @@ def main(output: str, fname: str, debug: bool): temp_pdf.write(dataset['pdf'][idx]) temp_pdf.flush() - converter = PdfConverter() + converter = PdfConverter(config) rendered = converter(temp_pdf.name) with open(os.path.join(output, out_filename), "w+") as f: diff --git a/marker/v2/processors/debug.py b/marker/v2/processors/debug.py index 91f8f34..9324606 100644 --- a/marker/v2/processors/debug.py +++ b/marker/v2/processors/debug.py @@ -23,7 +23,8 @@ def __call__(self, document: Document): # Remove extension from doc name doc_base = os.path.basename(document.filepath).rsplit(".", 1)[0] self.debug_folder = os.path.join(self.debug_data_folder, doc_base) - os.makedirs(self.debug_folder, exist_ok=True) + if any([self.debug_layout_images, self.debug_pdf_images, self.debug_json]): + os.makedirs(self.debug_folder, exist_ok=True) if self.debug_layout_images: self.draw_layout_debug_images(document) @@ -37,7 +38,7 @@ def __call__(self, document: Document): self.dump_block_debug_data(document) print(f"Dumped block debug data to {self.debug_data_folder}") - def draw_layout_debug_images(self, document: Document, pdf_mode = False): + def draw_layout_debug_images(self, document: Document, pdf_mode=False): for idx, page in enumerate(document.pages): img_size = page.highres_image.size png_image = Image.new("RGB", img_size, color="white") @@ -112,7 +113,7 @@ def get_text_size(self, text, font): _, _, width, height = draw.textbbox((0, 0), text=text, font=font) return width, height - def render_on_image(self, bboxes, image, labels=None, label_offset=1, label_font_size=10, color: str | list='red', draw_bbox=True): + def render_on_image(self, bboxes, image, labels=None, label_offset=1, label_font_size=10, color: str | list = 'red', draw_bbox=True): draw = ImageDraw.Draw(image) font_path = self.get_font_path() label_font = ImageFont.truetype(font_path, label_font_size) @@ -145,4 +146,4 @@ def render_on_image(self, bboxes, image, labels=None, label_offset=1, label_font font=label_font ) - return image \ No newline at end of file + return image diff --git a/marker/v2/processors/sectionheader.py b/marker/v2/processors/sectionheader.py index a100495..ea9cc74 100644 --- a/marker/v2/processors/sectionheader.py +++ b/marker/v2/processors/sectionheader.py @@ -2,6 +2,7 @@ from marker.v2.schema import BlockTypes from marker.v2.schema.document import Document +from typing import Dict, List import numpy as np from sklearn.cluster import KMeans from sklearn.exceptions import ConvergenceWarning @@ -19,7 +20,7 @@ class SectionHeaderProcessor(BaseProcessor): height_tolerance = .99 def __call__(self, document: Document): - line_heights = {} + line_heights: Dict[int, List[float]] = {} for page in document.pages: for block in page.children: if block.block_type not in self.block_types: @@ -46,7 +47,7 @@ def __call__(self, document: Document): if block.heading_level is None: block.heading_level = self.default_level - def bucket_headings(self, line_heights, num_levels=4): + def bucket_headings(self, line_heights: List[float], num_levels=4): if len(line_heights) <= self.level_count: return [] diff --git a/marker/v2/schema/__init__.py b/marker/v2/schema/__init__.py index 28046c2..7312c0f 100644 --- a/marker/v2/schema/__init__.py +++ b/marker/v2/schema/__init__.py @@ -1,7 +1,7 @@ -from enum import Enum, auto +from enum import auto, IntEnum -class BlockTypes(Enum): +class BlockTypes(IntEnum): Line = auto() Span = auto() FigureGroup = auto()