diff --git a/marker/v2/builders/structure.py b/marker/v2/builders/structure.py index 2b9ac99..90f950c 100644 --- a/marker/v2/builders/structure.py +++ b/marker/v2/builders/structure.py @@ -10,7 +10,7 @@ class StructureBuilder(BaseBuilder): - gap_threshold: int = 10 + gap_threshold: int = .05 def __init__(self, config=None): super().__init__(config) @@ -21,6 +21,7 @@ def __call__(self, document: Document): self.group_lists(page) def group_caption_blocks(self, page: PageGroup): + gap_threshold_px = self.gap_threshold * page.polygon.height for i, block_id in enumerate(page.structure): block = page.get_block(block_id) if block.block_type not in [BlockTypes.Table, BlockTypes.Figure, BlockTypes.Picture]: @@ -32,10 +33,10 @@ def group_caption_blocks(self, page: PageGroup): prev_block = page.get_block(prev_block_id) if all([ prev_block.block_type in [BlockTypes.Caption, BlockTypes.Footnote], - prev_block.polygon.minimum_gap(block.polygon) < self.gap_threshold + prev_block.polygon.minimum_gap(block.polygon) < gap_threshold_px ]): block_structure.insert(0, prev_block_id) - selected_polygons.append(prev_block.polygon) + selected_polygons.append(selected_polygons[0]) else: break @@ -43,7 +44,7 @@ def group_caption_blocks(self, page: PageGroup): next_block = page.get_block(next_block_id) if all([ next_block.block_type in [BlockTypes.Caption, BlockTypes.Footnote], - next_block.polygon.minimum_gap(block.polygon) < self.gap_threshold + next_block.polygon.minimum_gap(selected_polygons[-1]) < gap_threshold_px ]): block_structure.append(next_block_id) selected_polygons.append(next_block.polygon) @@ -62,6 +63,7 @@ def group_caption_blocks(self, page: PageGroup): page.remove_structure_items(block_structure) def group_lists(self, page: PageGroup): + gap_threshold_px = self.gap_threshold * page.polygon.height for i, block_id in enumerate(page.structure): block = page.get_block(block_id) if block.block_type not in [BlockTypes.ListItem]: @@ -73,7 +75,7 @@ def group_lists(self, page: PageGroup): next_block = page.get_block(next_block_id) if all([ next_block.block_type == BlockTypes.ListItem, - next_block.polygon.minimum_gap(block.polygon) < self.gap_threshold + next_block.polygon.minimum_gap(selected_polygons[-1]) < gap_threshold_px ]): block_structure.append(next_block_id) selected_polygons.append(next_block.polygon) diff --git a/marker/v2/converters/pdf.py b/marker/v2/converters/pdf.py index cf4fbbf..4713581 100644 --- a/marker/v2/converters/pdf.py +++ b/marker/v2/converters/pdf.py @@ -1,14 +1,13 @@ import os + os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning from marker.v2.processors.sectionheader import SectionHeaderProcessor from marker.v2.providers.pdf import PdfProvider import tempfile -from typing import List, Optional import click import datasets -from pydantic import BaseModel from marker.v2.builders.document import DocumentBuilder from marker.v2.builders.layout import LayoutBuilder @@ -20,6 +19,7 @@ from marker.v2.models import setup_layout_model, setup_texify_model, setup_recognition_model, setup_table_rec_model, \ setup_detection_model from marker.v2.renderers.markdown import MarkdownRenderer +from marker.v2.processors.debug import DebugProcessor class PdfConverter(BaseConverter): @@ -49,6 +49,9 @@ def __call__(self, filepath: str): section_header_processor = SectionHeaderProcessor(self.config) section_header_processor(document) + debug_processor = DebugProcessor(self.config) + debug_processor(document) + renderer = MarkdownRenderer(self.config) return renderer(document) @@ -56,12 +59,19 @@ def __call__(self, filepath: str): @click.command() @click.option("--output", type=click.Path(exists=False), required=False, default="temp") @click.option("--fname", type=str, default="adversarial.pdf") -def main(output: str, fname: str): +@click.option("--debug", is_flag=True) +def main(output: str, fname: str, debug: bool): dataset = datasets.load_dataset("datalab-to/pdfs", split="train") idx = dataset['filename'].index(fname) out_filename = fname.rsplit(".", 1)[0] + ".md" os.makedirs(output, exist_ok=True) + config = {} + if debug: + config["debug_pdf_images"] = True + config["debug_layout_images"] = True + config["debug_json"] = True + with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf: temp_pdf.write(dataset['pdf'][idx]) temp_pdf.flush() diff --git a/marker/v2/processors/debug.py b/marker/v2/processors/debug.py new file mode 100644 index 0000000..91f8f34 --- /dev/null +++ b/marker/v2/processors/debug.py @@ -0,0 +1,148 @@ +import json +import os + +import requests +from PIL import Image, ImageDraw, ImageFont + +from marker.settings import settings +from marker.v2.processors import BaseProcessor +from marker.v2.schema import BlockTypes +from marker.v2.schema.document import Document + + +class DebugProcessor(BaseProcessor): + block_types = tuple() + debug_data_folder: str = "debug_data" + debug_layout_images: bool = False + debug_pdf_images: bool = False + debug_json: bool = False + render_font: str = os.path.join(settings.FONT_DIR, "GoNotoCurrent-Regular.ttf") + font_dl_path: str = "https://github.com/satbyy/go-noto-universal/releases/download/v7.0" + + def __call__(self, document: Document): + # Remove extension from doc name + doc_base = os.path.basename(document.filepath).rsplit(".", 1)[0] + self.debug_folder = os.path.join(self.debug_data_folder, doc_base) + os.makedirs(self.debug_folder, exist_ok=True) + + if self.debug_layout_images: + self.draw_layout_debug_images(document) + print(f"Dumped layout debug images to {self.debug_data_folder}") + + if self.debug_pdf_images: + self.draw_layout_debug_images(document, pdf_mode=True) + print(f"Dumped PDF debug images to {self.debug_data_folder}") + + if self.debug_json: + self.dump_block_debug_data(document) + print(f"Dumped block debug data to {self.debug_data_folder}") + + def draw_layout_debug_images(self, document: Document, pdf_mode = False): + for idx, page in enumerate(document.pages): + img_size = page.highres_image.size + png_image = Image.new("RGB", img_size, color="white") + if pdf_mode: + png_image = page.highres_image.copy() + + line_bboxes = [] + line_text = [] + for child in page.children: + if child.block_type != BlockTypes.Line: + continue + + bbox = child.polygon.rescale(page.polygon.size, img_size).bbox + line_bboxes.append(bbox) + line_text.append(child.raw_text(document)) + + if pdf_mode: + line_text = None + + self.render_on_image(line_bboxes, png_image, labels=line_text, color="black", draw_bbox=False, label_font_size=24) + + layout_bboxes = [] + layout_labels = [] + for child in page.children: + if child.block_type in [BlockTypes.Line, BlockTypes.Span]: + continue + + bbox = child.polygon.rescale(page.polygon.size, img_size).bbox + layout_bboxes.append(bbox) + layout_labels.append(str(child.block_type)) + + self.render_on_image(layout_bboxes, png_image, labels=layout_labels, color="red", label_font_size=24) + + order_labels = [str(i) for i in range(len(layout_bboxes))] + self.render_on_image( + layout_bboxes, + png_image, + labels=order_labels, + color="green", + draw_bbox=False, + label_offset=5 + ) + + filecomp = "pdf" if pdf_mode else "layout" + debug_file = os.path.join(self.debug_folder, f"{filecomp}_page_{idx}.png") + png_image.save(debug_file) + + def dump_block_debug_data(self, document: Document): + debug_file = os.path.join(self.debug_folder, f"blocks.json") + debug_data = [] + for idx, page in enumerate(document.pages): + page_data = page.model_dump(exclude=["lowres_image", "highres_image"]) + debug_data.append(page_data) + + with open(debug_file, "w+") as f: + json.dump(debug_data, f) + + def get_font_path(self) -> str: + if not os.path.exists(self.render_font): + os.makedirs(os.path.dirname(self.render_font), exist_ok=True) + font_dl_path = f"{self.font_dl_path}/{os.path.basename(self.render_font)}" + with requests.get(font_dl_path, stream=True) as r, open(self.render_font, 'wb') as f: + r.raise_for_status() + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + + return self.render_font + + def get_text_size(self, text, font): + im = Image.new(mode="P", size=(0, 0)) + draw = ImageDraw.Draw(im) + _, _, width, height = draw.textbbox((0, 0), text=text, font=font) + return width, height + + def render_on_image(self, bboxes, image, labels=None, label_offset=1, label_font_size=10, color: str | list='red', draw_bbox=True): + draw = ImageDraw.Draw(image) + font_path = self.get_font_path() + label_font = ImageFont.truetype(font_path, label_font_size) + + for i, bbox in enumerate(bboxes): + bbox = [int(p) for p in bbox] + if draw_bbox: + draw.rectangle(bbox, outline=color[i] if isinstance(color, list) else color, width=1) + + if labels is not None: + label = labels[i] + text_position = ( + bbox[0] + label_offset, + bbox[1] + label_offset + ) + text_size = self.get_text_size(label, label_font) + if text_size[0] <= 0 or text_size[1] <= 0: + continue + box_position = ( + text_position[0], + text_position[1], + text_position[0] + text_size[0], + text_position[1] + text_size[1] + ) + draw.rectangle(box_position, fill="white") + draw.text( + text_position, + label, + fill=color[i] if isinstance(color, list) else color, + font=label_font + ) + + return image \ No newline at end of file diff --git a/marker/v2/processors/ignoretext.py b/marker/v2/processors/ignoretext.py new file mode 100644 index 0000000..067cb1a --- /dev/null +++ b/marker/v2/processors/ignoretext.py @@ -0,0 +1,48 @@ +from collections import Counter + +from marker.v2.processors import BaseProcessor +from marker.v2.schema import BlockTypes +from marker.v2.schema.document import Document + + +class IgnoreTextProcessor(BaseProcessor): + block_types = (BlockTypes.Text,) + common_element_threshold = .6 + max_blocks = 1 + + def __call__(self, document: Document): + first_blocks = [] + last_blocks = [] + for page in document.pages: + initial_block = None + block = None + last_block = None + for block in page.children: + if block.block_type not in self.block_types: + continue + + if initial_block is None: + initial_block = block + + if block is not None: + last_block = block + + if initial_block is not None: + first_blocks.append(initial_block) + if last_block is not None: + last_blocks.append(last_block) + + self.filter_common_elements(document, first_blocks) + self.filter_common_elements(document, last_blocks) + + def filter_common_elements(self, document, lines): + # We can't filter if we don't have enough pages to find common elements + if len(lines) < 3: + return [] + + text = [b.raw_text(document) for b in lines] + counter = Counter(text) + common = [k for k, v in counter.items() if v > len(lines) * self.common_element_threshold] + for b in lines: + if b.raw_text(document) in common: + b.is_header_footer = True diff --git a/marker/v2/schema/__init__.py b/marker/v2/schema/__init__.py index 9cbd658..28046c2 100644 --- a/marker/v2/schema/__init__.py +++ b/marker/v2/schema/__init__.py @@ -26,3 +26,6 @@ class BlockTypes(Enum): Text = auto() TableOfContents = auto() Document = auto() + + def __str__(self): + return self.name diff --git a/marker/v2/schema/blocks/listitem.py b/marker/v2/schema/blocks/listitem.py index 932254b..a3d2e5a 100644 --- a/marker/v2/schema/blocks/listitem.py +++ b/marker/v2/schema/blocks/listitem.py @@ -4,11 +4,16 @@ from marker.v2.schema.blocks import Block -def replace_bullets(text): +def replace_bullets(child_blocks): # Replace bullet characters with a - - bullet_pattern = r"(^|[\n ])[•●○■▪▫–—]( )" - replaced_string = re.sub(bullet_pattern, r"\1-\2", text) - return replaced_string + first_block = None + while len(child_blocks) > 0: + first_block = child_blocks[0] + child_blocks = first_block.children + + if first_block.id.block_type == BlockTypes.Line: + bullet_pattern = r"(^|[\n ]|<[^>]*>)[•●○■▪▫–—-]( )" + first_block.html = re.sub(bullet_pattern, r"\1\2", first_block.html) class ListItem(Block): @@ -17,5 +22,6 @@ class ListItem(Block): def assemble_html(self, child_blocks, parent_structure): template = super().assemble_html(child_blocks, parent_structure) template = template.replace("\n", " ") - template = replace_bullets(template) + # Remove the first bullet character + replace_bullets(child_blocks) return f"