Merge pull request #367 from VikParuchuri/vik_v2

Add debug utils, fix output quality issues
VikParuchuri · Nov 18, 2024 · 3e7c4f3 · 3e7c4f3
2 parents 3dbbf74 + 8bd872b
commit 3e7c4f3
Show file tree

Hide file tree

Showing 8 changed files with 256 additions and 22 deletions.
diff --git a/marker/v2/builders/structure.py b/marker/v2/builders/structure.py
@@ -10,7 +10,7 @@
 
 
 class StructureBuilder(BaseBuilder):
-    gap_threshold: int = 10
+    gap_threshold: int = .05
 
     def __init__(self, config=None):
         super().__init__(config)
@@ -21,6 +21,7 @@ def __call__(self, document: Document):
             self.group_lists(page)
 
     def group_caption_blocks(self, page: PageGroup):
+        gap_threshold_px = self.gap_threshold * page.polygon.height
         for i, block_id in enumerate(page.structure):
             block = page.get_block(block_id)
             if block.block_type not in [BlockTypes.Table, BlockTypes.Figure, BlockTypes.Picture]:
@@ -32,18 +33,18 @@ def group_caption_blocks(self, page: PageGroup):
                 prev_block = page.get_block(prev_block_id)
                 if all([
                     prev_block.block_type in [BlockTypes.Caption, BlockTypes.Footnote],
-                    prev_block.polygon.minimum_gap(block.polygon) < self.gap_threshold
+                    prev_block.polygon.minimum_gap(block.polygon) < gap_threshold_px
                 ]):
                     block_structure.insert(0, prev_block_id)
-                    selected_polygons.append(prev_block.polygon)
+                    selected_polygons.append(selected_polygons[0])
                 else:
                     break
 
             for j, next_block_id in enumerate(page.structure[i + 1:]):
                 next_block = page.get_block(next_block_id)
                 if all([
                     next_block.block_type in [BlockTypes.Caption, BlockTypes.Footnote],
-                    next_block.polygon.minimum_gap(block.polygon) < self.gap_threshold
+                    next_block.polygon.minimum_gap(selected_polygons[-1]) < gap_threshold_px
                 ]):
                     block_structure.append(next_block_id)
                     selected_polygons.append(next_block.polygon)
@@ -62,6 +63,7 @@ def group_caption_blocks(self, page: PageGroup):
                 page.remove_structure_items(block_structure)
 
     def group_lists(self, page: PageGroup):
+        gap_threshold_px = self.gap_threshold * page.polygon.height
         for i, block_id in enumerate(page.structure):
             block = page.get_block(block_id)
             if block.block_type not in [BlockTypes.ListItem]:
@@ -73,7 +75,7 @@ def group_lists(self, page: PageGroup):
                 next_block = page.get_block(next_block_id)
                 if all([
                     next_block.block_type == BlockTypes.ListItem,
-                    next_block.polygon.minimum_gap(block.polygon) < self.gap_threshold
+                    next_block.polygon.minimum_gap(selected_polygons[-1]) < gap_threshold_px
                 ]):
                     block_structure.append(next_block_id)
                     selected_polygons.append(next_block.polygon)

diff --git a/marker/v2/converters/pdf.py b/marker/v2/converters/pdf.py
@@ -1,14 +1,13 @@
 import os
+
 os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning
 
 from marker.v2.processors.sectionheader import SectionHeaderProcessor
 from marker.v2.providers.pdf import PdfProvider
 import tempfile
-from typing import List, Optional
 
 import click
 import datasets
-from pydantic import BaseModel
 
 from marker.v2.builders.document import DocumentBuilder
 from marker.v2.builders.layout import LayoutBuilder
@@ -20,6 +19,7 @@
 from marker.v2.models import setup_layout_model, setup_texify_model, setup_recognition_model, setup_table_rec_model, \
     setup_detection_model
 from marker.v2.renderers.markdown import MarkdownRenderer
+from marker.v2.processors.debug import DebugProcessor
 
 
 class PdfConverter(BaseConverter):
@@ -49,19 +49,29 @@ def __call__(self, filepath: str):
         section_header_processor = SectionHeaderProcessor(self.config)
         section_header_processor(document)
 
+        debug_processor = DebugProcessor(self.config)
+        debug_processor(document)
+
         renderer = MarkdownRenderer(self.config)
         return renderer(document)
 
 
 @click.command()
 @click.option("--output", type=click.Path(exists=False), required=False, default="temp")
 @click.option("--fname", type=str, default="adversarial.pdf")
-def main(output: str, fname: str):
+@click.option("--debug", is_flag=True)
+def main(output: str, fname: str, debug: bool):
     dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
     idx = dataset['filename'].index(fname)
     out_filename = fname.rsplit(".", 1)[0] + ".md"
     os.makedirs(output, exist_ok=True)
 
+    config = {}
+    if debug:
+        config["debug_pdf_images"] = True
+        config["debug_layout_images"] = True
+        config["debug_json"] = True
+
     with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
         temp_pdf.write(dataset['pdf'][idx])
         temp_pdf.flush()

diff --git a/marker/v2/processors/debug.py b/marker/v2/processors/debug.py
@@ -0,0 +1,148 @@
+import json
+import os
+
+import requests
+from PIL import Image, ImageDraw, ImageFont
+
+from marker.settings import settings
+from marker.v2.processors import BaseProcessor
+from marker.v2.schema import BlockTypes
+from marker.v2.schema.document import Document
+
+
+class DebugProcessor(BaseProcessor):
+    block_types = tuple()
+    debug_data_folder: str = "debug_data"
+    debug_layout_images: bool = False
+    debug_pdf_images: bool = False
+    debug_json: bool = False
+    render_font: str = os.path.join(settings.FONT_DIR, "GoNotoCurrent-Regular.ttf")
+    font_dl_path: str = "https://github.com/satbyy/go-noto-universal/releases/download/v7.0"
+
+    def __call__(self, document: Document):
+        # Remove extension from doc name
+        doc_base = os.path.basename(document.filepath).rsplit(".", 1)[0]
+        self.debug_folder = os.path.join(self.debug_data_folder, doc_base)
+        os.makedirs(self.debug_folder, exist_ok=True)
+
+        if self.debug_layout_images:
+            self.draw_layout_debug_images(document)
+            print(f"Dumped layout debug images to {self.debug_data_folder}")
+
+        if self.debug_pdf_images:
+            self.draw_layout_debug_images(document, pdf_mode=True)
+            print(f"Dumped PDF debug images to {self.debug_data_folder}")
+
+        if self.debug_json:
+            self.dump_block_debug_data(document)
+            print(f"Dumped block debug data to {self.debug_data_folder}")
+
+    def draw_layout_debug_images(self, document: Document, pdf_mode = False):
+        for idx, page in enumerate(document.pages):
+            img_size = page.highres_image.size
+            png_image = Image.new("RGB", img_size, color="white")
+            if pdf_mode:
+                png_image = page.highres_image.copy()
+
+            line_bboxes = []
+            line_text = []
+            for child in page.children:
+                if child.block_type != BlockTypes.Line:
+                    continue
+
+                bbox = child.polygon.rescale(page.polygon.size, img_size).bbox
+                line_bboxes.append(bbox)
+                line_text.append(child.raw_text(document))
+
+            if pdf_mode:
+                line_text = None
+
+            self.render_on_image(line_bboxes, png_image, labels=line_text, color="black", draw_bbox=False, label_font_size=24)
+
+            layout_bboxes = []
+            layout_labels = []
+            for child in page.children:
+                if child.block_type in [BlockTypes.Line, BlockTypes.Span]:
+                    continue
+
+                bbox = child.polygon.rescale(page.polygon.size, img_size).bbox
+                layout_bboxes.append(bbox)
+                layout_labels.append(str(child.block_type))
+
+            self.render_on_image(layout_bboxes, png_image, labels=layout_labels, color="red", label_font_size=24)
+
+            order_labels = [str(i) for i in range(len(layout_bboxes))]
+            self.render_on_image(
+                layout_bboxes,
+                png_image,
+                labels=order_labels,
+                color="green",
+                draw_bbox=False,
+                label_offset=5
+            )
+
+            filecomp = "pdf" if pdf_mode else "layout"
+            debug_file = os.path.join(self.debug_folder, f"{filecomp}_page_{idx}.png")
+            png_image.save(debug_file)
+
+    def dump_block_debug_data(self, document: Document):
+        debug_file = os.path.join(self.debug_folder, f"blocks.json")
+        debug_data = []
+        for idx, page in enumerate(document.pages):
+            page_data = page.model_dump(exclude=["lowres_image", "highres_image"])
+            debug_data.append(page_data)
+
+        with open(debug_file, "w+") as f:
+            json.dump(debug_data, f)
+
+    def get_font_path(self) -> str:
+        if not os.path.exists(self.render_font):
+            os.makedirs(os.path.dirname(self.render_font), exist_ok=True)
+            font_dl_path = f"{self.font_dl_path}/{os.path.basename(self.render_font)}"
+            with requests.get(font_dl_path, stream=True) as r, open(self.render_font, 'wb') as f:
+                r.raise_for_status()
+                for chunk in r.iter_content(chunk_size=8192):
+                    f.write(chunk)
+
+        return self.render_font
+
+    def get_text_size(self, text, font):
+        im = Image.new(mode="P", size=(0, 0))
+        draw = ImageDraw.Draw(im)
+        _, _, width, height = draw.textbbox((0, 0), text=text, font=font)
+        return width, height
+
+    def render_on_image(self, bboxes, image, labels=None, label_offset=1, label_font_size=10, color: str | list='red', draw_bbox=True):
+        draw = ImageDraw.Draw(image)
+        font_path = self.get_font_path()
+        label_font = ImageFont.truetype(font_path, label_font_size)
+
+        for i, bbox in enumerate(bboxes):
+            bbox = [int(p) for p in bbox]
+            if draw_bbox:
+                draw.rectangle(bbox, outline=color[i] if isinstance(color, list) else color, width=1)
+
+            if labels is not None:
+                label = labels[i]
+                text_position = (
+                    bbox[0] + label_offset,
+                    bbox[1] + label_offset
+                )
+                text_size = self.get_text_size(label, label_font)
+                if text_size[0] <= 0 or text_size[1] <= 0:
+                    continue
+                box_position = (
+                    text_position[0],
+                    text_position[1],
+                    text_position[0] + text_size[0],
+                    text_position[1] + text_size[1]
+                )
+                draw.rectangle(box_position, fill="white")
+                draw.text(
+                    text_position,
+                    label,
+                    fill=color[i] if isinstance(color, list) else color,
+                    font=label_font
+                )
+
+        return image
diff --git a/marker/v2/processors/ignoretext.py b/marker/v2/processors/ignoretext.py
@@ -0,0 +1,48 @@
+from collections import Counter
+
+from marker.v2.processors import BaseProcessor
+from marker.v2.schema import BlockTypes
+from marker.v2.schema.document import Document
+
+
+class IgnoreTextProcessor(BaseProcessor):
+    block_types = (BlockTypes.Text,)
+    common_element_threshold = .6
+    max_blocks = 1
+
+    def __call__(self, document: Document):
+        first_blocks = []
+        last_blocks = []
+        for page in document.pages:
+            initial_block = None
+            block = None
+            last_block = None
+            for block in page.children:
+                if block.block_type not in self.block_types:
+                    continue
+
+                if initial_block is None:
+                    initial_block = block
+
+            if block is not None:
+                last_block = block
+
+            if initial_block is not None:
+                first_blocks.append(initial_block)
+            if last_block is not None:
+                last_blocks.append(last_block)
+
+        self.filter_common_elements(document, first_blocks)
+        self.filter_common_elements(document, last_blocks)
+
+    def filter_common_elements(self, document, lines):
+        # We can't filter if we don't have enough pages to find common elements
+        if len(lines) < 3:
+            return []
+
+        text = [b.raw_text(document) for b in lines]
+        counter = Counter(text)
+        common = [k for k, v in counter.items() if v > len(lines) * self.common_element_threshold]
+        for b in lines:
+            if b.raw_text(document) in common:
+                b.is_header_footer = True
diff --git a/marker/v2/schema/__init__.py b/marker/v2/schema/__init__.py
@@ -26,3 +26,6 @@ class BlockTypes(Enum):
     Text = auto()
     TableOfContents = auto()
     Document = auto()
+
+    def __str__(self):
+        return self.name
diff --git a/marker/v2/schema/blocks/listitem.py b/marker/v2/schema/blocks/listitem.py
@@ -4,11 +4,16 @@
 from marker.v2.schema.blocks import Block
 
 
-def replace_bullets(text):
+def replace_bullets(child_blocks):
     # Replace bullet characters with a -
-    bullet_pattern = r"(^|[\n ])[•●○■▪▫–—]( )"
-    replaced_string = re.sub(bullet_pattern, r"\1-\2", text)
-    return replaced_string
+    first_block = None
+    while len(child_blocks) > 0:
+        first_block = child_blocks[0]
+        child_blocks = first_block.children
+
+    if first_block.id.block_type == BlockTypes.Line:
+        bullet_pattern = r"(^|[\n ]|<[^>]*>)[•●○■▪▫–—-]( )"
+        first_block.html = re.sub(bullet_pattern, r"\1\2", first_block.html)
 
 
 class ListItem(Block):
@@ -17,5 +22,6 @@ class ListItem(Block):
     def assemble_html(self, child_blocks, parent_structure):
         template = super().assemble_html(child_blocks, parent_structure)
         template = template.replace("\n", " ")
-        template = replace_bullets(template)
+        # Remove the first bullet character
+        replace_bullets(child_blocks)
         return f"<li>{template}</li>"
diff --git a/marker/v2/schema/blocks/text.py b/marker/v2/schema/blocks/text.py
@@ -1,6 +1,7 @@
 from marker.v2.schema import BlockTypes
 from marker.v2.schema.blocks import Block
 
+
 class Text(Block):
     block_type: BlockTypes = BlockTypes.Text
 

diff --git a/marker/v2/schema/polygon.py b/marker/v2/schema/polygon.py
@@ -77,15 +77,31 @@ def minimum_gap(self, other: PolygonBox):
         if self.intersection_pct(other) > 0:
             return 0
 
-        x_dist = min(abs(self.bbox[0] - other.bbox[2]), abs(self.bbox[2] - other.bbox[0]))
-        y_dist = min(abs(self.bbox[1] - other.bbox[3]), abs(self.bbox[3] - other.bbox[1]))
-
-        if x_dist == 0 or self.overlap_x(other) > 0:
-            return y_dist
-        if y_dist == 0 or self.overlap_y(other) > 0:
-            return x_dist
-
-        return (x_dist ** 2 + y_dist ** 2) ** 0.5
+        def dist(p1, p2):
+            return ((p1[0] - p2[0]) ** 2 + (p1[1] - p2[1]) ** 2) ** 0.5
+
+        left = other.bbox[2] < self.bbox[0]
+        right = self.bbox[2] < other.bbox[0]
+        bottom = other.bbox[3] < self.bbox[1]
+        top = self.bbox[3] < other.bbox[1]
+        if top and left:
+            return dist((self.bbox[0], self.bbox[3]), (other.bbox[2], other.bbox[1]))
+        elif left and bottom:
+            return dist((self.bbox[0], self.bbox[1]), (other.bbox[2], other.bbox[3]))
+        elif bottom and right:
+            return dist((self.bbox[2], self.bbox[1]), (other.bbox[0], other.bbox[3]))
+        elif right and top:
+            return dist((self.bbox[2], self.bbox[3]), (other.bbox[0], other.bbox[1]))
+        elif left:
+            return self.bbox[0] - other.bbox[2]
+        elif right:
+            return other.bbox[0] - self.bbox[2]
+        elif bottom:
+            return self.bbox[1] - other.bbox[3]
+        elif top:
+            return other.bbox[1] - self.bbox[3]
+        else:
+            return 0
 
     def center_distance(self, other: PolygonBox):
         return ((self.center[0] - other.center[0]) ** 2 + (self.center[1] - other.center[1]) ** 2) ** 0.5