Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/v2' into dev-mose/marker-v2
Browse files Browse the repository at this point in the history
  • Loading branch information
iammosespaulr committed Nov 18, 2024
2 parents c479d53 + 3e7c4f3 commit 8c71b35
Show file tree
Hide file tree
Showing 8 changed files with 255 additions and 20 deletions.
12 changes: 7 additions & 5 deletions marker/v2/builders/structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@


class StructureBuilder(BaseBuilder):
gap_threshold: int = 10
gap_threshold: int = .05

def __init__(self, config=None):
super().__init__(config)
Expand All @@ -18,6 +18,7 @@ def __call__(self, document: Document):
self.group_lists(page)

def group_caption_blocks(self, page: PageGroup):
gap_threshold_px = self.gap_threshold * page.polygon.height
for i, block_id in enumerate(page.structure):
block = page.get_block(block_id)
if block.block_type not in [BlockTypes.Table, BlockTypes.Figure, BlockTypes.Picture]:
Expand All @@ -29,18 +30,18 @@ def group_caption_blocks(self, page: PageGroup):
prev_block = page.get_block(prev_block_id)
if all([
prev_block.block_type in [BlockTypes.Caption, BlockTypes.Footnote],
prev_block.polygon.minimum_gap(block.polygon) < self.gap_threshold
prev_block.polygon.minimum_gap(block.polygon) < gap_threshold_px
]):
block_structure.insert(0, prev_block_id)
selected_polygons.append(prev_block.polygon)
selected_polygons.append(selected_polygons[0])
else:
break

for j, next_block_id in enumerate(page.structure[i + 1:]):
next_block = page.get_block(next_block_id)
if all([
next_block.block_type in [BlockTypes.Caption, BlockTypes.Footnote],
next_block.polygon.minimum_gap(block.polygon) < self.gap_threshold
next_block.polygon.minimum_gap(selected_polygons[-1]) < gap_threshold_px
]):
block_structure.append(next_block_id)
selected_polygons.append(next_block.polygon)
Expand All @@ -59,6 +60,7 @@ def group_caption_blocks(self, page: PageGroup):
page.remove_structure_items(block_structure)

def group_lists(self, page: PageGroup):
gap_threshold_px = self.gap_threshold * page.polygon.height
for i, block_id in enumerate(page.structure):
block = page.get_block(block_id)
if block.block_type not in [BlockTypes.ListItem]:
Expand All @@ -70,7 +72,7 @@ def group_lists(self, page: PageGroup):
next_block = page.get_block(next_block_id)
if all([
next_block.block_type == BlockTypes.ListItem,
next_block.polygon.minimum_gap(block.polygon) < self.gap_threshold
next_block.polygon.minimum_gap(selected_polygons[-1]) < gap_threshold_px
]):
block_structure.append(next_block_id)
selected_polygons.append(next_block.polygon)
Expand Down
13 changes: 12 additions & 1 deletion marker/v2/converters/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from marker.v2.schema import BlockTypes
from marker.v2.schema.blocks import Block
from marker.v2.schema.registry import BLOCK_REGISTRY
from marker.v2.processors.debug import DebugProcessor


class PdfConverter(BaseConverter):
Expand Down Expand Up @@ -58,19 +59,29 @@ def __call__(self, filepath: str):
section_header_processor = SectionHeaderProcessor(self.config)
section_header_processor(document)

debug_processor = DebugProcessor(self.config)
debug_processor(document)

renderer = MarkdownRenderer(self.config)
return renderer(document)


@click.command()
@click.option("--output", type=click.Path(exists=False), required=False, default="temp")
@click.option("--fname", type=str, default="adversarial.pdf")
def main(output: str, fname: str):
@click.option("--debug", is_flag=True)
def main(output: str, fname: str, debug: bool):
dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
idx = dataset['filename'].index(fname)
out_filename = fname.rsplit(".", 1)[0] + ".md"
os.makedirs(output, exist_ok=True)

config = {}
if debug:
config["debug_pdf_images"] = True
config["debug_layout_images"] = True
config["debug_json"] = True

with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
temp_pdf.write(dataset['pdf'][idx])
temp_pdf.flush()
Expand Down
148 changes: 148 additions & 0 deletions marker/v2/processors/debug.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import json
import os

import requests
from PIL import Image, ImageDraw, ImageFont

from marker.settings import settings
from marker.v2.processors import BaseProcessor
from marker.v2.schema import BlockTypes
from marker.v2.schema.document import Document


class DebugProcessor(BaseProcessor):
block_types = tuple()
debug_data_folder: str = "debug_data"
debug_layout_images: bool = False
debug_pdf_images: bool = False
debug_json: bool = False
render_font: str = os.path.join(settings.FONT_DIR, "GoNotoCurrent-Regular.ttf")
font_dl_path: str = "https://github.com/satbyy/go-noto-universal/releases/download/v7.0"

def __call__(self, document: Document):
# Remove extension from doc name
doc_base = os.path.basename(document.filepath).rsplit(".", 1)[0]
self.debug_folder = os.path.join(self.debug_data_folder, doc_base)
os.makedirs(self.debug_folder, exist_ok=True)

if self.debug_layout_images:
self.draw_layout_debug_images(document)
print(f"Dumped layout debug images to {self.debug_data_folder}")

if self.debug_pdf_images:
self.draw_layout_debug_images(document, pdf_mode=True)
print(f"Dumped PDF debug images to {self.debug_data_folder}")

if self.debug_json:
self.dump_block_debug_data(document)
print(f"Dumped block debug data to {self.debug_data_folder}")

def draw_layout_debug_images(self, document: Document, pdf_mode = False):
for idx, page in enumerate(document.pages):
img_size = page.highres_image.size
png_image = Image.new("RGB", img_size, color="white")
if pdf_mode:
png_image = page.highres_image.copy()

line_bboxes = []
line_text = []
for child in page.children:
if child.block_type != BlockTypes.Line:
continue

bbox = child.polygon.rescale(page.polygon.size, img_size).bbox
line_bboxes.append(bbox)
line_text.append(child.raw_text(document))

if pdf_mode:
line_text = None

self.render_on_image(line_bboxes, png_image, labels=line_text, color="black", draw_bbox=False, label_font_size=24)

layout_bboxes = []
layout_labels = []
for child in page.children:
if child.block_type in [BlockTypes.Line, BlockTypes.Span]:
continue

bbox = child.polygon.rescale(page.polygon.size, img_size).bbox
layout_bboxes.append(bbox)
layout_labels.append(str(child.block_type))

self.render_on_image(layout_bboxes, png_image, labels=layout_labels, color="red", label_font_size=24)

order_labels = [str(i) for i in range(len(layout_bboxes))]
self.render_on_image(
layout_bboxes,
png_image,
labels=order_labels,
color="green",
draw_bbox=False,
label_offset=5
)

filecomp = "pdf" if pdf_mode else "layout"
debug_file = os.path.join(self.debug_folder, f"{filecomp}_page_{idx}.png")
png_image.save(debug_file)

def dump_block_debug_data(self, document: Document):
debug_file = os.path.join(self.debug_folder, f"blocks.json")
debug_data = []
for idx, page in enumerate(document.pages):
page_data = page.model_dump(exclude=["lowres_image", "highres_image"])
debug_data.append(page_data)

with open(debug_file, "w+") as f:
json.dump(debug_data, f)

def get_font_path(self) -> str:
if not os.path.exists(self.render_font):
os.makedirs(os.path.dirname(self.render_font), exist_ok=True)
font_dl_path = f"{self.font_dl_path}/{os.path.basename(self.render_font)}"
with requests.get(font_dl_path, stream=True) as r, open(self.render_font, 'wb') as f:
r.raise_for_status()
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)

return self.render_font

def get_text_size(self, text, font):
im = Image.new(mode="P", size=(0, 0))
draw = ImageDraw.Draw(im)
_, _, width, height = draw.textbbox((0, 0), text=text, font=font)
return width, height

def render_on_image(self, bboxes, image, labels=None, label_offset=1, label_font_size=10, color: str | list='red', draw_bbox=True):
draw = ImageDraw.Draw(image)
font_path = self.get_font_path()
label_font = ImageFont.truetype(font_path, label_font_size)

for i, bbox in enumerate(bboxes):
bbox = [int(p) for p in bbox]
if draw_bbox:
draw.rectangle(bbox, outline=color[i] if isinstance(color, list) else color, width=1)

if labels is not None:
label = labels[i]
text_position = (
bbox[0] + label_offset,
bbox[1] + label_offset
)
text_size = self.get_text_size(label, label_font)
if text_size[0] <= 0 or text_size[1] <= 0:
continue
box_position = (
text_position[0],
text_position[1],
text_position[0] + text_size[0],
text_position[1] + text_size[1]
)
draw.rectangle(box_position, fill="white")
draw.text(
text_position,
label,
fill=color[i] if isinstance(color, list) else color,
font=label_font
)

return image
48 changes: 48 additions & 0 deletions marker/v2/processors/ignoretext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from collections import Counter

from marker.v2.processors import BaseProcessor
from marker.v2.schema import BlockTypes
from marker.v2.schema.document import Document


class IgnoreTextProcessor(BaseProcessor):
block_types = (BlockTypes.Text,)
common_element_threshold = .6
max_blocks = 1

def __call__(self, document: Document):
first_blocks = []
last_blocks = []
for page in document.pages:
initial_block = None
block = None
last_block = None
for block in page.children:
if block.block_type not in self.block_types:
continue

if initial_block is None:
initial_block = block

if block is not None:
last_block = block

if initial_block is not None:
first_blocks.append(initial_block)
if last_block is not None:
last_blocks.append(last_block)

self.filter_common_elements(document, first_blocks)
self.filter_common_elements(document, last_blocks)

def filter_common_elements(self, document, lines):
# We can't filter if we don't have enough pages to find common elements
if len(lines) < 3:
return []

text = [b.raw_text(document) for b in lines]
counter = Counter(text)
common = [k for k, v in counter.items() if v > len(lines) * self.common_element_threshold]
for b in lines:
if b.raw_text(document) in common:
b.is_header_footer = True
3 changes: 3 additions & 0 deletions marker/v2/schema/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,6 @@ class BlockTypes(Enum):
Text = auto()
TableOfContents = auto()
Document = auto()

def __str__(self):
return self.name
16 changes: 11 additions & 5 deletions marker/v2/schema/blocks/listitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,16 @@
from marker.v2.schema.blocks import Block


def replace_bullets(text):
def replace_bullets(child_blocks):
# Replace bullet characters with a -
bullet_pattern = r"(^|[\n ])[•●○■▪▫–—]( )"
replaced_string = re.sub(bullet_pattern, r"\1-\2", text)
return replaced_string
first_block = None
while len(child_blocks) > 0:
first_block = child_blocks[0]
child_blocks = first_block.children

if first_block.id.block_type == BlockTypes.Line:
bullet_pattern = r"(^|[\n ]|<[^>]*>)[•●○■▪▫–—-]( )"
first_block.html = re.sub(bullet_pattern, r"\1\2", first_block.html)


class ListItem(Block):
Expand All @@ -17,5 +22,6 @@ class ListItem(Block):
def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
template = replace_bullets(template)
# Remove the first bullet character
replace_bullets(child_blocks)
return f"<li>{template}</li>"
1 change: 1 addition & 0 deletions marker/v2/schema/blocks/text.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from marker.v2.schema import BlockTypes
from marker.v2.schema.blocks import Block


class Text(Block):
block_type: BlockTypes = BlockTypes.Text

Expand Down
34 changes: 25 additions & 9 deletions marker/v2/schema/polygon.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,15 +77,31 @@ def minimum_gap(self, other: PolygonBox):
if self.intersection_pct(other) > 0:
return 0

x_dist = min(abs(self.bbox[0] - other.bbox[2]), abs(self.bbox[2] - other.bbox[0]))
y_dist = min(abs(self.bbox[1] - other.bbox[3]), abs(self.bbox[3] - other.bbox[1]))

if x_dist == 0 or self.overlap_x(other) > 0:
return y_dist
if y_dist == 0 or self.overlap_y(other) > 0:
return x_dist

return (x_dist ** 2 + y_dist ** 2) ** 0.5
def dist(p1, p2):
return ((p1[0] - p2[0]) ** 2 + (p1[1] - p2[1]) ** 2) ** 0.5

left = other.bbox[2] < self.bbox[0]
right = self.bbox[2] < other.bbox[0]
bottom = other.bbox[3] < self.bbox[1]
top = self.bbox[3] < other.bbox[1]
if top and left:
return dist((self.bbox[0], self.bbox[3]), (other.bbox[2], other.bbox[1]))
elif left and bottom:
return dist((self.bbox[0], self.bbox[1]), (other.bbox[2], other.bbox[3]))
elif bottom and right:
return dist((self.bbox[2], self.bbox[1]), (other.bbox[0], other.bbox[3]))
elif right and top:
return dist((self.bbox[2], self.bbox[3]), (other.bbox[0], other.bbox[1]))
elif left:
return self.bbox[0] - other.bbox[2]
elif right:
return other.bbox[0] - self.bbox[2]
elif bottom:
return self.bbox[1] - other.bbox[3]
elif top:
return other.bbox[1] - self.bbox[3]
else:
return 0

def center_distance(self, other: PolygonBox):
return ((self.center[0] - other.center[0]) ** 2 + (self.center[1] - other.center[1]) ** 2) ** 0.5
Expand Down

0 comments on commit 8c71b35

Please sign in to comment.