From be91572245749637bcb37227ab59555196dd778b Mon Sep 17 00:00:00 2001
From: Vik Paruchuri
Date: Mon, 18 Nov 2024 06:54:17 -0500
Subject: [PATCH 1/2] Output images, clean up other output formats
---
.gitignore | 1 +
marker/v2/converters/pdf.py | 20 ++++++++++---
marker/v2/processors/__init__.py | 4 +--
marker/v2/processors/equation.py | 4 +--
marker/v2/processors/table.py | 4 +--
marker/v2/renderers/__init__.py | 1 +
marker/v2/renderers/html.py | 43 ++++++++++++++++++++++-----
marker/v2/renderers/markdown.py | 25 ++++++++++++----
marker/v2/schema/blocks/base.py | 25 ++++++++++++++++
marker/v2/schema/blocks/equation.py | 2 +-
marker/v2/schema/blocks/figure.py | 2 +-
marker/v2/schema/blocks/form.py | 9 ++++++
marker/v2/schema/blocks/pagefooter.py | 5 ++++
marker/v2/schema/blocks/pageheader.py | 5 ++++
marker/v2/schema/blocks/picture.py | 2 +-
marker/v2/schema/blocks/text.py | 1 -
marker/v2/schema/blocks/toc.py | 9 ++++++
marker/v2/schema/document.py | 7 ++++-
marker/v2/schema/groups/list.py | 2 +-
marker/v2/schema/text/line.py | 3 +-
20 files changed, 144 insertions(+), 30 deletions(-)
diff --git a/.gitignore b/.gitignore
index 0c6bc44..36d4690 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,7 @@ report.json
benchmark_data
debug_data
temp.md
+temp
# Byte-compiled / optimized / DLL files
__pycache__/
diff --git a/marker/v2/converters/pdf.py b/marker/v2/converters/pdf.py
index 8627a36..7e045f9 100644
--- a/marker/v2/converters/pdf.py
+++ b/marker/v2/converters/pdf.py
@@ -1,6 +1,8 @@
+import os
import tempfile
from typing import List, Optional
+import click
import datasets
from pydantic import BaseModel
@@ -43,9 +45,14 @@ def __call__(self, filepath: str, page_range: List[int] | None = None):
return renderer(document)
-if __name__ == "__main__":
+@click.command()
+@click.option("--output", type=click.Path(exists=False), required=False, default="temp")
+@click.option("--fname", type=str, default="adversarial.pdf")
+def main(output: str, fname: str):
dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
- idx = dataset['filename'].index('adversarial.pdf')
+ idx = dataset['filename'].index(fname)
+ out_filename = fname.rsplit(".", 1)[0] + ".md"
+ os.makedirs(output, exist_ok=True)
with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
temp_pdf.write(dataset['pdf'][idx])
@@ -54,7 +61,12 @@ def __call__(self, filepath: str, page_range: List[int] | None = None):
converter = PdfConverter()
rendered = converter(temp_pdf.name)
- with open("temp.md", "w+") as f:
- f.write(rendered)
+ with open(os.path.join(output, out_filename), "w+") as f:
+ f.write(rendered.markdown)
+
+ for img_name, img in rendered.images.items():
+ img.save(os.path.join(output, img_name))
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/marker/v2/processors/__init__.py b/marker/v2/processors/__init__.py
index 53dde66..caeef85 100644
--- a/marker/v2/processors/__init__.py
+++ b/marker/v2/processors/__init__.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Tuple
from pydantic import BaseModel
@@ -7,7 +7,7 @@
class BaseProcessor:
- block_type: str | None = None # What block type this processor is responsible for
+ block_types: Tuple[str] | None = None # What block types this processor is responsible for
def __init__(self, config: Optional[BaseModel | dict] = None):
assign_config(self, config)
diff --git a/marker/v2/processors/equation.py b/marker/v2/processors/equation.py
index e3988dd..c424f5d 100644
--- a/marker/v2/processors/equation.py
+++ b/marker/v2/processors/equation.py
@@ -11,7 +11,7 @@
class EquationProcessor(BaseProcessor):
- block_type = "Equation"
+ block_types = ("Equation", )
model_max_length = 384
batch_size = None
token_buffer = 256
@@ -26,7 +26,7 @@ def __call__(self, document: Document):
for page in document.pages:
for block in page.children:
- if block.block_type != self.block_type:
+ if block.block_type not in self.block_types:
continue
image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.lowres_image.size)
image = page.lowres_image.crop(image_poly.bbox).convert("RGB")
diff --git a/marker/v2/processors/table.py b/marker/v2/processors/table.py
index 32c114c..31e7a2d 100644
--- a/marker/v2/processors/table.py
+++ b/marker/v2/processors/table.py
@@ -12,7 +12,7 @@
class TableProcessor(BaseProcessor):
- block_type = BlockTypes.Table
+ block_types = (BlockTypes.Table, BlockTypes.TableOfContents, BlockTypes.Form)
detect_boxes = False
detector_batch_size = None
table_rec_batch_size = None
@@ -31,7 +31,7 @@ def __call__(self, document: Document):
table_data = []
for page in document.pages:
for block in page.children:
- if block.block_type != self.block_type:
+ if block.block_type not in self.block_types:
continue
image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.highres_image.size)
diff --git a/marker/v2/renderers/__init__.py b/marker/v2/renderers/__init__.py
index 5288213..eda11a8 100644
--- a/marker/v2/renderers/__init__.py
+++ b/marker/v2/renderers/__init__.py
@@ -3,6 +3,7 @@
from pydantic import BaseModel
+
class BaseRenderer:
block_type: str | None = None
diff --git a/marker/v2/renderers/html.py b/marker/v2/renderers/html.py
index c2f3743..703acc8 100644
--- a/marker/v2/renderers/html.py
+++ b/marker/v2/renderers/html.py
@@ -1,33 +1,60 @@
from bs4 import BeautifulSoup
+from pydantic import BaseModel
+
from marker.v2.renderers import BaseRenderer
from marker.v2.schema import BlockTypes
+from marker.v2.schema.blocks import BlockId
+
+
+class HTMLOutput(BaseModel):
+ html: str
+ images: dict
class HTMLRenderer(BaseRenderer):
remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter]
image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure]
+ def extract_image(self, document, image_id):
+ image_block = document.get_block(image_id)
+ page = document.get_page(image_block.page_id)
+ page_img = page.highres_image
+ image_box = image_block.polygon.rescale(page.polygon.size, page_img.size)
+ cropped = page_img.crop(image_box.bbox)
+ return cropped
+
def extract_html(self, document, document_output):
soup = BeautifulSoup(document_output.html, 'html.parser')
content_refs = soup.find_all('content-ref')
- ref_block_type = None
+ ref_block_id = None
+ images = {}
for ref in content_refs:
src = ref.get('src')
+ sub_images = {}
for item in document_output.children:
if item.id == src:
- content = self.extract_html(document, item)
- ref_block_type = item.id.block_type
+ content, sub_images = self.extract_html(document, item)
+ ref_block_id: BlockId = item.id
break
- if ref_block_type in self.remove_blocks:
+ if ref_block_id.block_type in self.remove_blocks:
ref.replace_with('')
+ elif ref_block_id.block_type in self.image_blocks:
+ image = self.extract_image(document, ref_block_id)
+ image_name = f"{ref_block_id.to_path()}.png"
+ images[image_name] = image
+ ref.replace_with(BeautifulSoup(f"", 'html.parser'))
else:
+ images.update(sub_images)
ref.replace_with(BeautifulSoup(f"{content}
", 'html.parser'))
- return str(soup)
+ return str(soup), images
- def __call__(self, document):
+ def __call__(self, document) -> HTMLOutput:
document_output = document.render()
- full_html = self.extract_html(document, document_output)
- return full_html
+ full_html, images = self.extract_html(document, document_output)
+ return HTMLOutput(
+ html=full_html,
+ images=images,
+ )
diff --git a/marker/v2/renderers/markdown.py b/marker/v2/renderers/markdown.py
index 8610e56..7b557a2 100644
--- a/marker/v2/renderers/markdown.py
+++ b/marker/v2/renderers/markdown.py
@@ -1,17 +1,32 @@
-from markdownify import markdownify
+from markdownify import markdownify, MarkdownConverter
+from pydantic import BaseModel
+
from marker.v2.renderers.html import HTMLRenderer
+class Markdownify(MarkdownConverter):
+ pass
+
+
+class MarkdownOutput(BaseModel):
+ markdown: str
+ images: dict
+
+
class MarkdownRenderer(HTMLRenderer):
- def __call__(self, document):
+ def __call__(self, document) -> MarkdownOutput:
document_output = document.render()
- full_html = self.extract_html(document, document_output)
- return markdownify(
- full_html,
+ full_html, images = self.extract_html(document, document_output)
+ md_cls = Markdownify(
heading_style="ATX",
bullets="-",
escape_misc=False,
escape_underscores=False
)
+ markdown = md_cls.convert(full_html)
+ return MarkdownOutput(
+ markdown=markdown,
+ images=images
+ )
diff --git a/marker/v2/schema/blocks/base.py b/marker/v2/schema/blocks/base.py
index b4e21e6..469c772 100644
--- a/marker/v2/schema/blocks/base.py
+++ b/marker/v2/schema/blocks/base.py
@@ -1,6 +1,7 @@
from __future__ import annotations
from typing import Optional, List, Any
+import re
from pydantic import BaseModel, ConfigDict, field_validator
@@ -44,6 +45,28 @@ def validate_block_type(cls, v):
raise ValueError(f"Invalid block type: {v}")
return v
+ def to_path(self):
+ return str(self).replace('/', '_')
+
+
+def merge_consecutive_tags(html, tag):
+ if not html:
+ return html
+
+ def replace_with_space(match):
+ closing_tag, whitespace, opening_tag = match.groups()
+ return whitespace if whitespace else ''
+
+ pattern = fr'{tag}>\s*<{tag}>'
+
+ while True:
+ new_merged = re.sub(pattern, replace_with_space, html)
+ if new_merged == html:
+ break
+ html = new_merged
+
+ return html
+
class Block(BaseModel):
polygon: PolygonBox
@@ -105,6 +128,8 @@ def assemble_html(self, child_blocks, parent_structure=None):
template = ""
for c in child_blocks:
template += f""
+ template = merge_consecutive_tags(template, 'b')
+ template = merge_consecutive_tags(template, 'i')
return template
def render(self, document, parent_structure):
diff --git a/marker/v2/schema/blocks/equation.py b/marker/v2/schema/blocks/equation.py
index 74eefb7..0dcb709 100644
--- a/marker/v2/schema/blocks/equation.py
+++ b/marker/v2/schema/blocks/equation.py
@@ -6,4 +6,4 @@ class Equation(Block):
latex: str | None = None
def assemble_html(self, child_blocks, parent_structure=None):
- return f"{self.latex}
"
+ return f""
diff --git a/marker/v2/schema/blocks/figure.py b/marker/v2/schema/blocks/figure.py
index ac6c901..f9af2f0 100644
--- a/marker/v2/schema/blocks/figure.py
+++ b/marker/v2/schema/blocks/figure.py
@@ -5,4 +5,4 @@ class Figure(Block):
block_type: str = "Figure"
def assemble_html(self, child_blocks, parent_structure):
- return f"Image {self.block_id}"
+ return f"Image {self.block_id}
"
diff --git a/marker/v2/schema/blocks/form.py b/marker/v2/schema/blocks/form.py
index 6e62ad2..294c5d2 100644
--- a/marker/v2/schema/blocks/form.py
+++ b/marker/v2/schema/blocks/form.py
@@ -1,5 +1,14 @@
+from typing import List
+
+from tabled.formats import html_format
+from tabled.schema import SpanTableCell
+
from marker.v2.schema.blocks import Block
class Form(Block):
block_type: str = "Form"
+ cells: List[SpanTableCell] | None = None
+
+ def assemble_html(self, child_blocks, parent_structure=None):
+ return html_format(self.cells)
\ No newline at end of file
diff --git a/marker/v2/schema/blocks/pagefooter.py b/marker/v2/schema/blocks/pagefooter.py
index 329885c..d1b5ce4 100644
--- a/marker/v2/schema/blocks/pagefooter.py
+++ b/marker/v2/schema/blocks/pagefooter.py
@@ -3,3 +3,8 @@
class PageFooter(Block):
block_type: str = "PageFooter"
+
+ def assemble_html(self, child_blocks, parent_structure):
+ template = super().assemble_html(child_blocks, parent_structure)
+ template = template.replace("\n", " ")
+ return f"{template}
"
diff --git a/marker/v2/schema/blocks/pageheader.py b/marker/v2/schema/blocks/pageheader.py
index 3215073..5c9f530 100644
--- a/marker/v2/schema/blocks/pageheader.py
+++ b/marker/v2/schema/blocks/pageheader.py
@@ -3,3 +3,8 @@
class PageHeader(Block):
block_type: str = "PageHeader"
+
+ def assemble_html(self, child_blocks, parent_structure):
+ template = super().assemble_html(child_blocks, parent_structure)
+ template = template.replace("\n", " ")
+ return f"{template}
"
diff --git a/marker/v2/schema/blocks/picture.py b/marker/v2/schema/blocks/picture.py
index e885259..c3151b2 100644
--- a/marker/v2/schema/blocks/picture.py
+++ b/marker/v2/schema/blocks/picture.py
@@ -5,4 +5,4 @@ class Picture(Block):
block_type: str = "Picture"
def assemble_html(self, child_blocks, parent_structure):
- return f"Image {self.block_id}"
+ return f"Image {self.block_id}
"
diff --git a/marker/v2/schema/blocks/text.py b/marker/v2/schema/blocks/text.py
index 5e0266b..0d0dd8e 100644
--- a/marker/v2/schema/blocks/text.py
+++ b/marker/v2/schema/blocks/text.py
@@ -1,6 +1,5 @@
from marker.v2.schema.blocks import Block
-
class Text(Block):
block_type: str = "Text"
diff --git a/marker/v2/schema/blocks/toc.py b/marker/v2/schema/blocks/toc.py
index 8bfeee1..f9eeb39 100644
--- a/marker/v2/schema/blocks/toc.py
+++ b/marker/v2/schema/blocks/toc.py
@@ -1,5 +1,14 @@
+from typing import List
+
+from tabled.formats import html_format
+from tabled.schema import SpanTableCell
+
from marker.v2.schema.blocks import Block
class TableOfContents(Block):
block_type: str = "TableOfContents"
+ cells: List[SpanTableCell] | None = None
+
+ def assemble_html(self, child_blocks, parent_structure=None):
+ return html_format(self.cells)
diff --git a/marker/v2/schema/document.py b/marker/v2/schema/document.py
index ca9b667..8ebb67a 100644
--- a/marker/v2/schema/document.py
+++ b/marker/v2/schema/document.py
@@ -20,12 +20,17 @@ class Document(BaseModel):
block_type: str = "Document"
def get_block(self, block_id: BlockId):
- page = [p for p in self.pages if p.page_id == block_id.page_id][0]
+ page = self.get_page(block_id.page_id)
block = page.get_block(block_id)
if block:
return block
return None
+ def get_page(self, page_id):
+ page = self.pages[page_id]
+ assert page.page_id == page_id, "Mismatch between page_id and page index"
+ return page
+
def assemble_html(self, child_blocks):
template = ""
for c in child_blocks:
diff --git a/marker/v2/schema/groups/list.py b/marker/v2/schema/groups/list.py
index 223bc70..5ae27be 100644
--- a/marker/v2/schema/groups/list.py
+++ b/marker/v2/schema/groups/list.py
@@ -6,4 +6,4 @@ class ListGroup(Block):
def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
- return f""
\ No newline at end of file
+ return f"
"
\ No newline at end of file
diff --git a/marker/v2/schema/text/line.py b/marker/v2/schema/text/line.py
index dc04987..e2f6205 100644
--- a/marker/v2/schema/text/line.py
+++ b/marker/v2/schema/text/line.py
@@ -25,7 +25,8 @@ def strip_trailing_hyphens(line_text, next_line_text, line_html) -> str:
next_line_starts_lowercase = regex.match(rf"^\s?[{lowercase_letters}]", next_line_text)
if hyphen_regex.match(line_text) and next_line_starts_lowercase:
- return replace_last(line_html, rf'[{HYPHENS}]', "")
+ line_html = replace_last(line_html, rf'[{HYPHENS}]', "")
+
return line_html
From 706bda32d18fe687a86db8dd4c2bfb8ce3dc58e4 Mon Sep 17 00:00:00 2001
From: Vik Paruchuri
Date: Mon, 18 Nov 2024 08:13:58 -0500
Subject: [PATCH 2/2] Merge consecutive output tags
---
marker/v2/converters/pdf.py | 2 +-
marker/v2/renderers/html.py | 33 +++++++++++++++++++++++++++++----
marker/v2/schema/blocks/base.py | 21 ---------------------
3 files changed, 30 insertions(+), 26 deletions(-)
diff --git a/marker/v2/converters/pdf.py b/marker/v2/converters/pdf.py
index 7e045f9..382cca9 100644
--- a/marker/v2/converters/pdf.py
+++ b/marker/v2/converters/pdf.py
@@ -65,7 +65,7 @@ def main(output: str, fname: str):
f.write(rendered.markdown)
for img_name, img in rendered.images.items():
- img.save(os.path.join(output, img_name))
+ img.save(os.path.join(output, img_name), "PNG")
if __name__ == "__main__":
diff --git a/marker/v2/renderers/html.py b/marker/v2/renderers/html.py
index 703acc8..06b82b6 100644
--- a/marker/v2/renderers/html.py
+++ b/marker/v2/renderers/html.py
@@ -1,3 +1,5 @@
+import re
+
from bs4 import BeautifulSoup
from pydantic import BaseModel
@@ -11,6 +13,24 @@ class HTMLOutput(BaseModel):
images: dict
+def merge_consecutive_tags(html, tag):
+ if not html:
+ return html
+
+ def replace_whitespace(match):
+ return match.group(1)
+
+ pattern = fr'{tag}>(\s*)<{tag}>'
+
+ while True:
+ new_merged = re.sub(pattern, replace_whitespace, html)
+ if new_merged == html:
+ break
+ html = new_merged
+
+ return html
+
+
class HTMLRenderer(BaseRenderer):
remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter]
image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure]
@@ -23,7 +43,7 @@ def extract_image(self, document, image_id):
cropped = page_img.crop(image_box.bbox)
return cropped
- def extract_html(self, document, document_output):
+ def extract_html(self, document, document_output, level=0):
soup = BeautifulSoup(document_output.html, 'html.parser')
content_refs = soup.find_all('content-ref')
@@ -34,7 +54,7 @@ def extract_html(self, document, document_output):
sub_images = {}
for item in document_output.children:
if item.id == src:
- content, sub_images = self.extract_html(document, item)
+ content, sub_images = self.extract_html(document, item, level + 1)
ref_block_id: BlockId = item.id
break
@@ -47,9 +67,14 @@ def extract_html(self, document, document_output):
ref.replace_with(BeautifulSoup(f"", 'html.parser'))
else:
images.update(sub_images)
- ref.replace_with(BeautifulSoup(f"{content}
", 'html.parser'))
+ ref.replace_with(BeautifulSoup(f"{content}", 'html.parser'))
+
+ output = str(soup)
+ if level == 0:
+ output = merge_consecutive_tags(output, 'b')
+ output = merge_consecutive_tags(output, 'i')
- return str(soup), images
+ return output, images
def __call__(self, document) -> HTMLOutput:
document_output = document.render()
diff --git a/marker/v2/schema/blocks/base.py b/marker/v2/schema/blocks/base.py
index 469c772..6148d0f 100644
--- a/marker/v2/schema/blocks/base.py
+++ b/marker/v2/schema/blocks/base.py
@@ -49,25 +49,6 @@ def to_path(self):
return str(self).replace('/', '_')
-def merge_consecutive_tags(html, tag):
- if not html:
- return html
-
- def replace_with_space(match):
- closing_tag, whitespace, opening_tag = match.groups()
- return whitespace if whitespace else ''
-
- pattern = fr'{tag}>\s*<{tag}>'
-
- while True:
- new_merged = re.sub(pattern, replace_with_space, html)
- if new_merged == html:
- break
- html = new_merged
-
- return html
-
-
class Block(BaseModel):
polygon: PolygonBox
block_type: Optional[str] = None
@@ -128,8 +109,6 @@ def assemble_html(self, child_blocks, parent_structure=None):
template = ""
for c in child_blocks:
template += f""
- template = merge_consecutive_tags(template, 'b')
- template = merge_consecutive_tags(template, 'i')
return template
def render(self, document, parent_structure):