Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Output images, clean up other output formats #362

Merged
merged 3 commits into from
Nov 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ report.json
benchmark_data
debug_data
temp.md
temp

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
25 changes: 21 additions & 4 deletions marker/v2/converters/pdf.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
<<<<<<< HEAD
import os
=======
from marker.v2.providers.pdf import PdfProvider

>>>>>>> origin/v2
import tempfile
from typing import List, Optional

import click
import datasets
from pydantic import BaseModel

Expand Down Expand Up @@ -46,9 +51,14 @@ def __call__(self, filepath: str, page_range: List[int] | None = None):
return renderer(document)


if __name__ == "__main__":
@click.command()
@click.option("--output", type=click.Path(exists=False), required=False, default="temp")
@click.option("--fname", type=str, default="adversarial.pdf")
def main(output: str, fname: str):
dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
idx = dataset['filename'].index('adversarial.pdf')
idx = dataset['filename'].index(fname)
out_filename = fname.rsplit(".", 1)[0] + ".md"
os.makedirs(output, exist_ok=True)

with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
temp_pdf.write(dataset['pdf'][idx])
Expand All @@ -57,5 +67,12 @@ def __call__(self, filepath: str, page_range: List[int] | None = None):
converter = PdfConverter()
rendered = converter(temp_pdf.name)

with open("temp.md", "w+") as f:
f.write(rendered)
with open(os.path.join(output, out_filename), "w+") as f:
f.write(rendered.markdown)

for img_name, img in rendered.images.items():
img.save(os.path.join(output, img_name), "PNG")


if __name__ == "__main__":
main()
4 changes: 2 additions & 2 deletions marker/v2/processors/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Tuple

from pydantic import BaseModel

Expand All @@ -8,7 +8,7 @@


class BaseProcessor:
block_type: BlockTypes | None = None # What block type this processor is responsible for
block_types: Tuple[str] | None = None # What block types this processor is responsible for

def __init__(self, config: Optional[BaseModel | dict] = None):
assign_config(self, config)
Expand Down
4 changes: 2 additions & 2 deletions marker/v2/processors/equation.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@


class EquationProcessor(BaseProcessor):
block_type = BlockTypes.Equation
block_types = (BlockTypes.Equation, )
model_max_length = 384
batch_size = None
token_buffer = 256
Expand All @@ -26,7 +26,7 @@ def __call__(self, document: Document):

for page in document.pages:
for block in page.children:
if block.block_type != self.block_type:
if block.block_type not in self.block_types:
continue
image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.lowres_image.size)
image = page.lowres_image.crop(image_poly.bbox).convert("RGB")
Expand Down
4 changes: 2 additions & 2 deletions marker/v2/processors/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@


class TableProcessor(BaseProcessor):
block_type = BlockTypes.Table
block_types = (BlockTypes.Table, BlockTypes.TableOfContents, BlockTypes.Form)
detect_boxes = False
detector_batch_size = None
table_rec_batch_size = None
Expand All @@ -31,7 +31,7 @@ def __call__(self, document: Document):
table_data = []
for page in document.pages:
for block in page.children:
if block.block_type != self.block_type:
if block.block_type not in self.block_types:
continue

image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.highres_image.size)
Expand Down
1 change: 1 addition & 0 deletions marker/v2/renderers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from marker.v2.schema import BlockTypes



class BaseRenderer:
block_type: BlockTypes | None = None

Expand Down
72 changes: 62 additions & 10 deletions marker/v2/renderers/html.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,85 @@
import re

from bs4 import BeautifulSoup
from pydantic import BaseModel

from marker.v2.renderers import BaseRenderer
from marker.v2.schema import BlockTypes
from marker.v2.schema.blocks import BlockId


class HTMLOutput(BaseModel):
html: str
images: dict


def merge_consecutive_tags(html, tag):
if not html:
return html

def replace_whitespace(match):
return match.group(1)

pattern = fr'</{tag}>(\s*)<{tag}>'

while True:
new_merged = re.sub(pattern, replace_whitespace, html)
if new_merged == html:
break
html = new_merged

return html


class HTMLRenderer(BaseRenderer):
remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter]
image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure]

def extract_html(self, document, document_output):
def extract_image(self, document, image_id):
image_block = document.get_block(image_id)
page = document.get_page(image_block.page_id)
page_img = page.highres_image
image_box = image_block.polygon.rescale(page.polygon.size, page_img.size)
cropped = page_img.crop(image_box.bbox)
return cropped

def extract_html(self, document, document_output, level=0):
soup = BeautifulSoup(document_output.html, 'html.parser')

content_refs = soup.find_all('content-ref')
ref_block_type = None
ref_block_id = None
images = {}
for ref in content_refs:
src = ref.get('src')
sub_images = {}
for item in document_output.children:
if item.id == src:
content = self.extract_html(document, item)
ref_block_type = item.id.block_type
content, sub_images = self.extract_html(document, item, level + 1)
ref_block_id: BlockId = item.id
break

if ref_block_type in self.remove_blocks:
if ref_block_id.block_type in self.remove_blocks:
ref.replace_with('')
elif ref_block_id.block_type in self.image_blocks:
image = self.extract_image(document, ref_block_id)
image_name = f"{ref_block_id.to_path()}.png"
images[image_name] = image
ref.replace_with(BeautifulSoup(f"<p><img src='{image_name}'></p>", 'html.parser'))
else:
ref.replace_with(BeautifulSoup(f"<div>{content}</div>", 'html.parser'))
images.update(sub_images)
ref.replace_with(BeautifulSoup(f"{content}", 'html.parser'))

output = str(soup)
if level == 0:
output = merge_consecutive_tags(output, 'b')
output = merge_consecutive_tags(output, 'i')

return str(soup)
return output, images

def __call__(self, document):
def __call__(self, document) -> HTMLOutput:
document_output = document.render()
full_html = self.extract_html(document, document_output)
return full_html
full_html, images = self.extract_html(document, document_output)
return HTMLOutput(
html=full_html,
images=images,
)
25 changes: 20 additions & 5 deletions marker/v2/renderers/markdown.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,31 @@
from markdownify import markdownify
from markdownify import markdownify, MarkdownConverter
from pydantic import BaseModel

from marker.v2.renderers.html import HTMLRenderer
from marker.v2.schema.document import Document


class Markdownify(MarkdownConverter):
pass


class MarkdownOutput(BaseModel):
markdown: str
images: dict


class MarkdownRenderer(HTMLRenderer):
def __call__(self, document: Document):
def __call__(self, document: Document) -> MarkdownOutput:
document_output = document.render()
full_html = self.extract_html(document, document_output)
return markdownify(
full_html,
full_html, images = self.extract_html(document, document_output)
md_cls = Markdownify(
heading_style="ATX",
bullets="-",
escape_misc=False,
escape_underscores=False
)
markdown = md_cls.convert(full_html)
return MarkdownOutput(
markdown=markdown,
images=images
)
3 changes: 3 additions & 0 deletions marker/v2/schema/blocks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ def validate_block_type(cls, v):
raise ValueError(f"Invalid block type: {v}")
return v

def to_path(self):
return str(self).replace('/', '_')


class Block(BaseModel):
polygon: PolygonBox
Expand Down
2 changes: 1 addition & 1 deletion marker/v2/schema/blocks/equation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ class Equation(Block):
latex: str | None = None

def assemble_html(self, child_blocks, parent_structure=None):
return f"<div class='math'>{self.latex}</div>"
return f"<p><math>{self.latex}</math></p>"
2 changes: 1 addition & 1 deletion marker/v2/schema/blocks/figure.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ class Figure(Block):
block_type: BlockTypes = BlockTypes.Figure

def assemble_html(self, child_blocks, parent_structure):
return f"Image {self.block_id}"
return f"<p>Image {self.block_id}</p>"
12 changes: 11 additions & 1 deletion marker/v2/schema/blocks/form.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
from typing import List

from tabled.formats import html_format
from tabled.schema import SpanTableCell

from marker.v2.schema import BlockTypes
from marker.v2.schema.blocks import Block


class Form(Block):
block_type: BlockTypes = BlockTypes.Form
block_type: str = BlockTypes.Form
cells: List[SpanTableCell] | None = None

def assemble_html(self, child_blocks, parent_structure=None):
return html_format(self.cells)

7 changes: 6 additions & 1 deletion marker/v2/schema/blocks/pagefooter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,9 @@


class PageFooter(Block):
block_type: BlockTypes = BlockTypes.PageFooter
block_type: str = BlockTypes.PageFooter

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
return f"<p>{template}</p>"
7 changes: 6 additions & 1 deletion marker/v2/schema/blocks/pageheader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,9 @@


class PageHeader(Block):
block_type: BlockTypes = BlockTypes.PageHeader
block_type: str = BlockTypes.PageHeader

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
return f"<p>{template}</p>"
2 changes: 1 addition & 1 deletion marker/v2/schema/blocks/picture.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ class Picture(Block):
block_type: BlockTypes = BlockTypes.Picture

def assemble_html(self, child_blocks, parent_structure):
return f"Image {self.block_id}"
return f"<p>Image {self.block_id}</p>"
1 change: 0 additions & 1 deletion marker/v2/schema/blocks/text.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from marker.v2.schema import BlockTypes
from marker.v2.schema.blocks import Block


class Text(Block):
block_type: BlockTypes = BlockTypes.Text

Expand Down
11 changes: 10 additions & 1 deletion marker/v2/schema/blocks/toc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
from typing import List

from tabled.formats import html_format
from tabled.schema import SpanTableCell

from marker.v2.schema import BlockTypes
from marker.v2.schema.blocks import Block


class TableOfContents(Block):
block_type: BlockTypes = BlockTypes.TableOfContents
block_type: str = BlockTypes.TableOfContents
cells: List[SpanTableCell] | None = None

def assemble_html(self, child_blocks, parent_structure=None):
return html_format(self.cells)
8 changes: 7 additions & 1 deletion marker/v2/schema/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,17 @@ class Document(BaseModel):
block_type: BlockTypes = BlockTypes.Document

def get_block(self, block_id: BlockId):
block = self.pages[block_id.page_id].get_block(block_id)
page = self.get_page(block_id.page_id)
block = page.get_block(block_id)
if block:
return block
return None

def get_page(self, page_id):
page = self.pages[page_id]
assert page.page_id == page_id, "Mismatch between page_id and page index"
return page

def assemble_html(self, child_blocks):
template = ""
for c in child_blocks:
Expand Down
2 changes: 1 addition & 1 deletion marker/v2/schema/groups/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ class ListGroup(Block):

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
return f"<ul>{template}</ul>"
return f"<p><ul>{template}</ul></p>"
3 changes: 2 additions & 1 deletion marker/v2/schema/text/line.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ def strip_trailing_hyphens(line_text, next_line_text, line_html) -> str:
next_line_starts_lowercase = regex.match(rf"^\s?[{lowercase_letters}]", next_line_text)

if hyphen_regex.match(line_text) and next_line_starts_lowercase:
return replace_last(line_html, rf'[{HYPHENS}]', "")
line_html = replace_last(line_html, rf'[{HYPHENS}]', "")

return line_html


Expand Down
Loading