Skip to content

Commit

Permalink
Merge pull request #362 from VikParuchuri/vik_v2
Browse files Browse the repository at this point in the history
Output images, clean up other output formats
  • Loading branch information
VikParuchuri authored Nov 18, 2024
2 parents e662972 + 35dbba1 commit f4ff48b
Show file tree
Hide file tree
Showing 20 changed files with 160 additions and 36 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ report.json
benchmark_data
debug_data
temp.md
temp

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
25 changes: 21 additions & 4 deletions marker/v2/converters/pdf.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
<<<<<<< HEAD
import os
=======
from marker.v2.providers.pdf import PdfProvider

>>>>>>> origin/v2
import tempfile
from typing import List, Optional

import click
import datasets
from pydantic import BaseModel

Expand Down Expand Up @@ -46,9 +51,14 @@ def __call__(self, filepath: str, page_range: List[int] | None = None):
return renderer(document)


if __name__ == "__main__":
@click.command()
@click.option("--output", type=click.Path(exists=False), required=False, default="temp")
@click.option("--fname", type=str, default="adversarial.pdf")
def main(output: str, fname: str):
dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
idx = dataset['filename'].index('adversarial.pdf')
idx = dataset['filename'].index(fname)
out_filename = fname.rsplit(".", 1)[0] + ".md"
os.makedirs(output, exist_ok=True)

with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
temp_pdf.write(dataset['pdf'][idx])
Expand All @@ -57,5 +67,12 @@ def __call__(self, filepath: str, page_range: List[int] | None = None):
converter = PdfConverter()
rendered = converter(temp_pdf.name)

with open("temp.md", "w+") as f:
f.write(rendered)
with open(os.path.join(output, out_filename), "w+") as f:
f.write(rendered.markdown)

for img_name, img in rendered.images.items():
img.save(os.path.join(output, img_name), "PNG")


if __name__ == "__main__":
main()
4 changes: 2 additions & 2 deletions marker/v2/processors/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Tuple

from pydantic import BaseModel

Expand All @@ -8,7 +8,7 @@


class BaseProcessor:
block_type: BlockTypes | None = None # What block type this processor is responsible for
block_types: Tuple[str] | None = None # What block types this processor is responsible for

def __init__(self, config: Optional[BaseModel | dict] = None):
assign_config(self, config)
Expand Down
4 changes: 2 additions & 2 deletions marker/v2/processors/equation.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@


class EquationProcessor(BaseProcessor):
block_type = BlockTypes.Equation
block_types = (BlockTypes.Equation, )
model_max_length = 384
batch_size = None
token_buffer = 256
Expand All @@ -26,7 +26,7 @@ def __call__(self, document: Document):

for page in document.pages:
for block in page.children:
if block.block_type != self.block_type:
if block.block_type not in self.block_types:
continue
image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.lowres_image.size)
image = page.lowres_image.crop(image_poly.bbox).convert("RGB")
Expand Down
4 changes: 2 additions & 2 deletions marker/v2/processors/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@


class TableProcessor(BaseProcessor):
block_type = BlockTypes.Table
block_types = (BlockTypes.Table, BlockTypes.TableOfContents, BlockTypes.Form)
detect_boxes = False
detector_batch_size = None
table_rec_batch_size = None
Expand All @@ -31,7 +31,7 @@ def __call__(self, document: Document):
table_data = []
for page in document.pages:
for block in page.children:
if block.block_type != self.block_type:
if block.block_type not in self.block_types:
continue

image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.highres_image.size)
Expand Down
1 change: 1 addition & 0 deletions marker/v2/renderers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from marker.v2.schema import BlockTypes



class BaseRenderer:
block_type: BlockTypes | None = None

Expand Down
72 changes: 62 additions & 10 deletions marker/v2/renderers/html.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,85 @@
import re

from bs4 import BeautifulSoup
from pydantic import BaseModel

from marker.v2.renderers import BaseRenderer
from marker.v2.schema import BlockTypes
from marker.v2.schema.blocks import BlockId


class HTMLOutput(BaseModel):
html: str
images: dict


def merge_consecutive_tags(html, tag):
if not html:
return html

def replace_whitespace(match):
return match.group(1)

pattern = fr'</{tag}>(\s*)<{tag}>'

while True:
new_merged = re.sub(pattern, replace_whitespace, html)
if new_merged == html:
break
html = new_merged

return html


class HTMLRenderer(BaseRenderer):
remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter]
image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure]

def extract_html(self, document, document_output):
def extract_image(self, document, image_id):
image_block = document.get_block(image_id)
page = document.get_page(image_block.page_id)
page_img = page.highres_image
image_box = image_block.polygon.rescale(page.polygon.size, page_img.size)
cropped = page_img.crop(image_box.bbox)
return cropped

def extract_html(self, document, document_output, level=0):
soup = BeautifulSoup(document_output.html, 'html.parser')

content_refs = soup.find_all('content-ref')
ref_block_type = None
ref_block_id = None
images = {}
for ref in content_refs:
src = ref.get('src')
sub_images = {}
for item in document_output.children:
if item.id == src:
content = self.extract_html(document, item)
ref_block_type = item.id.block_type
content, sub_images = self.extract_html(document, item, level + 1)
ref_block_id: BlockId = item.id
break

if ref_block_type in self.remove_blocks:
if ref_block_id.block_type in self.remove_blocks:
ref.replace_with('')
elif ref_block_id.block_type in self.image_blocks:
image = self.extract_image(document, ref_block_id)
image_name = f"{ref_block_id.to_path()}.png"
images[image_name] = image
ref.replace_with(BeautifulSoup(f"<p><img src='{image_name}'></p>", 'html.parser'))
else:
ref.replace_with(BeautifulSoup(f"<div>{content}</div>", 'html.parser'))
images.update(sub_images)
ref.replace_with(BeautifulSoup(f"{content}", 'html.parser'))

output = str(soup)
if level == 0:
output = merge_consecutive_tags(output, 'b')
output = merge_consecutive_tags(output, 'i')

return str(soup)
return output, images

def __call__(self, document):
def __call__(self, document) -> HTMLOutput:
document_output = document.render()
full_html = self.extract_html(document, document_output)
return full_html
full_html, images = self.extract_html(document, document_output)
return HTMLOutput(
html=full_html,
images=images,
)
25 changes: 20 additions & 5 deletions marker/v2/renderers/markdown.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,31 @@
from markdownify import markdownify
from markdownify import markdownify, MarkdownConverter
from pydantic import BaseModel

from marker.v2.renderers.html import HTMLRenderer
from marker.v2.schema.document import Document


class Markdownify(MarkdownConverter):
pass


class MarkdownOutput(BaseModel):
markdown: str
images: dict


class MarkdownRenderer(HTMLRenderer):
def __call__(self, document: Document):
def __call__(self, document: Document) -> MarkdownOutput:
document_output = document.render()
full_html = self.extract_html(document, document_output)
return markdownify(
full_html,
full_html, images = self.extract_html(document, document_output)
md_cls = Markdownify(
heading_style="ATX",
bullets="-",
escape_misc=False,
escape_underscores=False
)
markdown = md_cls.convert(full_html)
return MarkdownOutput(
markdown=markdown,
images=images
)
3 changes: 3 additions & 0 deletions marker/v2/schema/blocks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ def validate_block_type(cls, v):
raise ValueError(f"Invalid block type: {v}")
return v

def to_path(self):
return str(self).replace('/', '_')


class Block(BaseModel):
polygon: PolygonBox
Expand Down
2 changes: 1 addition & 1 deletion marker/v2/schema/blocks/equation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ class Equation(Block):
latex: str | None = None

def assemble_html(self, child_blocks, parent_structure=None):
return f"<div class='math'>{self.latex}</div>"
return f"<p><math>{self.latex}</math></p>"
2 changes: 1 addition & 1 deletion marker/v2/schema/blocks/figure.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ class Figure(Block):
block_type: BlockTypes = BlockTypes.Figure

def assemble_html(self, child_blocks, parent_structure):
return f"Image {self.block_id}"
return f"<p>Image {self.block_id}</p>"
12 changes: 11 additions & 1 deletion marker/v2/schema/blocks/form.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
from typing import List

from tabled.formats import html_format
from tabled.schema import SpanTableCell

from marker.v2.schema import BlockTypes
from marker.v2.schema.blocks import Block


class Form(Block):
block_type: BlockTypes = BlockTypes.Form
block_type: str = BlockTypes.Form
cells: List[SpanTableCell] | None = None

def assemble_html(self, child_blocks, parent_structure=None):
return html_format(self.cells)

7 changes: 6 additions & 1 deletion marker/v2/schema/blocks/pagefooter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,9 @@


class PageFooter(Block):
block_type: BlockTypes = BlockTypes.PageFooter
block_type: str = BlockTypes.PageFooter

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
return f"<p>{template}</p>"
7 changes: 6 additions & 1 deletion marker/v2/schema/blocks/pageheader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,9 @@


class PageHeader(Block):
block_type: BlockTypes = BlockTypes.PageHeader
block_type: str = BlockTypes.PageHeader

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
return f"<p>{template}</p>"
2 changes: 1 addition & 1 deletion marker/v2/schema/blocks/picture.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ class Picture(Block):
block_type: BlockTypes = BlockTypes.Picture

def assemble_html(self, child_blocks, parent_structure):
return f"Image {self.block_id}"
return f"<p>Image {self.block_id}</p>"
1 change: 0 additions & 1 deletion marker/v2/schema/blocks/text.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from marker.v2.schema import BlockTypes
from marker.v2.schema.blocks import Block


class Text(Block):
block_type: BlockTypes = BlockTypes.Text

Expand Down
11 changes: 10 additions & 1 deletion marker/v2/schema/blocks/toc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
from typing import List

from tabled.formats import html_format
from tabled.schema import SpanTableCell

from marker.v2.schema import BlockTypes
from marker.v2.schema.blocks import Block


class TableOfContents(Block):
block_type: BlockTypes = BlockTypes.TableOfContents
block_type: str = BlockTypes.TableOfContents
cells: List[SpanTableCell] | None = None

def assemble_html(self, child_blocks, parent_structure=None):
return html_format(self.cells)
8 changes: 7 additions & 1 deletion marker/v2/schema/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,17 @@ class Document(BaseModel):
block_type: BlockTypes = BlockTypes.Document

def get_block(self, block_id: BlockId):
block = self.pages[block_id.page_id].get_block(block_id)
page = self.get_page(block_id.page_id)
block = page.get_block(block_id)
if block:
return block
return None

def get_page(self, page_id):
page = self.pages[page_id]
assert page.page_id == page_id, "Mismatch between page_id and page index"
return page

def assemble_html(self, child_blocks):
template = ""
for c in child_blocks:
Expand Down
2 changes: 1 addition & 1 deletion marker/v2/schema/groups/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ class ListGroup(Block):

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
return f"<ul>{template}</ul>"
return f"<p><ul>{template}</ul></p>"
3 changes: 2 additions & 1 deletion marker/v2/schema/text/line.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ def strip_trailing_hyphens(line_text, next_line_text, line_html) -> str:
next_line_starts_lowercase = regex.match(rf"^\s?[{lowercase_letters}]", next_line_text)

if hyphen_regex.match(line_text) and next_line_starts_lowercase:
return replace_last(line_html, rf'[{HYPHENS}]', "")
line_html = replace_last(line_html, rf'[{HYPHENS}]', "")

return line_html


Expand Down

0 comments on commit f4ff48b

Please sign in to comment.