From be91572245749637bcb37227ab59555196dd778b Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Mon, 18 Nov 2024 06:54:17 -0500 Subject: [PATCH 1/2] Output images, clean up other output formats --- .gitignore | 1 + marker/v2/converters/pdf.py | 20 ++++++++++--- marker/v2/processors/__init__.py | 4 +-- marker/v2/processors/equation.py | 4 +-- marker/v2/processors/table.py | 4 +-- marker/v2/renderers/__init__.py | 1 + marker/v2/renderers/html.py | 43 ++++++++++++++++++++++----- marker/v2/renderers/markdown.py | 25 ++++++++++++---- marker/v2/schema/blocks/base.py | 25 ++++++++++++++++ marker/v2/schema/blocks/equation.py | 2 +- marker/v2/schema/blocks/figure.py | 2 +- marker/v2/schema/blocks/form.py | 9 ++++++ marker/v2/schema/blocks/pagefooter.py | 5 ++++ marker/v2/schema/blocks/pageheader.py | 5 ++++ marker/v2/schema/blocks/picture.py | 2 +- marker/v2/schema/blocks/text.py | 1 - marker/v2/schema/blocks/toc.py | 9 ++++++ marker/v2/schema/document.py | 7 ++++- marker/v2/schema/groups/list.py | 2 +- marker/v2/schema/text/line.py | 3 +- 20 files changed, 144 insertions(+), 30 deletions(-) diff --git a/.gitignore b/.gitignore index 0c6bc44..36d4690 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ report.json benchmark_data debug_data temp.md +temp # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/marker/v2/converters/pdf.py b/marker/v2/converters/pdf.py index 8627a36..7e045f9 100644 --- a/marker/v2/converters/pdf.py +++ b/marker/v2/converters/pdf.py @@ -1,6 +1,8 @@ +import os import tempfile from typing import List, Optional +import click import datasets from pydantic import BaseModel @@ -43,9 +45,14 @@ def __call__(self, filepath: str, page_range: List[int] | None = None): return renderer(document) -if __name__ == "__main__": +@click.command() +@click.option("--output", type=click.Path(exists=False), required=False, default="temp") +@click.option("--fname", type=str, default="adversarial.pdf") +def main(output: str, fname: str): dataset = datasets.load_dataset("datalab-to/pdfs", split="train") - idx = dataset['filename'].index('adversarial.pdf') + idx = dataset['filename'].index(fname) + out_filename = fname.rsplit(".", 1)[0] + ".md" + os.makedirs(output, exist_ok=True) with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf: temp_pdf.write(dataset['pdf'][idx]) @@ -54,7 +61,12 @@ def __call__(self, filepath: str, page_range: List[int] | None = None): converter = PdfConverter() rendered = converter(temp_pdf.name) - with open("temp.md", "w+") as f: - f.write(rendered) + with open(os.path.join(output, out_filename), "w+") as f: + f.write(rendered.markdown) + + for img_name, img in rendered.images.items(): + img.save(os.path.join(output, img_name)) +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/marker/v2/processors/__init__.py b/marker/v2/processors/__init__.py index 53dde66..caeef85 100644 --- a/marker/v2/processors/__init__.py +++ b/marker/v2/processors/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Tuple from pydantic import BaseModel @@ -7,7 +7,7 @@ class BaseProcessor: - block_type: str | None = None # What block type this processor is responsible for + block_types: Tuple[str] | None = None # What block types this processor is responsible for def __init__(self, config: Optional[BaseModel | dict] = None): assign_config(self, config) diff --git a/marker/v2/processors/equation.py b/marker/v2/processors/equation.py index e3988dd..c424f5d 100644 --- a/marker/v2/processors/equation.py +++ b/marker/v2/processors/equation.py @@ -11,7 +11,7 @@ class EquationProcessor(BaseProcessor): - block_type = "Equation" + block_types = ("Equation", ) model_max_length = 384 batch_size = None token_buffer = 256 @@ -26,7 +26,7 @@ def __call__(self, document: Document): for page in document.pages: for block in page.children: - if block.block_type != self.block_type: + if block.block_type not in self.block_types: continue image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.lowres_image.size) image = page.lowres_image.crop(image_poly.bbox).convert("RGB") diff --git a/marker/v2/processors/table.py b/marker/v2/processors/table.py index 32c114c..31e7a2d 100644 --- a/marker/v2/processors/table.py +++ b/marker/v2/processors/table.py @@ -12,7 +12,7 @@ class TableProcessor(BaseProcessor): - block_type = BlockTypes.Table + block_types = (BlockTypes.Table, BlockTypes.TableOfContents, BlockTypes.Form) detect_boxes = False detector_batch_size = None table_rec_batch_size = None @@ -31,7 +31,7 @@ def __call__(self, document: Document): table_data = [] for page in document.pages: for block in page.children: - if block.block_type != self.block_type: + if block.block_type not in self.block_types: continue image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.highres_image.size) diff --git a/marker/v2/renderers/__init__.py b/marker/v2/renderers/__init__.py index 5288213..eda11a8 100644 --- a/marker/v2/renderers/__init__.py +++ b/marker/v2/renderers/__init__.py @@ -3,6 +3,7 @@ from pydantic import BaseModel + class BaseRenderer: block_type: str | None = None diff --git a/marker/v2/renderers/html.py b/marker/v2/renderers/html.py index c2f3743..703acc8 100644 --- a/marker/v2/renderers/html.py +++ b/marker/v2/renderers/html.py @@ -1,33 +1,60 @@ from bs4 import BeautifulSoup +from pydantic import BaseModel + from marker.v2.renderers import BaseRenderer from marker.v2.schema import BlockTypes +from marker.v2.schema.blocks import BlockId + + +class HTMLOutput(BaseModel): + html: str + images: dict class HTMLRenderer(BaseRenderer): remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter] image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure] + def extract_image(self, document, image_id): + image_block = document.get_block(image_id) + page = document.get_page(image_block.page_id) + page_img = page.highres_image + image_box = image_block.polygon.rescale(page.polygon.size, page_img.size) + cropped = page_img.crop(image_box.bbox) + return cropped + def extract_html(self, document, document_output): soup = BeautifulSoup(document_output.html, 'html.parser') content_refs = soup.find_all('content-ref') - ref_block_type = None + ref_block_id = None + images = {} for ref in content_refs: src = ref.get('src') + sub_images = {} for item in document_output.children: if item.id == src: - content = self.extract_html(document, item) - ref_block_type = item.id.block_type + content, sub_images = self.extract_html(document, item) + ref_block_id: BlockId = item.id break - if ref_block_type in self.remove_blocks: + if ref_block_id.block_type in self.remove_blocks: ref.replace_with('') + elif ref_block_id.block_type in self.image_blocks: + image = self.extract_image(document, ref_block_id) + image_name = f"{ref_block_id.to_path()}.png" + images[image_name] = image + ref.replace_with(BeautifulSoup(f"

", 'html.parser')) else: + images.update(sub_images) ref.replace_with(BeautifulSoup(f"
{content}
", 'html.parser')) - return str(soup) + return str(soup), images - def __call__(self, document): + def __call__(self, document) -> HTMLOutput: document_output = document.render() - full_html = self.extract_html(document, document_output) - return full_html + full_html, images = self.extract_html(document, document_output) + return HTMLOutput( + html=full_html, + images=images, + ) diff --git a/marker/v2/renderers/markdown.py b/marker/v2/renderers/markdown.py index 8610e56..7b557a2 100644 --- a/marker/v2/renderers/markdown.py +++ b/marker/v2/renderers/markdown.py @@ -1,17 +1,32 @@ -from markdownify import markdownify +from markdownify import markdownify, MarkdownConverter +from pydantic import BaseModel + from marker.v2.renderers.html import HTMLRenderer +class Markdownify(MarkdownConverter): + pass + + +class MarkdownOutput(BaseModel): + markdown: str + images: dict + + class MarkdownRenderer(HTMLRenderer): - def __call__(self, document): + def __call__(self, document) -> MarkdownOutput: document_output = document.render() - full_html = self.extract_html(document, document_output) - return markdownify( - full_html, + full_html, images = self.extract_html(document, document_output) + md_cls = Markdownify( heading_style="ATX", bullets="-", escape_misc=False, escape_underscores=False ) + markdown = md_cls.convert(full_html) + return MarkdownOutput( + markdown=markdown, + images=images + ) diff --git a/marker/v2/schema/blocks/base.py b/marker/v2/schema/blocks/base.py index b4e21e6..469c772 100644 --- a/marker/v2/schema/blocks/base.py +++ b/marker/v2/schema/blocks/base.py @@ -1,6 +1,7 @@ from __future__ import annotations from typing import Optional, List, Any +import re from pydantic import BaseModel, ConfigDict, field_validator @@ -44,6 +45,28 @@ def validate_block_type(cls, v): raise ValueError(f"Invalid block type: {v}") return v + def to_path(self): + return str(self).replace('/', '_') + + +def merge_consecutive_tags(html, tag): + if not html: + return html + + def replace_with_space(match): + closing_tag, whitespace, opening_tag = match.groups() + return whitespace if whitespace else '' + + pattern = fr'\s*<{tag}>' + + while True: + new_merged = re.sub(pattern, replace_with_space, html) + if new_merged == html: + break + html = new_merged + + return html + class Block(BaseModel): polygon: PolygonBox @@ -105,6 +128,8 @@ def assemble_html(self, child_blocks, parent_structure=None): template = "" for c in child_blocks: template += f"" + template = merge_consecutive_tags(template, 'b') + template = merge_consecutive_tags(template, 'i') return template def render(self, document, parent_structure): diff --git a/marker/v2/schema/blocks/equation.py b/marker/v2/schema/blocks/equation.py index 74eefb7..0dcb709 100644 --- a/marker/v2/schema/blocks/equation.py +++ b/marker/v2/schema/blocks/equation.py @@ -6,4 +6,4 @@ class Equation(Block): latex: str | None = None def assemble_html(self, child_blocks, parent_structure=None): - return f"
{self.latex}
" + return f"

{self.latex}

" diff --git a/marker/v2/schema/blocks/figure.py b/marker/v2/schema/blocks/figure.py index ac6c901..f9af2f0 100644 --- a/marker/v2/schema/blocks/figure.py +++ b/marker/v2/schema/blocks/figure.py @@ -5,4 +5,4 @@ class Figure(Block): block_type: str = "Figure" def assemble_html(self, child_blocks, parent_structure): - return f"Image {self.block_id}" + return f"

Image {self.block_id}

" diff --git a/marker/v2/schema/blocks/form.py b/marker/v2/schema/blocks/form.py index 6e62ad2..294c5d2 100644 --- a/marker/v2/schema/blocks/form.py +++ b/marker/v2/schema/blocks/form.py @@ -1,5 +1,14 @@ +from typing import List + +from tabled.formats import html_format +from tabled.schema import SpanTableCell + from marker.v2.schema.blocks import Block class Form(Block): block_type: str = "Form" + cells: List[SpanTableCell] | None = None + + def assemble_html(self, child_blocks, parent_structure=None): + return html_format(self.cells) \ No newline at end of file diff --git a/marker/v2/schema/blocks/pagefooter.py b/marker/v2/schema/blocks/pagefooter.py index 329885c..d1b5ce4 100644 --- a/marker/v2/schema/blocks/pagefooter.py +++ b/marker/v2/schema/blocks/pagefooter.py @@ -3,3 +3,8 @@ class PageFooter(Block): block_type: str = "PageFooter" + + def assemble_html(self, child_blocks, parent_structure): + template = super().assemble_html(child_blocks, parent_structure) + template = template.replace("\n", " ") + return f"

{template}

" diff --git a/marker/v2/schema/blocks/pageheader.py b/marker/v2/schema/blocks/pageheader.py index 3215073..5c9f530 100644 --- a/marker/v2/schema/blocks/pageheader.py +++ b/marker/v2/schema/blocks/pageheader.py @@ -3,3 +3,8 @@ class PageHeader(Block): block_type: str = "PageHeader" + + def assemble_html(self, child_blocks, parent_structure): + template = super().assemble_html(child_blocks, parent_structure) + template = template.replace("\n", " ") + return f"

{template}

" diff --git a/marker/v2/schema/blocks/picture.py b/marker/v2/schema/blocks/picture.py index e885259..c3151b2 100644 --- a/marker/v2/schema/blocks/picture.py +++ b/marker/v2/schema/blocks/picture.py @@ -5,4 +5,4 @@ class Picture(Block): block_type: str = "Picture" def assemble_html(self, child_blocks, parent_structure): - return f"Image {self.block_id}" + return f"

Image {self.block_id}

" diff --git a/marker/v2/schema/blocks/text.py b/marker/v2/schema/blocks/text.py index 5e0266b..0d0dd8e 100644 --- a/marker/v2/schema/blocks/text.py +++ b/marker/v2/schema/blocks/text.py @@ -1,6 +1,5 @@ from marker.v2.schema.blocks import Block - class Text(Block): block_type: str = "Text" diff --git a/marker/v2/schema/blocks/toc.py b/marker/v2/schema/blocks/toc.py index 8bfeee1..f9eeb39 100644 --- a/marker/v2/schema/blocks/toc.py +++ b/marker/v2/schema/blocks/toc.py @@ -1,5 +1,14 @@ +from typing import List + +from tabled.formats import html_format +from tabled.schema import SpanTableCell + from marker.v2.schema.blocks import Block class TableOfContents(Block): block_type: str = "TableOfContents" + cells: List[SpanTableCell] | None = None + + def assemble_html(self, child_blocks, parent_structure=None): + return html_format(self.cells) diff --git a/marker/v2/schema/document.py b/marker/v2/schema/document.py index ca9b667..8ebb67a 100644 --- a/marker/v2/schema/document.py +++ b/marker/v2/schema/document.py @@ -20,12 +20,17 @@ class Document(BaseModel): block_type: str = "Document" def get_block(self, block_id: BlockId): - page = [p for p in self.pages if p.page_id == block_id.page_id][0] + page = self.get_page(block_id.page_id) block = page.get_block(block_id) if block: return block return None + def get_page(self, page_id): + page = self.pages[page_id] + assert page.page_id == page_id, "Mismatch between page_id and page index" + return page + def assemble_html(self, child_blocks): template = "" for c in child_blocks: diff --git a/marker/v2/schema/groups/list.py b/marker/v2/schema/groups/list.py index 223bc70..5ae27be 100644 --- a/marker/v2/schema/groups/list.py +++ b/marker/v2/schema/groups/list.py @@ -6,4 +6,4 @@ class ListGroup(Block): def assemble_html(self, child_blocks, parent_structure): template = super().assemble_html(child_blocks, parent_structure) - return f"" \ No newline at end of file + return f"

" \ No newline at end of file diff --git a/marker/v2/schema/text/line.py b/marker/v2/schema/text/line.py index dc04987..e2f6205 100644 --- a/marker/v2/schema/text/line.py +++ b/marker/v2/schema/text/line.py @@ -25,7 +25,8 @@ def strip_trailing_hyphens(line_text, next_line_text, line_html) -> str: next_line_starts_lowercase = regex.match(rf"^\s?[{lowercase_letters}]", next_line_text) if hyphen_regex.match(line_text) and next_line_starts_lowercase: - return replace_last(line_html, rf'[{HYPHENS}]', "") + line_html = replace_last(line_html, rf'[{HYPHENS}]', "") + return line_html From 706bda32d18fe687a86db8dd4c2bfb8ce3dc58e4 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Mon, 18 Nov 2024 08:13:58 -0500 Subject: [PATCH 2/2] Merge consecutive output tags --- marker/v2/converters/pdf.py | 2 +- marker/v2/renderers/html.py | 33 +++++++++++++++++++++++++++++---- marker/v2/schema/blocks/base.py | 21 --------------------- 3 files changed, 30 insertions(+), 26 deletions(-) diff --git a/marker/v2/converters/pdf.py b/marker/v2/converters/pdf.py index 7e045f9..382cca9 100644 --- a/marker/v2/converters/pdf.py +++ b/marker/v2/converters/pdf.py @@ -65,7 +65,7 @@ def main(output: str, fname: str): f.write(rendered.markdown) for img_name, img in rendered.images.items(): - img.save(os.path.join(output, img_name)) + img.save(os.path.join(output, img_name), "PNG") if __name__ == "__main__": diff --git a/marker/v2/renderers/html.py b/marker/v2/renderers/html.py index 703acc8..06b82b6 100644 --- a/marker/v2/renderers/html.py +++ b/marker/v2/renderers/html.py @@ -1,3 +1,5 @@ +import re + from bs4 import BeautifulSoup from pydantic import BaseModel @@ -11,6 +13,24 @@ class HTMLOutput(BaseModel): images: dict +def merge_consecutive_tags(html, tag): + if not html: + return html + + def replace_whitespace(match): + return match.group(1) + + pattern = fr'(\s*)<{tag}>' + + while True: + new_merged = re.sub(pattern, replace_whitespace, html) + if new_merged == html: + break + html = new_merged + + return html + + class HTMLRenderer(BaseRenderer): remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter] image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure] @@ -23,7 +43,7 @@ def extract_image(self, document, image_id): cropped = page_img.crop(image_box.bbox) return cropped - def extract_html(self, document, document_output): + def extract_html(self, document, document_output, level=0): soup = BeautifulSoup(document_output.html, 'html.parser') content_refs = soup.find_all('content-ref') @@ -34,7 +54,7 @@ def extract_html(self, document, document_output): sub_images = {} for item in document_output.children: if item.id == src: - content, sub_images = self.extract_html(document, item) + content, sub_images = self.extract_html(document, item, level + 1) ref_block_id: BlockId = item.id break @@ -47,9 +67,14 @@ def extract_html(self, document, document_output): ref.replace_with(BeautifulSoup(f"

", 'html.parser')) else: images.update(sub_images) - ref.replace_with(BeautifulSoup(f"
{content}
", 'html.parser')) + ref.replace_with(BeautifulSoup(f"{content}", 'html.parser')) + + output = str(soup) + if level == 0: + output = merge_consecutive_tags(output, 'b') + output = merge_consecutive_tags(output, 'i') - return str(soup), images + return output, images def __call__(self, document) -> HTMLOutput: document_output = document.render() diff --git a/marker/v2/schema/blocks/base.py b/marker/v2/schema/blocks/base.py index 469c772..6148d0f 100644 --- a/marker/v2/schema/blocks/base.py +++ b/marker/v2/schema/blocks/base.py @@ -49,25 +49,6 @@ def to_path(self): return str(self).replace('/', '_') -def merge_consecutive_tags(html, tag): - if not html: - return html - - def replace_with_space(match): - closing_tag, whitespace, opening_tag = match.groups() - return whitespace if whitespace else '' - - pattern = fr'\s*<{tag}>' - - while True: - new_merged = re.sub(pattern, replace_with_space, html) - if new_merged == html: - break - html = new_merged - - return html - - class Block(BaseModel): polygon: PolygonBox block_type: Optional[str] = None @@ -128,8 +109,6 @@ def assemble_html(self, child_blocks, parent_structure=None): template = "" for c in child_blocks: template += f"" - template = merge_consecutive_tags(template, 'b') - template = merge_consecutive_tags(template, 'i') return template def render(self, document, parent_structure):