-
Notifications
You must be signed in to change notification settings - Fork 1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #362 from VikParuchuri/vik_v2
Output images, clean up other output formats
- Loading branch information
Showing
20 changed files
with
160 additions
and
36 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
from marker.v2.schema import BlockTypes | ||
|
||
|
||
|
||
class BaseRenderer: | ||
block_type: BlockTypes | None = None | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,33 +1,85 @@ | ||
import re | ||
|
||
from bs4 import BeautifulSoup | ||
from pydantic import BaseModel | ||
|
||
from marker.v2.renderers import BaseRenderer | ||
from marker.v2.schema import BlockTypes | ||
from marker.v2.schema.blocks import BlockId | ||
|
||
|
||
class HTMLOutput(BaseModel): | ||
html: str | ||
images: dict | ||
|
||
|
||
def merge_consecutive_tags(html, tag): | ||
if not html: | ||
return html | ||
|
||
def replace_whitespace(match): | ||
return match.group(1) | ||
|
||
pattern = fr'</{tag}>(\s*)<{tag}>' | ||
|
||
while True: | ||
new_merged = re.sub(pattern, replace_whitespace, html) | ||
if new_merged == html: | ||
break | ||
html = new_merged | ||
|
||
return html | ||
|
||
|
||
class HTMLRenderer(BaseRenderer): | ||
remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter] | ||
image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure] | ||
|
||
def extract_html(self, document, document_output): | ||
def extract_image(self, document, image_id): | ||
image_block = document.get_block(image_id) | ||
page = document.get_page(image_block.page_id) | ||
page_img = page.highres_image | ||
image_box = image_block.polygon.rescale(page.polygon.size, page_img.size) | ||
cropped = page_img.crop(image_box.bbox) | ||
return cropped | ||
|
||
def extract_html(self, document, document_output, level=0): | ||
soup = BeautifulSoup(document_output.html, 'html.parser') | ||
|
||
content_refs = soup.find_all('content-ref') | ||
ref_block_type = None | ||
ref_block_id = None | ||
images = {} | ||
for ref in content_refs: | ||
src = ref.get('src') | ||
sub_images = {} | ||
for item in document_output.children: | ||
if item.id == src: | ||
content = self.extract_html(document, item) | ||
ref_block_type = item.id.block_type | ||
content, sub_images = self.extract_html(document, item, level + 1) | ||
ref_block_id: BlockId = item.id | ||
break | ||
|
||
if ref_block_type in self.remove_blocks: | ||
if ref_block_id.block_type in self.remove_blocks: | ||
ref.replace_with('') | ||
elif ref_block_id.block_type in self.image_blocks: | ||
image = self.extract_image(document, ref_block_id) | ||
image_name = f"{ref_block_id.to_path()}.png" | ||
images[image_name] = image | ||
ref.replace_with(BeautifulSoup(f"<p><img src='{image_name}'></p>", 'html.parser')) | ||
else: | ||
ref.replace_with(BeautifulSoup(f"<div>{content}</div>", 'html.parser')) | ||
images.update(sub_images) | ||
ref.replace_with(BeautifulSoup(f"{content}", 'html.parser')) | ||
|
||
output = str(soup) | ||
if level == 0: | ||
output = merge_consecutive_tags(output, 'b') | ||
output = merge_consecutive_tags(output, 'i') | ||
|
||
return str(soup) | ||
return output, images | ||
|
||
def __call__(self, document): | ||
def __call__(self, document) -> HTMLOutput: | ||
document_output = document.render() | ||
full_html = self.extract_html(document, document_output) | ||
return full_html | ||
full_html, images = self.extract_html(document, document_output) | ||
return HTMLOutput( | ||
html=full_html, | ||
images=images, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,31 @@ | ||
from markdownify import markdownify | ||
from markdownify import markdownify, MarkdownConverter | ||
from pydantic import BaseModel | ||
|
||
from marker.v2.renderers.html import HTMLRenderer | ||
from marker.v2.schema.document import Document | ||
|
||
|
||
class Markdownify(MarkdownConverter): | ||
pass | ||
|
||
|
||
class MarkdownOutput(BaseModel): | ||
markdown: str | ||
images: dict | ||
|
||
|
||
class MarkdownRenderer(HTMLRenderer): | ||
def __call__(self, document: Document): | ||
def __call__(self, document: Document) -> MarkdownOutput: | ||
document_output = document.render() | ||
full_html = self.extract_html(document, document_output) | ||
return markdownify( | ||
full_html, | ||
full_html, images = self.extract_html(document, document_output) | ||
md_cls = Markdownify( | ||
heading_style="ATX", | ||
bullets="-", | ||
escape_misc=False, | ||
escape_underscores=False | ||
) | ||
markdown = md_cls.convert(full_html) | ||
return MarkdownOutput( | ||
markdown=markdown, | ||
images=images | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,16 @@ | ||
from typing import List | ||
|
||
from tabled.formats import html_format | ||
from tabled.schema import SpanTableCell | ||
|
||
from marker.v2.schema import BlockTypes | ||
from marker.v2.schema.blocks import Block | ||
|
||
|
||
class Form(Block): | ||
block_type: BlockTypes = BlockTypes.Form | ||
block_type: str = BlockTypes.Form | ||
cells: List[SpanTableCell] | None = None | ||
|
||
def assemble_html(self, child_blocks, parent_structure=None): | ||
return html_format(self.cells) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,15 @@ | ||
from typing import List | ||
|
||
from tabled.formats import html_format | ||
from tabled.schema import SpanTableCell | ||
|
||
from marker.v2.schema import BlockTypes | ||
from marker.v2.schema.blocks import Block | ||
|
||
|
||
class TableOfContents(Block): | ||
block_type: BlockTypes = BlockTypes.TableOfContents | ||
block_type: str = BlockTypes.TableOfContents | ||
cells: List[SpanTableCell] | None = None | ||
|
||
def assemble_html(self, child_blocks, parent_structure=None): | ||
return html_format(self.cells) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters