Skip to content

Commit

Permalink
Merge consecutive output tags
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Nov 18, 2024
1 parent be91572 commit 706bda3
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 26 deletions.
2 changes: 1 addition & 1 deletion marker/v2/converters/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def main(output: str, fname: str):
f.write(rendered.markdown)

for img_name, img in rendered.images.items():
img.save(os.path.join(output, img_name))
img.save(os.path.join(output, img_name), "PNG")


if __name__ == "__main__":
Expand Down
33 changes: 29 additions & 4 deletions marker/v2/renderers/html.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

from bs4 import BeautifulSoup
from pydantic import BaseModel

Expand All @@ -11,6 +13,24 @@ class HTMLOutput(BaseModel):
images: dict


def merge_consecutive_tags(html, tag):
if not html:
return html

def replace_whitespace(match):
return match.group(1)

pattern = fr'</{tag}>(\s*)<{tag}>'

while True:
new_merged = re.sub(pattern, replace_whitespace, html)
if new_merged == html:
break
html = new_merged

return html


class HTMLRenderer(BaseRenderer):
remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter]
image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure]
Expand All @@ -23,7 +43,7 @@ def extract_image(self, document, image_id):
cropped = page_img.crop(image_box.bbox)
return cropped

def extract_html(self, document, document_output):
def extract_html(self, document, document_output, level=0):
soup = BeautifulSoup(document_output.html, 'html.parser')

content_refs = soup.find_all('content-ref')
Expand All @@ -34,7 +54,7 @@ def extract_html(self, document, document_output):
sub_images = {}
for item in document_output.children:
if item.id == src:
content, sub_images = self.extract_html(document, item)
content, sub_images = self.extract_html(document, item, level + 1)
ref_block_id: BlockId = item.id
break

Expand All @@ -47,9 +67,14 @@ def extract_html(self, document, document_output):
ref.replace_with(BeautifulSoup(f"<p><img src='{image_name}'></p>", 'html.parser'))
else:
images.update(sub_images)
ref.replace_with(BeautifulSoup(f"<div>{content}</div>", 'html.parser'))
ref.replace_with(BeautifulSoup(f"{content}", 'html.parser'))

output = str(soup)
if level == 0:
output = merge_consecutive_tags(output, 'b')
output = merge_consecutive_tags(output, 'i')

return str(soup), images
return output, images

def __call__(self, document) -> HTMLOutput:
document_output = document.render()
Expand Down
21 changes: 0 additions & 21 deletions marker/v2/schema/blocks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,25 +49,6 @@ def to_path(self):
return str(self).replace('/', '_')


def merge_consecutive_tags(html, tag):
if not html:
return html

def replace_with_space(match):
closing_tag, whitespace, opening_tag = match.groups()
return whitespace if whitespace else ''

pattern = fr'</{tag}>\s*<{tag}>'

while True:
new_merged = re.sub(pattern, replace_with_space, html)
if new_merged == html:
break
html = new_merged

return html


class Block(BaseModel):
polygon: PolygonBox
block_type: Optional[str] = None
Expand Down Expand Up @@ -128,8 +109,6 @@ def assemble_html(self, child_blocks, parent_structure=None):
template = ""
for c in child_blocks:
template += f"<content-ref src='{c.id}'></content-ref>"
template = merge_consecutive_tags(template, 'b')
template = merge_consecutive_tags(template, 'i')
return template

def render(self, document, parent_structure):
Expand Down

0 comments on commit 706bda3

Please sign in to comment.