Merge consecutive output tags

VikParuchuri · Nov 18, 2024 · 706bda3 · 706bda3
1 parent be91572
commit 706bda3
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 26 deletions.
diff --git a/marker/v2/converters/pdf.py b/marker/v2/converters/pdf.py
@@ -65,7 +65,7 @@ def main(output: str, fname: str):
             f.write(rendered.markdown)
 
         for img_name, img in rendered.images.items():
-            img.save(os.path.join(output, img_name))
+            img.save(os.path.join(output, img_name), "PNG")
 
 
 if __name__ == "__main__":

diff --git a/marker/v2/renderers/html.py b/marker/v2/renderers/html.py
@@ -1,3 +1,5 @@
+import re
+
 from bs4 import BeautifulSoup
 from pydantic import BaseModel
 
@@ -11,6 +13,24 @@ class HTMLOutput(BaseModel):
     images: dict
 
 
+def merge_consecutive_tags(html, tag):
+    if not html:
+        return html
+
+    def replace_whitespace(match):
+        return match.group(1)
+
+    pattern = fr'</{tag}>(\s*)<{tag}>'
+
+    while True:
+        new_merged = re.sub(pattern, replace_whitespace, html)
+        if new_merged == html:
+            break
+        html = new_merged
+
+    return html
+
+
 class HTMLRenderer(BaseRenderer):
     remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter]
     image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure]
@@ -23,7 +43,7 @@ def extract_image(self, document, image_id):
         cropped = page_img.crop(image_box.bbox)
         return cropped
 
-    def extract_html(self, document, document_output):
+    def extract_html(self, document, document_output, level=0):
         soup = BeautifulSoup(document_output.html, 'html.parser')
 
         content_refs = soup.find_all('content-ref')
@@ -34,7 +54,7 @@ def extract_html(self, document, document_output):
             sub_images = {}
             for item in document_output.children:
                 if item.id == src:
-                    content, sub_images = self.extract_html(document, item)
+                    content, sub_images = self.extract_html(document, item, level + 1)
                     ref_block_id: BlockId = item.id
                     break
 
@@ -47,9 +67,14 @@ def extract_html(self, document, document_output):
                 ref.replace_with(BeautifulSoup(f"<p><img src='{image_name}'></p>", 'html.parser'))
             else:
                 images.update(sub_images)
-                ref.replace_with(BeautifulSoup(f"<div>{content}</div>", 'html.parser'))
+                ref.replace_with(BeautifulSoup(f"{content}", 'html.parser'))
+
+        output = str(soup)
+        if level == 0:
+            output = merge_consecutive_tags(output, 'b')
+            output = merge_consecutive_tags(output, 'i')
 
-        return str(soup), images
+        return output, images
 
     def __call__(self, document) -> HTMLOutput:
         document_output = document.render()

diff --git a/marker/v2/schema/blocks/base.py b/marker/v2/schema/blocks/base.py
@@ -49,25 +49,6 @@ def to_path(self):
         return str(self).replace('/', '_')
 
 
-def merge_consecutive_tags(html, tag):
-    if not html:
-        return html
-
-    def replace_with_space(match):
-        closing_tag, whitespace, opening_tag = match.groups()
-        return whitespace if whitespace else ''
-
-    pattern = fr'</{tag}>\s*<{tag}>'
-
-    while True:
-        new_merged = re.sub(pattern, replace_with_space, html)
-        if new_merged == html:
-            break
-        html = new_merged
-
-    return html
-
-
 class Block(BaseModel):
     polygon: PolygonBox
     block_type: Optional[str] = None
@@ -128,8 +109,6 @@ def assemble_html(self, child_blocks, parent_structure=None):
         template = ""
         for c in child_blocks:
             template += f"<content-ref src='{c.id}'></content-ref>"
-        template = merge_consecutive_tags(template, 'b')
-        template = merge_consecutive_tags(template, 'i')
         return template
 
     def render(self, document, parent_structure):