Clean up renderers, fix output

VikParuchuri · Nov 15, 2024 · 81092a6 · 81092a6
1 parent a748e23
commit 81092a6
Show file tree

Hide file tree

Showing 18 changed files with 101 additions and 30 deletions.
diff --git a/.gitignore b/.gitignore
@@ -9,6 +9,7 @@ wandb
 report.json
 benchmark_data
 debug_data
+temp.md
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/marker/v2/converters/pdf.py b/marker/v2/converters/pdf.py
@@ -37,8 +37,8 @@ def __call__(self, filepath: str, page_range: List[int] | None = None):
         equation_processor(document)
 
         # TODO: re-enable once we add OCR method
-        #table_processor = TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model)
-        #table_processor(document)
+        table_processor = TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model)
+        table_processor(document)
 
         renderer = MarkdownRenderer()
         document_output = document.render()

diff --git a/marker/v2/processors/table.py b/marker/v2/processors/table.py
@@ -33,6 +33,7 @@ def __call__(self, document: Document):
             for block in page.children:
                 if block.block_type != self.block_type:
                     continue
+
                 image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.highres_image.size)
                 image = page.highres_image.crop(image_poly.bbox).convert("RGB")
 
@@ -42,9 +43,9 @@ def __call__(self, document: Document):
                     text_lines = get_page_text_lines(
                         filepath,
                         [page.page_id],
-                        page.highres_image.size,
+                        [page.highres_image.size],
                         flatten_pdf=True
-                    )
+                    )[0]
 
                 table_data.append({
                     "block_id": block.id,
@@ -54,10 +55,7 @@ def __call__(self, document: Document):
                     "img_size": page.highres_image.size
                 })
 
-        lst_format = zip(*(
-            [t[key] for t in table_data]
-            for key in ["table_image", "table_bbox", "img_size", "text_lines"]
-        ))
+        lst_format = [[t[key] for t in table_data] for key in ["table_image", "table_bbox", "img_size", "text_lines"]]
 
         cells, needs_ocr = get_cells(
             *lst_format,

diff --git a/marker/v2/renderers/html.py b/marker/v2/renderers/html.py
@@ -0,0 +1,30 @@
+from bs4 import BeautifulSoup
+from marker.v2.renderers import BaseRenderer
+from marker.v2.schema import BlockTypes
+
+
+class HTMLRenderer(BaseRenderer):
+    remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter]
+    def extract_html(self, document_output):
+        soup = BeautifulSoup(document_output.html, 'html.parser')
+
+        content_refs = soup.find_all('content-ref')
+        ref_block_type = None
+        for ref in content_refs:
+            src = ref.get('src')
+            for item in document_output.children:
+                if item.id == src:
+                    content = self.extract_html(item)
+                    ref_block_type = item.id.block_type
+                    break
+
+            if ref_block_type in self.remove_blocks:
+                ref.replace_with('')
+            else:
+                ref.replace_with(BeautifulSoup(f"<div>{content}</div>", 'html.parser'))
+
+        return str(soup)
+
+    def __call__(self, document_output):
+        full_html = self.extract_html(document_output)
+        return full_html
diff --git a/marker/v2/renderers/markdown.py b/marker/v2/renderers/markdown.py
@@ -1,26 +1,15 @@
-from bs4 import BeautifulSoup
 from markdownify import markdownify
-from marker.v2.renderers import BaseRenderer
-
-
-class MarkdownRenderer(BaseRenderer):
-    def extract_html(self, document_output):
-        soup = BeautifulSoup(document_output.html, 'html.parser')
-
-        content_refs = soup.find_all('content-ref')
-        for ref in content_refs:
-            src = ref.get('src')
-            for item in document_output.children:
-                if item.id == src:
-                    content = self.extract_html(item)
-                    break
-
-            ref.replace_with(BeautifulSoup(content, 'html.parser'))
-
-        return str(soup)
+from marker.v2.renderers.html import HTMLRenderer
 
+class MarkdownRenderer(HTMLRenderer):
     def __call__(self, document_output):
         full_html = self.extract_html(document_output)
-        return markdownify(full_html)
+        return markdownify(
+            full_html,
+            heading_style="ATX",
+            bullets="-",
+            escape_misc=False,
+            escape_underscores=False
+        )
 
 
diff --git a/marker/v2/schema/blocks/base.py b/marker/v2/schema/blocks/base.py
@@ -52,6 +52,7 @@ class Block(BaseModel):
     page_id: Optional[int] = None
     structure: List[BlockId] | None = None  # The top-level page structure, which is the block ids in order
     rendered: Any | None = None # The rendered output of the block
+    text_extraction_method: str = "pdftext"
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
 

diff --git a/marker/v2/schema/blocks/caption.py b/marker/v2/schema/blocks/caption.py
@@ -3,3 +3,8 @@
 
 class Caption(Block):
     block_type: str = "Caption"
+
+    def assemble_html(self, child_blocks, parent_structure):
+        template = super().assemble_html(child_blocks, parent_structure)
+        template = template.replace("\n", " ")
+        return f"<p>{template}</p>"
diff --git a/marker/v2/schema/blocks/code.py b/marker/v2/schema/blocks/code.py
@@ -3,3 +3,7 @@
 
 class Code(Block):
     block_type: str = "Code"
+
+    def assemble_html(self, child_blocks, parent_structure):
+        template = super().assemble_html(child_blocks, parent_structure)
+        return f"<pre>{template}</pre>"
diff --git a/marker/v2/schema/blocks/equation.py b/marker/v2/schema/blocks/equation.py
@@ -4,3 +4,6 @@
 class Equation(Block):
     block_type: str = "Equation"
     latex: str | None = None
+
+    def assemble_html(self, child_blocks, parent_structure=None):
+        return f"<div class='math'>{self.latex}</div>"
diff --git a/marker/v2/schema/blocks/figure.py b/marker/v2/schema/blocks/figure.py
@@ -3,3 +3,6 @@
 
 class Figure(Block):
     block_type: str = "Figure"
+
+    def assemble_html(self, child_blocks, parent_structure):
+        return f"Image {self.block_id}"
diff --git a/marker/v2/schema/blocks/footnote.py b/marker/v2/schema/blocks/footnote.py
@@ -3,3 +3,8 @@
 
 class Footnote(Block):
     block_type: str = "Footnote"
+
+    def assemble_html(self, child_blocks, parent_structure):
+        template = super().assemble_html(child_blocks, parent_structure)
+        template = template.replace("\n", " ")
+        return f"<p>{template}</p>"
diff --git a/marker/v2/schema/blocks/inlinemath.py b/marker/v2/schema/blocks/inlinemath.py
@@ -3,3 +3,9 @@
 
 class InlineMath(Block):
     block_type: str = "TextInlineMath"
+
+    def assemble_html(self, child_blocks, parent_structure):
+        template = super().assemble_html(child_blocks, parent_structure)
+        template = template.replace("\n", " ")
+        return f"<p>{template}</p>"
+
diff --git a/marker/v2/schema/blocks/listitem.py b/marker/v2/schema/blocks/listitem.py
@@ -1,9 +1,19 @@
+import re
+
 from marker.v2.schema.blocks import Block
 
 
+def replace_bullets(text):
+    # Replace bullet characters with a -
+    bullet_pattern = r"(^|[\n ])[•●○■▪▫–—]( )"
+    replaced_string = re.sub(bullet_pattern, r"\1-\2", text)
+    return replaced_string
+
 class ListItem(Block):
     block_type: str = "ListItem"
 
     def assemble_html(self, child_blocks, parent_structure):
         template = super().assemble_html(child_blocks, parent_structure)
+        template = template.replace("\n", " ")
+        template = replace_bullets(template)
         return f"<li>{template}</li>"
diff --git a/marker/v2/schema/blocks/picture.py b/marker/v2/schema/blocks/picture.py
@@ -3,3 +3,6 @@
 
 class Picture(Block):
     block_type: str = "Picture"
+
+    def assemble_html(self, child_blocks, parent_structure):
+        return f"Image {self.block_id}"
diff --git a/marker/v2/schema/blocks/sectionheader.py b/marker/v2/schema/blocks/sectionheader.py
@@ -3,3 +3,8 @@
 
 class SectionHeader(Block):
     block_type: str = "SectionHeader"
+
+    def assemble_html(self, child_blocks, parent_structure):
+        template = super().assemble_html(child_blocks, parent_structure)
+        template = template.replace("\n", " ")
+        return f"<h2>{template}</h2>"
diff --git a/marker/v2/schema/blocks/table.py b/marker/v2/schema/blocks/table.py
@@ -1,10 +1,14 @@
 from typing import List
 
+from tabled.formats import html_format
 from tabled.schema import SpanTableCell
 
 from marker.v2.schema.blocks import Block
 
 
 class Table(Block):
     block_type: str = "Table"
-    cells: List[SpanTableCell] | None = None
+    cells: List[SpanTableCell] | None = None
+
+    def assemble_html(self, child_blocks, parent_structure=None):
+        return html_format(self.cells)
diff --git a/marker/v2/schema/groups/table.py b/marker/v2/schema/groups/table.py
@@ -1,5 +1,4 @@
 from marker.v2.schema.blocks import Block
 
-
 class TableGroup(Block):
     block_type: str = "TableGroup"
diff --git a/marker/v2/schema/text/span.py b/marker/v2/schema/text/span.py
@@ -26,11 +26,16 @@ def assemble_html(self, child_blocks, parent_structure):
         text = self.text
         text = text.replace("-\n", "")  # Remove hyphenated line breaks
 
+        # Remove trailing newlines
         replaced_newline = False
         while len(text) > 0 and text[-1] in ["\n", "\r"]:
             text = text[:-1]
             replaced_newline = True
 
+        # Remove leading newlines
+        while len(text) > 0 and text[0] in ["\n", "\r"]:
+            text = text[1:]
+
         if replaced_newline:
             text += " "