Skip to content

Commit

Permalink
Clean up renderers, fix output
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Nov 15, 2024
1 parent a748e23 commit 81092a6
Show file tree
Hide file tree
Showing 18 changed files with 101 additions and 30 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ wandb
report.json
benchmark_data
debug_data
temp.md

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
4 changes: 2 additions & 2 deletions marker/v2/converters/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ def __call__(self, filepath: str, page_range: List[int] | None = None):
equation_processor(document)

# TODO: re-enable once we add OCR method
#table_processor = TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model)
#table_processor(document)
table_processor = TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model)
table_processor(document)

renderer = MarkdownRenderer()
document_output = document.render()
Expand Down
10 changes: 4 additions & 6 deletions marker/v2/processors/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def __call__(self, document: Document):
for block in page.children:
if block.block_type != self.block_type:
continue

image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.highres_image.size)
image = page.highres_image.crop(image_poly.bbox).convert("RGB")

Expand All @@ -42,9 +43,9 @@ def __call__(self, document: Document):
text_lines = get_page_text_lines(
filepath,
[page.page_id],
page.highres_image.size,
[page.highres_image.size],
flatten_pdf=True
)
)[0]

table_data.append({
"block_id": block.id,
Expand All @@ -54,10 +55,7 @@ def __call__(self, document: Document):
"img_size": page.highres_image.size
})

lst_format = zip(*(
[t[key] for t in table_data]
for key in ["table_image", "table_bbox", "img_size", "text_lines"]
))
lst_format = [[t[key] for t in table_data] for key in ["table_image", "table_bbox", "img_size", "text_lines"]]

cells, needs_ocr = get_cells(
*lst_format,
Expand Down
30 changes: 30 additions & 0 deletions marker/v2/renderers/html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from bs4 import BeautifulSoup
from marker.v2.renderers import BaseRenderer
from marker.v2.schema import BlockTypes


class HTMLRenderer(BaseRenderer):
remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter]
def extract_html(self, document_output):
soup = BeautifulSoup(document_output.html, 'html.parser')

content_refs = soup.find_all('content-ref')
ref_block_type = None
for ref in content_refs:
src = ref.get('src')
for item in document_output.children:
if item.id == src:
content = self.extract_html(item)
ref_block_type = item.id.block_type
break

if ref_block_type in self.remove_blocks:
ref.replace_with('')
else:
ref.replace_with(BeautifulSoup(f"<div>{content}</div>", 'html.parser'))

return str(soup)

def __call__(self, document_output):
full_html = self.extract_html(document_output)
return full_html
29 changes: 9 additions & 20 deletions marker/v2/renderers/markdown.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,15 @@
from bs4 import BeautifulSoup
from markdownify import markdownify
from marker.v2.renderers import BaseRenderer


class MarkdownRenderer(BaseRenderer):
def extract_html(self, document_output):
soup = BeautifulSoup(document_output.html, 'html.parser')

content_refs = soup.find_all('content-ref')
for ref in content_refs:
src = ref.get('src')
for item in document_output.children:
if item.id == src:
content = self.extract_html(item)
break

ref.replace_with(BeautifulSoup(content, 'html.parser'))

return str(soup)
from marker.v2.renderers.html import HTMLRenderer

class MarkdownRenderer(HTMLRenderer):
def __call__(self, document_output):
full_html = self.extract_html(document_output)
return markdownify(full_html)
return markdownify(
full_html,
heading_style="ATX",
bullets="-",
escape_misc=False,
escape_underscores=False
)


1 change: 1 addition & 0 deletions marker/v2/schema/blocks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ class Block(BaseModel):
page_id: Optional[int] = None
structure: List[BlockId] | None = None # The top-level page structure, which is the block ids in order
rendered: Any | None = None # The rendered output of the block
text_extraction_method: str = "pdftext"

model_config = ConfigDict(arbitrary_types_allowed=True)

Expand Down
5 changes: 5 additions & 0 deletions marker/v2/schema/blocks/caption.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,8 @@

class Caption(Block):
block_type: str = "Caption"

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
return f"<p>{template}</p>"
4 changes: 4 additions & 0 deletions marker/v2/schema/blocks/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,7 @@

class Code(Block):
block_type: str = "Code"

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
return f"<pre>{template}</pre>"
3 changes: 3 additions & 0 deletions marker/v2/schema/blocks/equation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@
class Equation(Block):
block_type: str = "Equation"
latex: str | None = None

def assemble_html(self, child_blocks, parent_structure=None):
return f"<div class='math'>{self.latex}</div>"
3 changes: 3 additions & 0 deletions marker/v2/schema/blocks/figure.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@

class Figure(Block):
block_type: str = "Figure"

def assemble_html(self, child_blocks, parent_structure):
return f"Image {self.block_id}"
5 changes: 5 additions & 0 deletions marker/v2/schema/blocks/footnote.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,8 @@

class Footnote(Block):
block_type: str = "Footnote"

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
return f"<p>{template}</p>"
6 changes: 6 additions & 0 deletions marker/v2/schema/blocks/inlinemath.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,9 @@

class InlineMath(Block):
block_type: str = "TextInlineMath"

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
return f"<p>{template}</p>"

10 changes: 10 additions & 0 deletions marker/v2/schema/blocks/listitem.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,19 @@
import re

from marker.v2.schema.blocks import Block


def replace_bullets(text):
# Replace bullet characters with a -
bullet_pattern = r"(^|[\n ])[•●○■▪▫–—]( )"
replaced_string = re.sub(bullet_pattern, r"\1-\2", text)
return replaced_string

class ListItem(Block):
block_type: str = "ListItem"

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
template = replace_bullets(template)
return f"<li>{template}</li>"
3 changes: 3 additions & 0 deletions marker/v2/schema/blocks/picture.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@

class Picture(Block):
block_type: str = "Picture"

def assemble_html(self, child_blocks, parent_structure):
return f"Image {self.block_id}"
5 changes: 5 additions & 0 deletions marker/v2/schema/blocks/sectionheader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,8 @@

class SectionHeader(Block):
block_type: str = "SectionHeader"

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
return f"<h2>{template}</h2>"
6 changes: 5 additions & 1 deletion marker/v2/schema/blocks/table.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
from typing import List

from tabled.formats import html_format
from tabled.schema import SpanTableCell

from marker.v2.schema.blocks import Block


class Table(Block):
block_type: str = "Table"
cells: List[SpanTableCell] | None = None
cells: List[SpanTableCell] | None = None

def assemble_html(self, child_blocks, parent_structure=None):
return html_format(self.cells)
1 change: 0 additions & 1 deletion marker/v2/schema/groups/table.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from marker.v2.schema.blocks import Block


class TableGroup(Block):
block_type: str = "TableGroup"
5 changes: 5 additions & 0 deletions marker/v2/schema/text/span.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,16 @@ def assemble_html(self, child_blocks, parent_structure):
text = self.text
text = text.replace("-\n", "") # Remove hyphenated line breaks

# Remove trailing newlines
replaced_newline = False
while len(text) > 0 and text[-1] in ["\n", "\r"]:
text = text[:-1]
replaced_newline = True

# Remove leading newlines
while len(text) > 0 and text[0] in ["\n", "\r"]:
text = text[1:]

if replaced_newline:
text += " "

Expand Down

0 comments on commit 81092a6

Please sign in to comment.