Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/vik_v2' into dev-mose/marker-v2
Browse files Browse the repository at this point in the history
  • Loading branch information
iammosespaulr committed Nov 16, 2024
2 parents f32ea04 + 7556d53 commit 70c4734
Show file tree
Hide file tree
Showing 25 changed files with 199 additions and 74 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ wandb
report.json
benchmark_data
debug_data
temp.md

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
11 changes: 5 additions & 6 deletions marker/v2/converters/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,11 @@ def __call__(self, filepath: str, page_range: List[int] | None = None):
equation_processor = EquationProcessor(self.texify_model)
equation_processor(document)

# TODO: re-enable once we add OCR method
# table_processor = TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model)
# table_processor(document)
table_processor = TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model)
table_processor(document)

renderer = MarkdownRenderer()
document_output = document.render()
return renderer(document_output)
return renderer(document)


if __name__ == "__main__":
Expand All @@ -57,4 +55,5 @@ def __call__(self, filepath: str, page_range: List[int] | None = None):
converter = PdfConverter()
rendered = converter(temp_pdf.name)

print(rendered)
with open("temp.md", "w+") as f:
f.write(rendered)
10 changes: 4 additions & 6 deletions marker/v2/processors/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def __call__(self, document: Document):
for block in page.children:
if block.block_type != self.block_type:
continue

image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.highres_image.size)
image = page.highres_image.crop(image_poly.bbox).convert("RGB")

Expand All @@ -42,9 +43,9 @@ def __call__(self, document: Document):
text_lines = get_page_text_lines(
filepath,
[page.page_id],
page.highres_image.size,
[page.highres_image.size],
flatten_pdf=True
)
)[0]

table_data.append({
"block_id": block.id,
Expand All @@ -54,10 +55,7 @@ def __call__(self, document: Document):
"img_size": page.highres_image.size
})

lst_format = zip(*(
[t[key] for t in table_data]
for key in ["table_image", "table_bbox", "img_size", "text_lines"]
))
lst_format = [[t[key] for t in table_data] for key in ["table_image", "table_bbox", "img_size", "text_lines"]]

cells, needs_ocr = get_cells(
*lst_format,
Expand Down
21 changes: 15 additions & 6 deletions marker/v2/providers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,10 @@ def merge_lines(

return page_lines, page_spans

def font_flags_to_format(self, flags: int) -> Set[str]:
def font_flags_to_format(self, flags: int | None) -> Set[str]:
if flags is None:
return {"plain"}

flag_map = {
1: "FixedPitch",
2: "Serif",
Expand Down Expand Up @@ -200,8 +203,11 @@ def font_flags_to_format(self, flags: int) -> Set[str]:
formats.add("plain")
return formats

def font_names_to_format(self, font_name: str) -> Set[str]:
def font_names_to_format(self, font_name: str | None) -> Set[str]:
formats = set()
if font_name is None:
return formats

if "bold" in font_name.lower():
formats.add("bold")
if "ital" in font_name.lower():
Expand All @@ -226,16 +232,19 @@ def pdftext_extraction(self) -> Tuple[PageLines, PageSpans]:
for line in block["lines"]:
spans: List[Span] = []
for span in line["spans"]:
if not span["text"].strip():
if not span["text"]:
continue
font_formats = self.font_flags_to_format(span["font"]["flags"]).union(self.font_names_to_format(span["font"]["name"]))
font_name = span["font"]["name"] or "Unknown"
font_weight = span["font"]["weight"] or 0
font_size = span["font"]["size"] or 0
spans.append(
Span(
polygon=PolygonBox.from_bbox(span["bbox"]),
text=span["text"],
font=span["font"]["name"],
font_weight=span["font"]["weight"],
font_size=span["font"]["size"],
font=font_name,
font_weight=font_weight,
font_size=font_size,
minimum_position=span["char_start_idx"],
maximum_position=span["char_end_idx"],
formats=list(font_formats),
Expand Down
8 changes: 1 addition & 7 deletions marker/v2/renderers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,10 @@
from enum import Enum
from typing import Optional

from pydantic import BaseModel

from marker.v2.schema import BlockTypes


class RenderFormat(str, Enum):
json = "json"
markdown = "markdown"


class BaseRenderer:
block_type: BlockTypes | None = None

Expand All @@ -19,6 +13,6 @@ def __init__(self, config: Optional[BaseModel | dict] = None):
for k in config.model_fields:
setattr(self, k, config[k])

def __call__(self, document_output):
def __call__(self, document):
# Children are in reading order
raise NotImplementedError
33 changes: 33 additions & 0 deletions marker/v2/renderers/html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from bs4 import BeautifulSoup
from marker.v2.renderers import BaseRenderer
from marker.v2.schema import BlockTypes


class HTMLRenderer(BaseRenderer):
remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter]
image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure]

def extract_html(self, document, document_output):
soup = BeautifulSoup(document_output.html, 'html.parser')

content_refs = soup.find_all('content-ref')
ref_block_type = None
for ref in content_refs:
src = ref.get('src')
for item in document_output.children:
if item.id == src:
content = self.extract_html(document, item)
ref_block_type = item.id.block_type
break

if ref_block_type in self.remove_blocks:
ref.replace_with('')
else:
ref.replace_with(BeautifulSoup(f"<div>{content}</div>", 'html.parser'))

return str(soup)

def __call__(self, document):
document_output = document.render()
full_html = self.extract_html(document, document_output)
return full_html
37 changes: 14 additions & 23 deletions marker/v2/renderers/markdown.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,17 @@
from bs4 import BeautifulSoup
from markdownify import markdownify
from marker.v2.renderers import BaseRenderer


class MarkdownRenderer(BaseRenderer):
def extract_html(self, document_output):
soup = BeautifulSoup(document_output.html, 'html.parser')

content_refs = soup.find_all('content-ref')
for ref in content_refs:
src = ref.get('src')
for item in document_output.children:
if item.id == src:
content = self.extract_html(item)
break

ref.replace_with(BeautifulSoup(content, 'html.parser'))

return str(soup)

def __call__(self, document_output):
full_html = self.extract_html(document_output)
return markdownify(full_html)
from marker.v2.renderers.html import HTMLRenderer


class MarkdownRenderer(HTMLRenderer):
def __call__(self, document):
document_output = document.render()
full_html = self.extract_html(document, document_output)
return markdownify(
full_html,
heading_style="ATX",
bullets="-",
escape_misc=False,
escape_underscores=False
)


8 changes: 4 additions & 4 deletions marker/v2/schema/blocks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,21 +103,21 @@ def raw_text(self, document: Document) -> str:
text += "\n"
return text

def assemble_html(self, child_blocks):
def assemble_html(self, child_blocks, parent_structure=None):
template = ""
for c in child_blocks:
template += f"<content-ref src='{c.id}'></content-ref>"
return template

def render(self, document):
def render(self, document, parent_structure):
child_content = []
if self.structure is not None and len(self.structure) > 0:
for block_id in self.structure:
block = document.get_block(block_id)
child_content.append(block.render(document))
child_content.append(block.render(document, self.structure))

return BlockOutput(
html=self.assemble_html(child_content),
html=self.assemble_html(child_content, parent_structure),
polygon=self.polygon,
id=self.id,
children=child_content
Expand Down
5 changes: 5 additions & 0 deletions marker/v2/schema/blocks/caption.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,8 @@

class Caption(Block):
block_type: BlockTypes = BlockTypes.Caption

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
return f"<p>{template}</p>"
4 changes: 4 additions & 0 deletions marker/v2/schema/blocks/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,7 @@

class Code(Block):
block_type: BlockTypes = BlockTypes.Code

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
return f"<pre>{template}</pre>"
3 changes: 3 additions & 0 deletions marker/v2/schema/blocks/equation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,6 @@
class Equation(Block):
block_type: BlockTypes = BlockTypes.Equation
latex: str | None = None

def assemble_html(self, child_blocks, parent_structure=None):
return f"<div class='math'>{self.latex}</div>"
3 changes: 3 additions & 0 deletions marker/v2/schema/blocks/figure.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@

class Figure(Block):
block_type: BlockTypes = BlockTypes.Figure

def assemble_html(self, child_blocks, parent_structure):
return f"Image {self.block_id}"
5 changes: 5 additions & 0 deletions marker/v2/schema/blocks/footnote.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,8 @@

class Footnote(Block):
block_type: BlockTypes = BlockTypes.Footnote

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
return f"<p>{template}</p>"
5 changes: 5 additions & 0 deletions marker/v2/schema/blocks/inlinemath.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,8 @@

class InlineMath(Block):
block_type: BlockTypes = BlockTypes.TextInlineMath

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
return f"<p>{template}</p>"
15 changes: 13 additions & 2 deletions marker/v2/schema/blocks/listitem.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,21 @@
import re

from marker.v2.schema import BlockTypes
from marker.v2.schema.blocks import Block


def replace_bullets(text):
# Replace bullet characters with a -
bullet_pattern = r"(^|[\n ])[•●○■▪▫–—]( )"
replaced_string = re.sub(bullet_pattern, r"\1-\2", text)
return replaced_string


class ListItem(Block):
block_type: BlockTypes = BlockTypes.ListItem

def assemble_html(self, child_blocks):
template = super().assemble_html(child_blocks)
def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
template = replace_bullets(template)
return f"<li>{template}</li>"
3 changes: 3 additions & 0 deletions marker/v2/schema/blocks/picture.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@

class Picture(Block):
block_type: BlockTypes = BlockTypes.Picture

def assemble_html(self, child_blocks, parent_structure):
return f"Image {self.block_id}"
5 changes: 5 additions & 0 deletions marker/v2/schema/blocks/sectionheader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,8 @@

class SectionHeader(Block):
block_type: BlockTypes = BlockTypes.SectionHeader

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
return f"<h2>{template}</h2>"
4 changes: 4 additions & 0 deletions marker/v2/schema/blocks/table.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import List

from tabled.formats import html_format
from tabled.schema import SpanTableCell

from marker.v2.schema import BlockTypes
Expand All @@ -9,3 +10,6 @@
class Table(Block):
block_type: BlockTypes = BlockTypes.Table
cells: List[SpanTableCell] | None = None

def assemble_html(self, child_blocks, parent_structure=None):
return html_format(self.cells)
5 changes: 3 additions & 2 deletions marker/v2/schema/blocks/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
class Text(Block):
block_type: BlockTypes = BlockTypes.Text

def assemble_html(self, child_blocks):
template = super().assemble_html(child_blocks)
def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
return f"<p>{template}</p>"
10 changes: 5 additions & 5 deletions marker/v2/schema/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ class Document(BaseModel):
block_type: BlockTypes = BlockTypes.Document

def get_block(self, block_id: BlockId):
for page in self.pages:
block = page.get_block(block_id)
if block:
return block
page = [p for p in self.pages if p.page_id == block_id.page_id][0]
block = page.get_block(block_id)
if block:
return block
return None

def assemble_html(self, child_blocks):
Expand All @@ -36,7 +36,7 @@ def assemble_html(self, child_blocks):
def render(self):
child_content = []
for page in self.pages:
child_content.append(page.render(self))
child_content.append(page.render(self, None))

return DocumentOutput(
children=child_content,
Expand Down
4 changes: 2 additions & 2 deletions marker/v2/schema/groups/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@
class ListGroup(Block):
block_type: BlockTypes = BlockTypes.ListGroup

def assemble_html(self, child_blocks):
template = super().assemble_html(child_blocks)
def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
return f"<ul>{template}</ul>"
2 changes: 1 addition & 1 deletion marker/v2/schema/groups/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,5 @@ def add_full_block(self, block: Block) -> Block:

def get_block(self, block_id: BlockId) -> Block | None:
for block in self.children:
if block.id == block_id:
if block.block_id == block_id.block_id:
return block
1 change: 0 additions & 1 deletion marker/v2/schema/groups/table.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from marker.v2.schema import BlockTypes
from marker.v2.schema.blocks import Block


class TableGroup(Block):
block_type: BlockTypes = BlockTypes.TableGroup
Loading

0 comments on commit 70c4734

Please sign in to comment.