Skip to content

Commit

Permalink
Add simple line and span renderer, add blocktype class
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Nov 15, 2024
1 parent c9f478a commit 76ea3e5
Show file tree
Hide file tree
Showing 20 changed files with 129 additions and 23 deletions.
13 changes: 7 additions & 6 deletions marker/v2/builders/structure.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from typing import List, Optional
from typing import Optional

from pydantic import BaseModel

from marker.v2.builders import BaseBuilder
from marker.v2.schema import BlockTypes
from marker.v2.schema.document import Document
from marker.v2.schema.groups import GROUP_BLOCK_REGISTRY, ListGroup
from marker.v2.schema.groups.page import PageGroup
Expand All @@ -22,15 +23,15 @@ def __call__(self, document: Document):
def group_caption_blocks(self, page: PageGroup):
for i, block_id in enumerate(page.structure):
block = page.get_block(block_id)
if block.block_type not in ["Table", "Figure", "Picture"]:
if block.block_type not in [BlockTypes.Table, BlockTypes.Figure, BlockTypes.Picture]:
continue

block_structure = [block_id]
selected_polygons = [block.polygon]
for j, prev_block_id in enumerate(page.structure[:i][::-1]):
prev_block = page.get_block(prev_block_id)
if all([
prev_block.block_type in ["Caption", "Footnote"],
prev_block.block_type in [BlockTypes.Caption, BlockTypes.Footnote],
prev_block.polygon.minimum_gap(block.polygon) < self.gap_threshold
]):
block_structure.insert(0, prev_block_id)
Expand All @@ -41,7 +42,7 @@ def group_caption_blocks(self, page: PageGroup):
for j, next_block_id in enumerate(page.structure[i + 1:]):
next_block = page.get_block(next_block_id)
if all([
next_block.block_type in ["Caption", "Footnote"],
next_block.block_type in [BlockTypes.Caption, BlockTypes.Footnote],
next_block.polygon.minimum_gap(block.polygon) < self.gap_threshold
]):
block_structure.append(next_block_id)
Expand All @@ -63,15 +64,15 @@ def group_caption_blocks(self, page: PageGroup):
def group_lists(self, page: PageGroup):
for i, block_id in enumerate(page.structure):
block = page.get_block(block_id)
if block.block_type not in ["ListItem"]:
if block.block_type not in [BlockTypes.ListItem]:
continue
block_structure = [block_id]
selected_polygons = [block.polygon]

for j, next_block_id in enumerate(page.structure[i + 1:]):
next_block = page.get_block(next_block_id)
if all([
next_block.block_type == "ListItem",
next_block.block_type == BlockTypes.ListItem,
next_block.polygon.minimum_gap(block.polygon) < self.gap_threshold
]):
block_structure.append(next_block_id)
Expand Down
5 changes: 4 additions & 1 deletion marker/v2/converters/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from marker.v2.providers.pdf import PdfProvider
from marker.v2.models import setup_layout_model, setup_texify_model, setup_recognition_model, setup_table_rec_model, \
setup_detection_model
from marker.v2.renderers.line import LineRenderer
from marker.v2.renderers.span import SpanRenderer


class PdfConverter(BaseConverter):
Expand All @@ -39,7 +41,8 @@ def __call__(self, filepath: str, page_range: List[int] | None = None):
#table_processor = TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model)
#table_processor(document)

rendered = document.render()
renderer_lst = [SpanRenderer(), LineRenderer()]
rendered = document.render(renderer_lst)
return rendered


Expand Down
3 changes: 2 additions & 1 deletion marker/v2/processors/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@

from marker.settings import settings
from marker.v2.processors import BaseProcessor
from marker.v2.schema import BlockTypes
from marker.v2.schema.document import Document


class TableProcessor(BaseProcessor):
block_type = "Table"
block_type = BlockTypes.Table
detect_boxes = False
detector_batch_size = None
table_rec_batch_size = None
Expand Down
17 changes: 13 additions & 4 deletions marker/v2/providers/pdf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import functools
from typing import Dict, List, Tuple
from typing import Dict, List, Tuple, Set
from typing import Dict, List, Optional

import pypdfium2 as pdfium
Expand Down Expand Up @@ -34,7 +34,7 @@ def __len__(self) -> int:
def __del__(self):
self.doc.close()

def font_flags_to_format(self, flags: int) -> List[str]:
def font_flags_to_format(self, flags: int) -> Set[str]:
flag_map = {
1: "FixedPitch",
2: "Serif",
Expand Down Expand Up @@ -69,7 +69,15 @@ def font_flags_to_format(self, flags: int) -> List[str]:
formats.add("bold")
if set_flags & {"FixedPitch", "Serif", "Script", "Nonsymbolic", "AllCap", "SmallCap", "UseExternAttr"}:
formats.add("plain")
return list(formats)
return formats

def font_names_to_format(self, font_name: str) -> Set[str]:
formats = set()
if "bold" in font_name.lower():
formats.add("bold")
if "ital" in font_name.lower():
formats.add("italic")
return formats

def setup(self):
self.doc = pdfium.PdfDocument(self.filepath)
Expand All @@ -90,6 +98,7 @@ def setup(self):
for span in line["spans"]:
if not span["text"].strip():
continue
font_formats = self.font_flags_to_format(span["font"]["flags"]).union(self.font_names_to_format(span["font"]["name"]))
spans.append(
Span(
polygon=PolygonBox.from_bbox(span["bbox"]),
Expand All @@ -99,7 +108,7 @@ def setup(self):
font_size=span["font"]["size"],
minimum_position=span["char_start_idx"],
maximum_position=span["char_end_idx"],
formats=self.font_flags_to_format(span["font"]["flags"]),
formats=list(font_formats),
page_id=page_id,
)
)
Expand Down
6 changes: 6 additions & 0 deletions marker/v2/renderers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
from enum import Enum
from typing import Optional

from pydantic import BaseModel


class RenderFormat(str, Enum):
json = "json"
markdown = "markdown"


class BaseRenderer:
block_type: str | None = None

Expand Down
41 changes: 41 additions & 0 deletions marker/v2/renderers/line.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import re
from typing import List, Optional

from marker.v2.renderers import BaseRenderer
from marker.v2.schema import BlockTypes
from marker.v2.schema.text import Span


def surround_text(s, char_to_insert):
leading_whitespace = re.match(r'^(\s*)', s).group(1)
trailing_whitespace = re.search(r'(\s*)$', s).group(1)
stripped_string = s.strip()
modified_string = char_to_insert + stripped_string + char_to_insert
final_string = leading_whitespace + modified_string + trailing_whitespace
return final_string


class LineRenderer(BaseRenderer):
block_type = BlockTypes.Line

def __call__(self, document, block, children: Optional[List[Span]] = None):
text = ""
for i, child in enumerate(children):
next_span = None
next_idx = i + 1
while len(children) > next_idx:
next_span = children[next_idx]
next_idx += 1
if len(next_span.text.strip()) > 0:
break
span_text = child.rendered

# Don't bold or italicize very short sequences
# Avoid bolding first and last sequence so lines can be joined properly
if len(span_text) > 3 and 0 < i < len(children) - 1:
if child.italic and (not next_span or not next_span.italic):
span_text = surround_text(span_text, "*")
elif child.bold and (not next_span or not next_span.bold):
span_text = surround_text(span_text, "**")
text += span_text
return text
9 changes: 9 additions & 0 deletions marker/v2/renderers/span.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from marker.v2.renderers import BaseRenderer
from marker.v2.schema import BlockTypes


class SpanRenderer(BaseRenderer):
block_type = BlockTypes.Span

def __call__(self, document, block, children=None):
return block.text
2 changes: 1 addition & 1 deletion marker/v2/renderers/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ def renderer_for_block(block, renderer_list: list):
if renderer.block_type == block.block_type:
return renderer

return DefaultRenderer
return DefaultRenderer()
17 changes: 17 additions & 0 deletions marker/v2/schema/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,18 @@
from marker.v2.schema.blocks import Block, LAYOUT_BLOCK_REGISTRY
from marker.v2.schema.groups import GROUP_BLOCK_REGISTRY
from marker.v2.schema.text import TEXT_BLOCK_REGISTRY


class _BlockTypes:
def __init__(self):
pass

def add(self, registry: dict[str, Block]):
for k, v in registry.items():
setattr(self, k, k)


BlockTypes = _BlockTypes()
BlockTypes.add(GROUP_BLOCK_REGISTRY)
BlockTypes.add(TEXT_BLOCK_REGISTRY)
BlockTypes.add(LAYOUT_BLOCK_REGISTRY)
3 changes: 1 addition & 2 deletions marker/v2/schema/blocks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,5 @@ def render(self, document, renderer_list: list):
block.render(document, renderer_list)
child_blocks.append(block)

renderer_cls = renderer_for_block(self, renderer_list)
renderer = renderer_cls()
renderer = renderer_for_block(self, renderer_list)
self.rendered = renderer(document, self, child_blocks)
2 changes: 1 addition & 1 deletion marker/v2/schema/blocks/inlinemath.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@


class InlineMath(Block):
block_type: str = "Text-inline-math"
block_type: str = "TextInlineMath"
2 changes: 1 addition & 1 deletion marker/v2/schema/blocks/listitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@


class ListItem(Block):
block_type: str = "List-item"
block_type: str = "ListItem"
2 changes: 1 addition & 1 deletion marker/v2/schema/blocks/pagefooter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@


class PageFooter(Block):
block_type: str = "Page-footer"
block_type: str = "PageFooter"
2 changes: 1 addition & 1 deletion marker/v2/schema/blocks/pageheader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@


class PageHeader(Block):
block_type: str = "Page-header"
block_type: str = "PageHeader"
2 changes: 1 addition & 1 deletion marker/v2/schema/blocks/sectionheader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@


class SectionHeader(Block):
block_type: str = "Section-header"
block_type: str = "SectionHeader"
2 changes: 1 addition & 1 deletion marker/v2/schema/blocks/toc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@


class TableOfContents(Block):
block_type: str = "Table-of-contents"
block_type: str = "TableOfContents"
4 changes: 2 additions & 2 deletions marker/v2/schema/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
class Document(BaseModel):
filepath: str
pages: List[PageGroup]
block_type: str = "Document"

def get_block(self, block_id: BlockId):
for page in self.pages:
Expand All @@ -27,6 +28,5 @@ def render(self, renderer_lst: list | None = None):
for page in self.pages:
page.render(self, renderer_lst)

doc_renderer_cls = renderer_for_block(self, renderer_lst)
doc_renderer = doc_renderer_cls()
doc_renderer = renderer_for_block(self, renderer_lst)
return doc_renderer(self, self, self.pages)
9 changes: 9 additions & 0 deletions marker/v2/schema/text/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from marker.v2.schema.text.line import Line
from marker.v2.schema.text.span import Span


TEXT_BLOCK_REGISTRY = {
"Line": Line,
"Span": Span,
}

3 changes: 3 additions & 0 deletions marker/v2/schema/text/line.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@

class Line(Block):
block_type: str = "Line"

def is_continuation(self, other):
pass
8 changes: 8 additions & 0 deletions marker/v2/schema/text/span.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,11 @@ class Span(Block):
minimum_position: int
maximum_position: int
formats: List[Literal['plain', 'math', 'chemical', 'bold', 'italic']]

@property
def bold(self):
return 'bold' in self.formats

@property
def italic(self):
return 'italic' in self.formats

0 comments on commit 76ea3e5

Please sign in to comment.