diff --git a/marker/v2/builders/structure.py b/marker/v2/builders/structure.py index 9ff6c25b..f73d7bec 100644 --- a/marker/v2/builders/structure.py +++ b/marker/v2/builders/structure.py @@ -1,8 +1,9 @@ -from typing import List, Optional +from typing import Optional from pydantic import BaseModel from marker.v2.builders import BaseBuilder +from marker.v2.schema import BlockTypes from marker.v2.schema.document import Document from marker.v2.schema.groups import GROUP_BLOCK_REGISTRY, ListGroup from marker.v2.schema.groups.page import PageGroup @@ -22,7 +23,7 @@ def __call__(self, document: Document): def group_caption_blocks(self, page: PageGroup): for i, block_id in enumerate(page.structure): block = page.get_block(block_id) - if block.block_type not in ["Table", "Figure", "Picture"]: + if block.block_type not in [BlockTypes.Table, BlockTypes.Figure, BlockTypes.Picture]: continue block_structure = [block_id] @@ -30,7 +31,7 @@ def group_caption_blocks(self, page: PageGroup): for j, prev_block_id in enumerate(page.structure[:i][::-1]): prev_block = page.get_block(prev_block_id) if all([ - prev_block.block_type in ["Caption", "Footnote"], + prev_block.block_type in [BlockTypes.Caption, BlockTypes.Footnote], prev_block.polygon.minimum_gap(block.polygon) < self.gap_threshold ]): block_structure.insert(0, prev_block_id) @@ -41,7 +42,7 @@ def group_caption_blocks(self, page: PageGroup): for j, next_block_id in enumerate(page.structure[i + 1:]): next_block = page.get_block(next_block_id) if all([ - next_block.block_type in ["Caption", "Footnote"], + next_block.block_type in [BlockTypes.Caption, BlockTypes.Footnote], next_block.polygon.minimum_gap(block.polygon) < self.gap_threshold ]): block_structure.append(next_block_id) @@ -63,7 +64,7 @@ def group_caption_blocks(self, page: PageGroup): def group_lists(self, page: PageGroup): for i, block_id in enumerate(page.structure): block = page.get_block(block_id) - if block.block_type not in ["ListItem"]: + if block.block_type not in [BlockTypes.ListItem]: continue block_structure = [block_id] selected_polygons = [block.polygon] @@ -71,7 +72,7 @@ def group_lists(self, page: PageGroup): for j, next_block_id in enumerate(page.structure[i + 1:]): next_block = page.get_block(next_block_id) if all([ - next_block.block_type == "ListItem", + next_block.block_type == BlockTypes.ListItem, next_block.polygon.minimum_gap(block.polygon) < self.gap_threshold ]): block_structure.append(next_block_id) diff --git a/marker/v2/converters/pdf.py b/marker/v2/converters/pdf.py index e4774c07..27ea2a4f 100644 --- a/marker/v2/converters/pdf.py +++ b/marker/v2/converters/pdf.py @@ -13,6 +13,8 @@ from marker.v2.providers.pdf import PdfProvider from marker.v2.models import setup_layout_model, setup_texify_model, setup_recognition_model, setup_table_rec_model, \ setup_detection_model +from marker.v2.renderers.line import LineRenderer +from marker.v2.renderers.span import SpanRenderer class PdfConverter(BaseConverter): @@ -39,7 +41,8 @@ def __call__(self, filepath: str, page_range: List[int] | None = None): #table_processor = TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model) #table_processor(document) - rendered = document.render() + renderer_lst = [SpanRenderer(), LineRenderer()] + rendered = document.render(renderer_lst) return rendered diff --git a/marker/v2/processors/table.py b/marker/v2/processors/table.py index e2b1b08b..fa62df00 100644 --- a/marker/v2/processors/table.py +++ b/marker/v2/processors/table.py @@ -7,11 +7,12 @@ from marker.settings import settings from marker.v2.processors import BaseProcessor +from marker.v2.schema import BlockTypes from marker.v2.schema.document import Document class TableProcessor(BaseProcessor): - block_type = "Table" + block_type = BlockTypes.Table detect_boxes = False detector_batch_size = None table_rec_batch_size = None diff --git a/marker/v2/providers/pdf.py b/marker/v2/providers/pdf.py index 4c9b391a..782d6ea4 100644 --- a/marker/v2/providers/pdf.py +++ b/marker/v2/providers/pdf.py @@ -1,5 +1,5 @@ import functools -from typing import Dict, List, Tuple +from typing import Dict, List, Tuple, Set from typing import Dict, List, Optional import pypdfium2 as pdfium @@ -34,7 +34,7 @@ def __len__(self) -> int: def __del__(self): self.doc.close() - def font_flags_to_format(self, flags: int) -> List[str]: + def font_flags_to_format(self, flags: int) -> Set[str]: flag_map = { 1: "FixedPitch", 2: "Serif", @@ -69,7 +69,15 @@ def font_flags_to_format(self, flags: int) -> List[str]: formats.add("bold") if set_flags & {"FixedPitch", "Serif", "Script", "Nonsymbolic", "AllCap", "SmallCap", "UseExternAttr"}: formats.add("plain") - return list(formats) + return formats + + def font_names_to_format(self, font_name: str) -> Set[str]: + formats = set() + if "bold" in font_name.lower(): + formats.add("bold") + if "ital" in font_name.lower(): + formats.add("italic") + return formats def setup(self): self.doc = pdfium.PdfDocument(self.filepath) @@ -90,6 +98,7 @@ def setup(self): for span in line["spans"]: if not span["text"].strip(): continue + font_formats = self.font_flags_to_format(span["font"]["flags"]).union(self.font_names_to_format(span["font"]["name"])) spans.append( Span( polygon=PolygonBox.from_bbox(span["bbox"]), @@ -99,7 +108,7 @@ def setup(self): font_size=span["font"]["size"], minimum_position=span["char_start_idx"], maximum_position=span["char_end_idx"], - formats=self.font_flags_to_format(span["font"]["flags"]), + formats=list(font_formats), page_id=page_id, ) ) diff --git a/marker/v2/renderers/__init__.py b/marker/v2/renderers/__init__.py index 8806b78e..00616c9f 100644 --- a/marker/v2/renderers/__init__.py +++ b/marker/v2/renderers/__init__.py @@ -1,8 +1,14 @@ +from enum import Enum from typing import Optional from pydantic import BaseModel +class RenderFormat(str, Enum): + json = "json" + markdown = "markdown" + + class BaseRenderer: block_type: str | None = None diff --git a/marker/v2/renderers/line.py b/marker/v2/renderers/line.py new file mode 100644 index 00000000..fb222278 --- /dev/null +++ b/marker/v2/renderers/line.py @@ -0,0 +1,41 @@ +import re +from typing import List, Optional + +from marker.v2.renderers import BaseRenderer +from marker.v2.schema import BlockTypes +from marker.v2.schema.text import Span + + +def surround_text(s, char_to_insert): + leading_whitespace = re.match(r'^(\s*)', s).group(1) + trailing_whitespace = re.search(r'(\s*)$', s).group(1) + stripped_string = s.strip() + modified_string = char_to_insert + stripped_string + char_to_insert + final_string = leading_whitespace + modified_string + trailing_whitespace + return final_string + + +class LineRenderer(BaseRenderer): + block_type = BlockTypes.Line + + def __call__(self, document, block, children: Optional[List[Span]] = None): + text = "" + for i, child in enumerate(children): + next_span = None + next_idx = i + 1 + while len(children) > next_idx: + next_span = children[next_idx] + next_idx += 1 + if len(next_span.text.strip()) > 0: + break + span_text = child.rendered + + # Don't bold or italicize very short sequences + # Avoid bolding first and last sequence so lines can be joined properly + if len(span_text) > 3 and 0 < i < len(children) - 1: + if child.italic and (not next_span or not next_span.italic): + span_text = surround_text(span_text, "*") + elif child.bold and (not next_span or not next_span.bold): + span_text = surround_text(span_text, "**") + text += span_text + return text \ No newline at end of file diff --git a/marker/v2/renderers/span.py b/marker/v2/renderers/span.py new file mode 100644 index 00000000..ada64ffb --- /dev/null +++ b/marker/v2/renderers/span.py @@ -0,0 +1,9 @@ +from marker.v2.renderers import BaseRenderer +from marker.v2.schema import BlockTypes + + +class SpanRenderer(BaseRenderer): + block_type = BlockTypes.Span + + def __call__(self, document, block, children=None): + return block.text \ No newline at end of file diff --git a/marker/v2/renderers/util.py b/marker/v2/renderers/util.py index 335cf452..e9a301a7 100644 --- a/marker/v2/renderers/util.py +++ b/marker/v2/renderers/util.py @@ -5,4 +5,4 @@ def renderer_for_block(block, renderer_list: list): if renderer.block_type == block.block_type: return renderer - return DefaultRenderer + return DefaultRenderer() diff --git a/marker/v2/schema/__init__.py b/marker/v2/schema/__init__.py index 8b137891..ffb8cb0b 100644 --- a/marker/v2/schema/__init__.py +++ b/marker/v2/schema/__init__.py @@ -1 +1,18 @@ +from marker.v2.schema.blocks import Block, LAYOUT_BLOCK_REGISTRY +from marker.v2.schema.groups import GROUP_BLOCK_REGISTRY +from marker.v2.schema.text import TEXT_BLOCK_REGISTRY + +class _BlockTypes: + def __init__(self): + pass + + def add(self, registry: dict[str, Block]): + for k, v in registry.items(): + setattr(self, k, k) + + +BlockTypes = _BlockTypes() +BlockTypes.add(GROUP_BLOCK_REGISTRY) +BlockTypes.add(TEXT_BLOCK_REGISTRY) +BlockTypes.add(LAYOUT_BLOCK_REGISTRY) diff --git a/marker/v2/schema/blocks/base.py b/marker/v2/schema/blocks/base.py index d7bed645..5331a7a0 100644 --- a/marker/v2/schema/blocks/base.py +++ b/marker/v2/schema/blocks/base.py @@ -91,6 +91,5 @@ def render(self, document, renderer_list: list): block.render(document, renderer_list) child_blocks.append(block) - renderer_cls = renderer_for_block(self, renderer_list) - renderer = renderer_cls() + renderer = renderer_for_block(self, renderer_list) self.rendered = renderer(document, self, child_blocks) diff --git a/marker/v2/schema/blocks/inlinemath.py b/marker/v2/schema/blocks/inlinemath.py index 4cc76380..2108046c 100644 --- a/marker/v2/schema/blocks/inlinemath.py +++ b/marker/v2/schema/blocks/inlinemath.py @@ -2,4 +2,4 @@ class InlineMath(Block): - block_type: str = "Text-inline-math" + block_type: str = "TextInlineMath" diff --git a/marker/v2/schema/blocks/listitem.py b/marker/v2/schema/blocks/listitem.py index 0e3f67ec..3f13849f 100644 --- a/marker/v2/schema/blocks/listitem.py +++ b/marker/v2/schema/blocks/listitem.py @@ -2,4 +2,4 @@ class ListItem(Block): - block_type: str = "List-item" + block_type: str = "ListItem" diff --git a/marker/v2/schema/blocks/pagefooter.py b/marker/v2/schema/blocks/pagefooter.py index a676a987..329885c2 100644 --- a/marker/v2/schema/blocks/pagefooter.py +++ b/marker/v2/schema/blocks/pagefooter.py @@ -2,4 +2,4 @@ class PageFooter(Block): - block_type: str = "Page-footer" + block_type: str = "PageFooter" diff --git a/marker/v2/schema/blocks/pageheader.py b/marker/v2/schema/blocks/pageheader.py index e48dc217..32150733 100644 --- a/marker/v2/schema/blocks/pageheader.py +++ b/marker/v2/schema/blocks/pageheader.py @@ -2,4 +2,4 @@ class PageHeader(Block): - block_type: str = "Page-header" + block_type: str = "PageHeader" diff --git a/marker/v2/schema/blocks/sectionheader.py b/marker/v2/schema/blocks/sectionheader.py index 01f0b741..046ddbce 100644 --- a/marker/v2/schema/blocks/sectionheader.py +++ b/marker/v2/schema/blocks/sectionheader.py @@ -2,4 +2,4 @@ class SectionHeader(Block): - block_type: str = "Section-header" + block_type: str = "SectionHeader" diff --git a/marker/v2/schema/blocks/toc.py b/marker/v2/schema/blocks/toc.py index 11796336..8bfeee1d 100644 --- a/marker/v2/schema/blocks/toc.py +++ b/marker/v2/schema/blocks/toc.py @@ -2,4 +2,4 @@ class TableOfContents(Block): - block_type: str = "Table-of-contents" + block_type: str = "TableOfContents" diff --git a/marker/v2/schema/document.py b/marker/v2/schema/document.py index 0f3dbc99..ef03764f 100644 --- a/marker/v2/schema/document.py +++ b/marker/v2/schema/document.py @@ -12,6 +12,7 @@ class Document(BaseModel): filepath: str pages: List[PageGroup] + block_type: str = "Document" def get_block(self, block_id: BlockId): for page in self.pages: @@ -27,6 +28,5 @@ def render(self, renderer_lst: list | None = None): for page in self.pages: page.render(self, renderer_lst) - doc_renderer_cls = renderer_for_block(self, renderer_lst) - doc_renderer = doc_renderer_cls() + doc_renderer = renderer_for_block(self, renderer_lst) return doc_renderer(self, self, self.pages) diff --git a/marker/v2/schema/text/__init__.py b/marker/v2/schema/text/__init__.py new file mode 100644 index 00000000..1a01f22f --- /dev/null +++ b/marker/v2/schema/text/__init__.py @@ -0,0 +1,9 @@ +from marker.v2.schema.text.line import Line +from marker.v2.schema.text.span import Span + + +TEXT_BLOCK_REGISTRY = { + "Line": Line, + "Span": Span, +} + diff --git a/marker/v2/schema/text/line.py b/marker/v2/schema/text/line.py index a9234d7d..545930c0 100644 --- a/marker/v2/schema/text/line.py +++ b/marker/v2/schema/text/line.py @@ -6,3 +6,6 @@ class Line(Block): block_type: str = "Line" + + def is_continuation(self, other): + pass diff --git a/marker/v2/schema/text/span.py b/marker/v2/schema/text/span.py index a0af73b2..cce09b4f 100644 --- a/marker/v2/schema/text/span.py +++ b/marker/v2/schema/text/span.py @@ -13,3 +13,11 @@ class Span(Block): minimum_position: int maximum_position: int formats: List[Literal['plain', 'math', 'chemical', 'bold', 'italic']] + + @property + def bold(self): + return 'bold' in self.formats + + @property + def italic(self): + return 'italic' in self.formats