Skip to content

Commit

Permalink
Fix markdown output
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Nov 15, 2024
1 parent 2d6256c commit a748e23
Show file tree
Hide file tree
Showing 8 changed files with 84 additions and 27 deletions.
23 changes: 16 additions & 7 deletions marker/v2/providers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,10 @@ def __len__(self) -> int:
def __del__(self):
self.doc.close()

def font_flags_to_format(self, flags: int) -> Set[str]:
def font_flags_to_format(self, flags: int | None) -> Set[str]:
if flags is None:
return {"plain"}

flag_map = {
1: "FixedPitch",
2: "Serif",
Expand Down Expand Up @@ -72,8 +75,11 @@ def font_flags_to_format(self, flags: int) -> Set[str]:
formats.add("plain")
return formats

def font_names_to_format(self, font_name: str) -> Set[str]:
def font_names_to_format(self, font_name: str | None) -> Set[str]:
formats = set()
if font_name is None:
return formats

if "bold" in font_name.lower():
formats.add("bold")
if "ital" in font_name.lower():
Expand All @@ -97,16 +103,19 @@ def setup(self):
for line in block["lines"]:
spans: List[Span] = []
for span in line["spans"]:
if not span["text"].strip():
if not span["text"]:
continue
font_formats = self.font_flags_to_format(span["font"]["flags"]).union(self.font_names_to_format(span["font"]["name"]))
font_name = span["font"]["name"] or "Unknown"
font_weight = span["font"]["weight"] or 0
font_size = span["font"]["size"] or 0
spans.append(
Span(
polygon=PolygonBox.from_bbox(span["bbox"]),
text=span["text"].strip(),
font=span["font"]["name"],
font_weight=span["font"]["weight"],
font_size=span["font"]["size"],
text=span["text"],
font=font_name,
font_weight=font_weight,
font_size=font_size,
minimum_position=span["char_start_idx"],
maximum_position=span["char_end_idx"],
formats=list(font_formats),
Expand Down
8 changes: 4 additions & 4 deletions marker/v2/schema/blocks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,21 +100,21 @@ def raw_text(self, document) -> str:
text += "\n"
return text

def assemble_html(self, child_blocks):
def assemble_html(self, child_blocks, parent_structure=None):
template = ""
for c in child_blocks:
template += f"<content-ref src='{c.id}'></content-ref>"
return template

def render(self, document):
def render(self, document, parent_structure):
child_content = []
if self.structure is not None and len(self.structure) > 0:
for block_id in self.structure:
block = document.get_block(block_id)
child_content.append(block.render(document))
child_content.append(block.render(document, self.structure))

return BlockOutput(
html=self.assemble_html(child_content),
html=self.assemble_html(child_content, parent_structure),
polygon=self.polygon,
id=self.id,
children=child_content
Expand Down
4 changes: 2 additions & 2 deletions marker/v2/schema/blocks/listitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@
class ListItem(Block):
block_type: str = "ListItem"

def assemble_html(self, child_blocks):
template = super().assemble_html(child_blocks)
def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
return f"<li>{template}</li>"
5 changes: 3 additions & 2 deletions marker/v2/schema/blocks/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
class Text(Block):
block_type: str = "Text"

def assemble_html(self, child_blocks):
template = super().assemble_html(child_blocks)
def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = template.replace("\n", " ")
return f"<p>{template}</p>"
2 changes: 1 addition & 1 deletion marker/v2/schema/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def assemble_html(self, child_blocks):
def render(self):
child_content = []
for page in self.pages:
child_content.append(page.render(self))
child_content.append(page.render(self, None))

return DocumentOutput(
children=child_content,
Expand Down
4 changes: 2 additions & 2 deletions marker/v2/schema/groups/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@
class ListGroup(Block):
block_type: str = "ListGroup"

def assemble_html(self, child_blocks):
template = super().assemble_html(child_blocks)
def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
return f"<ul>{template}</ul>"
44 changes: 40 additions & 4 deletions marker/v2/schema/text/line.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,60 @@
import re

import regex

from marker.v2.schema.blocks import Block, BlockOutput

HYPHENS = r'-—¬'


def remove_tags(text):
return re.sub(r'<[^>]+>', '', text)


def replace_last(string, old, new):
matches = list(re.finditer(old, string))
if not matches:
return string
last_match = matches[-1]
return string[:last_match.start()] + new + string[last_match.end():]

def strip_trailing_hyphens(line_text, next_line_text, line_html) -> str:
lowercase_letters = r'\p{Ll}|\d'

hyphen_regex = regex.compile(rf'.*[{HYPHENS}]\s?$', regex.DOTALL)
next_line_starts_lowercase = regex.match(rf"^\s?[{lowercase_letters}]", next_line_text)

if hyphen_regex.match(line_text) and next_line_starts_lowercase:
return replace_last(line_html, rf'[{HYPHENS}]', "")
return line_html


class Line(Block):
block_type: str = "Line"

def assemble_html(self, child_blocks):
def assemble_html(self, document, child_blocks, parent_structure):
template = ""
for c in child_blocks:
template += c.html

raw_text = remove_tags(template).strip()
structure_idx = parent_structure.index(self.id)
if structure_idx < len(parent_structure) - 1:
next_block_id = parent_structure[structure_idx + 1]
next_line = document.get_block(next_block_id)
next_line_raw_text = next_line.raw_text(document)
template = strip_trailing_hyphens(raw_text, next_line_raw_text, template)
return template

def render(self, document):
def render(self, document, parent_structure):
child_content = []
if self.structure is not None and len(self.structure) > 0:
for block_id in self.structure:
block = document.get_block(block_id)
child_content.append(block.render(document))
child_content.append(block.render(document, parent_structure))

return BlockOutput(
html=self.assemble_html(child_content),
html=self.assemble_html(document, child_content, parent_structure),
polygon=self.polygon,
id=self.id,
children=[]
Expand Down
21 changes: 16 additions & 5 deletions marker/v2/schema/text/span.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,21 @@ def bold(self):
def italic(self):
return 'italic' in self.formats

def assemble_html(self, child_blocks):
if len(self.text) > 3:
def assemble_html(self, child_blocks, parent_structure):
text = self.text
text = text.replace("-\n", "") # Remove hyphenated line breaks

replaced_newline = False
while len(text) > 0 and text[-1] in ["\n", "\r"]:
text = text[:-1]
replaced_newline = True

if replaced_newline:
text += " "

if len(text) > 3:
if self.italic:
return f"<i>{self.text}</i>"
return f"<i>{text}</i>"
elif self.bold:
return f"<b>{self.text}</b>"
return self.text
return f"<b>{text}</b>"
return text

0 comments on commit a748e23

Please sign in to comment.