diff --git a/marker/v2/providers/pdf.py b/marker/v2/providers/pdf.py index 47004cb..25db8b1 100644 --- a/marker/v2/providers/pdf.py +++ b/marker/v2/providers/pdf.py @@ -35,7 +35,10 @@ def __len__(self) -> int: def __del__(self): self.doc.close() - def font_flags_to_format(self, flags: int) -> Set[str]: + def font_flags_to_format(self, flags: int | None) -> Set[str]: + if flags is None: + return {"plain"} + flag_map = { 1: "FixedPitch", 2: "Serif", @@ -72,8 +75,11 @@ def font_flags_to_format(self, flags: int) -> Set[str]: formats.add("plain") return formats - def font_names_to_format(self, font_name: str) -> Set[str]: + def font_names_to_format(self, font_name: str | None) -> Set[str]: formats = set() + if font_name is None: + return formats + if "bold" in font_name.lower(): formats.add("bold") if "ital" in font_name.lower(): @@ -97,16 +103,19 @@ def setup(self): for line in block["lines"]: spans: List[Span] = [] for span in line["spans"]: - if not span["text"].strip(): + if not span["text"]: continue font_formats = self.font_flags_to_format(span["font"]["flags"]).union(self.font_names_to_format(span["font"]["name"])) + font_name = span["font"]["name"] or "Unknown" + font_weight = span["font"]["weight"] or 0 + font_size = span["font"]["size"] or 0 spans.append( Span( polygon=PolygonBox.from_bbox(span["bbox"]), - text=span["text"].strip(), - font=span["font"]["name"], - font_weight=span["font"]["weight"], - font_size=span["font"]["size"], + text=span["text"], + font=font_name, + font_weight=font_weight, + font_size=font_size, minimum_position=span["char_start_idx"], maximum_position=span["char_end_idx"], formats=list(font_formats), diff --git a/marker/v2/schema/blocks/base.py b/marker/v2/schema/blocks/base.py index 4218300..4ba0231 100644 --- a/marker/v2/schema/blocks/base.py +++ b/marker/v2/schema/blocks/base.py @@ -100,21 +100,21 @@ def raw_text(self, document) -> str: text += "\n" return text - def assemble_html(self, child_blocks): + def assemble_html(self, child_blocks, parent_structure=None): template = "" for c in child_blocks: template += f"" return template - def render(self, document): + def render(self, document, parent_structure): child_content = [] if self.structure is not None and len(self.structure) > 0: for block_id in self.structure: block = document.get_block(block_id) - child_content.append(block.render(document)) + child_content.append(block.render(document, self.structure)) return BlockOutput( - html=self.assemble_html(child_content), + html=self.assemble_html(child_content, parent_structure), polygon=self.polygon, id=self.id, children=child_content diff --git a/marker/v2/schema/blocks/listitem.py b/marker/v2/schema/blocks/listitem.py index d4c511d..f3b9c36 100644 --- a/marker/v2/schema/blocks/listitem.py +++ b/marker/v2/schema/blocks/listitem.py @@ -4,6 +4,6 @@ class ListItem(Block): block_type: str = "ListItem" - def assemble_html(self, child_blocks): - template = super().assemble_html(child_blocks) + def assemble_html(self, child_blocks, parent_structure): + template = super().assemble_html(child_blocks, parent_structure) return f"
  • {template}
  • " diff --git a/marker/v2/schema/blocks/text.py b/marker/v2/schema/blocks/text.py index e95d729..5e0266b 100644 --- a/marker/v2/schema/blocks/text.py +++ b/marker/v2/schema/blocks/text.py @@ -4,6 +4,7 @@ class Text(Block): block_type: str = "Text" - def assemble_html(self, child_blocks): - template = super().assemble_html(child_blocks) + def assemble_html(self, child_blocks, parent_structure): + template = super().assemble_html(child_blocks, parent_structure) + template = template.replace("\n", " ") return f"

    {template}

    " diff --git a/marker/v2/schema/document.py b/marker/v2/schema/document.py index c8c837c..9589051 100644 --- a/marker/v2/schema/document.py +++ b/marker/v2/schema/document.py @@ -35,7 +35,7 @@ def assemble_html(self, child_blocks): def render(self): child_content = [] for page in self.pages: - child_content.append(page.render(self)) + child_content.append(page.render(self, None)) return DocumentOutput( children=child_content, diff --git a/marker/v2/schema/groups/list.py b/marker/v2/schema/groups/list.py index 5220975..223bc70 100644 --- a/marker/v2/schema/groups/list.py +++ b/marker/v2/schema/groups/list.py @@ -4,6 +4,6 @@ class ListGroup(Block): block_type: str = "ListGroup" - def assemble_html(self, child_blocks): - template = super().assemble_html(child_blocks) + def assemble_html(self, child_blocks, parent_structure): + template = super().assemble_html(child_blocks, parent_structure) return f"" \ No newline at end of file diff --git a/marker/v2/schema/text/line.py b/marker/v2/schema/text/line.py index c61756d..dc04987 100644 --- a/marker/v2/schema/text/line.py +++ b/marker/v2/schema/text/line.py @@ -1,24 +1,60 @@ +import re + +import regex + from marker.v2.schema.blocks import Block, BlockOutput +HYPHENS = r'-—¬' + + +def remove_tags(text): + return re.sub(r'<[^>]+>', '', text) + + +def replace_last(string, old, new): + matches = list(re.finditer(old, string)) + if not matches: + return string + last_match = matches[-1] + return string[:last_match.start()] + new + string[last_match.end():] + +def strip_trailing_hyphens(line_text, next_line_text, line_html) -> str: + lowercase_letters = r'\p{Ll}|\d' + + hyphen_regex = regex.compile(rf'.*[{HYPHENS}]\s?$', regex.DOTALL) + next_line_starts_lowercase = regex.match(rf"^\s?[{lowercase_letters}]", next_line_text) + + if hyphen_regex.match(line_text) and next_line_starts_lowercase: + return replace_last(line_html, rf'[{HYPHENS}]', "") + return line_html + class Line(Block): block_type: str = "Line" - def assemble_html(self, child_blocks): + def assemble_html(self, document, child_blocks, parent_structure): template = "" for c in child_blocks: template += c.html + + raw_text = remove_tags(template).strip() + structure_idx = parent_structure.index(self.id) + if structure_idx < len(parent_structure) - 1: + next_block_id = parent_structure[structure_idx + 1] + next_line = document.get_block(next_block_id) + next_line_raw_text = next_line.raw_text(document) + template = strip_trailing_hyphens(raw_text, next_line_raw_text, template) return template - def render(self, document): + def render(self, document, parent_structure): child_content = [] if self.structure is not None and len(self.structure) > 0: for block_id in self.structure: block = document.get_block(block_id) - child_content.append(block.render(document)) + child_content.append(block.render(document, parent_structure)) return BlockOutput( - html=self.assemble_html(child_content), + html=self.assemble_html(document, child_content, parent_structure), polygon=self.polygon, id=self.id, children=[] diff --git a/marker/v2/schema/text/span.py b/marker/v2/schema/text/span.py index 4fbb40d..a70ba93 100644 --- a/marker/v2/schema/text/span.py +++ b/marker/v2/schema/text/span.py @@ -22,10 +22,21 @@ def bold(self): def italic(self): return 'italic' in self.formats - def assemble_html(self, child_blocks): - if len(self.text) > 3: + def assemble_html(self, child_blocks, parent_structure): + text = self.text + text = text.replace("-\n", "") # Remove hyphenated line breaks + + replaced_newline = False + while len(text) > 0 and text[-1] in ["\n", "\r"]: + text = text[:-1] + replaced_newline = True + + if replaced_newline: + text += " " + + if len(text) > 3: if self.italic: - return f"{self.text}" + return f"{text}" elif self.bold: - return f"{self.text}" - return self.text + return f"{text}" + return text