diff --git a/.gitignore b/.gitignore index 933a116..0c6bc44 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ wandb report.json benchmark_data debug_data +temp.md # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/marker/v2/converters/pdf.py b/marker/v2/converters/pdf.py index 0248aeb..680e418 100644 --- a/marker/v2/converters/pdf.py +++ b/marker/v2/converters/pdf.py @@ -37,13 +37,11 @@ def __call__(self, filepath: str, page_range: List[int] | None = None): equation_processor = EquationProcessor(self.texify_model) equation_processor(document) - # TODO: re-enable once we add OCR method - # table_processor = TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model) - # table_processor(document) + table_processor = TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model) + table_processor(document) renderer = MarkdownRenderer() - document_output = document.render() - return renderer(document_output) + return renderer(document) if __name__ == "__main__": @@ -57,4 +55,5 @@ def __call__(self, filepath: str, page_range: List[int] | None = None): converter = PdfConverter() rendered = converter(temp_pdf.name) - print(rendered) + with open("temp.md", "w+") as f: + f.write(rendered) diff --git a/marker/v2/processors/table.py b/marker/v2/processors/table.py index 5dffc23..6c330aa 100644 --- a/marker/v2/processors/table.py +++ b/marker/v2/processors/table.py @@ -33,6 +33,7 @@ def __call__(self, document: Document): for block in page.children: if block.block_type != self.block_type: continue + image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.highres_image.size) image = page.highres_image.crop(image_poly.bbox).convert("RGB") @@ -42,9 +43,9 @@ def __call__(self, document: Document): text_lines = get_page_text_lines( filepath, [page.page_id], - page.highres_image.size, + [page.highres_image.size], flatten_pdf=True - ) + )[0] table_data.append({ "block_id": block.id, @@ -54,10 +55,7 @@ def __call__(self, document: Document): "img_size": page.highres_image.size }) - lst_format = zip(*( - [t[key] for t in table_data] - for key in ["table_image", "table_bbox", "img_size", "text_lines"] - )) + lst_format = [[t[key] for t in table_data] for key in ["table_image", "table_bbox", "img_size", "text_lines"]] cells, needs_ocr = get_cells( *lst_format, diff --git a/marker/v2/providers/pdf.py b/marker/v2/providers/pdf.py index 6628106..3c9de38 100644 --- a/marker/v2/providers/pdf.py +++ b/marker/v2/providers/pdf.py @@ -163,7 +163,10 @@ def merge_lines( return page_lines, page_spans - def font_flags_to_format(self, flags: int) -> Set[str]: + def font_flags_to_format(self, flags: int | None) -> Set[str]: + if flags is None: + return {"plain"} + flag_map = { 1: "FixedPitch", 2: "Serif", @@ -200,8 +203,11 @@ def font_flags_to_format(self, flags: int) -> Set[str]: formats.add("plain") return formats - def font_names_to_format(self, font_name: str) -> Set[str]: + def font_names_to_format(self, font_name: str | None) -> Set[str]: formats = set() + if font_name is None: + return formats + if "bold" in font_name.lower(): formats.add("bold") if "ital" in font_name.lower(): @@ -226,16 +232,19 @@ def pdftext_extraction(self) -> Tuple[PageLines, PageSpans]: for line in block["lines"]: spans: List[Span] = [] for span in line["spans"]: - if not span["text"].strip(): + if not span["text"]: continue font_formats = self.font_flags_to_format(span["font"]["flags"]).union(self.font_names_to_format(span["font"]["name"])) + font_name = span["font"]["name"] or "Unknown" + font_weight = span["font"]["weight"] or 0 + font_size = span["font"]["size"] or 0 spans.append( Span( polygon=PolygonBox.from_bbox(span["bbox"]), text=span["text"], - font=span["font"]["name"], - font_weight=span["font"]["weight"], - font_size=span["font"]["size"], + font=font_name, + font_weight=font_weight, + font_size=font_size, minimum_position=span["char_start_idx"], maximum_position=span["char_end_idx"], formats=list(font_formats), diff --git a/marker/v2/renderers/__init__.py b/marker/v2/renderers/__init__.py index 5dfb934..7f6297c 100644 --- a/marker/v2/renderers/__init__.py +++ b/marker/v2/renderers/__init__.py @@ -1,4 +1,3 @@ -from enum import Enum from typing import Optional from pydantic import BaseModel @@ -6,11 +5,6 @@ from marker.v2.schema import BlockTypes -class RenderFormat(str, Enum): - json = "json" - markdown = "markdown" - - class BaseRenderer: block_type: BlockTypes | None = None @@ -19,6 +13,6 @@ def __init__(self, config: Optional[BaseModel | dict] = None): for k in config.model_fields: setattr(self, k, config[k]) - def __call__(self, document_output): + def __call__(self, document): # Children are in reading order raise NotImplementedError diff --git a/marker/v2/renderers/html.py b/marker/v2/renderers/html.py new file mode 100644 index 0000000..c2f3743 --- /dev/null +++ b/marker/v2/renderers/html.py @@ -0,0 +1,33 @@ +from bs4 import BeautifulSoup +from marker.v2.renderers import BaseRenderer +from marker.v2.schema import BlockTypes + + +class HTMLRenderer(BaseRenderer): + remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter] + image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure] + + def extract_html(self, document, document_output): + soup = BeautifulSoup(document_output.html, 'html.parser') + + content_refs = soup.find_all('content-ref') + ref_block_type = None + for ref in content_refs: + src = ref.get('src') + for item in document_output.children: + if item.id == src: + content = self.extract_html(document, item) + ref_block_type = item.id.block_type + break + + if ref_block_type in self.remove_blocks: + ref.replace_with('') + else: + ref.replace_with(BeautifulSoup(f"
{content}
", 'html.parser')) + + return str(soup) + + def __call__(self, document): + document_output = document.render() + full_html = self.extract_html(document, document_output) + return full_html diff --git a/marker/v2/renderers/markdown.py b/marker/v2/renderers/markdown.py index 950e863..8610e56 100644 --- a/marker/v2/renderers/markdown.py +++ b/marker/v2/renderers/markdown.py @@ -1,26 +1,17 @@ -from bs4 import BeautifulSoup from markdownify import markdownify -from marker.v2.renderers import BaseRenderer - - -class MarkdownRenderer(BaseRenderer): - def extract_html(self, document_output): - soup = BeautifulSoup(document_output.html, 'html.parser') - - content_refs = soup.find_all('content-ref') - for ref in content_refs: - src = ref.get('src') - for item in document_output.children: - if item.id == src: - content = self.extract_html(item) - break - - ref.replace_with(BeautifulSoup(content, 'html.parser')) - - return str(soup) - - def __call__(self, document_output): - full_html = self.extract_html(document_output) - return markdownify(full_html) +from marker.v2.renderers.html import HTMLRenderer + + +class MarkdownRenderer(HTMLRenderer): + def __call__(self, document): + document_output = document.render() + full_html = self.extract_html(document, document_output) + return markdownify( + full_html, + heading_style="ATX", + bullets="-", + escape_misc=False, + escape_underscores=False + ) diff --git a/marker/v2/schema/blocks/base.py b/marker/v2/schema/blocks/base.py index dbb8ee2..acbba98 100644 --- a/marker/v2/schema/blocks/base.py +++ b/marker/v2/schema/blocks/base.py @@ -103,21 +103,21 @@ def raw_text(self, document: Document) -> str: text += "\n" return text - def assemble_html(self, child_blocks): + def assemble_html(self, child_blocks, parent_structure=None): template = "" for c in child_blocks: template += f"" return template - def render(self, document): + def render(self, document, parent_structure): child_content = [] if self.structure is not None and len(self.structure) > 0: for block_id in self.structure: block = document.get_block(block_id) - child_content.append(block.render(document)) + child_content.append(block.render(document, self.structure)) return BlockOutput( - html=self.assemble_html(child_content), + html=self.assemble_html(child_content, parent_structure), polygon=self.polygon, id=self.id, children=child_content diff --git a/marker/v2/schema/blocks/caption.py b/marker/v2/schema/blocks/caption.py index ab3fd9f..a6fb68c 100644 --- a/marker/v2/schema/blocks/caption.py +++ b/marker/v2/schema/blocks/caption.py @@ -4,3 +4,8 @@ class Caption(Block): block_type: BlockTypes = BlockTypes.Caption + + def assemble_html(self, child_blocks, parent_structure): + template = super().assemble_html(child_blocks, parent_structure) + template = template.replace("\n", " ") + return f"

{template}

" diff --git a/marker/v2/schema/blocks/code.py b/marker/v2/schema/blocks/code.py index 89100c9..ca8e6e1 100644 --- a/marker/v2/schema/blocks/code.py +++ b/marker/v2/schema/blocks/code.py @@ -4,3 +4,7 @@ class Code(Block): block_type: BlockTypes = BlockTypes.Code + + def assemble_html(self, child_blocks, parent_structure): + template = super().assemble_html(child_blocks, parent_structure) + return f"
{template}
" diff --git a/marker/v2/schema/blocks/equation.py b/marker/v2/schema/blocks/equation.py index 184013b..f3c577e 100644 --- a/marker/v2/schema/blocks/equation.py +++ b/marker/v2/schema/blocks/equation.py @@ -5,3 +5,6 @@ class Equation(Block): block_type: BlockTypes = BlockTypes.Equation latex: str | None = None + + def assemble_html(self, child_blocks, parent_structure=None): + return f"
{self.latex}
" diff --git a/marker/v2/schema/blocks/figure.py b/marker/v2/schema/blocks/figure.py index e90e15b..acd4f7b 100644 --- a/marker/v2/schema/blocks/figure.py +++ b/marker/v2/schema/blocks/figure.py @@ -4,3 +4,6 @@ class Figure(Block): block_type: BlockTypes = BlockTypes.Figure + + def assemble_html(self, child_blocks, parent_structure): + return f"Image {self.block_id}" diff --git a/marker/v2/schema/blocks/footnote.py b/marker/v2/schema/blocks/footnote.py index 1f99283..f775d54 100644 --- a/marker/v2/schema/blocks/footnote.py +++ b/marker/v2/schema/blocks/footnote.py @@ -4,3 +4,8 @@ class Footnote(Block): block_type: BlockTypes = BlockTypes.Footnote + + def assemble_html(self, child_blocks, parent_structure): + template = super().assemble_html(child_blocks, parent_structure) + template = template.replace("\n", " ") + return f"

{template}

" diff --git a/marker/v2/schema/blocks/inlinemath.py b/marker/v2/schema/blocks/inlinemath.py index f74fe74..c0d564e 100644 --- a/marker/v2/schema/blocks/inlinemath.py +++ b/marker/v2/schema/blocks/inlinemath.py @@ -4,3 +4,8 @@ class InlineMath(Block): block_type: BlockTypes = BlockTypes.TextInlineMath + + def assemble_html(self, child_blocks, parent_structure): + template = super().assemble_html(child_blocks, parent_structure) + template = template.replace("\n", " ") + return f"

{template}

" diff --git a/marker/v2/schema/blocks/listitem.py b/marker/v2/schema/blocks/listitem.py index 9927e1d..932254b 100644 --- a/marker/v2/schema/blocks/listitem.py +++ b/marker/v2/schema/blocks/listitem.py @@ -1,10 +1,21 @@ +import re + from marker.v2.schema import BlockTypes from marker.v2.schema.blocks import Block +def replace_bullets(text): + # Replace bullet characters with a - + bullet_pattern = r"(^|[\n ])[•●○■▪▫–—]( )" + replaced_string = re.sub(bullet_pattern, r"\1-\2", text) + return replaced_string + + class ListItem(Block): block_type: BlockTypes = BlockTypes.ListItem - def assemble_html(self, child_blocks): - template = super().assemble_html(child_blocks) + def assemble_html(self, child_blocks, parent_structure): + template = super().assemble_html(child_blocks, parent_structure) + template = template.replace("\n", " ") + template = replace_bullets(template) return f"
  • {template}
  • " diff --git a/marker/v2/schema/blocks/picture.py b/marker/v2/schema/blocks/picture.py index edb2328..b4e2e17 100644 --- a/marker/v2/schema/blocks/picture.py +++ b/marker/v2/schema/blocks/picture.py @@ -4,3 +4,6 @@ class Picture(Block): block_type: BlockTypes = BlockTypes.Picture + + def assemble_html(self, child_blocks, parent_structure): + return f"Image {self.block_id}" diff --git a/marker/v2/schema/blocks/sectionheader.py b/marker/v2/schema/blocks/sectionheader.py index 7a5c85c..a367fc7 100644 --- a/marker/v2/schema/blocks/sectionheader.py +++ b/marker/v2/schema/blocks/sectionheader.py @@ -4,3 +4,8 @@ class SectionHeader(Block): block_type: BlockTypes = BlockTypes.SectionHeader + + def assemble_html(self, child_blocks, parent_structure): + template = super().assemble_html(child_blocks, parent_structure) + template = template.replace("\n", " ") + return f"

    {template}

    " diff --git a/marker/v2/schema/blocks/table.py b/marker/v2/schema/blocks/table.py index ea7bab2..810ccd4 100644 --- a/marker/v2/schema/blocks/table.py +++ b/marker/v2/schema/blocks/table.py @@ -1,5 +1,6 @@ from typing import List +from tabled.formats import html_format from tabled.schema import SpanTableCell from marker.v2.schema import BlockTypes @@ -9,3 +10,6 @@ class Table(Block): block_type: BlockTypes = BlockTypes.Table cells: List[SpanTableCell] | None = None + + def assemble_html(self, child_blocks, parent_structure=None): + return html_format(self.cells) diff --git a/marker/v2/schema/blocks/text.py b/marker/v2/schema/blocks/text.py index bcbe410..aaa9a3e 100644 --- a/marker/v2/schema/blocks/text.py +++ b/marker/v2/schema/blocks/text.py @@ -5,6 +5,7 @@ class Text(Block): block_type: BlockTypes = BlockTypes.Text - def assemble_html(self, child_blocks): - template = super().assemble_html(child_blocks) + def assemble_html(self, child_blocks, parent_structure): + template = super().assemble_html(child_blocks, parent_structure) + template = template.replace("\n", " ") return f"

    {template}

    " diff --git a/marker/v2/schema/document.py b/marker/v2/schema/document.py index 8aed380..7e96313 100644 --- a/marker/v2/schema/document.py +++ b/marker/v2/schema/document.py @@ -21,10 +21,10 @@ class Document(BaseModel): block_type: BlockTypes = BlockTypes.Document def get_block(self, block_id: BlockId): - for page in self.pages: - block = page.get_block(block_id) - if block: - return block + page = [p for p in self.pages if p.page_id == block_id.page_id][0] + block = page.get_block(block_id) + if block: + return block return None def assemble_html(self, child_blocks): @@ -36,7 +36,7 @@ def assemble_html(self, child_blocks): def render(self): child_content = [] for page in self.pages: - child_content.append(page.render(self)) + child_content.append(page.render(self, None)) return DocumentOutput( children=child_content, diff --git a/marker/v2/schema/groups/list.py b/marker/v2/schema/groups/list.py index 3e45cab..0baa293 100644 --- a/marker/v2/schema/groups/list.py +++ b/marker/v2/schema/groups/list.py @@ -5,6 +5,6 @@ class ListGroup(Block): block_type: BlockTypes = BlockTypes.ListGroup - def assemble_html(self, child_blocks): - template = super().assemble_html(child_blocks) + def assemble_html(self, child_blocks, parent_structure): + template = super().assemble_html(child_blocks, parent_structure) return f"" diff --git a/marker/v2/schema/groups/page.py b/marker/v2/schema/groups/page.py index ac23955..ddc1aeb 100644 --- a/marker/v2/schema/groups/page.py +++ b/marker/v2/schema/groups/page.py @@ -43,5 +43,5 @@ def add_full_block(self, block: Block) -> Block: def get_block(self, block_id: BlockId) -> Block | None: for block in self.children: - if block.id == block_id: + if block.block_id == block_id.block_id: return block diff --git a/marker/v2/schema/groups/table.py b/marker/v2/schema/groups/table.py index a5732e1..b1b1f2d 100644 --- a/marker/v2/schema/groups/table.py +++ b/marker/v2/schema/groups/table.py @@ -1,6 +1,5 @@ from marker.v2.schema import BlockTypes from marker.v2.schema.blocks import Block - class TableGroup(Block): block_type: BlockTypes = BlockTypes.TableGroup diff --git a/marker/v2/schema/text/line.py b/marker/v2/schema/text/line.py index cca4ca1..2ffb12e 100644 --- a/marker/v2/schema/text/line.py +++ b/marker/v2/schema/text/line.py @@ -1,28 +1,64 @@ +import re from typing import Literal, Optional +import regex + from marker.v2.schema import BlockTypes from marker.v2.schema.blocks import Block, BlockOutput +HYPHENS = r'-—¬' + + +def remove_tags(text): + return re.sub(r'<[^>]+>', '', text) + + +def replace_last(string, old, new): + matches = list(re.finditer(old, string)) + if not matches: + return string + last_match = matches[-1] + return string[:last_match.start()] + new + string[last_match.end():] + + +def strip_trailing_hyphens(line_text, next_line_text, line_html) -> str: + lowercase_letters = r'\p{Ll}|\d' + + hyphen_regex = regex.compile(rf'.*[{HYPHENS}]\s?$', regex.DOTALL) + next_line_starts_lowercase = regex.match(rf"^\s?[{lowercase_letters}]", next_line_text) + + if hyphen_regex.match(line_text) and next_line_starts_lowercase: + return replace_last(line_html, rf'[{HYPHENS}]', "") + return line_html + class Line(Block): block_type: BlockTypes = BlockTypes.Line origin: Optional[Literal["pdftext", "surya"]] = None - def assemble_html(self, child_blocks): + def assemble_html(self, document, child_blocks, parent_structure): template = "" for c in child_blocks: template += c.html + + raw_text = remove_tags(template).strip() + structure_idx = parent_structure.index(self.id) + if structure_idx < len(parent_structure) - 1: + next_block_id = parent_structure[structure_idx + 1] + next_line = document.get_block(next_block_id) + next_line_raw_text = next_line.raw_text(document) + template = strip_trailing_hyphens(raw_text, next_line_raw_text, template) return template - def render(self, document): + def render(self, document, parent_structure): child_content = [] if self.structure is not None and len(self.structure) > 0: for block_id in self.structure: block = document.get_block(block_id) - child_content.append(block.render(document)) + child_content.append(block.render(document, parent_structure)) return BlockOutput( - html=self.assemble_html(child_content), + html=self.assemble_html(document, child_content, parent_structure), polygon=self.polygon, id=self.id, children=[] diff --git a/marker/v2/schema/text/span.py b/marker/v2/schema/text/span.py index e30cec4..e9d74e9 100644 --- a/marker/v2/schema/text/span.py +++ b/marker/v2/schema/text/span.py @@ -23,10 +23,26 @@ def bold(self): def italic(self): return 'italic' in self.formats - def assemble_html(self, child_blocks): - if len(self.text) > 3: + def assemble_html(self, child_blocks, parent_structure): + text = self.text + text = text.replace("-\n", "") # Remove hyphenated line breaks + + # Remove trailing newlines + replaced_newline = False + while len(text) > 0 and text[-1] in ["\n", "\r"]: + text = text[:-1] + replaced_newline = True + + # Remove leading newlines + while len(text) > 0 and text[0] in ["\n", "\r"]: + text = text[1:] + + if replaced_newline: + text += " " + + if len(text) > 3: if self.italic: - return f"{self.text}" + return f"{text}" elif self.bold: - return f"{self.text}" - return self.text + return f"{text}" + return text