diff --git a/.gitignore b/.gitignore
index 933a116..0c6bc44 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,7 @@ wandb
report.json
benchmark_data
debug_data
+temp.md
# Byte-compiled / optimized / DLL files
__pycache__/
diff --git a/marker/v2/converters/pdf.py b/marker/v2/converters/pdf.py
index 0248aeb..680e418 100644
--- a/marker/v2/converters/pdf.py
+++ b/marker/v2/converters/pdf.py
@@ -37,13 +37,11 @@ def __call__(self, filepath: str, page_range: List[int] | None = None):
equation_processor = EquationProcessor(self.texify_model)
equation_processor(document)
- # TODO: re-enable once we add OCR method
- # table_processor = TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model)
- # table_processor(document)
+ table_processor = TableProcessor(self.detection_model, self.recognition_model, self.table_rec_model)
+ table_processor(document)
renderer = MarkdownRenderer()
- document_output = document.render()
- return renderer(document_output)
+ return renderer(document)
if __name__ == "__main__":
@@ -57,4 +55,5 @@ def __call__(self, filepath: str, page_range: List[int] | None = None):
converter = PdfConverter()
rendered = converter(temp_pdf.name)
- print(rendered)
+ with open("temp.md", "w+") as f:
+ f.write(rendered)
diff --git a/marker/v2/processors/table.py b/marker/v2/processors/table.py
index 5dffc23..6c330aa 100644
--- a/marker/v2/processors/table.py
+++ b/marker/v2/processors/table.py
@@ -33,6 +33,7 @@ def __call__(self, document: Document):
for block in page.children:
if block.block_type != self.block_type:
continue
+
image_poly = block.polygon.rescale((page.polygon.width, page.polygon.height), page.highres_image.size)
image = page.highres_image.crop(image_poly.bbox).convert("RGB")
@@ -42,9 +43,9 @@ def __call__(self, document: Document):
text_lines = get_page_text_lines(
filepath,
[page.page_id],
- page.highres_image.size,
+ [page.highres_image.size],
flatten_pdf=True
- )
+ )[0]
table_data.append({
"block_id": block.id,
@@ -54,10 +55,7 @@ def __call__(self, document: Document):
"img_size": page.highres_image.size
})
- lst_format = zip(*(
- [t[key] for t in table_data]
- for key in ["table_image", "table_bbox", "img_size", "text_lines"]
- ))
+ lst_format = [[t[key] for t in table_data] for key in ["table_image", "table_bbox", "img_size", "text_lines"]]
cells, needs_ocr = get_cells(
*lst_format,
diff --git a/marker/v2/providers/pdf.py b/marker/v2/providers/pdf.py
index 6628106..3c9de38 100644
--- a/marker/v2/providers/pdf.py
+++ b/marker/v2/providers/pdf.py
@@ -163,7 +163,10 @@ def merge_lines(
return page_lines, page_spans
- def font_flags_to_format(self, flags: int) -> Set[str]:
+ def font_flags_to_format(self, flags: int | None) -> Set[str]:
+ if flags is None:
+ return {"plain"}
+
flag_map = {
1: "FixedPitch",
2: "Serif",
@@ -200,8 +203,11 @@ def font_flags_to_format(self, flags: int) -> Set[str]:
formats.add("plain")
return formats
- def font_names_to_format(self, font_name: str) -> Set[str]:
+ def font_names_to_format(self, font_name: str | None) -> Set[str]:
formats = set()
+ if font_name is None:
+ return formats
+
if "bold" in font_name.lower():
formats.add("bold")
if "ital" in font_name.lower():
@@ -226,16 +232,19 @@ def pdftext_extraction(self) -> Tuple[PageLines, PageSpans]:
for line in block["lines"]:
spans: List[Span] = []
for span in line["spans"]:
- if not span["text"].strip():
+ if not span["text"]:
continue
font_formats = self.font_flags_to_format(span["font"]["flags"]).union(self.font_names_to_format(span["font"]["name"]))
+ font_name = span["font"]["name"] or "Unknown"
+ font_weight = span["font"]["weight"] or 0
+ font_size = span["font"]["size"] or 0
spans.append(
Span(
polygon=PolygonBox.from_bbox(span["bbox"]),
text=span["text"],
- font=span["font"]["name"],
- font_weight=span["font"]["weight"],
- font_size=span["font"]["size"],
+ font=font_name,
+ font_weight=font_weight,
+ font_size=font_size,
minimum_position=span["char_start_idx"],
maximum_position=span["char_end_idx"],
formats=list(font_formats),
diff --git a/marker/v2/renderers/__init__.py b/marker/v2/renderers/__init__.py
index 5dfb934..7f6297c 100644
--- a/marker/v2/renderers/__init__.py
+++ b/marker/v2/renderers/__init__.py
@@ -1,4 +1,3 @@
-from enum import Enum
from typing import Optional
from pydantic import BaseModel
@@ -6,11 +5,6 @@
from marker.v2.schema import BlockTypes
-class RenderFormat(str, Enum):
- json = "json"
- markdown = "markdown"
-
-
class BaseRenderer:
block_type: BlockTypes | None = None
@@ -19,6 +13,6 @@ def __init__(self, config: Optional[BaseModel | dict] = None):
for k in config.model_fields:
setattr(self, k, config[k])
- def __call__(self, document_output):
+ def __call__(self, document):
# Children are in reading order
raise NotImplementedError
diff --git a/marker/v2/renderers/html.py b/marker/v2/renderers/html.py
new file mode 100644
index 0000000..c2f3743
--- /dev/null
+++ b/marker/v2/renderers/html.py
@@ -0,0 +1,33 @@
+from bs4 import BeautifulSoup
+from marker.v2.renderers import BaseRenderer
+from marker.v2.schema import BlockTypes
+
+
+class HTMLRenderer(BaseRenderer):
+ remove_blocks: list = [BlockTypes.PageHeader, BlockTypes.PageFooter]
+ image_blocks: list = [BlockTypes.Picture, BlockTypes.Figure]
+
+ def extract_html(self, document, document_output):
+ soup = BeautifulSoup(document_output.html, 'html.parser')
+
+ content_refs = soup.find_all('content-ref')
+ ref_block_type = None
+ for ref in content_refs:
+ src = ref.get('src')
+ for item in document_output.children:
+ if item.id == src:
+ content = self.extract_html(document, item)
+ ref_block_type = item.id.block_type
+ break
+
+ if ref_block_type in self.remove_blocks:
+ ref.replace_with('')
+ else:
+ ref.replace_with(BeautifulSoup(f"
{content}
", 'html.parser'))
+
+ return str(soup)
+
+ def __call__(self, document):
+ document_output = document.render()
+ full_html = self.extract_html(document, document_output)
+ return full_html
diff --git a/marker/v2/renderers/markdown.py b/marker/v2/renderers/markdown.py
index 950e863..8610e56 100644
--- a/marker/v2/renderers/markdown.py
+++ b/marker/v2/renderers/markdown.py
@@ -1,26 +1,17 @@
-from bs4 import BeautifulSoup
from markdownify import markdownify
-from marker.v2.renderers import BaseRenderer
-
-
-class MarkdownRenderer(BaseRenderer):
- def extract_html(self, document_output):
- soup = BeautifulSoup(document_output.html, 'html.parser')
-
- content_refs = soup.find_all('content-ref')
- for ref in content_refs:
- src = ref.get('src')
- for item in document_output.children:
- if item.id == src:
- content = self.extract_html(item)
- break
-
- ref.replace_with(BeautifulSoup(content, 'html.parser'))
-
- return str(soup)
-
- def __call__(self, document_output):
- full_html = self.extract_html(document_output)
- return markdownify(full_html)
+from marker.v2.renderers.html import HTMLRenderer
+
+
+class MarkdownRenderer(HTMLRenderer):
+ def __call__(self, document):
+ document_output = document.render()
+ full_html = self.extract_html(document, document_output)
+ return markdownify(
+ full_html,
+ heading_style="ATX",
+ bullets="-",
+ escape_misc=False,
+ escape_underscores=False
+ )
diff --git a/marker/v2/schema/blocks/base.py b/marker/v2/schema/blocks/base.py
index dbb8ee2..acbba98 100644
--- a/marker/v2/schema/blocks/base.py
+++ b/marker/v2/schema/blocks/base.py
@@ -103,21 +103,21 @@ def raw_text(self, document: Document) -> str:
text += "\n"
return text
- def assemble_html(self, child_blocks):
+ def assemble_html(self, child_blocks, parent_structure=None):
template = ""
for c in child_blocks:
template += f""
return template
- def render(self, document):
+ def render(self, document, parent_structure):
child_content = []
if self.structure is not None and len(self.structure) > 0:
for block_id in self.structure:
block = document.get_block(block_id)
- child_content.append(block.render(document))
+ child_content.append(block.render(document, self.structure))
return BlockOutput(
- html=self.assemble_html(child_content),
+ html=self.assemble_html(child_content, parent_structure),
polygon=self.polygon,
id=self.id,
children=child_content
diff --git a/marker/v2/schema/blocks/caption.py b/marker/v2/schema/blocks/caption.py
index ab3fd9f..a6fb68c 100644
--- a/marker/v2/schema/blocks/caption.py
+++ b/marker/v2/schema/blocks/caption.py
@@ -4,3 +4,8 @@
class Caption(Block):
block_type: BlockTypes = BlockTypes.Caption
+
+ def assemble_html(self, child_blocks, parent_structure):
+ template = super().assemble_html(child_blocks, parent_structure)
+ template = template.replace("\n", " ")
+ return f"{template}
"
diff --git a/marker/v2/schema/blocks/code.py b/marker/v2/schema/blocks/code.py
index 89100c9..ca8e6e1 100644
--- a/marker/v2/schema/blocks/code.py
+++ b/marker/v2/schema/blocks/code.py
@@ -4,3 +4,7 @@
class Code(Block):
block_type: BlockTypes = BlockTypes.Code
+
+ def assemble_html(self, child_blocks, parent_structure):
+ template = super().assemble_html(child_blocks, parent_structure)
+ return f"{template}
"
diff --git a/marker/v2/schema/blocks/equation.py b/marker/v2/schema/blocks/equation.py
index 184013b..f3c577e 100644
--- a/marker/v2/schema/blocks/equation.py
+++ b/marker/v2/schema/blocks/equation.py
@@ -5,3 +5,6 @@
class Equation(Block):
block_type: BlockTypes = BlockTypes.Equation
latex: str | None = None
+
+ def assemble_html(self, child_blocks, parent_structure=None):
+ return f"{self.latex}
"
diff --git a/marker/v2/schema/blocks/figure.py b/marker/v2/schema/blocks/figure.py
index e90e15b..acd4f7b 100644
--- a/marker/v2/schema/blocks/figure.py
+++ b/marker/v2/schema/blocks/figure.py
@@ -4,3 +4,6 @@
class Figure(Block):
block_type: BlockTypes = BlockTypes.Figure
+
+ def assemble_html(self, child_blocks, parent_structure):
+ return f"Image {self.block_id}"
diff --git a/marker/v2/schema/blocks/footnote.py b/marker/v2/schema/blocks/footnote.py
index 1f99283..f775d54 100644
--- a/marker/v2/schema/blocks/footnote.py
+++ b/marker/v2/schema/blocks/footnote.py
@@ -4,3 +4,8 @@
class Footnote(Block):
block_type: BlockTypes = BlockTypes.Footnote
+
+ def assemble_html(self, child_blocks, parent_structure):
+ template = super().assemble_html(child_blocks, parent_structure)
+ template = template.replace("\n", " ")
+ return f"{template}
"
diff --git a/marker/v2/schema/blocks/inlinemath.py b/marker/v2/schema/blocks/inlinemath.py
index f74fe74..c0d564e 100644
--- a/marker/v2/schema/blocks/inlinemath.py
+++ b/marker/v2/schema/blocks/inlinemath.py
@@ -4,3 +4,8 @@
class InlineMath(Block):
block_type: BlockTypes = BlockTypes.TextInlineMath
+
+ def assemble_html(self, child_blocks, parent_structure):
+ template = super().assemble_html(child_blocks, parent_structure)
+ template = template.replace("\n", " ")
+ return f"{template}
"
diff --git a/marker/v2/schema/blocks/listitem.py b/marker/v2/schema/blocks/listitem.py
index 9927e1d..932254b 100644
--- a/marker/v2/schema/blocks/listitem.py
+++ b/marker/v2/schema/blocks/listitem.py
@@ -1,10 +1,21 @@
+import re
+
from marker.v2.schema import BlockTypes
from marker.v2.schema.blocks import Block
+def replace_bullets(text):
+ # Replace bullet characters with a -
+ bullet_pattern = r"(^|[\n ])[•●○■▪▫–—]( )"
+ replaced_string = re.sub(bullet_pattern, r"\1-\2", text)
+ return replaced_string
+
+
class ListItem(Block):
block_type: BlockTypes = BlockTypes.ListItem
- def assemble_html(self, child_blocks):
- template = super().assemble_html(child_blocks)
+ def assemble_html(self, child_blocks, parent_structure):
+ template = super().assemble_html(child_blocks, parent_structure)
+ template = template.replace("\n", " ")
+ template = replace_bullets(template)
return f"{template}"
diff --git a/marker/v2/schema/blocks/picture.py b/marker/v2/schema/blocks/picture.py
index edb2328..b4e2e17 100644
--- a/marker/v2/schema/blocks/picture.py
+++ b/marker/v2/schema/blocks/picture.py
@@ -4,3 +4,6 @@
class Picture(Block):
block_type: BlockTypes = BlockTypes.Picture
+
+ def assemble_html(self, child_blocks, parent_structure):
+ return f"Image {self.block_id}"
diff --git a/marker/v2/schema/blocks/sectionheader.py b/marker/v2/schema/blocks/sectionheader.py
index 7a5c85c..a367fc7 100644
--- a/marker/v2/schema/blocks/sectionheader.py
+++ b/marker/v2/schema/blocks/sectionheader.py
@@ -4,3 +4,8 @@
class SectionHeader(Block):
block_type: BlockTypes = BlockTypes.SectionHeader
+
+ def assemble_html(self, child_blocks, parent_structure):
+ template = super().assemble_html(child_blocks, parent_structure)
+ template = template.replace("\n", " ")
+ return f"{template}
"
diff --git a/marker/v2/schema/blocks/table.py b/marker/v2/schema/blocks/table.py
index ea7bab2..810ccd4 100644
--- a/marker/v2/schema/blocks/table.py
+++ b/marker/v2/schema/blocks/table.py
@@ -1,5 +1,6 @@
from typing import List
+from tabled.formats import html_format
from tabled.schema import SpanTableCell
from marker.v2.schema import BlockTypes
@@ -9,3 +10,6 @@
class Table(Block):
block_type: BlockTypes = BlockTypes.Table
cells: List[SpanTableCell] | None = None
+
+ def assemble_html(self, child_blocks, parent_structure=None):
+ return html_format(self.cells)
diff --git a/marker/v2/schema/blocks/text.py b/marker/v2/schema/blocks/text.py
index bcbe410..aaa9a3e 100644
--- a/marker/v2/schema/blocks/text.py
+++ b/marker/v2/schema/blocks/text.py
@@ -5,6 +5,7 @@
class Text(Block):
block_type: BlockTypes = BlockTypes.Text
- def assemble_html(self, child_blocks):
- template = super().assemble_html(child_blocks)
+ def assemble_html(self, child_blocks, parent_structure):
+ template = super().assemble_html(child_blocks, parent_structure)
+ template = template.replace("\n", " ")
return f"{template}
"
diff --git a/marker/v2/schema/document.py b/marker/v2/schema/document.py
index 8aed380..7e96313 100644
--- a/marker/v2/schema/document.py
+++ b/marker/v2/schema/document.py
@@ -21,10 +21,10 @@ class Document(BaseModel):
block_type: BlockTypes = BlockTypes.Document
def get_block(self, block_id: BlockId):
- for page in self.pages:
- block = page.get_block(block_id)
- if block:
- return block
+ page = [p for p in self.pages if p.page_id == block_id.page_id][0]
+ block = page.get_block(block_id)
+ if block:
+ return block
return None
def assemble_html(self, child_blocks):
@@ -36,7 +36,7 @@ def assemble_html(self, child_blocks):
def render(self):
child_content = []
for page in self.pages:
- child_content.append(page.render(self))
+ child_content.append(page.render(self, None))
return DocumentOutput(
children=child_content,
diff --git a/marker/v2/schema/groups/list.py b/marker/v2/schema/groups/list.py
index 3e45cab..0baa293 100644
--- a/marker/v2/schema/groups/list.py
+++ b/marker/v2/schema/groups/list.py
@@ -5,6 +5,6 @@
class ListGroup(Block):
block_type: BlockTypes = BlockTypes.ListGroup
- def assemble_html(self, child_blocks):
- template = super().assemble_html(child_blocks)
+ def assemble_html(self, child_blocks, parent_structure):
+ template = super().assemble_html(child_blocks, parent_structure)
return f""
diff --git a/marker/v2/schema/groups/page.py b/marker/v2/schema/groups/page.py
index ac23955..ddc1aeb 100644
--- a/marker/v2/schema/groups/page.py
+++ b/marker/v2/schema/groups/page.py
@@ -43,5 +43,5 @@ def add_full_block(self, block: Block) -> Block:
def get_block(self, block_id: BlockId) -> Block | None:
for block in self.children:
- if block.id == block_id:
+ if block.block_id == block_id.block_id:
return block
diff --git a/marker/v2/schema/groups/table.py b/marker/v2/schema/groups/table.py
index a5732e1..b1b1f2d 100644
--- a/marker/v2/schema/groups/table.py
+++ b/marker/v2/schema/groups/table.py
@@ -1,6 +1,5 @@
from marker.v2.schema import BlockTypes
from marker.v2.schema.blocks import Block
-
class TableGroup(Block):
block_type: BlockTypes = BlockTypes.TableGroup
diff --git a/marker/v2/schema/text/line.py b/marker/v2/schema/text/line.py
index cca4ca1..2ffb12e 100644
--- a/marker/v2/schema/text/line.py
+++ b/marker/v2/schema/text/line.py
@@ -1,28 +1,64 @@
+import re
from typing import Literal, Optional
+import regex
+
from marker.v2.schema import BlockTypes
from marker.v2.schema.blocks import Block, BlockOutput
+HYPHENS = r'-—¬'
+
+
+def remove_tags(text):
+ return re.sub(r'<[^>]+>', '', text)
+
+
+def replace_last(string, old, new):
+ matches = list(re.finditer(old, string))
+ if not matches:
+ return string
+ last_match = matches[-1]
+ return string[:last_match.start()] + new + string[last_match.end():]
+
+
+def strip_trailing_hyphens(line_text, next_line_text, line_html) -> str:
+ lowercase_letters = r'\p{Ll}|\d'
+
+ hyphen_regex = regex.compile(rf'.*[{HYPHENS}]\s?$', regex.DOTALL)
+ next_line_starts_lowercase = regex.match(rf"^\s?[{lowercase_letters}]", next_line_text)
+
+ if hyphen_regex.match(line_text) and next_line_starts_lowercase:
+ return replace_last(line_html, rf'[{HYPHENS}]', "")
+ return line_html
+
class Line(Block):
block_type: BlockTypes = BlockTypes.Line
origin: Optional[Literal["pdftext", "surya"]] = None
- def assemble_html(self, child_blocks):
+ def assemble_html(self, document, child_blocks, parent_structure):
template = ""
for c in child_blocks:
template += c.html
+
+ raw_text = remove_tags(template).strip()
+ structure_idx = parent_structure.index(self.id)
+ if structure_idx < len(parent_structure) - 1:
+ next_block_id = parent_structure[structure_idx + 1]
+ next_line = document.get_block(next_block_id)
+ next_line_raw_text = next_line.raw_text(document)
+ template = strip_trailing_hyphens(raw_text, next_line_raw_text, template)
return template
- def render(self, document):
+ def render(self, document, parent_structure):
child_content = []
if self.structure is not None and len(self.structure) > 0:
for block_id in self.structure:
block = document.get_block(block_id)
- child_content.append(block.render(document))
+ child_content.append(block.render(document, parent_structure))
return BlockOutput(
- html=self.assemble_html(child_content),
+ html=self.assemble_html(document, child_content, parent_structure),
polygon=self.polygon,
id=self.id,
children=[]
diff --git a/marker/v2/schema/text/span.py b/marker/v2/schema/text/span.py
index e30cec4..e9d74e9 100644
--- a/marker/v2/schema/text/span.py
+++ b/marker/v2/schema/text/span.py
@@ -23,10 +23,26 @@ def bold(self):
def italic(self):
return 'italic' in self.formats
- def assemble_html(self, child_blocks):
- if len(self.text) > 3:
+ def assemble_html(self, child_blocks, parent_structure):
+ text = self.text
+ text = text.replace("-\n", "") # Remove hyphenated line breaks
+
+ # Remove trailing newlines
+ replaced_newline = False
+ while len(text) > 0 and text[-1] in ["\n", "\r"]:
+ text = text[:-1]
+ replaced_newline = True
+
+ # Remove leading newlines
+ while len(text) > 0 and text[0] in ["\n", "\r"]:
+ text = text[1:]
+
+ if replaced_newline:
+ text += " "
+
+ if len(text) > 3:
if self.italic:
- return f"{self.text}"
+ return f"{text}"
elif self.bold:
- return f"{self.text}"
- return self.text
+ return f"{text}"
+ return text