diff --git a/marker/v2/providers/pdf.py b/marker/v2/providers/pdf.py
index 47004cb..25db8b1 100644
--- a/marker/v2/providers/pdf.py
+++ b/marker/v2/providers/pdf.py
@@ -35,7 +35,10 @@ def __len__(self) -> int:
def __del__(self):
self.doc.close()
- def font_flags_to_format(self, flags: int) -> Set[str]:
+ def font_flags_to_format(self, flags: int | None) -> Set[str]:
+ if flags is None:
+ return {"plain"}
+
flag_map = {
1: "FixedPitch",
2: "Serif",
@@ -72,8 +75,11 @@ def font_flags_to_format(self, flags: int) -> Set[str]:
formats.add("plain")
return formats
- def font_names_to_format(self, font_name: str) -> Set[str]:
+ def font_names_to_format(self, font_name: str | None) -> Set[str]:
formats = set()
+ if font_name is None:
+ return formats
+
if "bold" in font_name.lower():
formats.add("bold")
if "ital" in font_name.lower():
@@ -97,16 +103,19 @@ def setup(self):
for line in block["lines"]:
spans: List[Span] = []
for span in line["spans"]:
- if not span["text"].strip():
+ if not span["text"]:
continue
font_formats = self.font_flags_to_format(span["font"]["flags"]).union(self.font_names_to_format(span["font"]["name"]))
+ font_name = span["font"]["name"] or "Unknown"
+ font_weight = span["font"]["weight"] or 0
+ font_size = span["font"]["size"] or 0
spans.append(
Span(
polygon=PolygonBox.from_bbox(span["bbox"]),
- text=span["text"].strip(),
- font=span["font"]["name"],
- font_weight=span["font"]["weight"],
- font_size=span["font"]["size"],
+ text=span["text"],
+ font=font_name,
+ font_weight=font_weight,
+ font_size=font_size,
minimum_position=span["char_start_idx"],
maximum_position=span["char_end_idx"],
formats=list(font_formats),
diff --git a/marker/v2/schema/blocks/base.py b/marker/v2/schema/blocks/base.py
index 4218300..4ba0231 100644
--- a/marker/v2/schema/blocks/base.py
+++ b/marker/v2/schema/blocks/base.py
@@ -100,21 +100,21 @@ def raw_text(self, document) -> str:
text += "\n"
return text
- def assemble_html(self, child_blocks):
+ def assemble_html(self, child_blocks, parent_structure=None):
template = ""
for c in child_blocks:
template += f"
{template}
" diff --git a/marker/v2/schema/document.py b/marker/v2/schema/document.py index c8c837c..9589051 100644 --- a/marker/v2/schema/document.py +++ b/marker/v2/schema/document.py @@ -35,7 +35,7 @@ def assemble_html(self, child_blocks): def render(self): child_content = [] for page in self.pages: - child_content.append(page.render(self)) + child_content.append(page.render(self, None)) return DocumentOutput( children=child_content, diff --git a/marker/v2/schema/groups/list.py b/marker/v2/schema/groups/list.py index 5220975..223bc70 100644 --- a/marker/v2/schema/groups/list.py +++ b/marker/v2/schema/groups/list.py @@ -4,6 +4,6 @@ class ListGroup(Block): block_type: str = "ListGroup" - def assemble_html(self, child_blocks): - template = super().assemble_html(child_blocks) + def assemble_html(self, child_blocks, parent_structure): + template = super().assemble_html(child_blocks, parent_structure) return f"