diff --git a/marker/v2/builders/layout.py b/marker/v2/builders/layout.py index 25c2249..dd66f41 100644 --- a/marker/v2/builders/layout.py +++ b/marker/v2/builders/layout.py @@ -12,11 +12,12 @@ from marker.v2.schema.groups.page import PageGroup from marker.v2.schema.polygon import PolygonBox from marker.v2.schema.registry import get_block_class -from marker.v2.schema.text.line import Line class LayoutBuilder(BaseBuilder): batch_size = None + layout_coverage_min_lines = 1 + layout_coverage_threshold = .5 def __init__(self, layout_model, config=None): self.layout_model = layout_model @@ -57,7 +58,7 @@ def add_blocks_to_pages(self, pages: List[PageGroup], layout_results: List[Layou def merge_blocks(self, document_pages: List[PageGroup], provider_page_lines: ProviderPageLines): for document_page in document_pages: - provider_lines = provider_page_lines[document_page.page_id] + provider_lines = provider_page_lines.get(document_page.page_id, []) if not self.check_layout_coverage(document_page, provider_lines): document_page.text_extraction_method = "surya" continue @@ -67,16 +68,22 @@ def check_layout_coverage( self, document_page: PageGroup, provider_lines: List[ProviderOutput], - coverage_threshold=0.5 ): - layout_area = 0 - provider_area = 0 + covered_blocks = 0 + total_blocks = 0 for layout_block_id in document_page.structure: layout_block = document_page.get_block(layout_block_id) if layout_block.block_type in [BlockTypes.Figure, BlockTypes.Picture, BlockTypes.Table]: continue - layout_area += layout_block.polygon.area + + total_blocks += 1 + intersecting_lines = 0 for provider_line in provider_lines: - provider_area += layout_block.polygon.intersection_area(provider_line.line.polygon) - coverage_ratio = provider_area / layout_area if layout_area > 0 else 0 - return coverage_ratio >= coverage_threshold + if layout_block.polygon.intersection_area(provider_line.line.polygon) > 0: + intersecting_lines += 1 + + if intersecting_lines > self.layout_coverage_min_lines: + covered_blocks += 1 + + coverage_ratio = covered_blocks / max(total_blocks, 1) + return coverage_ratio >= self.layout_coverage_threshold diff --git a/marker/v2/schema/blocks/code.py b/marker/v2/schema/blocks/code.py index ca8e6e1..ad79665 100644 --- a/marker/v2/schema/blocks/code.py +++ b/marker/v2/schema/blocks/code.py @@ -6,5 +6,7 @@ class Code(Block): block_type: BlockTypes = BlockTypes.Code def assemble_html(self, child_blocks, parent_structure): - template = super().assemble_html(child_blocks, parent_structure) + template = "" + for c in child_blocks: + template += f"\n" return f"
{template}
"