Skip to content

Commit

Permalink
Fix ocr heuristics, somewhat fix code
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Nov 19, 2024
1 parent b8301ee commit 7e449a1
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 10 deletions.
25 changes: 16 additions & 9 deletions marker/v2/builders/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@
from marker.v2.schema.groups.page import PageGroup
from marker.v2.schema.polygon import PolygonBox
from marker.v2.schema.registry import get_block_class
from marker.v2.schema.text.line import Line


class LayoutBuilder(BaseBuilder):
batch_size = None
layout_coverage_min_lines = 1
layout_coverage_threshold = .5

def __init__(self, layout_model, config=None):
self.layout_model = layout_model
Expand Down Expand Up @@ -57,7 +58,7 @@ def add_blocks_to_pages(self, pages: List[PageGroup], layout_results: List[Layou

def merge_blocks(self, document_pages: List[PageGroup], provider_page_lines: ProviderPageLines):
for document_page in document_pages:
provider_lines = provider_page_lines[document_page.page_id]
provider_lines = provider_page_lines.get(document_page.page_id, [])
if not self.check_layout_coverage(document_page, provider_lines):
document_page.text_extraction_method = "surya"
continue
Expand All @@ -67,16 +68,22 @@ def check_layout_coverage(
self,
document_page: PageGroup,
provider_lines: List[ProviderOutput],
coverage_threshold=0.5
):
layout_area = 0
provider_area = 0
covered_blocks = 0
total_blocks = 0
for layout_block_id in document_page.structure:
layout_block = document_page.get_block(layout_block_id)
if layout_block.block_type in [BlockTypes.Figure, BlockTypes.Picture, BlockTypes.Table]:
continue
layout_area += layout_block.polygon.area

total_blocks += 1
intersecting_lines = 0
for provider_line in provider_lines:
provider_area += layout_block.polygon.intersection_area(provider_line.line.polygon)
coverage_ratio = provider_area / layout_area if layout_area > 0 else 0
return coverage_ratio >= coverage_threshold
if layout_block.polygon.intersection_area(provider_line.line.polygon) > 0:
intersecting_lines += 1

if intersecting_lines > self.layout_coverage_min_lines:
covered_blocks += 1

coverage_ratio = covered_blocks / max(total_blocks, 1)
return coverage_ratio >= self.layout_coverage_threshold
4 changes: 3 additions & 1 deletion marker/v2/schema/blocks/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,7 @@ class Code(Block):
block_type: BlockTypes = BlockTypes.Code

def assemble_html(self, child_blocks, parent_structure):
template = super().assemble_html(child_blocks, parent_structure)
template = ""
for c in child_blocks:
template += f"<content-ref src='{c.id}'></content-ref>\n"
return f"<pre>{template}</pre>"

0 comments on commit 7e449a1

Please sign in to comment.