From 1dd344076c80e3ef409209f105252fa1c4ab282a Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Wed, 20 Nov 2024 16:01:43 +0000 Subject: [PATCH] clean up --- marker/v2/processors/text.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/marker/v2/processors/text.py b/marker/v2/processors/text.py index 77ec5422..5c9e0df5 100644 --- a/marker/v2/processors/text.py +++ b/marker/v2/processors/text.py @@ -42,10 +42,20 @@ def __call__(self, document: Document): next_block_in_first_quadrant = False if column_break: - # set new block_lines from the next block + if next_block.block_type not in self.block_types: + continue + if next_block.structure is None: # This is odd though, why do we have text blocks with no structure? + continue + + # we check for next_block indentation new_block_lines = [page.get_block(block_id) for block_id in next_block.structure] - elif page_break and document.pages.index(page) < len(document.pages) - 1: - # if we found a page break and we're not on the last page + min_x = math.floor(min([l.polygon.x_start for l in new_block_lines])) + next_block_starts_indented = new_block_lines[0].polygon.x_start > min_x + elif page_break: + # we don't care if it's a page break and it's the last page + if not document.pages.index(page) < len(document.pages) - 1: + continue + next_page = document.get_next_page(page) if next_page is None: continue @@ -57,7 +67,14 @@ def __call__(self, document: Document): # we have our text_block, now we set the new block_lines next_page_block = next_page.get_block(next_page_block_id) + if next_page_block.structure is None: + break # This is odd though, why do we have text blocks with no structure? + + # check if the new block is indented new_block_lines = [next_page.get_block(block_id) for block_id in next_page_block.structure] + min_x = math.floor(min([l.polygon.x_start for l in new_block_lines])) + next_block_starts_indented = new_block_lines[0].polygon.x_start > min_x + next_block_in_first_quadrant = (next_page_block.polygon.x_start < next_page.polygon.width // 2) and \ (next_page_block.polygon.y_start < next_page.polygon.height // 2) break @@ -66,9 +83,6 @@ def __call__(self, document: Document): else: continue - min_x = math.floor(min([l.polygon.x_start for l in new_block_lines])) - next_block_starts_indented = new_block_lines[0].polygon.x_start > min_x - lines: List[Line] = [page.get_block(block_id) for block_id in block.structure] max_x = math.floor(max([l.polygon.x_end for l in lines])) last_line_is_full_width = lines[-1].polygon.x_end >= max_x