From 86c5234662e33cab7c91e6bebb3f09bbf1b6a771 Mon Sep 17 00:00:00 2001 From: Moses Paul R Date: Wed, 20 Nov 2024 15:29:33 +0000 Subject: [PATCH] clean up logic and add heuristic to check if the next text block is in the first quadrant if it's a page break --- marker/v2/processors/text.py | 49 ++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/marker/v2/processors/text.py b/marker/v2/processors/text.py index 7b87257..77ec542 100644 --- a/marker/v2/processors/text.py +++ b/marker/v2/processors/text.py @@ -25,34 +25,57 @@ def __call__(self, document: Document): if not len(block.structure) >= 2: # Skip single lines continue - column_or_page_break = False + column_break, page_break = False, False next_block = page.get_next_block(block) if next_block is not None: # we check for a column break - column_or_page_break = ( + column_break = ( next_block.polygon.y_start < block.polygon.y_start and next_block.polygon.x_start > block.polygon.x_start ) else: # It's a page break since we don't have a next block in the page - column_or_page_break = True + page_break = True - if not column_or_page_break: + if not (column_break or page_break): continue next_block_starts_indented = True - next_block_doc = document.get_next_block(block) - if next_block_doc: - if next_block_doc.block_type not in self.block_types: + next_block_in_first_quadrant = False + + if column_break: + # set new block_lines from the next block + new_block_lines = [page.get_block(block_id) for block_id in next_block.structure] + elif page_break and document.pages.index(page) < len(document.pages) - 1: + # if we found a page break and we're not on the last page + next_page = document.get_next_page(page) + if next_page is None: continue - new_page = document.get_page(next_block_doc.page_id) # the next block can come from the next page - new_block_lines = [new_page.get_block(block_id) for block_id in next_block_doc.structure] - min_x = math.floor(min([l.polygon.x_start for l in new_block_lines])) - next_block_starts_indented = new_block_lines[0].polygon.x_start > min_x + for next_page_block_id in next_page.structure: + if next_page_block_id.block_type in [BlockTypes.PageHeader, BlockTypes.PageFooter]: + continue # skip headers and footers + if next_page_block_id.block_type not in self.block_types: + break # we found a non-text block, so we can stop looking + + # we have our text_block, now we set the new block_lines + next_page_block = next_page.get_block(next_page_block_id) + new_block_lines = [next_page.get_block(block_id) for block_id in next_page_block.structure] + next_block_in_first_quadrant = (next_page_block.polygon.x_start < next_page.polygon.width // 2) and \ + (next_page_block.polygon.y_start < next_page.polygon.height // 2) + break + else: + continue # we didn't break anywhere so we continue + else: + continue + + min_x = math.floor(min([l.polygon.x_start for l in new_block_lines])) + next_block_starts_indented = new_block_lines[0].polygon.x_start > min_x lines: List[Line] = [page.get_block(block_id) for block_id in block.structure] max_x = math.floor(max([l.polygon.x_end for l in lines])) - last_line_is_full_width = lines[-1].polygon.x_end >= max_x + last_line_is_hyphentated = regex.compile(r'.*[\p{Ll}|\d][-—¬]\s?$', regex.DOTALL).match(lines[-1].raw_text(document).strip()) - if (last_line_is_full_width or last_line_is_hyphentated) and not next_block_starts_indented: + if (last_line_is_full_width or last_line_is_hyphentated) and \ + not next_block_starts_indented and \ + ((next_block_in_first_quadrant and page_break) or column_break): block.has_continuation = True