From 86c5234662e33cab7c91e6bebb3f09bbf1b6a771 Mon Sep 17 00:00:00 2001
From: Moses Paul R <iammosespaulr@gmail.com>
Date: Wed, 20 Nov 2024 15:29:33 +0000
Subject: [PATCH] clean up logic and add heuristic to check if the next text
 block is in the first quadrant if it's a page break

---
 marker/v2/processors/text.py | 49 ++++++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 13 deletions(-)

diff --git a/marker/v2/processors/text.py b/marker/v2/processors/text.py
index 7b87257..77ec542 100644
--- a/marker/v2/processors/text.py
+++ b/marker/v2/processors/text.py
@@ -25,34 +25,57 @@ def __call__(self, document: Document):
                     if not len(block.structure) >= 2:  # Skip single lines
                         continue
 
-                    column_or_page_break = False
+                    column_break, page_break = False, False
                     next_block = page.get_next_block(block)
                     if next_block is not None:  # we check for a column break
-                        column_or_page_break = (
+                        column_break = (
                             next_block.polygon.y_start < block.polygon.y_start and
                             next_block.polygon.x_start > block.polygon.x_start
                         )
                     else:  # It's a page break since we don't have a next block in the page
-                        column_or_page_break = True
+                        page_break = True
 
-                    if not column_or_page_break:
+                    if not (column_break or page_break):
                         continue
 
                     next_block_starts_indented = True
-                    next_block_doc = document.get_next_block(block)
-                    if next_block_doc:
-                        if next_block_doc.block_type not in self.block_types:
+                    next_block_in_first_quadrant = False
+
+                    if column_break:
+                        # set new block_lines from the next block
+                        new_block_lines = [page.get_block(block_id) for block_id in next_block.structure]
+                    elif page_break and document.pages.index(page) < len(document.pages) - 1:
+                        # if we found a page break and we're not on the last page
+                        next_page = document.get_next_page(page)
+                        if next_page is None:
                             continue
-                        new_page = document.get_page(next_block_doc.page_id)  # the next block can come from the next page
-                        new_block_lines = [new_page.get_block(block_id) for block_id in next_block_doc.structure]
-                        min_x = math.floor(min([l.polygon.x_start for l in new_block_lines]))
-                        next_block_starts_indented = new_block_lines[0].polygon.x_start > min_x
+                        for next_page_block_id in next_page.structure:
+                            if next_page_block_id.block_type in [BlockTypes.PageHeader, BlockTypes.PageFooter]:
+                                continue  # skip headers and footers
+                            if next_page_block_id.block_type not in self.block_types:
+                                break  # we found a non-text block, so we can stop looking
+
+                            # we have our text_block, now we set the new block_lines
+                            next_page_block = next_page.get_block(next_page_block_id)
+                            new_block_lines = [next_page.get_block(block_id) for block_id in next_page_block.structure]
+                            next_block_in_first_quadrant = (next_page_block.polygon.x_start < next_page.polygon.width // 2) and \
+                                (next_page_block.polygon.y_start < next_page.polygon.height // 2)
+                            break
+                        else:
+                            continue  # we didn't break anywhere so we continue
+                    else:
+                        continue
+
+                    min_x = math.floor(min([l.polygon.x_start for l in new_block_lines]))
+                    next_block_starts_indented = new_block_lines[0].polygon.x_start > min_x
 
                     lines: List[Line] = [page.get_block(block_id) for block_id in block.structure]
                     max_x = math.floor(max([l.polygon.x_end for l in lines]))
-
                     last_line_is_full_width = lines[-1].polygon.x_end >= max_x
+
                     last_line_is_hyphentated = regex.compile(r'.*[\p{Ll}|\d][-—¬]\s?$', regex.DOTALL).match(lines[-1].raw_text(document).strip())
 
-                    if (last_line_is_full_width or last_line_is_hyphentated) and not next_block_starts_indented:
+                    if (last_line_is_full_width or last_line_is_hyphentated) and \
+                        not next_block_starts_indented and \
+                            ((next_block_in_first_quadrant and page_break) or column_break):
                         block.has_continuation = True