Skip to content

Commit

Permalink
clean up logic and add heuristic to check if the next text block is i…
Browse files Browse the repository at this point in the history
…n the first quadrant if it's a page break
  • Loading branch information
iammosespaulr committed Nov 20, 2024
1 parent dd4db58 commit 86c5234
Showing 1 changed file with 36 additions and 13 deletions.
49 changes: 36 additions & 13 deletions marker/v2/processors/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,34 +25,57 @@ def __call__(self, document: Document):
if not len(block.structure) >= 2: # Skip single lines
continue

column_or_page_break = False
column_break, page_break = False, False
next_block = page.get_next_block(block)
if next_block is not None: # we check for a column break
column_or_page_break = (
column_break = (
next_block.polygon.y_start < block.polygon.y_start and
next_block.polygon.x_start > block.polygon.x_start
)
else: # It's a page break since we don't have a next block in the page
column_or_page_break = True
page_break = True

if not column_or_page_break:
if not (column_break or page_break):
continue

next_block_starts_indented = True
next_block_doc = document.get_next_block(block)
if next_block_doc:
if next_block_doc.block_type not in self.block_types:
next_block_in_first_quadrant = False

if column_break:
# set new block_lines from the next block
new_block_lines = [page.get_block(block_id) for block_id in next_block.structure]
elif page_break and document.pages.index(page) < len(document.pages) - 1:
# if we found a page break and we're not on the last page
next_page = document.get_next_page(page)
if next_page is None:
continue
new_page = document.get_page(next_block_doc.page_id) # the next block can come from the next page
new_block_lines = [new_page.get_block(block_id) for block_id in next_block_doc.structure]
min_x = math.floor(min([l.polygon.x_start for l in new_block_lines]))
next_block_starts_indented = new_block_lines[0].polygon.x_start > min_x
for next_page_block_id in next_page.structure:
if next_page_block_id.block_type in [BlockTypes.PageHeader, BlockTypes.PageFooter]:
continue # skip headers and footers
if next_page_block_id.block_type not in self.block_types:
break # we found a non-text block, so we can stop looking

# we have our text_block, now we set the new block_lines
next_page_block = next_page.get_block(next_page_block_id)
new_block_lines = [next_page.get_block(block_id) for block_id in next_page_block.structure]
next_block_in_first_quadrant = (next_page_block.polygon.x_start < next_page.polygon.width // 2) and \
(next_page_block.polygon.y_start < next_page.polygon.height // 2)
break
else:
continue # we didn't break anywhere so we continue
else:
continue

min_x = math.floor(min([l.polygon.x_start for l in new_block_lines]))
next_block_starts_indented = new_block_lines[0].polygon.x_start > min_x

lines: List[Line] = [page.get_block(block_id) for block_id in block.structure]
max_x = math.floor(max([l.polygon.x_end for l in lines]))

last_line_is_full_width = lines[-1].polygon.x_end >= max_x

last_line_is_hyphentated = regex.compile(r'.*[\p{Ll}|\d][-—¬]\s?$', regex.DOTALL).match(lines[-1].raw_text(document).strip())

if (last_line_is_full_width or last_line_is_hyphentated) and not next_block_starts_indented:
if (last_line_is_full_width or last_line_is_hyphentated) and \
not next_block_starts_indented and \
((next_block_in_first_quadrant and page_break) or column_break):
block.has_continuation = True

0 comments on commit 86c5234

Please sign in to comment.