Skip to content

Commit

Permalink
Merge pull request #191 from freelawproject/doctor-update-recap-extract
Browse files Browse the repository at this point in the history
Doctor Fix
  • Loading branch information
flooie authored May 31, 2024
2 parents 5f30530 + 9774695 commit 4009f00
Showing 1 changed file with 9 additions and 7 deletions.
16 changes: 9 additions & 7 deletions doctor/lib/text_extraction.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import re
from statistics import mean

import pdfplumber
from pdfplumber.ctm import CTM
Expand Down Expand Up @@ -38,6 +37,8 @@ def get_page_text(page: pdfplumber.PDF.pages, strip_margin: bool) -> str:
Using pdf plumber extract out the text of the document that is not
skewed (ie a stamp of approval) and extract out text removing blue text
Strip margin refers only to top and bottom margin here
:param page: PdfPlumber page
:param strip_margin: a flag to crop out the margin of a document and skewed content
:return: Text from the pdf plumber page
Expand All @@ -47,24 +48,24 @@ def get_page_text(page: pdfplumber.PDF.pages, strip_margin: bool) -> str:
_, _, width, height = page.bbox
pixels_per_inch = width / 8.5
bbox = (
pixels_per_inch * 1, # 1 inch from left edge
0,
pixels_per_inch * 1, # 1 inch down from top
pixels_per_inch
* 7.5, # 7.5 inches from left edge (1 inch from right)
width, #
pixels_per_inch * 10, # 10 inches from top (1 inch from bottom)
)
doc_text = (
page_text = (
page.crop(bbox)
.filter(is_skewed)
.extract_text(
layout=True, keep_blank_chars=True, y_tolerance=5, y_density=25
)
)
else:
doc_text = page.extract_text(
page_text = page.extract_text(
layout=True, keep_blank_chars=True, y_tolerance=5, y_density=25
)
return doc_text
page_text = remove_excess_whitespace(page_text)
return page_text


def has_images(page: pdfplumber.pdf.Page) -> bool:
Expand Down Expand Up @@ -126,6 +127,7 @@ def adjust_caption_lines(page_text: str) -> str:
row = row.replace(f" {separator}", f"{addition}{separator}")
page.append(row)
return "\n".join(page)
return page_text


def page_needs_ocr(page: pdfplumber.pdf.Page, page_text: str) -> bool:
Expand Down

0 comments on commit 4009f00

Please sign in to comment.