Skip to content

Commit

Permalink
feat(ocr_utils): Caption Adjustments
Browse files Browse the repository at this point in the history
  • Loading branch information
flooie committed May 15, 2024
1 parent 6dd78f1 commit 6c0fef0
Showing 1 changed file with 57 additions and 1 deletion.
58 changes: 57 additions & 1 deletion doctor/lib/ocr_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,18 @@ def validate_ocr_text(row: pd.Series, img: Image) -> pd.Series:
row["text"] = new_words
else:
row["text"] = "□" * len(row["text"])
elif row["conf"] < 10:
# if the confidence is under 10 and its just three characters - box it
row["text"] = "□"
elif (
row["conf"] < 20
and len(row["text"]) == 1
and not row["text"].isalnum()
):
# Artifacts from scans often appear as lone symbols
# if conf is low and they are all alone drop them
row["text"] = " "

return row["text"] + " "


Expand Down Expand Up @@ -189,7 +201,9 @@ def insert_indentation(row: pd.Series, state: dict) -> dict:
return state


def format_text_by_block(block: pd.DataFrame, img: Image) -> str:
def format_text_by_block(
block: pd.DataFrame, img: Image
) -> str:
"""Process blocks of text
Insert whitespace and validate the OCR results
Expand Down Expand Up @@ -250,10 +264,52 @@ def process_page_with_ocr(page: pdfplumber.PDF.pages) -> str:
page_text = ""
for block in ordered_page_blocks:
page_text += format_text_by_block(block, image)

if page.page_number == 1:
page_text = adjust_caption_lines(page_text)
page_text = re.sub(r"^\s+\n|$", "", page_text, 1, flags=re.MULTILINE)
return page_text


def adjust_caption_lines(page_text: str) -> str:
"""Adjust the alignment of ) or : or § used to align content
§ is used in texas courts
: is used in NY courts
) is used in many courts
:param page_text: The text of the first page
:return: The page text
"""
for separator in [r"\)", "§", ":"]:
matches = list(re.finditer(rf"(.* +{separator} .*\n)", page_text))
central_matches = [
match
for match in matches
if 30 <= match.group().rindex(separator[-1]) <= 70
]
if len(central_matches) < 3:
continue # Skip this separator if less than 3 matches found
# Determine the longest position of the separator
longest = max(
match.group().rindex(separator[-1]) for match in central_matches
)
adjust = 0
for match in central_matches:
match_text = match.group()
index = match_text.rindex(separator[-1])
location = match.start() + adjust + index
# Adjust the page text by adding spaces to align the separators
page_text = (
page_text[:location]
+ " " * (longest - index)
+ page_text[location:]
)
adjust += longest - index
return page_text
return page_text


def page_needs_ocr(page: pdfplumber.pdf.Page, page_text: str) -> bool:
"""Does the page need OCR
Expand Down

0 comments on commit 6c0fef0

Please sign in to comment.