diff --git a/doctor/lib/ocr_utils.py b/doctor/lib/ocr_utils.py index d34ca99..9a32f47 100644 --- a/doctor/lib/ocr_utils.py +++ b/doctor/lib/ocr_utils.py @@ -132,6 +132,18 @@ def validate_ocr_text(row: pd.Series, img: Image) -> pd.Series: row["text"] = new_words else: row["text"] = "□" * len(row["text"]) + elif row["conf"] < 10: + # if the confidence is under 10 and its just three characters - box it + row["text"] = "□" + elif ( + row["conf"] < 20 + and len(row["text"]) == 1 + and not row["text"].isalnum() + ): + # Artifacts from scans often appear as lone symbols + # if conf is low and they are all alone drop them + row["text"] = " " + return row["text"] + " " @@ -189,7 +201,9 @@ def insert_indentation(row: pd.Series, state: dict) -> dict: return state -def format_text_by_block(block: pd.DataFrame, img: Image) -> str: +def format_text_by_block( + block: pd.DataFrame, img: Image +) -> str: """Process blocks of text Insert whitespace and validate the OCR results @@ -250,10 +264,52 @@ def process_page_with_ocr(page: pdfplumber.PDF.pages) -> str: page_text = "" for block in ordered_page_blocks: page_text += format_text_by_block(block, image) + + if page.page_number == 1: + page_text = adjust_caption_lines(page_text) page_text = re.sub(r"^\s+\n|$", "", page_text, 1, flags=re.MULTILINE) return page_text +def adjust_caption_lines(page_text: str) -> str: + """Adjust the alignment of ) or : or § used to align content + + § is used in texas courts + : is used in NY courts + ) is used in many courts + + :param page_text: The text of the first page + :return: The page text + """ + for separator in [r"\)", "§", ":"]: + matches = list(re.finditer(rf"(.* +{separator} .*\n)", page_text)) + central_matches = [ + match + for match in matches + if 30 <= match.group().rindex(separator[-1]) <= 70 + ] + if len(central_matches) < 3: + continue # Skip this separator if less than 3 matches found + # Determine the longest position of the separator + longest = max( + match.group().rindex(separator[-1]) for match in central_matches + ) + adjust = 0 + for match in central_matches: + match_text = match.group() + index = match_text.rindex(separator[-1]) + location = match.start() + adjust + index + # Adjust the page text by adding spaces to align the separators + page_text = ( + page_text[:location] + + " " * (longest - index) + + page_text[location:] + ) + adjust += longest - index + return page_text + return page_text + + def page_needs_ocr(page: pdfplumber.pdf.Page, page_text: str) -> bool: """Does the page need OCR