diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index e60ac21..647cabe 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.10"] + python-version: ["3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} diff --git a/DEVELOPING.md b/DEVELOPING.md index 2eebdd9..b81ee32 100644 --- a/DEVELOPING.md +++ b/DEVELOPING.md @@ -19,7 +19,7 @@ If you want to see debug logs, set `DEBUG` to `True` in `settings.py`. Once the above compose file is running, you can use the `mock_web_app` container to run the tests against the `doctor` container: - docker exec -it mock_web_app_doctor python3 -m unittest doctor.tests + docker exec -it mock_web_app python3 -m unittest doctor.tests ## Building Images diff --git a/README.md b/README.md index 7b8538e..c5da289 100644 --- a/README.md +++ b/README.md @@ -100,6 +100,25 @@ Valid requests will receive a JSON response with the following keys: - `extracted_by_ocr`: Whether OCR was needed and used during processing. - `page_count`: The number of pages, if it applies. +### Endpoint: /extract/recap/text/ + +Given a RECAP pdf, extract out the text using PDF Plumber, OCR or a combination of the two + +Parameters: + + - `strip_margin`: Whether doctor should crop the edges of the recap document during processing. With PDF plumber it will ignore traditional 1 inch margin. With an OCR it lowers the threshold for hiding OCR gibberish. To enable it, set strip_margin to `True`: + +```bash +curl 'http://localhost:5050/extract/recap/text/?strip_margin=True' \ + -X 'POST' \ + -F "file=@doctor/recap_extract/gov.uscourts.cacd.652774.40.0.pdf" +``` + +Valid requests will receive a JSON response with the following keys: + + - `content`: The utf-8 encoded text of the file + - `extracted_by_ocr`: Whether OCR was needed and used during processing. + ## Utilities diff --git a/doctor/forms.py b/doctor/forms.py index a3d14e5..04c229e 100644 --- a/doctor/forms.py +++ b/doctor/forms.py @@ -95,6 +95,7 @@ def clean(self): class DocumentForm(BaseFileForm): ocr_available = forms.BooleanField(label="ocr-available", required=False) mime = forms.BooleanField(label="mime", required=False) + strip_margin = forms.BooleanField(label="strip-margin", required=False) def clean(self): self.clean_file() diff --git a/doctor/lib/text_extraction.py b/doctor/lib/text_extraction.py new file mode 100644 index 0000000..6b9f5f0 --- /dev/null +++ b/doctor/lib/text_extraction.py @@ -0,0 +1,381 @@ +import re +from statistics import mean + +import pdfplumber +from pdfplumber.ctm import CTM +import pytesseract +from pytesseract import Output +import pandas as pd +from PIL import Image + + +def is_skewed(obj: dict) -> bool: + """Check if a PDF plumber dict is skewed + + CTM stands for current transformation matrix. + Pdf plumber has a method to calculate the angle of text which we use here + + Traditionally this is only seen in circular stamps which confuses the + content, or in perpendicular text of the ninth circuit courts which also + confuses the text. + + :param obj: dictionary from pdfplumber for each word + :return: if the text should be returned + """ + if (matrix := obj.get("matrix")) is None: + return True + + # Remove Skew + my_char_ctm = CTM(*matrix) + if my_char_ctm.skew_x != 0: + return False + return True + + +def get_page_text(page: pdfplumber.PDF.pages, strip_margin: bool) -> str: + """Extract page text + + Using pdf plumber extract out the text of the document that is not + skewed (ie a stamp of approval) and extract out text removing blue text + + :param page: PdfPlumber page + :param strip_margin: a flag to crop out the margin of a document and skewed content + :return: Text from the pdf plumber page + """ + if strip_margin: + # Crop margins and remove skewed text + _, _, width, height = page.bbox + pixels_per_inch = width / 8.5 + bbox = ( + pixels_per_inch * 1, # 1 inch from left edge + pixels_per_inch * 1, # 1 inch down from top + pixels_per_inch + * 7.5, # 7.5 inches from left edge (1 inch from right) + pixels_per_inch * 10, # 10 inches from top (1 inch from bottom) + ) + doc_text = ( + page.crop(bbox) + .filter(is_skewed) + .extract_text( + layout=True, keep_blank_chars=True, y_tolerance=5, y_density=25 + ) + ) + else: + doc_text = page.extract_text( + layout=True, keep_blank_chars=True, y_tolerance=5, y_density=25 + ) + return doc_text + + +def has_images(page: pdfplumber.pdf.Page) -> bool: + """Does the page have images that are large enough to contain text + + :param page: pdf plumber page + :return: True if page contains images of a certain size + """ + return any( + [ + image + for image in page.images + if image["width"] > 10 and image["height"] > 10 + ] + ) + + +def has_text_annotations(page: pdfplumber.pdf.Page) -> bool: + """Does the page have annotations which could contain text + + :param page: pdf plumber + :return: if page has annotations + """ + if page.annots: + anno_types = [ + str(annot.get("data").get("Subtype")) for annot in page.annots + ] + if "/'FreeText'" in anno_types or "/'Widget'" in anno_types: + return True + return False + + +def adjust_caption_lines(page_text: str) -> str: + """Adjust the alignment of ) or : or § used to align content + + § is used in texas courts + : is used in NY courts + ) is used in many courts + + :param page_text: The text of the first page + :return: The page text + """ + for separator in [r")", "§", ":"]: + pattern = rf"(.* +{re.escape(separator)} .*\n)" + matches = list(re.finditer(pattern, page_text)) + central_matches = [ + match.group().rindex(separator) + for match in matches + if 30 <= match.group().rindex(separator) <= 70 + ] + if len(central_matches) < 3: + continue # Skip this separator if less than 3 matches found + # Determine the longest position of the separator + longest = max(central_matches) + page = [] + for row in page_text.splitlines(): + index = row.find(f" {separator}") + addition = (longest - index) * " " + row = row.replace(f" {separator}", f"{addition}{separator}") + page.append(row) + return "\n".join(page) + + +def page_needs_ocr(page: pdfplumber.pdf.Page, page_text: str) -> bool: + """Does the page need OCR + + :param page:Pdf Plumber Page + :param page_text: context extracted from page + :return: does page need OCR + """ + if ( + page_text.strip() == "" + or "(cid:" in page_text + or has_text_annotations(page) + or has_images(page) + or len(page.curves) > 10 + ): + return True + return False + + +def convert_pdf_page_to_image( + page: pdfplumber.pdf.Page, strip_margin: bool +) -> Image: + """Convert page to image and crop margin if applicable + + :param page: the pdf page + :param strip_margin: whether to crop the margin + :return: The cropped page image + """ + img = page.to_image(resolution=300) + _, _, w, h = page.bbox + width = w * img.scale + + if strip_margin == True: + pixels_per_inch = width / 8.5 + bbox = ( + pixels_per_inch * 0.5, # .5" from left edge + pixels_per_inch * 0.5, # .5" down from top + pixels_per_inch * 8, # 8" from left edge (.5" from right) + pixels_per_inch * 10.5, # 10.5" from top (.5" from bottom) + ) + image = img.original.crop(bbox) + else: + image = img.original + return image + + +def ocr_image_to_data(image: Image) -> list[pd.DataFrame]: + """Perform OCR on an image to extract data + + Convert the image of the pdf page to OCR data + :param image: Pil Image + :return: A list of DataFrames, each containing OCR data for a block of text + """ + + # Detailed Parameters for `pytesseract.image_to_data`: + # - config: str + # Additional Tesseract configuration options. + # - `-c preserve_interword_spaces=1`: Preserve spaces between words as they appear in the image. + # - `-c tessedit_do_invert=0`: Do not invert the image colors. + # - `--psm 6`: Page segmentation mode 6, which assumes a single uniform block of text. + # - `-l eng`: Use the English language for OCR. + # - output_type: pytesseract.Output.DICT + # Specifies that the output should be a dictionary of OCR data. + # + # Reference: + # Tesseract OCR documentation: https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc + + data_dict = pytesseract.image_to_data( + image, + config="-c preserve_interword_spaces=1x1 -c tessedit_do_invert=0 --psm 6 -l eng", + output_type=Output.DICT, + ) + df = pd.DataFrame(data_dict) + filtered_data = df[(df.conf != -1)] + block_ids = ( + filtered_data.groupby("block_num") + .first() + .sort_values("top") + .index.tolist() + ) + blocks = [ + filtered_data[filtered_data["block_num"] == block] + for block in block_ids + ] + return blocks + + +def extract_with_ocr(page: pdfplumber.pdf.Page, strip_margin: bool) -> str: + """Extract the page using OCR + + :param page:Pdf Plumber Page + :param strip_margin: If we should trim the margins + :return: The extracted content for the page + """ + + image = convert_pdf_page_to_image(page, strip_margin) + data = ocr_image_to_data(image) + content = "" + prev = {} + for words in data: + for index, word in words.iterrows(): + content = insert_whitespace(content, word, prev) + content += get_word(word, image.size[0], strip_margin) + prev = word + content = cleanup_content(content, page.page_number) + return content + + +def insert_whitespace(content: str, word: dict, prev: dict) -> str: + """Insert whitespace after or before word + + :param content: The text extracted so far + :param word: The OCR extraction object + :param prev: The previous word object extracted + :return: The content with the whitespace appended + """ + is_new_line = prev.get("line_num", 0) != word["line_num"] + is_new_par = prev.get("par_num", 0) != word["par_num"] + prev_end = prev.get("left", 1) + prev.get("width", 1) + + # Add vertical whitespace + if is_new_line or is_new_par: + vertical_gap = word["top"] - ( + prev.get("top", 0) + prev.get("height", 0) + ) + content += "\n\n" if vertical_gap > 100 else "\n" + prev_end = 0 + + # add horizontal whitespace + content += " " * int(((word["left"] - prev_end) / 25)) + return content + + +def get_word(word_dict: dict, width: float, strip_margin: bool) -> str: + """Append word to content + + This function determines if a word should be added to the page content + and adds the word. + + :param word_dict: the word object from tesseract + :param width: The width of the document + :param strip_margin: should we strip the margin + :return: The text with space + """ + pixels_per_inch = width / 8.5 + if strip_margin: + left_margin = 1 * pixels_per_inch # + right_margin = 7.5 * pixels_per_inch + else: + left_margin = 0.5 * pixels_per_inch + right_margin = 8.0 * pixels_per_inch + + # tesseract provides confidence values for its OCR outputs. We use those + # confidence values to determine if something is a good OCR output, a + # likely artifact and should be excluded or is bad ocr but not an artifact. + + word = word_dict["text"] + conf = word_dict["conf"] + + no_confidence = 0 + very_low_confidence = 5 + low_confidence = 40 + short_word_len = 3 + long_word_len = 20 + if ( + word_dict["left"] + word_dict["width"] < left_margin + and conf < low_confidence + ): + # If a word has confidence below 40, a number that usually equates to 3 to 5 + # standard deviations from confidences found in other words is entirely in the + # margin of the page - its likely an artifact as well. + word = " " * len(word) + elif (conf == no_confidence and len(word) <= short_word_len) or word_dict[ + "left" + ] == 0: + # If a word has a zero confidence or starts on the left most edge of the paper + # we return it as an empty string. It is likely an artifact. + word = " " * len(word) + elif conf < very_low_confidence and ( + len(word) <= short_word_len or len(word) > long_word_len + ): + # If a confidence is below 5 - for a very short word - or for a very long word + # its likely part of the document but we have no idea so we return a square + # box to indicate that. This is often caused by stamps or lines in case captions + word = "□" * len(word) + elif conf < low_confidence and word_dict["left"] > right_margin: + # Finally if a low confidence word starts in the right margin - its likely a + # bad OCR that is multiple standard deviations away so we return the word as + # empty squares. + word = "□" * len(word) + + return f"{word} " + + +def cleanup_content(content: str, page_number: int) -> str: + """Reduce legal document line clutter + + This function performs several operations to clean up the text extracted from legal documents: + + 1. On the first page, it smooths out vertical lines if they are detected. + 2. It removes pipes ('|') that might start a line repeatedly. + 3. It removes artifacts that appear at the end of a line of text, specifically single characters + following at least 10 whitespace characters, reducing right margin edge artifacts. + 4. It removes excess left margin whitespace to improve readability and formatting. + + Example: + If the pipes below represent the page edge (not characters): + | we can remove the + | the left whitespace + | and shift this entire + | page over four characters + | which keeps formatting and + | makes the text easier to + | read and process with the API. + + :param content: the page content extracted + :param page_number: the page number + :return: the cleaned up text + """ + # remove floating pipes + pattern = r"\s{4,}\| $" + # Substitute the matched pipe with an empty string + content = re.sub(pattern, "", content, flags=re.MULTILINE) + + # remove floating artifacts from the right side + pattern = r"\s{10,}[a-zA-Z0-9|] $" + content = re.sub(pattern, "", content, flags=re.MULTILINE) + + # shift text left if possible and remove excess start and end whitespace + content = remove_excess_whitespace(content) + if page_number == 1: + content = adjust_caption_lines(content) + + return f"{content}\n" + + +def remove_excess_whitespace(document: str) -> str: + """Remove excess whitespace from OCR + + This function removes empty lines of text at the start and end of a document + and shifts the page left if possible + + :param document: text of the document + :return: Document with excess whitespace removed + """ + m = re.findall(r"(^ +)", document, re.MULTILINE) + if m: + shift_left = len(min(m)) + pattern = f"(^ {{{shift_left}}})" + document = re.sub(pattern, "", document, flags=re.MULTILINE) + document = re.sub(r"^ +$", "", document, flags=re.MULTILINE) + return document.strip("\n") diff --git a/doctor/tasks.py b/doctor/tasks.py index 6c51d38..1077925 100644 --- a/doctor/tasks.py +++ b/doctor/tasks.py @@ -22,6 +22,12 @@ from seal_rookery.search import seal, ImageSizes from doctor.lib.mojibake import fix_mojibake +from doctor.lib.text_extraction import ( + get_page_text, + page_needs_ocr, + extract_with_ocr, + remove_excess_whitespace, +) from doctor.lib.utils import ( DoctorUnicodeDecodeError, force_bytes, @@ -621,3 +627,26 @@ def get_document_number_from_pdf(path: str) -> str: return "" document_number = [dn for dn in document_number_matches[0] if dn] return clean_document_number(document_number[0]) + + +def extract_recap_pdf( + filepath: str, + strip_margin: bool = False, +) -> tuple[str, bool]: + """Extract from RECAP PDF + + :param filepath: The path to the PDF + :param strip_margin: Whether to remove 1 inch margin from text extraction + :return: A tuple containing the text and a boolean indicating ocr usage + """ + content = "" + extracted_by_ocr = False + with pdfplumber.open(filepath) as pdf: + for page in pdf.pages: + page_text = get_page_text(page, strip_margin=strip_margin) + if page_needs_ocr(page, page_text): + extracted_by_ocr = True + page_text = extract_with_ocr(page, strip_margin=strip_margin) + content += f"\n{page_text}" + content = remove_excess_whitespace(content) + return content, extracted_by_ocr diff --git a/doctor/test_assets/recap_extract/gov.uscourts.cacd.652774.40.0.pdf b/doctor/test_assets/recap_extract/gov.uscourts.cacd.652774.40.0.pdf new file mode 100644 index 0000000..0741378 Binary files /dev/null and b/doctor/test_assets/recap_extract/gov.uscourts.cacd.652774.40.0.pdf differ diff --git a/doctor/test_assets/recap_extract/gov.uscourts.cand.203070.27.0.pdf b/doctor/test_assets/recap_extract/gov.uscourts.cand.203070.27.0.pdf new file mode 100644 index 0000000..2b0184a Binary files /dev/null and b/doctor/test_assets/recap_extract/gov.uscourts.cand.203070.27.0.pdf differ diff --git a/doctor/tests.py b/doctor/tests.py index 2ad282c..f285ec0 100644 --- a/doctor/tests.py +++ b/doctor/tests.py @@ -3,6 +3,7 @@ import re import glob import unittest +from unittest.mock import patch from pathlib import Path from tempfile import NamedTemporaryFile from zipfile import ZipFile @@ -10,6 +11,13 @@ import eyed3 import requests +from doctor.lib.text_extraction import ( + insert_whitespace, + get_word, + remove_excess_whitespace, + cleanup_content, + adjust_caption_lines, +) from doctor.lib.utils import make_file, make_buffer asset_path = f"{Path.cwd()}/doctor/test_assets" @@ -24,6 +32,64 @@ def test_heartbeat(self): ) +class RECAPExtractionTests(unittest.TestCase): + def test_recap_extraction(self): + """Can we extract from the new recap text endpoint""" + files = make_file( + filename="recap_extract/gov.uscourts.cand.203070.27.0.pdf" + ) + params = {"strip_margin": False} + response = requests.post( + "http://doctor:5050/extract/recap/text/", + files=files, + params=params, + ) + first_line = response.json()["content"].splitlines()[0].strip() + self.assertEqual(200, response.status_code, msg="Wrong status code") + self.assertTrue( + response.json()["extracted_by_ocr"], msg="Not extracted correctly" + ) + self.assertEqual( + "aséakOS- 08-0220 A25BA BAD GDoonene 2627 Filed OL/2B/DE0IP adgeahefi2of 2", + first_line, + msg="Wrong Text", + ) + + def test_recap_extraction_with_strip_margin(self): + """Can we extract from the new recap text endpoint with strip margin?""" + files = make_file( + filename="recap_extract/gov.uscourts.cand.203070.27.0.pdf" + ) + params = {"strip_margin": True} + response = requests.post( + "http://doctor:5050/extract/recap/text/", + files=files, + params=params, + ) + first_line = response.json()["content"].splitlines()[0].strip() + self.assertEqual(200, response.status_code, msg="Wrong status code") + self.assertEqual( + "1 || DONALD W. CARLSON [Bar No. 79258]", + first_line, + msg="Wrong Text", + ) + + def test_strip_margin_without_ocr(self): + """Can we extract from the new recap text endpoint with strip margin?""" + files = make_file( + filename="recap_extract/gov.uscourts.cacd.652774.40.0.pdf" + ) + params = {"strip_margin": True} + response = requests.post( + "http://doctor:5050/extract/recap/text/", + files=files, + params=params, + ) + first_line = response.json()["content"].splitlines()[0].strip() + self.assertEqual(200, response.status_code, msg="Wrong status code") + self.assertEqual("1", first_line, msg="Wrong Text") + + class ExtractionTests(unittest.TestCase): def test_pdf_to_text(self): """""" @@ -232,7 +298,6 @@ def test_mime_type(self): files=files, params=params, ).json() - print(response) self.assertEqual( response["mimetype"], "application/pdf", @@ -489,5 +554,267 @@ def test_pdf_400_mime(self): self.assertEqual(response.status_code, 400, msg="Wrong validation") +class TestRecapWhitespaceInsertions(unittest.TestCase): + """Test our whitespace insertion code""" + + def test_insert_whitespace_new_line(self): + content = "foo" + word = { + "line_num": 2, + "par_num": 1, + "left": 50, + "top": 200, + "width": 10, + "height": 20, + } + prev = { + "line_num": 1, + "par_num": 1, + "left": 10, + "top": 100, + "width": 30, + "height": 20, + } + result = insert_whitespace(content, word, prev) + self.assertEqual(result, "foo\n ") + + def test_insert_whitespace_new_paragraph(self): + content = "foo" + word = { + "line_num": 1, + "par_num": 2, + "left": 50, + "top": 200, + "width": 10, + "height": 20, + } + prev = { + "line_num": 2, + "par_num": 1, + "left": 10, + "top": 100, + "width": 30, + "height": 20, + } + result = insert_whitespace(content, word, prev) + self.assertEqual(result, "foo\n ") + + def test_insert_whitespace_vertical_gap(self): + content = "foo" + word = { + "line_num": 2, + "par_num": 1, + "left": 50, + "top": 300, + "width": 10, + "height": 20, + } + prev = { + "line_num": 1, + "par_num": 1, + "left": 10, + "top": 100, + "width": 30, + "height": 20, + } + result = insert_whitespace(content, word, prev) + self.assertEqual(result, "foo\n\n ") + + def test_insert_whitespace_horizontal_gap(self): + content = "foo" + word = { + "line_num": 1, + "par_num": 1, + "left": 200, + "top": 100, + "width": 10, + "height": 20, + } + prev = { + "line_num": 1, + "par_num": 1, + "left": 10, + "top": 100, + "width": 30, + "height": 20, + } + result = insert_whitespace(content, word, prev) + self.assertEqual(result, "foo ") + + def test_insert_whitespace_no_gap(self): + content = "foo" + word = { + "line_num": 1, + "par_num": 1, + "left": 50, + "top": 100, + "width": 10, + "height": 20, + } + prev = { + "line_num": 1, + "par_num": 1, + "left": 40, + "top": 100, + "width": 10, + "height": 20, + } + result = insert_whitespace(content, word, prev) + self.assertEqual(result, "foo") + + +class TestOCRConfidenceTests(unittest.TestCase): + """Test our OCR confidence checking functions.""" + + def test_confidence_zero(self): + word_dict = {"text": "foo", "conf": 0, "left": 10, "width": 30} + result = get_word(word_dict, 612, True) + self.assertEqual(result, " ") + + def test_confidence_low_and_in_margin(self): + word_dict = {"text": "foo", "conf": 30, "left": 5, "width": 20} + result = get_word(word_dict, 612, True) + self.assertEqual(result, " ") + + def test_confidence_below_threshold_short_word(self): + word_dict = {"text": "foo", "conf": 3, "left": 200, "width": 20} + result = get_word(word_dict, 612, True) + self.assertEqual(result, "□□□ ") + + def test_confidence_below_threshold_long_word(self): + word_dict = { + "text": "foobarbazfoobarbazfoobar", + "conf": 3, + "left": 200, + "width": 200, + } + result = get_word(word_dict, 612, True) + self.assertEqual(result, "□□□□□□□□□□□□□□□□□□□□□□□□ ") + + def test_confidence_below_threshold_in_right_margin(self): + word_dict = {"text": "foo", "conf": 30, "left": 580, "width": 10} + result = get_word(word_dict, 612, True) + self.assertEqual(result, "□□□ ") + + def test_valid_word_high_confidence(self): + word_dict = {"text": "foo", "conf": 90, "left": 50, "width": 20} + result = get_word(word_dict, 612, True) + self.assertEqual(result, "foo ") + + def test_word_on_left_edge(self): + word_dict = {"text": "foo", "conf": 50, "left": 0, "width": 20} + result = get_word(word_dict, 612, True) + self.assertEqual(result, " ") + + +class TestWhiteSpaceRemoval(unittest.TestCase): + + def test_left_shift(self): + """Can we properly shift our text left?""" + document = """ + foo + bar + foo + bar""" + expected_result = """ foo +bar +foo +bar""" + result = remove_excess_whitespace(document) + self.assertEqual(result, expected_result) + + def test_left_shift_when_artifact_exists(self): + """Shift left once""" + document = """ + foo + bar + | foo + bar""" + expected_result = """ foo + bar +| foo + bar""" + result = remove_excess_whitespace(document) + self.assertEqual(result, expected_result) + + +class TestCleanupContent(unittest.TestCase): + + def setUp(self): + # Patch the functions before each test method + patcher1 = patch( + "doctor.lib.text_extraction.adjust_caption_lines", + side_effect=lambda x: x, + ) + patcher2 = patch( + "doctor.lib.text_extraction.remove_excess_whitespace", + side_effect=lambda x: x, + ) + self.mock_adjust = patcher1.start() + self.mock_remove_whitespace = patcher2.start() + self.addCleanup(patcher1.stop) + self.addCleanup(patcher2.stop) + + def test_remove_floating_pipes(self): + """Can we remove a pipe""" + content = "This is a test line | \nAnother line" + expected_result = "This is a test line\nAnother line\n" + result = cleanup_content(content, 2) + self.assertEqual(result, expected_result) + + def test_remove_floating_artifacts_right_side(self): + """Can we remove an artifact on the far right""" + content = "This is a test line e \nAnother line" + expected_result = "This is a test line\nAnother line\n" + result = cleanup_content(content, 2) + self.assertEqual(result, expected_result) + + def test_remove_floating_pipes_and_artifacts(self): + """Test to remove just the period""" + content = "This is a test line | and the content continues\nThis is another test line e \nFinal line" + expected_result = "This is a test line | and the content continues\nThis is another test line\nFinal line\n" + result = cleanup_content(content, 2) + self.assertEqual(result, expected_result) + + def test_no_floating_pipes_or_artifacts(self): + """Test that no floating pipes are an issue""" + content = ( + "This is a test line JW-6\nAnother line\n" + ) + expected_result = ( + "This is a test line JW-6\nAnother line\n\n" + ) + result = cleanup_content(content, 2) + self.assertEqual(result, expected_result) + + def test_adjust_caption(self): + """Test if we can align the caption correctly""" + content = """ 10 + LESLIE MASSEY, ) Case No.: 2:16-cv-05001 GJS + ) + oe ) PROPOSED} ORDER AWARDING + 12 Plaintiff, ) EQUAL ACCESS TO JUSTICE ACT + ) ATTORNEY FEES AND EXPENSES + 13 VS. ) PURSUANT TO 28 U.S.C. § 2412(d) + NANCY A. BERRYHILL, Acting ) AND COSTS PURSUANT TO 28 + 14 || Commissioner of Social Security, ) U.S.C. § 1920 + 15 Defendant ) + 16 ) """ + + expected_result = """ 10 + LESLIE MASSEY, ) Case No.: 2:16-cv-05001 GJS + ) + oe ) PROPOSED} ORDER AWARDING + 12 Plaintiff, ) EQUAL ACCESS TO JUSTICE ACT + ) ATTORNEY FEES AND EXPENSES + 13 VS. ) PURSUANT TO 28 U.S.C. § 2412(d) + NANCY A. BERRYHILL, Acting ) AND COSTS PURSUANT TO 28 + 14 || Commissioner of Social Security, ) U.S.C. § 1920 + 15 Defendant ) + 16 ) """ + content = adjust_caption_lines(content) + self.assertEqual(expected_result, content) + + if __name__ == "__main__": unittest.main() diff --git a/doctor/urls.py b/doctor/urls.py index 54e0788..e151bc6 100644 --- a/doctor/urls.py +++ b/doctor/urls.py @@ -10,6 +10,11 @@ views.extract_doc_content, name="convert-doc-to-text", ), + path( + "extract/recap/text/", + views.extract_recap_document, + name="extract-recap-document", + ), path("convert/image/pdf/", views.image_to_pdf, name="image-to-pdf"), path("convert/images/pdf/", views.images_to_pdf, name="images-to-pdf"), path("convert/pdf/thumbnail/", views.make_png_thumbnail, name="thumbnail"), diff --git a/doctor/views.py b/doctor/views.py index cc2cbfa..16b1de6 100644 --- a/doctor/views.py +++ b/doctor/views.py @@ -48,6 +48,7 @@ rasterize_pdf, set_mp3_meta_data, strip_metadata_from_bytes, + extract_recap_pdf, ) @@ -76,6 +77,35 @@ def image_to_pdf(request) -> HttpResponse: return HttpResponse(cleaned_pdf_bytes) +def extract_recap_document(request) -> JsonResponse: + """Extract Recap Documents + + :param request: The request object + :return: JsonResponse + """ + form = DocumentForm(request.GET, request.FILES) + if not form.is_valid(): + return JsonResponse( + { + "err": "Failed validation", + }, + status=BAD_REQUEST, + ) + filepath = form.cleaned_data["fp"] + strip_margin = form.cleaned_data["strip_margin"] + content, extracted_by_ocr = extract_recap_pdf( + filepath=filepath, + strip_margin=strip_margin, + ) + cleanup_form(form) + return JsonResponse( + { + "content": content, + "extracted_by_ocr": extracted_by_ocr, + } + ) + + def extract_doc_content(request) -> Union[JsonResponse, HttpResponse]: """Extract txt from different document types. diff --git a/requirements.txt b/requirements.txt index 953bd68..ca94094 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,6 +8,7 @@ python-magic idna==2.10 img2pdf lxml>=4.5.2 +lxml_html_clean numpy>=1.19.1 opencv-python>=4.2.0.32 pandas>=1.1.1