diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index e60ac21..647cabe 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.10"]
+        python-version: ["3.10", "3.11", "3.12"]
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python ${{ matrix.python-version }}
diff --git a/DEVELOPING.md b/DEVELOPING.md
index 2eebdd9..b81ee32 100644
--- a/DEVELOPING.md
+++ b/DEVELOPING.md
@@ -19,7 +19,7 @@ If you want to see debug logs, set `DEBUG` to `True` in `settings.py`.
 Once the above compose file is running, you can use the `mock_web_app`
 container to run the tests against the `doctor` container:
 
-    docker exec -it mock_web_app_doctor python3 -m unittest doctor.tests
+    docker exec -it mock_web_app python3 -m unittest doctor.tests
 
 
 ## Building Images
diff --git a/README.md b/README.md
index 7b8538e..c5da289 100644
--- a/README.md
+++ b/README.md
@@ -100,6 +100,25 @@ Valid requests will receive a JSON response with the following keys:
  - `extracted_by_ocr`: Whether OCR was needed and used during processing.
  - `page_count`: The number of pages, if it applies.
 
+### Endpoint: /extract/recap/text/
+
+Given a RECAP pdf, extract out the text using PDF Plumber, OCR or a combination of the two
+
+Parameters:
+
+ - `strip_margin`: Whether doctor should crop the edges of the recap document during processing. With PDF plumber it will ignore traditional 1 inch margin.  With an OCR it lowers the threshold for hiding OCR gibberish. To enable it, set strip_margin to `True`:
+
+```bash
+curl 'http://localhost:5050/extract/recap/text/?strip_margin=True' \
+  -X 'POST' \
+  -F "file=@doctor/recap_extract/gov.uscourts.cacd.652774.40.0.pdf"
+```
+
+Valid requests will receive a JSON response with the following keys:
+
+ - `content`: The utf-8 encoded text of the file
+ - `extracted_by_ocr`: Whether OCR was needed and used during processing.
+
 
 ## Utilities
 
diff --git a/doctor/forms.py b/doctor/forms.py
index a3d14e5..04c229e 100644
--- a/doctor/forms.py
+++ b/doctor/forms.py
@@ -95,6 +95,7 @@ def clean(self):
 class DocumentForm(BaseFileForm):
     ocr_available = forms.BooleanField(label="ocr-available", required=False)
     mime = forms.BooleanField(label="mime", required=False)
+    strip_margin = forms.BooleanField(label="strip-margin", required=False)
 
     def clean(self):
         self.clean_file()
diff --git a/doctor/lib/text_extraction.py b/doctor/lib/text_extraction.py
new file mode 100644
index 0000000..6b9f5f0
--- /dev/null
+++ b/doctor/lib/text_extraction.py
@@ -0,0 +1,381 @@
+import re
+from statistics import mean
+
+import pdfplumber
+from pdfplumber.ctm import CTM
+import pytesseract
+from pytesseract import Output
+import pandas as pd
+from PIL import Image
+
+
+def is_skewed(obj: dict) -> bool:
+    """Check if a PDF plumber dict is skewed
+
+    CTM stands for current transformation matrix.
+    Pdf plumber has a method to calculate the angle of text which we use here
+
+    Traditionally this is only seen in circular stamps which confuses the
+    content, or in perpendicular text of the ninth circuit courts which also
+    confuses the text.
+
+    :param obj: dictionary from pdfplumber for each word
+    :return: if the text should be returned
+    """
+    if (matrix := obj.get("matrix")) is None:
+        return True
+
+    # Remove Skew
+    my_char_ctm = CTM(*matrix)
+    if my_char_ctm.skew_x != 0:
+        return False
+    return True
+
+
+def get_page_text(page: pdfplumber.PDF.pages, strip_margin: bool) -> str:
+    """Extract page text
+
+    Using pdf plumber extract out the text of the document that is not
+    skewed (ie a stamp of approval) and extract out text removing blue text
+
+    :param page: PdfPlumber page
+    :param strip_margin: a flag to crop out the margin of a document and skewed content
+    :return: Text from the pdf plumber page
+    """
+    if strip_margin:
+        # Crop margins and remove skewed text
+        _, _, width, height = page.bbox
+        pixels_per_inch = width / 8.5
+        bbox = (
+            pixels_per_inch * 1,  # 1 inch from left edge
+            pixels_per_inch * 1,  # 1 inch down from top
+            pixels_per_inch
+            * 7.5,  # 7.5 inches from left edge (1 inch from right)
+            pixels_per_inch * 10,  # 10 inches from top (1 inch from bottom)
+        )
+        doc_text = (
+            page.crop(bbox)
+            .filter(is_skewed)
+            .extract_text(
+                layout=True, keep_blank_chars=True, y_tolerance=5, y_density=25
+            )
+        )
+    else:
+        doc_text = page.extract_text(
+            layout=True, keep_blank_chars=True, y_tolerance=5, y_density=25
+        )
+    return doc_text
+
+
+def has_images(page: pdfplumber.pdf.Page) -> bool:
+    """Does the page have images that are large enough to contain text
+
+    :param page: pdf plumber page
+    :return: True if page contains images of a certain size
+    """
+    return any(
+        [
+            image
+            for image in page.images
+            if image["width"] > 10 and image["height"] > 10
+        ]
+    )
+
+
+def has_text_annotations(page: pdfplumber.pdf.Page) -> bool:
+    """Does the page have annotations which could contain text
+
+    :param page: pdf plumber
+    :return: if page has annotations
+    """
+    if page.annots:
+        anno_types = [
+            str(annot.get("data").get("Subtype")) for annot in page.annots
+        ]
+        if "/'FreeText'" in anno_types or "/'Widget'" in anno_types:
+            return True
+    return False
+
+
+def adjust_caption_lines(page_text: str) -> str:
+    """Adjust the alignment of ) or : or § used to align content
+
+    § is used in texas courts
+    : is used in NY courts
+    ) is used in many courts
+
+    :param page_text: The text of the first page
+    :return: The page text
+    """
+    for separator in [r")", "§", ":"]:
+        pattern = rf"(.* +{re.escape(separator)} .*\n)"
+        matches = list(re.finditer(pattern, page_text))
+        central_matches = [
+            match.group().rindex(separator)
+            for match in matches
+            if 30 <= match.group().rindex(separator) <= 70
+        ]
+        if len(central_matches) < 3:
+            continue  # Skip this separator if less than 3 matches found
+        # Determine the longest position of the separator
+        longest = max(central_matches)
+        page = []
+        for row in page_text.splitlines():
+            index = row.find(f" {separator}")
+            addition = (longest - index) * " "
+            row = row.replace(f" {separator}", f"{addition}{separator}")
+            page.append(row)
+        return "\n".join(page)
+
+
+def page_needs_ocr(page: pdfplumber.pdf.Page, page_text: str) -> bool:
+    """Does the page need OCR
+
+    :param page:Pdf Plumber Page
+    :param page_text: context extracted from page
+    :return: does page need OCR
+    """
+    if (
+        page_text.strip() == ""
+        or "(cid:" in page_text
+        or has_text_annotations(page)
+        or has_images(page)
+        or len(page.curves) > 10
+    ):
+        return True
+    return False
+
+
+def convert_pdf_page_to_image(
+    page: pdfplumber.pdf.Page, strip_margin: bool
+) -> Image:
+    """Convert page to image and crop margin if applicable
+
+    :param page: the pdf page
+    :param strip_margin: whether to crop the margin
+    :return: The cropped page image
+    """
+    img = page.to_image(resolution=300)
+    _, _, w, h = page.bbox
+    width = w * img.scale
+
+    if strip_margin == True:
+        pixels_per_inch = width / 8.5
+        bbox = (
+            pixels_per_inch * 0.5,  # .5"  from left edge
+            pixels_per_inch * 0.5,  # .5" down from top
+            pixels_per_inch * 8,  # 8" from left edge (.5" from right)
+            pixels_per_inch * 10.5,  # 10.5" from top (.5" from bottom)
+        )
+        image = img.original.crop(bbox)
+    else:
+        image = img.original
+    return image
+
+
+def ocr_image_to_data(image: Image) -> list[pd.DataFrame]:
+    """Perform OCR on an image to extract data
+
+    Convert the image of the pdf page to OCR data
+    :param image: Pil Image
+    :return: A list of DataFrames, each containing OCR data for a block of text
+    """
+
+    #  Detailed Parameters for `pytesseract.image_to_data`:
+    #  - config: str
+    #      Additional Tesseract configuration options.
+    #      - `-c preserve_interword_spaces=1`: Preserve spaces between words as they appear in the image.
+    #      - `-c tessedit_do_invert=0`: Do not invert the image colors.
+    #      - `--psm 6`: Page segmentation mode 6, which assumes a single uniform block of text.
+    #      - `-l eng`: Use the English language for OCR.
+    #  - output_type: pytesseract.Output.DICT
+    #      Specifies that the output should be a dictionary of OCR data.
+    #
+    #  Reference:
+    #  Tesseract OCR documentation: https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc
+
+    data_dict = pytesseract.image_to_data(
+        image,
+        config="-c preserve_interword_spaces=1x1 -c tessedit_do_invert=0 --psm 6 -l eng",
+        output_type=Output.DICT,
+    )
+    df = pd.DataFrame(data_dict)
+    filtered_data = df[(df.conf != -1)]
+    block_ids = (
+        filtered_data.groupby("block_num")
+        .first()
+        .sort_values("top")
+        .index.tolist()
+    )
+    blocks = [
+        filtered_data[filtered_data["block_num"] == block]
+        for block in block_ids
+    ]
+    return blocks
+
+
+def extract_with_ocr(page: pdfplumber.pdf.Page, strip_margin: bool) -> str:
+    """Extract the page using OCR
+
+    :param page:Pdf Plumber Page
+    :param strip_margin: If we should trim the margins
+    :return: The extracted content for the page
+    """
+
+    image = convert_pdf_page_to_image(page, strip_margin)
+    data = ocr_image_to_data(image)
+    content = ""
+    prev = {}
+    for words in data:
+        for index, word in words.iterrows():
+            content = insert_whitespace(content, word, prev)
+            content += get_word(word, image.size[0], strip_margin)
+            prev = word
+    content = cleanup_content(content, page.page_number)
+    return content
+
+
+def insert_whitespace(content: str, word: dict, prev: dict) -> str:
+    """Insert whitespace after or before word
+
+    :param content: The text extracted so far
+    :param word: The OCR extraction object
+    :param prev: The previous word object extracted
+    :return: The content with the whitespace appended
+    """
+    is_new_line = prev.get("line_num", 0) != word["line_num"]
+    is_new_par = prev.get("par_num", 0) != word["par_num"]
+    prev_end = prev.get("left", 1) + prev.get("width", 1)
+
+    # Add vertical whitespace
+    if is_new_line or is_new_par:
+        vertical_gap = word["top"] - (
+            prev.get("top", 0) + prev.get("height", 0)
+        )
+        content += "\n\n" if vertical_gap > 100 else "\n"
+        prev_end = 0
+
+    # add horizontal whitespace
+    content += " " * int(((word["left"] - prev_end) / 25))
+    return content
+
+
+def get_word(word_dict: dict, width: float, strip_margin: bool) -> str:
+    """Append word to content
+
+    This function determines if a word should be added to the page content
+    and adds the word.
+
+    :param word_dict: the word object from tesseract
+    :param width: The width of the document
+    :param strip_margin: should we strip the margin
+    :return: The text with space
+    """
+    pixels_per_inch = width / 8.5
+    if strip_margin:
+        left_margin = 1 * pixels_per_inch  #
+        right_margin = 7.5 * pixels_per_inch
+    else:
+        left_margin = 0.5 * pixels_per_inch
+        right_margin = 8.0 * pixels_per_inch
+
+    # tesseract provides confidence values for its OCR outputs. We use those
+    # confidence values to determine if something is a good OCR output, a
+    # likely artifact and should be excluded or is bad ocr but not an artifact.
+
+    word = word_dict["text"]
+    conf = word_dict["conf"]
+
+    no_confidence = 0
+    very_low_confidence = 5
+    low_confidence = 40
+    short_word_len = 3
+    long_word_len = 20
+    if (
+        word_dict["left"] + word_dict["width"] < left_margin
+        and conf < low_confidence
+    ):
+        # If a word has confidence below 40, a number that usually equates to 3 to 5
+        # standard deviations from confidences found in other words is entirely in the
+        # margin of the page - its likely an artifact as well.
+        word = " " * len(word)
+    elif (conf == no_confidence and len(word) <= short_word_len) or word_dict[
+        "left"
+    ] == 0:
+        # If a word has a zero confidence or starts on the left most edge of the paper
+        # we return it as an empty string. It is likely an artifact.
+        word = " " * len(word)
+    elif conf < very_low_confidence and (
+        len(word) <= short_word_len or len(word) > long_word_len
+    ):
+        # If a confidence is below 5 - for a very short word - or for a very long word
+        # its likely part of the document but we have no idea so we return a square
+        # box to indicate that. This is often caused by stamps or lines in case captions
+        word = "□" * len(word)
+    elif conf < low_confidence and word_dict["left"] > right_margin:
+        # Finally if a low confidence word starts in the right margin - its likely a
+        # bad OCR that is multiple standard deviations away so we return the word as
+        # empty squares.
+        word = "□" * len(word)
+
+    return f"{word} "
+
+
+def cleanup_content(content: str, page_number: int) -> str:
+    """Reduce legal document line clutter
+
+    This function performs several operations to clean up the text extracted from legal documents:
+
+    1. On the first page, it smooths out vertical lines if they are detected.
+    2. It removes pipes ('|') that might start a line repeatedly.
+    3. It removes artifacts that appear at the end of a line of text, specifically single characters
+       following at least 10 whitespace characters, reducing right margin edge artifacts.
+    4. It removes excess left margin whitespace to improve readability and formatting.
+
+    Example:
+    If the pipes below represent the page edge (not characters):
+    |       we can remove the
+    |    the left whitespace
+    |    and shift this entire
+    |    page over four characters
+    |    which keeps formatting and
+    |    makes the text easier to
+    |    read and process with the API.
+
+    :param content: the page content extracted
+    :param page_number: the page number
+    :return: the cleaned up text
+    """
+    # remove floating pipes
+    pattern = r"\s{4,}\| $"
+    # Substitute the matched pipe with an empty string
+    content = re.sub(pattern, "", content, flags=re.MULTILINE)
+
+    # remove floating artifacts from the right side
+    pattern = r"\s{10,}[a-zA-Z0-9|] $"
+    content = re.sub(pattern, "", content, flags=re.MULTILINE)
+
+    # shift text left if possible and remove excess start and end whitespace
+    content = remove_excess_whitespace(content)
+    if page_number == 1:
+        content = adjust_caption_lines(content)
+
+    return f"{content}\n"
+
+
+def remove_excess_whitespace(document: str) -> str:
+    """Remove excess whitespace from OCR
+
+    This function removes empty lines of text at the start and end of a document
+    and shifts the page left if possible
+
+    :param document: text of the document
+    :return: Document with excess whitespace removed
+    """
+    m = re.findall(r"(^ +)", document, re.MULTILINE)
+    if m:
+        shift_left = len(min(m))
+        pattern = f"(^ {{{shift_left}}})"
+        document = re.sub(pattern, "", document, flags=re.MULTILINE)
+    document = re.sub(r"^ +$", "", document, flags=re.MULTILINE)
+    return document.strip("\n")
diff --git a/doctor/tasks.py b/doctor/tasks.py
index 6c51d38..1077925 100644
--- a/doctor/tasks.py
+++ b/doctor/tasks.py
@@ -22,6 +22,12 @@
 from seal_rookery.search import seal, ImageSizes
 
 from doctor.lib.mojibake import fix_mojibake
+from doctor.lib.text_extraction import (
+    get_page_text,
+    page_needs_ocr,
+    extract_with_ocr,
+    remove_excess_whitespace,
+)
 from doctor.lib.utils import (
     DoctorUnicodeDecodeError,
     force_bytes,
@@ -621,3 +627,26 @@ def get_document_number_from_pdf(path: str) -> str:
         return ""
     document_number = [dn for dn in document_number_matches[0] if dn]
     return clean_document_number(document_number[0])
+
+
+def extract_recap_pdf(
+    filepath: str,
+    strip_margin: bool = False,
+) -> tuple[str, bool]:
+    """Extract from RECAP PDF
+
+    :param filepath: The path to the PDF
+    :param strip_margin: Whether to remove 1 inch margin from text extraction
+    :return: A tuple containing the text and a boolean indicating ocr usage
+    """
+    content = ""
+    extracted_by_ocr = False
+    with pdfplumber.open(filepath) as pdf:
+        for page in pdf.pages:
+            page_text = get_page_text(page, strip_margin=strip_margin)
+            if page_needs_ocr(page, page_text):
+                extracted_by_ocr = True
+                page_text = extract_with_ocr(page, strip_margin=strip_margin)
+            content += f"\n{page_text}"
+    content = remove_excess_whitespace(content)
+    return content, extracted_by_ocr
diff --git a/doctor/test_assets/recap_extract/gov.uscourts.cacd.652774.40.0.pdf b/doctor/test_assets/recap_extract/gov.uscourts.cacd.652774.40.0.pdf
new file mode 100644
index 0000000..0741378
Binary files /dev/null and b/doctor/test_assets/recap_extract/gov.uscourts.cacd.652774.40.0.pdf differ
diff --git a/doctor/test_assets/recap_extract/gov.uscourts.cand.203070.27.0.pdf b/doctor/test_assets/recap_extract/gov.uscourts.cand.203070.27.0.pdf
new file mode 100644
index 0000000..2b0184a
Binary files /dev/null and b/doctor/test_assets/recap_extract/gov.uscourts.cand.203070.27.0.pdf differ
diff --git a/doctor/tests.py b/doctor/tests.py
index 2ad282c..f285ec0 100644
--- a/doctor/tests.py
+++ b/doctor/tests.py
@@ -3,6 +3,7 @@
 import re
 import glob
 import unittest
+from unittest.mock import patch
 from pathlib import Path
 from tempfile import NamedTemporaryFile
 from zipfile import ZipFile
@@ -10,6 +11,13 @@
 import eyed3
 import requests
 
+from doctor.lib.text_extraction import (
+    insert_whitespace,
+    get_word,
+    remove_excess_whitespace,
+    cleanup_content,
+    adjust_caption_lines,
+)
 from doctor.lib.utils import make_file, make_buffer
 
 asset_path = f"{Path.cwd()}/doctor/test_assets"
@@ -24,6 +32,64 @@ def test_heartbeat(self):
         )
 
 
+class RECAPExtractionTests(unittest.TestCase):
+    def test_recap_extraction(self):
+        """Can we extract from the new recap text endpoint"""
+        files = make_file(
+            filename="recap_extract/gov.uscourts.cand.203070.27.0.pdf"
+        )
+        params = {"strip_margin": False}
+        response = requests.post(
+            "http://doctor:5050/extract/recap/text/",
+            files=files,
+            params=params,
+        )
+        first_line = response.json()["content"].splitlines()[0].strip()
+        self.assertEqual(200, response.status_code, msg="Wrong status code")
+        self.assertTrue(
+            response.json()["extracted_by_ocr"], msg="Not extracted correctly"
+        )
+        self.assertEqual(
+            "aséakOS- 08-0220 A25BA  BAD GDoonene 2627  Filed  OL/2B/DE0IP adgeahefi2of 2",
+            first_line,
+            msg="Wrong Text",
+        )
+
+    def test_recap_extraction_with_strip_margin(self):
+        """Can we extract from the new recap text endpoint with strip margin?"""
+        files = make_file(
+            filename="recap_extract/gov.uscourts.cand.203070.27.0.pdf"
+        )
+        params = {"strip_margin": True}
+        response = requests.post(
+            "http://doctor:5050/extract/recap/text/",
+            files=files,
+            params=params,
+        )
+        first_line = response.json()["content"].splitlines()[0].strip()
+        self.assertEqual(200, response.status_code, msg="Wrong status code")
+        self.assertEqual(
+            "1  || DONALD W. CARLSON  [Bar No. 79258]",
+            first_line,
+            msg="Wrong Text",
+        )
+
+    def test_strip_margin_without_ocr(self):
+        """Can we extract from the new recap text endpoint with strip margin?"""
+        files = make_file(
+            filename="recap_extract/gov.uscourts.cacd.652774.40.0.pdf"
+        )
+        params = {"strip_margin": True}
+        response = requests.post(
+            "http://doctor:5050/extract/recap/text/",
+            files=files,
+            params=params,
+        )
+        first_line = response.json()["content"].splitlines()[0].strip()
+        self.assertEqual(200, response.status_code, msg="Wrong status code")
+        self.assertEqual("1", first_line, msg="Wrong Text")
+
+
 class ExtractionTests(unittest.TestCase):
     def test_pdf_to_text(self):
         """"""
@@ -232,7 +298,6 @@ def test_mime_type(self):
             files=files,
             params=params,
         ).json()
-        print(response)
         self.assertEqual(
             response["mimetype"],
             "application/pdf",
@@ -489,5 +554,267 @@ def test_pdf_400_mime(self):
         self.assertEqual(response.status_code, 400, msg="Wrong validation")
 
 
+class TestRecapWhitespaceInsertions(unittest.TestCase):
+    """Test our whitespace insertion code"""
+
+    def test_insert_whitespace_new_line(self):
+        content = "foo"
+        word = {
+            "line_num": 2,
+            "par_num": 1,
+            "left": 50,
+            "top": 200,
+            "width": 10,
+            "height": 20,
+        }
+        prev = {
+            "line_num": 1,
+            "par_num": 1,
+            "left": 10,
+            "top": 100,
+            "width": 30,
+            "height": 20,
+        }
+        result = insert_whitespace(content, word, prev)
+        self.assertEqual(result, "foo\n  ")
+
+    def test_insert_whitespace_new_paragraph(self):
+        content = "foo"
+        word = {
+            "line_num": 1,
+            "par_num": 2,
+            "left": 50,
+            "top": 200,
+            "width": 10,
+            "height": 20,
+        }
+        prev = {
+            "line_num": 2,
+            "par_num": 1,
+            "left": 10,
+            "top": 100,
+            "width": 30,
+            "height": 20,
+        }
+        result = insert_whitespace(content, word, prev)
+        self.assertEqual(result, "foo\n  ")
+
+    def test_insert_whitespace_vertical_gap(self):
+        content = "foo"
+        word = {
+            "line_num": 2,
+            "par_num": 1,
+            "left": 50,
+            "top": 300,
+            "width": 10,
+            "height": 20,
+        }
+        prev = {
+            "line_num": 1,
+            "par_num": 1,
+            "left": 10,
+            "top": 100,
+            "width": 30,
+            "height": 20,
+        }
+        result = insert_whitespace(content, word, prev)
+        self.assertEqual(result, "foo\n\n  ")
+
+    def test_insert_whitespace_horizontal_gap(self):
+        content = "foo"
+        word = {
+            "line_num": 1,
+            "par_num": 1,
+            "left": 200,
+            "top": 100,
+            "width": 10,
+            "height": 20,
+        }
+        prev = {
+            "line_num": 1,
+            "par_num": 1,
+            "left": 10,
+            "top": 100,
+            "width": 30,
+            "height": 20,
+        }
+        result = insert_whitespace(content, word, prev)
+        self.assertEqual(result, "foo      ")
+
+    def test_insert_whitespace_no_gap(self):
+        content = "foo"
+        word = {
+            "line_num": 1,
+            "par_num": 1,
+            "left": 50,
+            "top": 100,
+            "width": 10,
+            "height": 20,
+        }
+        prev = {
+            "line_num": 1,
+            "par_num": 1,
+            "left": 40,
+            "top": 100,
+            "width": 10,
+            "height": 20,
+        }
+        result = insert_whitespace(content, word, prev)
+        self.assertEqual(result, "foo")
+
+
+class TestOCRConfidenceTests(unittest.TestCase):
+    """Test our OCR confidence checking functions."""
+
+    def test_confidence_zero(self):
+        word_dict = {"text": "foo", "conf": 0, "left": 10, "width": 30}
+        result = get_word(word_dict, 612, True)
+        self.assertEqual(result, "    ")
+
+    def test_confidence_low_and_in_margin(self):
+        word_dict = {"text": "foo", "conf": 30, "left": 5, "width": 20}
+        result = get_word(word_dict, 612, True)
+        self.assertEqual(result, "    ")
+
+    def test_confidence_below_threshold_short_word(self):
+        word_dict = {"text": "foo", "conf": 3, "left": 200, "width": 20}
+        result = get_word(word_dict, 612, True)
+        self.assertEqual(result, "□□□ ")
+
+    def test_confidence_below_threshold_long_word(self):
+        word_dict = {
+            "text": "foobarbazfoobarbazfoobar",
+            "conf": 3,
+            "left": 200,
+            "width": 200,
+        }
+        result = get_word(word_dict, 612, True)
+        self.assertEqual(result, "□□□□□□□□□□□□□□□□□□□□□□□□ ")
+
+    def test_confidence_below_threshold_in_right_margin(self):
+        word_dict = {"text": "foo", "conf": 30, "left": 580, "width": 10}
+        result = get_word(word_dict, 612, True)
+        self.assertEqual(result, "□□□ ")
+
+    def test_valid_word_high_confidence(self):
+        word_dict = {"text": "foo", "conf": 90, "left": 50, "width": 20}
+        result = get_word(word_dict, 612, True)
+        self.assertEqual(result, "foo ")
+
+    def test_word_on_left_edge(self):
+        word_dict = {"text": "foo", "conf": 50, "left": 0, "width": 20}
+        result = get_word(word_dict, 612, True)
+        self.assertEqual(result, "    ")
+
+
+class TestWhiteSpaceRemoval(unittest.TestCase):
+
+    def test_left_shift(self):
+        """Can we properly shift our text left?"""
+        document = """
+        foo
+    bar
+    foo
+    bar"""
+        expected_result = """    foo
+bar
+foo
+bar"""
+        result = remove_excess_whitespace(document)
+        self.assertEqual(result, expected_result)
+
+    def test_left_shift_when_artifact_exists(self):
+        """Shift left once"""
+        document = """
+        foo
+    bar
+ |  foo
+    bar"""
+        expected_result = """       foo
+   bar
+|  foo
+   bar"""
+        result = remove_excess_whitespace(document)
+        self.assertEqual(result, expected_result)
+
+
+class TestCleanupContent(unittest.TestCase):
+
+    def setUp(self):
+        # Patch the functions before each test method
+        patcher1 = patch(
+            "doctor.lib.text_extraction.adjust_caption_lines",
+            side_effect=lambda x: x,
+        )
+        patcher2 = patch(
+            "doctor.lib.text_extraction.remove_excess_whitespace",
+            side_effect=lambda x: x,
+        )
+        self.mock_adjust = patcher1.start()
+        self.mock_remove_whitespace = patcher2.start()
+        self.addCleanup(patcher1.stop)
+        self.addCleanup(patcher2.stop)
+
+    def test_remove_floating_pipes(self):
+        """Can we remove a pipe"""
+        content = "This is a test line     | \nAnother line"
+        expected_result = "This is a test line\nAnother line\n"
+        result = cleanup_content(content, 2)
+        self.assertEqual(result, expected_result)
+
+    def test_remove_floating_artifacts_right_side(self):
+        """Can we remove an artifact on the far right"""
+        content = "This is a test line          e \nAnother line"
+        expected_result = "This is a test line\nAnother line\n"
+        result = cleanup_content(content, 2)
+        self.assertEqual(result, expected_result)
+
+    def test_remove_floating_pipes_and_artifacts(self):
+        """Test to remove just the period"""
+        content = "This is a test line     | and the content continues\nThis is another test line              e \nFinal line"
+        expected_result = "This is a test line     | and the content continues\nThis is another test line\nFinal line\n"
+        result = cleanup_content(content, 2)
+        self.assertEqual(result, expected_result)
+
+    def test_no_floating_pipes_or_artifacts(self):
+        """Test that no floating pipes are an issue"""
+        content = (
+            "This is a test line                     JW-6\nAnother line\n"
+        )
+        expected_result = (
+            "This is a test line                     JW-6\nAnother line\n\n"
+        )
+        result = cleanup_content(content, 2)
+        self.assertEqual(result, expected_result)
+
+    def test_adjust_caption(self):
+        """Test if we can align the caption correctly"""
+        content = """             10
+                 LESLIE MASSEY,                    )  Case No.:  2:16-cv-05001 GJS
+                                                       )
+                                 oe                    )  PROPOSED} ORDER AWARDING
+             12               Plaintiff,                    )   EQUAL ACCESS TO JUSTICE ACT
+                                                )    ATTORNEY FEES AND EXPENSES
+             13         VS.                              )  PURSUANT TO 28 U.S.C. § 2412(d)
+                 NANCY A. BERRYHILL, Acting      )  AND COSTS PURSUANT TO 28
+             14 || Commissioner of Social Security,       )  U.S.C. §  1920
+             15               Defendant                 )
+             16                                         ) """
+
+        expected_result = """             10
+                 LESLIE MASSEY,                             )  Case No.:  2:16-cv-05001 GJS
+                                                            )
+                                 oe                         )  PROPOSED} ORDER AWARDING
+             12               Plaintiff,                    )   EQUAL ACCESS TO JUSTICE ACT
+                                                            )    ATTORNEY FEES AND EXPENSES
+             13         VS.                                 )  PURSUANT TO 28 U.S.C. § 2412(d)
+                 NANCY A. BERRYHILL, Acting                 )  AND COSTS PURSUANT TO 28
+             14 || Commissioner of Social Security,         )  U.S.C. §  1920
+             15               Defendant                     )
+             16                                             ) """
+        content = adjust_caption_lines(content)
+        self.assertEqual(expected_result, content)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/doctor/urls.py b/doctor/urls.py
index 54e0788..e151bc6 100644
--- a/doctor/urls.py
+++ b/doctor/urls.py
@@ -10,6 +10,11 @@
         views.extract_doc_content,
         name="convert-doc-to-text",
     ),
+    path(
+        "extract/recap/text/",
+        views.extract_recap_document,
+        name="extract-recap-document",
+    ),
     path("convert/image/pdf/", views.image_to_pdf, name="image-to-pdf"),
     path("convert/images/pdf/", views.images_to_pdf, name="images-to-pdf"),
     path("convert/pdf/thumbnail/", views.make_png_thumbnail, name="thumbnail"),
diff --git a/doctor/views.py b/doctor/views.py
index cc2cbfa..16b1de6 100644
--- a/doctor/views.py
+++ b/doctor/views.py
@@ -48,6 +48,7 @@
     rasterize_pdf,
     set_mp3_meta_data,
     strip_metadata_from_bytes,
+    extract_recap_pdf,
 )
 
 
@@ -76,6 +77,35 @@ def image_to_pdf(request) -> HttpResponse:
         return HttpResponse(cleaned_pdf_bytes)
 
 
+def extract_recap_document(request) -> JsonResponse:
+    """Extract Recap Documents
+
+    :param request: The request object
+    :return: JsonResponse
+    """
+    form = DocumentForm(request.GET, request.FILES)
+    if not form.is_valid():
+        return JsonResponse(
+            {
+                "err": "Failed validation",
+            },
+            status=BAD_REQUEST,
+        )
+    filepath = form.cleaned_data["fp"]
+    strip_margin = form.cleaned_data["strip_margin"]
+    content, extracted_by_ocr = extract_recap_pdf(
+        filepath=filepath,
+        strip_margin=strip_margin,
+    )
+    cleanup_form(form)
+    return JsonResponse(
+        {
+            "content": content,
+            "extracted_by_ocr": extracted_by_ocr,
+        }
+    )
+
+
 def extract_doc_content(request) -> Union[JsonResponse, HttpResponse]:
     """Extract txt from different document types.
 
diff --git a/requirements.txt b/requirements.txt
index 953bd68..ca94094 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,6 +8,7 @@ python-magic
 idna==2.10
 img2pdf
 lxml>=4.5.2
+lxml_html_clean
 numpy>=1.19.1
 opencv-python>=4.2.0.32
 pandas>=1.1.1