From 4f298c7a0931883e0dd9f77ea3a03dd7fd43be3d Mon Sep 17 00:00:00 2001
From: christinestraub <christinemstraub@gmail.com>
Date: Wed, 31 Jul 2024 09:12:43 -0700
Subject: [PATCH 1/5] feat: add functionality to extract all inner objects

---
 .../pdf_image/pdfminer_processing.py          | 42 +++++++++----------
 .../partition/pdf_image/pdfminer_utils.py     | 27 +++++++++++-
 2 files changed, 46 insertions(+), 23 deletions(-)

diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
index 83e4bab67f..6758ef5918 100644
--- a/unstructured/partition/pdf_image/pdfminer_processing.py
+++ b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -5,7 +5,7 @@
 from unstructured.documents.elements import ElementType
 from unstructured.partition.pdf_image.pdf_image_utils import remove_control_characters
 from unstructured.partition.pdf_image.pdfminer_utils import (
-    get_images_from_pdf_element,
+    extract_text_and_image_objects,
     open_pdfminer_pages_generator,
     rect_to_bbox,
 )
@@ -53,30 +53,28 @@ def process_data_with_pdfminer(
 
         layout: List["TextRegion"] = []
         for obj in page_layout:
-            x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
+            inner_objects = extract_text_and_image_objects(obj)
+            for inner_obj in inner_objects:
+                x1, y1, x2, y2 = rect_to_bbox(inner_obj.bbox, height)
 
-            if hasattr(obj, "get_text"):
-                _text = obj.get_text()
-                element_class = EmbeddedTextRegion  # type: ignore
-            else:
-                embedded_images = get_images_from_pdf_element(obj)
-                if len(embedded_images) > 0:
-                    _text = None
-                    element_class = ImageTextRegion  # type: ignore
+                if hasattr(inner_obj, "get_text"):
+                    _text = inner_obj.get_text()
+                    element_class = EmbeddedTextRegion  # type: ignore
                 else:
-                    continue
-
-            text_region = element_class.from_coords(
-                x1 * coef,
-                y1 * coef,
-                x2 * coef,
-                y2 * coef,
-                text=_text,
-                source=Source.PDFMINER,
-            )
+                    _text = None
+                    element_class = ImageTextRegion
+
+                text_region = element_class.from_coords(
+                    x1 * coef,
+                    y1 * coef,
+                    x2 * coef,
+                    y2 * coef,
+                    text=_text,
+                    source=Source.PDFMINER,
+                )
 
-            if text_region.bbox is not None and text_region.bbox.area > 0:
-                layout.append(text_region)
+                if text_region.bbox is not None and text_region.bbox.area > 0:
+                    layout.append(text_region)
 
         # NOTE(christine): always do the basic sort first for deterministic order across
         # python versions.
diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py
index c35a4dedd3..f1ad39a447 100644
--- a/unstructured/partition/pdf_image/pdfminer_utils.py
+++ b/unstructured/partition/pdf_image/pdfminer_utils.py
@@ -2,7 +2,7 @@
 from typing import Any, BinaryIO, List, Tuple
 
 from pdfminer.converter import PDFPageAggregator
-from pdfminer.layout import LAParams, LTContainer, LTImage
+from pdfminer.layout import LAParams, LTComponent, LTContainer, LTImage
 from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
 from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfparser import PSSyntaxError
@@ -20,6 +20,31 @@ def init_pdfminer():
     return device, interpreter
 
 
+def extract_text_and_image_objects(parent_object) -> List[LTComponent]:
+    """
+    Recursively extract text and image objects from a given parent object in a PDF document.
+
+    This function navigates through the PDF's layout tree and collects objects that contain text
+    (objects with a 'get_text' method) or are images (instances of LTImage).
+
+    Args:
+        parent_object: The root object from which to start the extraction. This could be an
+                       instance of LTContainer, LTText, LTImage, or any other pdfminer layout object.
+
+    Returns:
+        A list of LTComponent objects which are either text-containing objects or images.
+    """
+    objects = []
+
+    if hasattr(parent_object, "get_text") or isinstance(parent_object, LTImage):
+        objects.append(parent_object)
+    elif isinstance(parent_object, LTContainer):
+        for child in parent_object:
+            objects.extend(extract_text_and_image_objects(child))
+
+    return objects
+
+
 def get_images_from_pdf_element(layout_object: Any) -> List[LTImage]:
     """
     Recursively extracts LTImage objects from a PDF layout element.

From fa53e53a0d82294365222e2a5a676ac795a49d3d Mon Sep 17 00:00:00 2001
From: christinestraub <christinemstraub@gmail.com>
Date: Wed, 31 Jul 2024 09:13:42 -0700
Subject: [PATCH 2/5] refactor: remove unused `get_images_from_pdf_element`

---
 .../partition/pdf_image/pdfminer_utils.py     | 35 -------------------
 1 file changed, 35 deletions(-)

diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py
index f1ad39a447..71eaea3a56 100644
--- a/unstructured/partition/pdf_image/pdfminer_utils.py
+++ b/unstructured/partition/pdf_image/pdfminer_utils.py
@@ -45,41 +45,6 @@ def extract_text_and_image_objects(parent_object) -> List[LTComponent]:
     return objects
 
 
-def get_images_from_pdf_element(layout_object: Any) -> List[LTImage]:
-    """
-    Recursively extracts LTImage objects from a PDF layout element.
-
-    This function takes a PDF layout element (could be LTImage or LTContainer) and recursively
-    extracts all LTImage objects contained within it.
-
-    Parameters:
-    - layout_object (Any): The PDF layout element to extract images from.
-
-    Returns:
-    - List[LTImage]: A list of LTImage objects extracted from the layout object.
-
-    Note:
-    - This function recursively traverses through the layout_object to find and accumulate all
-     LTImage objects.
-    - If the input layout_object is an LTImage, it will be included in the returned list.
-    - If the input layout_object is an LTContainer, the function will recursively search its
-     children for LTImage objects.
-    - If the input layout_object is neither LTImage nor LTContainer, an empty list will be
-     returned.
-    """
-
-    # recursively locate Image objects in layout_object
-    if isinstance(layout_object, LTImage):
-        return [layout_object]
-    if isinstance(layout_object, LTContainer):
-        img_list: List[LTImage] = []
-        for child in layout_object:
-            img_list = img_list + get_images_from_pdf_element(child)
-        return img_list
-    else:
-        return []
-
-
 def rect_to_bbox(
     rect: Tuple[float, float, float, float],
     height: float,

From 9cd925464a776c0286a4f62fee8d26ce92470d11 Mon Sep 17 00:00:00 2001
From: christinestraub <christinemstraub@gmail.com>
Date: Wed, 31 Jul 2024 16:12:19 -0700
Subject: [PATCH 3/5] feat: update `ImageTextRegion` extraction

---
 .../pdf_image/pdfminer_processing.py          | 50 +++++++++++--------
 .../partition/pdf_image/pdfminer_utils.py     | 24 +++------
 2 files changed, 35 insertions(+), 39 deletions(-)

diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
index 6758ef5918..4759d4b610 100644
--- a/unstructured/partition/pdf_image/pdfminer_processing.py
+++ b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -5,7 +5,7 @@
 from unstructured.documents.elements import ElementType
 from unstructured.partition.pdf_image.pdf_image_utils import remove_control_characters
 from unstructured.partition.pdf_image.pdfminer_utils import (
-    extract_text_and_image_objects,
+    extract_image_objects,
     open_pdfminer_pages_generator,
     rect_to_bbox,
 )
@@ -51,30 +51,26 @@ def process_data_with_pdfminer(
     for page, page_layout in open_pdfminer_pages_generator(file):
         height = page_layout.height
 
-        layout: List["TextRegion"] = []
+        layout: list["TextRegion"] = []
         for obj in page_layout:
-            inner_objects = extract_text_and_image_objects(obj)
-            for inner_obj in inner_objects:
-                x1, y1, x2, y2 = rect_to_bbox(inner_obj.bbox, height)
-
-                if hasattr(inner_obj, "get_text"):
-                    _text = inner_obj.get_text()
-                    element_class = EmbeddedTextRegion  # type: ignore
-                else:
-                    _text = None
-                    element_class = ImageTextRegion
-
-                text_region = element_class.from_coords(
-                    x1 * coef,
-                    y1 * coef,
-                    x2 * coef,
-                    y2 * coef,
-                    text=_text,
-                    source=Source.PDFMINER,
-                )
+            x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
 
+            if hasattr(obj, "get_text"):
+                _text = obj.get_text()
+                text_region = _create_text_region(
+                    x1, y1, x2, y2, coef, _text, Source.PDFMINER, EmbeddedTextRegion
+                )
                 if text_region.bbox is not None and text_region.bbox.area > 0:
                     layout.append(text_region)
+            else:
+                inner_image_objects = extract_image_objects(obj)
+                for img_obj in inner_image_objects:
+                    new_x1, new_y1, new_x2, new_y2 = rect_to_bbox(img_obj.bbox, height)
+                    text_region = _create_text_region(
+                        new_x1, new_y1, new_x2, new_y2, coef, None, Source.PDFMINER, ImageTextRegion
+                    )
+                    if text_region.bbox is not None and text_region.bbox.area > 0:
+                        layout.append(text_region)
 
         # NOTE(christine): always do the basic sort first for deterministic order across
         # python versions.
@@ -88,6 +84,18 @@ def process_data_with_pdfminer(
     return layouts
 
 
+def _create_text_region(x1, y1, x2, y2, coef, text, source, region_class):
+    """Creates a text region of the specified class with scaled coordinates."""
+    return region_class.from_coords(
+        x1 * coef,
+        y1 * coef,
+        x2 * coef,
+        y2 * coef,
+        text=text,
+        source=source,
+    )
+
+
 @requires_dependencies("unstructured_inference")
 def merge_inferred_with_extracted_layout(
     inferred_document_layout: "DocumentLayout",
diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py
index 71eaea3a56..fce84de0bd 100644
--- a/unstructured/partition/pdf_image/pdfminer_utils.py
+++ b/unstructured/partition/pdf_image/pdfminer_utils.py
@@ -1,8 +1,8 @@
 import tempfile
-from typing import Any, BinaryIO, List, Tuple
+from typing import BinaryIO, List, Tuple
 
 from pdfminer.converter import PDFPageAggregator
-from pdfminer.layout import LAParams, LTComponent, LTContainer, LTImage
+from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem
 from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
 from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfparser import PSSyntaxError
@@ -20,27 +20,15 @@ def init_pdfminer():
     return device, interpreter
 
 
-def extract_text_and_image_objects(parent_object) -> List[LTComponent]:
-    """
-    Recursively extract text and image objects from a given parent object in a PDF document.
-
-    This function navigates through the PDF's layout tree and collects objects that contain text
-    (objects with a 'get_text' method) or are images (instances of LTImage).
-
-    Args:
-        parent_object: The root object from which to start the extraction. This could be an
-                       instance of LTContainer, LTText, LTImage, or any other pdfminer layout object.
-
-    Returns:
-        A list of LTComponent objects which are either text-containing objects or images.
-    """
+def extract_image_objects(parent_object: LTItem) -> List[LTImage]:
+    """Recursively extracts image objects from a given parent object in a PDF document."""
     objects = []
 
-    if hasattr(parent_object, "get_text") or isinstance(parent_object, LTImage):
+    if isinstance(parent_object, LTImage):
         objects.append(parent_object)
     elif isinstance(parent_object, LTContainer):
         for child in parent_object:
-            objects.extend(extract_text_and_image_objects(child))
+            objects.extend(extract_image_objects(child))
 
     return objects
 

From be9141d66de8cf5a45e00ac6c4bee6a740c225fe Mon Sep 17 00:00:00 2001
From: christinestraub <christinemstraub@gmail.com>
Date: Wed, 31 Jul 2024 16:14:48 -0700
Subject: [PATCH 4/5] chore: bump version

---
 CHANGELOG.md                | 2 +-
 unstructured/__version__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0c33bfc91a..512a1bca43 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.15.1-dev7
+## 0.15.1-dev8
 
 ### Enhancements
 
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index d05dee38d4..31c20c9568 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.15.1-dev7"  # pragma: no cover
+__version__ = "0.15.1-dev8"  # pragma: no cover

From 109e1c1ce7f5c234a95144fb39b99a8822c1fff3 Mon Sep 17 00:00:00 2001
From: Christine Straub <christinemstraub@gmail.com>
Date: Wed, 31 Jul 2024 17:46:27 -0700
Subject: [PATCH 5/5] chore: update changelog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 512a1bca43..e2b8877b92 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 ### Enhancements
 
+* **Improve `pdfminer` embedded `image` extraction to exclude text elements and produce more accurate bounding boxes.** This results in cleaner, more precise element extraction in `pdf` partitioning.
+
 ### Features
 
 * **Mark ingest as deprecated** Begin sunset of ingest code in this repo as it's been moved to a dedicated repo.