Unstructured-IO · christinestraub · Aug 1, 2024 · Jul 31, 2024 · Jul 31, 2024 · Jul 31, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,9 @@
-## 0.15.1-dev7
+## 0.15.1-dev8
 
 ### Enhancements
 
+* **Improve `pdfminer` embedded `image` extraction to exclude text elements and produce more accurate bounding boxes.** This results in cleaner, more precise element extraction in `pdf` partitioning.
+
 ### Features
 
 * **Mark ingest as deprecated** Begin sunset of ingest code in this repo as it's been moved to a dedicated repo.

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.15.1-dev7"  # pragma: no cover
+__version__ = "0.15.1-dev8"  # pragma: no cover
diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -5,7 +5,7 @@
 from unstructured.documents.elements import ElementType
 from unstructured.partition.pdf_image.pdf_image_utils import remove_control_characters
 from unstructured.partition.pdf_image.pdfminer_utils import (
-    get_images_from_pdf_element,
+    extract_image_objects,
     open_pdfminer_pages_generator,
     rect_to_bbox,
 )
@@ -51,32 +51,26 @@ def process_data_with_pdfminer(
     for page, page_layout in open_pdfminer_pages_generator(file):
         height = page_layout.height
 
-        layout: List["TextRegion"] = []
+        layout: list["TextRegion"] = []
         for obj in page_layout:
             x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
 
             if hasattr(obj, "get_text"):
                 _text = obj.get_text()
-                element_class = EmbeddedTextRegion  # type: ignore
+                text_region = _create_text_region(
+                    x1, y1, x2, y2, coef, _text, Source.PDFMINER, EmbeddedTextRegion
+                )
+                if text_region.bbox is not None and text_region.bbox.area > 0:
+                    layout.append(text_region)
             else:
-                embedded_images = get_images_from_pdf_element(obj)
-                if len(embedded_images) > 0:
-                    _text = None
-                    element_class = ImageTextRegion  # type: ignore
-                else:
-                    continue
-
-            text_region = element_class.from_coords(
-                x1 * coef,
-                y1 * coef,
-                x2 * coef,
-                y2 * coef,
-                text=_text,
-                source=Source.PDFMINER,
-            )
-
-            if text_region.bbox is not None and text_region.bbox.area > 0:
-                layout.append(text_region)
+                inner_image_objects = extract_image_objects(obj)
+                for img_obj in inner_image_objects:
+                    new_x1, new_y1, new_x2, new_y2 = rect_to_bbox(img_obj.bbox, height)
+                    text_region = _create_text_region(
+                        new_x1, new_y1, new_x2, new_y2, coef, None, Source.PDFMINER, ImageTextRegion
+                    )
+                    if text_region.bbox is not None and text_region.bbox.area > 0:
+                        layout.append(text_region)
 
         # NOTE(christine): always do the basic sort first for deterministic order across
         # python versions.
@@ -90,6 +84,18 @@ def process_data_with_pdfminer(
     return layouts
 
 
+def _create_text_region(x1, y1, x2, y2, coef, text, source, region_class):
+    """Creates a text region of the specified class with scaled coordinates."""
+    return region_class.from_coords(
+        x1 * coef,
+        y1 * coef,
+        x2 * coef,
+        y2 * coef,
+        text=text,
+        source=source,
+    )
+
+
 @requires_dependencies("unstructured_inference")
 def merge_inferred_with_extracted_layout(
     inferred_document_layout: "DocumentLayout",

diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py
@@ -1,8 +1,8 @@
 import tempfile
-from typing import Any, BinaryIO, List, Tuple
+from typing import BinaryIO, List, Tuple
 
 from pdfminer.converter import PDFPageAggregator
-from pdfminer.layout import LAParams, LTContainer, LTImage
+from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem
 from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
 from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfparser import PSSyntaxError
@@ -20,39 +20,17 @@ def init_pdfminer():
     return device, interpreter
 
 
-def get_images_from_pdf_element(layout_object: Any) -> List[LTImage]:
-    """
-    Recursively extracts LTImage objects from a PDF layout element.
-
-    This function takes a PDF layout element (could be LTImage or LTContainer) and recursively
-    extracts all LTImage objects contained within it.
-
-    Parameters:
-    - layout_object (Any): The PDF layout element to extract images from.
+def extract_image_objects(parent_object: LTItem) -> List[LTImage]:
+    """Recursively extracts image objects from a given parent object in a PDF document."""
+    objects = []
 
-    Returns:
-    - List[LTImage]: A list of LTImage objects extracted from the layout object.
-
-    Note:
-    - This function recursively traverses through the layout_object to find and accumulate all
-     LTImage objects.
-    - If the input layout_object is an LTImage, it will be included in the returned list.
-    - If the input layout_object is an LTContainer, the function will recursively search its
-     children for LTImage objects.
-    - If the input layout_object is neither LTImage nor LTContainer, an empty list will be
-     returned.
-    """
+    if isinstance(parent_object, LTImage):
+        objects.append(parent_object)
+    elif isinstance(parent_object, LTContainer):
+        for child in parent_object:
+            objects.extend(extract_image_objects(child))
 
-    # recursively locate Image objects in layout_object
-    if isinstance(layout_object, LTImage):
-        return [layout_object]
-    if isinstance(layout_object, LTContainer):
-        img_list: List[LTImage] = []
-        for child in layout_object:
-            img_list = img_list + get_images_from_pdf_element(child)
-        return img_list
-    else:
-        return []
+    return objects
 
 
 def rect_to_bbox(
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.15.1-dev7" # pragma: no cover
		__version__ = "0.15.1-dev8" # pragma: no cover