Improve pdfminer embedded image extraction in pdf partitioning (#3456)

### Summary This PR addresses an issue in `pdfminer` library's embedded image extraction process. Previously, some extracted "images" were incorrect, including embedded text elements, resulting in oversized bounding boxes. This update refines the extraction process to focus on actual images with more accurate, smaller bounding boxes. ### Testing PDF: [test_pdfminer_text_extraction.pdf](https://github.com/user-attachments/files/16448213/test_pdfminer_text_extraction.pdf) ``` elements = partition_pdf( filename="test_pdfminer_text_extraction", strategy=strategy, languages=["chi_sim"], analysis=True, ) ``` **Results** - this `PR` ![page1_layout_pdfminer](https://github.com/user-attachments/assets/098e0a1f-fdad-4627-a881-cbafd71ce5a0) ![page1_layout_final](https://github.com/user-attachments/assets/6dc89180-36ac-424a-99de-63810ebf8958) - `main` branch ![page1_layout_pdfminer](https://github.com/user-attachments/assets/8228995a-2ef1-4b76-9758-b8015c224e6d) ![page1_layout_final](https://github.com/user-attachments/assets/68d43d7b-7270-4f58-8360-dc76bd0df78f)
Unstructured-IO · Aug 1, 2024 · 242a66b · 242a66b
1 parent 8fd216c
commit 242a66b
Show file tree

Hide file tree

Showing 4 changed files with 42 additions and 56 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,9 @@
-## 0.15.1-dev7
+## 0.15.1-dev8
 
 ### Enhancements
 
+* **Improve `pdfminer` embedded `image` extraction to exclude text elements and produce more accurate bounding boxes.** This results in cleaner, more precise element extraction in `pdf` partitioning.
+
 ### Features
 
 * **Mark ingest as deprecated** Begin sunset of ingest code in this repo as it's been moved to a dedicated repo.

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.15.1-dev7"  # pragma: no cover
+__version__ = "0.15.1-dev8"  # pragma: no cover
diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -5,7 +5,7 @@
 from unstructured.documents.elements import ElementType
 from unstructured.partition.pdf_image.pdf_image_utils import remove_control_characters
 from unstructured.partition.pdf_image.pdfminer_utils import (
-    get_images_from_pdf_element,
+    extract_image_objects,
     open_pdfminer_pages_generator,
     rect_to_bbox,
 )
@@ -51,32 +51,26 @@ def process_data_with_pdfminer(
     for page, page_layout in open_pdfminer_pages_generator(file):
         height = page_layout.height
 
-        layout: List["TextRegion"] = []
+        layout: list["TextRegion"] = []
         for obj in page_layout:
             x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)
 
             if hasattr(obj, "get_text"):
                 _text = obj.get_text()
-                element_class = EmbeddedTextRegion  # type: ignore
+                text_region = _create_text_region(
+                    x1, y1, x2, y2, coef, _text, Source.PDFMINER, EmbeddedTextRegion
+                )
+                if text_region.bbox is not None and text_region.bbox.area > 0:
+                    layout.append(text_region)
             else:
-                embedded_images = get_images_from_pdf_element(obj)
-                if len(embedded_images) > 0:
-                    _text = None
-                    element_class = ImageTextRegion  # type: ignore
-                else:
-                    continue
-
-            text_region = element_class.from_coords(
-                x1 * coef,
-                y1 * coef,
-                x2 * coef,
-                y2 * coef,
-                text=_text,
-                source=Source.PDFMINER,
-            )
-
-            if text_region.bbox is not None and text_region.bbox.area > 0:
-                layout.append(text_region)
+                inner_image_objects = extract_image_objects(obj)
+                for img_obj in inner_image_objects:
+                    new_x1, new_y1, new_x2, new_y2 = rect_to_bbox(img_obj.bbox, height)
+                    text_region = _create_text_region(
+                        new_x1, new_y1, new_x2, new_y2, coef, None, Source.PDFMINER, ImageTextRegion
+                    )
+                    if text_region.bbox is not None and text_region.bbox.area > 0:
+                        layout.append(text_region)
 
         # NOTE(christine): always do the basic sort first for deterministic order across
         # python versions.
@@ -90,6 +84,18 @@ def process_data_with_pdfminer(
     return layouts
 
 
+def _create_text_region(x1, y1, x2, y2, coef, text, source, region_class):
+    """Creates a text region of the specified class with scaled coordinates."""
+    return region_class.from_coords(
+        x1 * coef,
+        y1 * coef,
+        x2 * coef,
+        y2 * coef,
+        text=text,
+        source=source,
+    )
+
+
 @requires_dependencies("unstructured_inference")
 def merge_inferred_with_extracted_layout(
     inferred_document_layout: "DocumentLayout",

diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py
@@ -1,8 +1,8 @@
 import tempfile
-from typing import Any, BinaryIO, List, Tuple
+from typing import BinaryIO, List, Tuple
 
 from pdfminer.converter import PDFPageAggregator
-from pdfminer.layout import LAParams, LTContainer, LTImage
+from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem
 from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
 from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfparser import PSSyntaxError
@@ -20,39 +20,17 @@ def init_pdfminer():
     return device, interpreter
 
 
-def get_images_from_pdf_element(layout_object: Any) -> List[LTImage]:
-    """
-    Recursively extracts LTImage objects from a PDF layout element.
-
-    This function takes a PDF layout element (could be LTImage or LTContainer) and recursively
-    extracts all LTImage objects contained within it.
-
-    Parameters:
-    - layout_object (Any): The PDF layout element to extract images from.
+def extract_image_objects(parent_object: LTItem) -> List[LTImage]:
+    """Recursively extracts image objects from a given parent object in a PDF document."""
+    objects = []
 
-    Returns:
-    - List[LTImage]: A list of LTImage objects extracted from the layout object.
-
-    Note:
-    - This function recursively traverses through the layout_object to find and accumulate all
-     LTImage objects.
-    - If the input layout_object is an LTImage, it will be included in the returned list.
-    - If the input layout_object is an LTContainer, the function will recursively search its
-     children for LTImage objects.
-    - If the input layout_object is neither LTImage nor LTContainer, an empty list will be
-     returned.
-    """
+    if isinstance(parent_object, LTImage):
+        objects.append(parent_object)
+    elif isinstance(parent_object, LTContainer):
+        for child in parent_object:
+            objects.extend(extract_image_objects(child))
 
-    # recursively locate Image objects in layout_object
-    if isinstance(layout_object, LTImage):
-        return [layout_object]
-    if isinstance(layout_object, LTContainer):
-        img_list: List[LTImage] = []
-        for child in layout_object:
-            img_list = img_list + get_images_from_pdf_element(child)
-        return img_list
-    else:
-        return []
+    return objects
 
 
 def rect_to_bbox(
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.15.1-dev7" # pragma: no cover
		__version__ = "0.15.1-dev8" # pragma: no cover