Skip to content

Commit

Permalink
Improve pdfminer embedded image extraction in pdf partitioning (#3456)
Browse files Browse the repository at this point in the history
### Summary
This PR addresses an issue in `pdfminer` library's embedded image
extraction process. Previously, some extracted "images" were incorrect,
including embedded text elements, resulting in oversized bounding boxes.
This update refines the extraction process to focus on actual images
with more accurate, smaller bounding boxes.

### Testing
PDF:
[test_pdfminer_text_extraction.pdf](https://github.com/user-attachments/files/16448213/test_pdfminer_text_extraction.pdf)

```
elements = partition_pdf(
    filename="test_pdfminer_text_extraction",
    strategy=strategy,
    languages=["chi_sim"],
    analysis=True,
)
```
**Results**
- this `PR`

![page1_layout_pdfminer](https://github.com/user-attachments/assets/098e0a1f-fdad-4627-a881-cbafd71ce5a0)

![page1_layout_final](https://github.com/user-attachments/assets/6dc89180-36ac-424a-99de-63810ebf8958)
- `main` branch

![page1_layout_pdfminer](https://github.com/user-attachments/assets/8228995a-2ef1-4b76-9758-b8015c224e6d)

![page1_layout_final](https://github.com/user-attachments/assets/68d43d7b-7270-4f58-8360-dc76bd0df78f)
  • Loading branch information
christinestraub authored Aug 1, 2024
1 parent 8fd216c commit 242a66b
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 56 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
## 0.15.1-dev7
## 0.15.1-dev8

### Enhancements

* **Improve `pdfminer` embedded `image` extraction to exclude text elements and produce more accurate bounding boxes.** This results in cleaner, more precise element extraction in `pdf` partitioning.

### Features

* **Mark ingest as deprecated** Begin sunset of ingest code in this repo as it's been moved to a dedicated repo.
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.15.1-dev7" # pragma: no cover
__version__ = "0.15.1-dev8" # pragma: no cover
48 changes: 27 additions & 21 deletions unstructured/partition/pdf_image/pdfminer_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from unstructured.documents.elements import ElementType
from unstructured.partition.pdf_image.pdf_image_utils import remove_control_characters
from unstructured.partition.pdf_image.pdfminer_utils import (
get_images_from_pdf_element,
extract_image_objects,
open_pdfminer_pages_generator,
rect_to_bbox,
)
Expand Down Expand Up @@ -51,32 +51,26 @@ def process_data_with_pdfminer(
for page, page_layout in open_pdfminer_pages_generator(file):
height = page_layout.height

layout: List["TextRegion"] = []
layout: list["TextRegion"] = []
for obj in page_layout:
x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)

if hasattr(obj, "get_text"):
_text = obj.get_text()
element_class = EmbeddedTextRegion # type: ignore
text_region = _create_text_region(
x1, y1, x2, y2, coef, _text, Source.PDFMINER, EmbeddedTextRegion
)
if text_region.bbox is not None and text_region.bbox.area > 0:
layout.append(text_region)
else:
embedded_images = get_images_from_pdf_element(obj)
if len(embedded_images) > 0:
_text = None
element_class = ImageTextRegion # type: ignore
else:
continue

text_region = element_class.from_coords(
x1 * coef,
y1 * coef,
x2 * coef,
y2 * coef,
text=_text,
source=Source.PDFMINER,
)

if text_region.bbox is not None and text_region.bbox.area > 0:
layout.append(text_region)
inner_image_objects = extract_image_objects(obj)
for img_obj in inner_image_objects:
new_x1, new_y1, new_x2, new_y2 = rect_to_bbox(img_obj.bbox, height)
text_region = _create_text_region(
new_x1, new_y1, new_x2, new_y2, coef, None, Source.PDFMINER, ImageTextRegion
)
if text_region.bbox is not None and text_region.bbox.area > 0:
layout.append(text_region)

# NOTE(christine): always do the basic sort first for deterministic order across
# python versions.
Expand All @@ -90,6 +84,18 @@ def process_data_with_pdfminer(
return layouts


def _create_text_region(x1, y1, x2, y2, coef, text, source, region_class):
"""Creates a text region of the specified class with scaled coordinates."""
return region_class.from_coords(
x1 * coef,
y1 * coef,
x2 * coef,
y2 * coef,
text=text,
source=source,
)


@requires_dependencies("unstructured_inference")
def merge_inferred_with_extracted_layout(
inferred_document_layout: "DocumentLayout",
Expand Down
44 changes: 11 additions & 33 deletions unstructured/partition/pdf_image/pdfminer_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import tempfile
from typing import Any, BinaryIO, List, Tuple
from typing import BinaryIO, List, Tuple

from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTContainer, LTImage
from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PSSyntaxError
Expand All @@ -20,39 +20,17 @@ def init_pdfminer():
return device, interpreter


def get_images_from_pdf_element(layout_object: Any) -> List[LTImage]:
"""
Recursively extracts LTImage objects from a PDF layout element.
This function takes a PDF layout element (could be LTImage or LTContainer) and recursively
extracts all LTImage objects contained within it.
Parameters:
- layout_object (Any): The PDF layout element to extract images from.
def extract_image_objects(parent_object: LTItem) -> List[LTImage]:
"""Recursively extracts image objects from a given parent object in a PDF document."""
objects = []

Returns:
- List[LTImage]: A list of LTImage objects extracted from the layout object.
Note:
- This function recursively traverses through the layout_object to find and accumulate all
LTImage objects.
- If the input layout_object is an LTImage, it will be included in the returned list.
- If the input layout_object is an LTContainer, the function will recursively search its
children for LTImage objects.
- If the input layout_object is neither LTImage nor LTContainer, an empty list will be
returned.
"""
if isinstance(parent_object, LTImage):
objects.append(parent_object)
elif isinstance(parent_object, LTContainer):
for child in parent_object:
objects.extend(extract_image_objects(child))

# recursively locate Image objects in layout_object
if isinstance(layout_object, LTImage):
return [layout_object]
if isinstance(layout_object, LTContainer):
img_list: List[LTImage] = []
for child in layout_object:
img_list = img_list + get_images_from_pdf_element(child)
return img_list
else:
return []
return objects


def rect_to_bbox(
Expand Down

0 comments on commit 242a66b

Please sign in to comment.