Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve pdfminer embedded image extraction in pdf partitioning #3456

Merged
merged 5 commits into from
Aug 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
## 0.15.1-dev7
## 0.15.1-dev8

### Enhancements

* **Improve `pdfminer` embedded `image` extraction to exclude text elements and produce more accurate bounding boxes.** This results in cleaner, more precise element extraction in `pdf` partitioning.

### Features

* **Mark ingest as deprecated** Begin sunset of ingest code in this repo as it's been moved to a dedicated repo.
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.15.1-dev7" # pragma: no cover
__version__ = "0.15.1-dev8" # pragma: no cover
48 changes: 27 additions & 21 deletions unstructured/partition/pdf_image/pdfminer_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from unstructured.documents.elements import ElementType
from unstructured.partition.pdf_image.pdf_image_utils import remove_control_characters
from unstructured.partition.pdf_image.pdfminer_utils import (
get_images_from_pdf_element,
extract_image_objects,
open_pdfminer_pages_generator,
rect_to_bbox,
)
Expand Down Expand Up @@ -51,32 +51,26 @@ def process_data_with_pdfminer(
for page, page_layout in open_pdfminer_pages_generator(file):
height = page_layout.height

layout: List["TextRegion"] = []
layout: list["TextRegion"] = []
for obj in page_layout:
x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height)

if hasattr(obj, "get_text"):
_text = obj.get_text()
element_class = EmbeddedTextRegion # type: ignore
text_region = _create_text_region(
x1, y1, x2, y2, coef, _text, Source.PDFMINER, EmbeddedTextRegion
)
if text_region.bbox is not None and text_region.bbox.area > 0:
layout.append(text_region)
else:
embedded_images = get_images_from_pdf_element(obj)
if len(embedded_images) > 0:
_text = None
element_class = ImageTextRegion # type: ignore
else:
continue

text_region = element_class.from_coords(
x1 * coef,
y1 * coef,
x2 * coef,
y2 * coef,
text=_text,
source=Source.PDFMINER,
)

if text_region.bbox is not None and text_region.bbox.area > 0:
layout.append(text_region)
inner_image_objects = extract_image_objects(obj)
for img_obj in inner_image_objects:
new_x1, new_y1, new_x2, new_y2 = rect_to_bbox(img_obj.bbox, height)
text_region = _create_text_region(
new_x1, new_y1, new_x2, new_y2, coef, None, Source.PDFMINER, ImageTextRegion
)
if text_region.bbox is not None and text_region.bbox.area > 0:
layout.append(text_region)

# NOTE(christine): always do the basic sort first for deterministic order across
# python versions.
Expand All @@ -90,6 +84,18 @@ def process_data_with_pdfminer(
return layouts


def _create_text_region(x1, y1, x2, y2, coef, text, source, region_class):
"""Creates a text region of the specified class with scaled coordinates."""
return region_class.from_coords(
x1 * coef,
y1 * coef,
x2 * coef,
y2 * coef,
text=text,
source=source,
)


@requires_dependencies("unstructured_inference")
def merge_inferred_with_extracted_layout(
inferred_document_layout: "DocumentLayout",
Expand Down
44 changes: 11 additions & 33 deletions unstructured/partition/pdf_image/pdfminer_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import tempfile
from typing import Any, BinaryIO, List, Tuple
from typing import BinaryIO, List, Tuple

from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTContainer, LTImage
from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PSSyntaxError
Expand All @@ -20,39 +20,17 @@ def init_pdfminer():
return device, interpreter


def get_images_from_pdf_element(layout_object: Any) -> List[LTImage]:
"""
Recursively extracts LTImage objects from a PDF layout element.

This function takes a PDF layout element (could be LTImage or LTContainer) and recursively
extracts all LTImage objects contained within it.

Parameters:
- layout_object (Any): The PDF layout element to extract images from.
def extract_image_objects(parent_object: LTItem) -> List[LTImage]:
"""Recursively extracts image objects from a given parent object in a PDF document."""
objects = []

Returns:
- List[LTImage]: A list of LTImage objects extracted from the layout object.

Note:
- This function recursively traverses through the layout_object to find and accumulate all
LTImage objects.
- If the input layout_object is an LTImage, it will be included in the returned list.
- If the input layout_object is an LTContainer, the function will recursively search its
children for LTImage objects.
- If the input layout_object is neither LTImage nor LTContainer, an empty list will be
returned.
"""
if isinstance(parent_object, LTImage):
objects.append(parent_object)
elif isinstance(parent_object, LTContainer):
for child in parent_object:
objects.extend(extract_image_objects(child))

# recursively locate Image objects in layout_object
if isinstance(layout_object, LTImage):
return [layout_object]
if isinstance(layout_object, LTContainer):
img_list: List[LTImage] = []
for child in layout_object:
img_list = img_list + get_images_from_pdf_element(child)
return img_list
else:
return []
return objects


def rect_to_bbox(
Expand Down
Loading