Skip to content

Commit

Permalink
perf: optimize pdfminer image cleanup process for improved performance (
Browse files Browse the repository at this point in the history
#3630)

This PR enhances `pdfminer` image cleanup process by repositioning the
duplicate image removal step. It optimizes the removal of duplicated
pdfminer images by performing the cleanup before merging elements,
rather than after. This improvement reduces execution time and enhances
the overall processing speed of PDF documents.

---------

Co-authored-by: Yao You <[email protected]>
  • Loading branch information
christinestraub and badGarnet authored Sep 19, 2024
1 parent cd074bb commit be88eef
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 32 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

### Enhancements

* **Improve `pdfminer` image cleanup process**. Optimized the removal of duplicated pdfminer images by performing the cleanup before merging elements, rather than after. This improvement reduces execution time and enhances overall processing speed of PDF documents.

### Features

### Fixes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
boxes_self_iou,
clean_pdfminer_duplicate_image_elements,
clean_pdfminer_inner_elements,
remove_duplicate_embedded_text,
remove_duplicate_elements,
)
from unstructured.partition.utils.constants import Source

Expand Down Expand Up @@ -212,14 +212,14 @@ def test_boxes_self_iou(coords, threshold, expected):
np.testing.assert_array_equal(boxes_self_iou(bboxes, threshold), expected)


def test_remove_duplicate_embedded_text():
def test_remove_duplicate_elements():
sample_elements = [
EmbeddedTextRegion(bbox=Rectangle(0, 0, 10, 10), text="Text 1"),
EmbeddedTextRegion(bbox=Rectangle(0, 0, 10, 10), text="Text 2"),
EmbeddedTextRegion(bbox=Rectangle(20, 20, 30, 30), text="Text 3"),
]

result = remove_duplicate_embedded_text(sample_elements)
result = remove_duplicate_elements(sample_elements)

# Check that duplicates were removed and only 2 unique elements remain
assert len(result) == 2
Expand Down
51 changes: 23 additions & 28 deletions unstructured/partition/pdf_image/pdfminer_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,37 +56,42 @@ def process_data_with_pdfminer(
for page, page_layout in open_pdfminer_pages_generator(file):
height = page_layout.height

layout: list["TextRegion"] = []
text_layout = []
image_layout = []
for obj in page_layout:
if hasattr(obj, "get_text"):
inner_text_objects = extract_text_objects(obj)
for inner_obj in inner_text_objects:
_text = inner_obj.get_text()
new_x1, new_y1, new_x2, new_y2 = rect_to_bbox(inner_obj.bbox, height)
text_region = _create_text_region(
new_x1,
new_y1,
new_x2,
new_y2,
*rect_to_bbox(inner_obj.bbox, height),
coef,
_text,
Source.PDFMINER,
EmbeddedTextRegion,
)
if text_region.bbox is not None and text_region.bbox.area > 0:
layout.append(text_region)
text_layout.append(text_region)
else:
inner_image_objects = extract_image_objects(obj)
for img_obj in inner_image_objects:
new_x1, new_y1, new_x2, new_y2 = rect_to_bbox(img_obj.bbox, height)
text_region = _create_text_region(
new_x1, new_y1, new_x2, new_y2, coef, None, Source.PDFMINER, ImageTextRegion
*rect_to_bbox(img_obj.bbox, height),
coef,
None,
Source.PDFMINER,
ImageTextRegion,
)
if text_region.bbox is not None and text_region.bbox.area > 0:
layout.append(text_region)

layout = remove_duplicate_embedded_text(layout)
image_layout.append(text_region)

clean_text_layout = remove_duplicate_elements(
text_layout, env_config.EMBEDDED_TEXT_SAME_REGION_THRESHOLD
)
clean_image_layout = remove_duplicate_elements(
image_layout, env_config.EMBEDDED_IMAGE_SAME_REGION_THRESHOLD
)
layout = [*clean_text_layout, *clean_image_layout]
# NOTE(christine): always do the basic sort first for deterministic order across
# python versions.
layout = sort_text_regions(layout, SORT_MODE_BASIC)
Expand Down Expand Up @@ -301,31 +306,21 @@ def clean_pdfminer_duplicate_image_elements(document: "DocumentLayout") -> "Docu


@requires_dependencies("unstructured_inference")
def remove_duplicate_embedded_text(elements: list["TextRegion"]) -> list["TextRegion"]:
def remove_duplicate_elements(
elements: list["TextRegion"],
threshold: float = 0.5,
) -> list["TextRegion"]:
"""Removes duplicate text elements extracted by PDFMiner from a document layout."""
from unstructured_inference.inference.elements import EmbeddedTextRegion

bboxes = []
texts = []
bbox_to_iou_mapping = {}
current_idx = 0
for i, element in enumerate(elements):
if not isinstance(element, EmbeddedTextRegion):
continue
bboxes.append(element.bbox)
texts.append(element.text)
bbox_to_iou_mapping[i] = current_idx
current_idx += 1

iou = boxes_self_iou(bboxes, env_config.EMBEDDED_TEXT_SAME_REGION_THRESHOLD)
iou = boxes_self_iou(bboxes, threshold)

filtered_elements = []
for i, element in enumerate(elements):
if not isinstance(element, EmbeddedTextRegion):
filtered_elements.append(element)
continue
this_idx = bbox_to_iou_mapping[i]
if iou[this_idx, this_idx + 1 :].any():
if iou[i, i + 1 :].any():
continue
filtered_elements.append(element)

Expand Down
2 changes: 1 addition & 1 deletion unstructured/partition/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD(self) -> float:
@property
def EMBEDDED_TEXT_SAME_REGION_THRESHOLD(self) -> float:
"""threshold to consider the bounding boxes of two embedded images as the same region"""
return self._get_float("EMBEDDED_IMAGE_SAME_REGION_THRESHOLD", 0.9)
return self._get_float("EMBEDDED_TEXT_SAME_REGION_THRESHOLD", 0.9)

@property
def PDF_ANNOTATION_THRESHOLD(self) -> float:
Expand Down

0 comments on commit be88eef

Please sign in to comment.