Skip to content

Commit

Permalink
Feat/improve iou speed (#3582)
Browse files Browse the repository at this point in the history
This PR vectorizes the computation of element overlap to speed up
deduplication process of extracted elements.

## test

This PR adds unit test to the new vectorized IOU and subregion
computation functions.

In addition, running partition on large files with many elements like
this slide:

[002489.pdf](https://github.com/user-attachments/files/16823176/002489.pdf)

shows a reduction of runtime from around 15min on the main branch to
less than 4min with this branch.

Profiling results show that the new implementation greatly reduces the
time cost of computation and now most of the time is spend on getting
the coordinates from a list of bboxes.

![Screenshot 2024-08-30 at 9 29
27 PM](https://github.com/user-attachments/assets/6c186838-54c7-483b-ac3e-7342c23ff3a6)
  • Loading branch information
badGarnet authored Sep 3, 2024
1 parent 404f780 commit d51fb13
Show file tree
Hide file tree
Showing 4 changed files with 170 additions and 31 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
## 0.15.10-dev1
## 0.15.10-dev2

### Enhancements

* **Modified analysis drawing tools to dump to files and draw from dumps** If the parameter `analysis` of the `partition_pdf` function is set to `True`, the layout for Object Detection, Pdfminer Extraction, OCR and final layouts will be dumped as json files. The drawers now accept dict (dump) objects instead of internal classes instances.
* **Vectorize pdfminer elements deduplication computation**. Use `numpy` operations to compute IOU and sub-region membership instead of using simply loop. This improves the speed of deduplicating elements for pages with a lot of elements.

### Features

Expand Down
56 changes: 56 additions & 0 deletions test_unstructured/partition/pdf_image/test_pdfminer_processing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numpy as np
import pytest
from PIL import Image
from unstructured_inference.constants import Source as InferenceSource
Expand All @@ -6,6 +7,8 @@

from unstructured.partition.pdf_image.pdfminer_processing import (
aggregate_embedded_text_by_block,
bboxes1_is_almost_subregion_of_bboxes2,
boxes_self_iou,
clean_pdfminer_duplicate_image_elements,
clean_pdfminer_inner_elements,
)
Expand Down Expand Up @@ -153,3 +156,56 @@ def test_aggregate_by_block():

text = aggregate_embedded_text_by_block(target_region, embedded_regions)
assert text == expected


@pytest.mark.parametrize(
("coords1", "coords2", "expected"),
[
(
[[0, 0, 10, 10], [10, 0, 20, 10], [10, 10, 20, 20]],
[[0, 0, 10, 10], [0, 0, 12, 12]],
[[True, True], [False, False], [False, False]],
),
(
[[0, 0, 10, 10], [10, 0, 20, 10], [10, 10, 20, 20]],
[[0, 0, 10, 10], [10, 10, 22, 22], [0, 0, 5, 5]],
[[True, False, False], [False, False, False], [False, True, False]],
),
(
[[0, 0, 10, 10], [10, 10, 10, 10]],
[[0, 0, 10, 10], [10, 10, 22, 22], [0, 0, 5, 5]],
[[True, False, False], [True, True, False]],
),
],
)
def test_bboxes1_is_almost_subregion_of_bboxes2(coords1, coords2, expected):
bboxes1 = [Rectangle(*row) for row in coords1]
bboxes2 = [Rectangle(*row) for row in coords2]
np.testing.assert_array_equal(
bboxes1_is_almost_subregion_of_bboxes2(bboxes1, bboxes2), expected
)


@pytest.mark.parametrize(
("coords", "threshold", "expected"),
[
(
[[0, 0, 10, 10], [2, 2, 12, 12], [10, 10, 20, 20]],
0.5,
[[True, True, False], [True, True, False], [False, False, True]],
),
(
[[0, 0, 10, 10], [2, 2, 12, 12], [10, 10, 20, 20]],
0.9,
[[True, False, False], [False, True, False], [False, False, True]],
),
(
[[0, 0, 10, 10], [10, 10, 10, 10]],
0.5,
[[True, False], [False, True]],
),
],
)
def test_boxes_self_iou(coords, threshold, expected):
bboxes = [Rectangle(*row) for row in coords]
np.testing.assert_array_equal(boxes_self_iou(bboxes, threshold), expected)
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.15.10-dev1" # pragma: no cover
__version__ = "0.15.10-dev2" # pragma: no cover
139 changes: 110 additions & 29 deletions unstructured/partition/pdf_image/pdfminer_processing.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast

import numpy as np
from pdfminer.utils import open_filename

from unstructured.documents.elements import ElementType
Expand All @@ -19,6 +20,9 @@
from unstructured_inference.inference.layout import DocumentLayout


EPSILON_AREA = 0.01


def process_file_with_pdfminer(
filename: str = "",
dpi: int = 200,
Expand Down Expand Up @@ -96,6 +100,57 @@ def _create_text_region(x1, y1, x2, y2, coef, text, source, region_class):
)


def get_coords_from_bboxes(bboxes) -> np.ndarray:
"""convert a list of boxes's coords into np array"""
# preallocate memory
coords = np.zeros((len(bboxes), 4))

for i, bbox in enumerate(bboxes):
coords[i, :] = [bbox.x1, bbox.y1, bbox.x2, bbox.y2]

return coords


def areas_of_boxes_and_intersection_area(
coords1: np.ndarray, coords2: np.ndarray, threshold: float = 0.5
):
"""compute intersection area and own areas for two groups of bounding boxes"""
x11, y11, x12, y12 = np.split(coords1, 4, axis=1)
x21, y21, x22, y22 = np.split(coords2, 4, axis=1)

xa = np.maximum(x11, np.transpose(x21))
ya = np.maximum(y11, np.transpose(y21))
xb = np.minimum(x12, np.transpose(x22))
yb = np.minimum(y12, np.transpose(y22))

inter_area = np.maximum((xb - xa + 1), 0) * np.maximum((yb - ya + 1), 0)
boxa_area = (x12 - x11 + 1) * (y12 - y11 + 1)
boxb_area = (x22 - x21 + 1) * (y22 - y21 + 1)

return inter_area, boxa_area, boxb_area


def bboxes1_is_almost_subregion_of_bboxes2(bboxes1, bboxes2, threshold: float = 0.5) -> np.ndarray:
"""compute if each element from bboxes1 is almost a subregion of one or more elements in
bboxes2"""
coords1, coords2 = get_coords_from_bboxes(bboxes1), get_coords_from_bboxes(bboxes2)

inter_area, boxa_area, boxb_area = areas_of_boxes_and_intersection_area(coords1, coords2)

return (inter_area / np.maximum(boxa_area, EPSILON_AREA) > threshold) & (
boxa_area <= boxb_area.T
)


def boxes_self_iou(bboxes, threshold: float = 0.5) -> np.ndarray:
"""compute iou for a group of elements"""
coords = get_coords_from_bboxes(bboxes)

inter_area, boxa_area, boxb_area = areas_of_boxes_and_intersection_area(coords, coords)

return (inter_area / np.maximum(EPSILON_AREA, boxa_area + boxb_area.T - inter_area)) > threshold


@requires_dependencies("unstructured_inference")
def merge_inferred_with_extracted_layout(
inferred_document_layout: "DocumentLayout",
Expand Down Expand Up @@ -168,45 +223,71 @@ def clean_pdfminer_inner_elements(document: "DocumentLayout") -> "DocumentLayout
"""

for page in document.pages:
tables = [e for e in page.elements if e.type == ElementType.TABLE]
table_boxes = [e.bbox for e in page.elements if e.type == ElementType.TABLE]
element_boxes = []
element_to_subregion_map = {}
subregion_indice = 0
for i, element in enumerate(page.elements):
if element.source != Source.PDFMINER:
continue
subregion_threshold = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD
element_inside_table = [
element.bbox.is_almost_subregion_of(t.bbox, subregion_threshold) for t in tables
]
if sum(element_inside_table) == 1:
page.elements[i] = None
page.elements = [e for e in page.elements if e]
element_boxes.append(element.bbox)
element_to_subregion_map[i] = subregion_indice
subregion_indice += 1

is_element_subregion_of_tables = (
bboxes1_is_almost_subregion_of_bboxes2(
element_boxes,
table_boxes,
env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD,
).sum(axis=1)
== 1
)

page.elements = [
e
for i, e in enumerate(page.elements)
if (
(i not in element_to_subregion_map)
or not is_element_subregion_of_tables[element_to_subregion_map[i]]
)
]

return document


def clean_pdfminer_duplicate_image_elements(document: "DocumentLayout") -> "DocumentLayout":
"""Removes duplicate image elements extracted by PDFMiner from a document layout."""

from unstructured_inference.inference.elements import (
region_bounding_boxes_are_almost_the_same,
)

for page in document.pages:
image_elements = []
image_bboxes = []
texts = []
bbox_to_iou_mapping = {}
current_idx = 0
for i, element in enumerate(page.elements):
if element.source != Source.PDFMINER or element.type != ElementType.IMAGE:
continue
image_bboxes.append(element.bbox)
texts.append(element.text)
bbox_to_iou_mapping[i] = current_idx
current_idx += 1

iou = boxes_self_iou(image_bboxes, env_config.EMBEDDED_IMAGE_SAME_REGION_THRESHOLD)

# check if this element is a duplicate
filtered_elements = []
for i, element in enumerate(page.elements[:-1]):
if element.source != Source.PDFMINER or element.type != ElementType.IMAGE:
filtered_elements.append(element)
continue
text = element.text
this_idx = bbox_to_iou_mapping[i]
if any(
e.text == element.text
and region_bounding_boxes_are_almost_the_same(
e.bbox, element.bbox, env_config.EMBEDDED_IMAGE_SAME_REGION_THRESHOLD
)
for e in image_elements
text == texts[potential_match + this_idx + 1]
for potential_match in np.where(iou[this_idx, this_idx + 1 :])[0]
):
page.elements[i] = None
image_elements.append(element)
page.elements = [e for e in page.elements if e]
continue
else:
filtered_elements.append(element)
page.elements[:-1] = filtered_elements

return document

Expand All @@ -218,11 +299,11 @@ def aggregate_embedded_text_by_block(
"""Extracts the text aggregated from the elements of the given layout that lie within the given
block."""

subregion_threshold = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD
filtered_blocks = [
obj
for obj in pdf_objects
if obj.bbox.is_almost_subregion_of(text_region.bbox, subregion_threshold)
]
text = " ".join([x.text for x in filtered_blocks if x.text])
mask = bboxes1_is_almost_subregion_of_bboxes2(
[obj.bbox for obj in pdf_objects],
[text_region.bbox],
env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD,
).sum(axis=1)

text = " ".join([obj.text for i, obj in enumerate(pdf_objects) if (mask[i] and obj.text)])
return text

0 comments on commit d51fb13

Please sign in to comment.