Skip to content

Commit

Permalink
refactor: pdfminer image cleanup (#3648)
Browse files Browse the repository at this point in the history
This PR aims to remove `clean_pdfminer_duplicate_image_elements()`
function, as its functionality has already been integrated into the
`remove_duplicate_elements()` function in [PR
#3630](#3630).
  • Loading branch information
christinestraub authored Sep 19, 2024
1 parent be88eef commit 0ed69a1
Show file tree
Hide file tree
Showing 5 changed files with 2 additions and 60 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.15.13-dev2
## 0.15.13-dev3

### Enhancements

Expand Down
18 changes: 0 additions & 18 deletions test_unstructured/partition/pdf_image/test_pdfminer_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
aggregate_embedded_text_by_block,
bboxes1_is_almost_subregion_of_bboxes2,
boxes_self_iou,
clean_pdfminer_duplicate_image_elements,
clean_pdfminer_inner_elements,
remove_duplicate_elements,
)
Expand Down Expand Up @@ -129,23 +128,6 @@ def test_clean_pdfminer_inner_elements(elements, length_extra_info, expected_doc
]


@pytest.mark.parametrize(
("elements", "expected_document_length"),
[
(elements_with_duplicate_images, 2),
(elements_without_duplicate_images, 4),
],
)
def test_clean_pdfminer_duplicate_image_elements(elements, expected_document_length):
page = PageLayout(number=1, image=Image.new("1", (1, 1)))
page.elements = elements
document = DocumentLayout(pages=[page])

cleaned_doc = clean_pdfminer_duplicate_image_elements(document)

assert len(cleaned_doc.pages[0].elements) == expected_document_length


def test_aggregate_by_block():
expected = "Inside region1 Inside region2"
embedded_regions = [
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.15.13-dev2" # pragma: no cover
__version__ = "0.15.13-dev3" # pragma: no cover
2 changes: 0 additions & 2 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@
save_elements,
)
from unstructured.partition.pdf_image.pdfminer_processing import (
clean_pdfminer_duplicate_image_elements,
clean_pdfminer_inner_elements,
merge_inferred_with_extracted_layout,
)
Expand Down Expand Up @@ -712,7 +711,6 @@ def _partition_pdf_or_image_local(
if hi_res_model_name.startswith("chipper") and hi_res_model_name != "chipperv1":
kwargs["sort_mode"] = SORT_MODE_DONT

final_document_layout = clean_pdfminer_duplicate_image_elements(final_document_layout)
final_document_layout = clean_pdfminer_inner_elements(final_document_layout)

for page in final_document_layout.pages:
Expand Down
38 changes: 0 additions & 38 deletions unstructured/partition/pdf_image/pdfminer_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import numpy as np
from pdfminer.utils import open_filename

from unstructured.documents.elements import ElementType
from unstructured.partition.pdf_image.pdf_image_utils import remove_control_characters
from unstructured.partition.pdf_image.pdfminer_utils import (
extract_image_objects,
Expand Down Expand Up @@ -268,43 +267,6 @@ def clean_pdfminer_inner_elements(document: "DocumentLayout") -> "DocumentLayout
return document


def clean_pdfminer_duplicate_image_elements(document: "DocumentLayout") -> "DocumentLayout":
"""Removes duplicate image elements extracted by PDFMiner from a document layout."""

for page in document.pages:
image_bboxes = []
texts = []
bbox_to_iou_mapping = {}
current_idx = 0
for i, element in enumerate(page.elements):
if element.source != Source.PDFMINER or element.type != ElementType.IMAGE:
continue
image_bboxes.append(element.bbox)
texts.append(element.text)
bbox_to_iou_mapping[i] = current_idx
current_idx += 1

iou = boxes_self_iou(image_bboxes, env_config.EMBEDDED_IMAGE_SAME_REGION_THRESHOLD)

filtered_elements = []
for i, element in enumerate(page.elements[:-1]):
if element.source != Source.PDFMINER or element.type != ElementType.IMAGE:
filtered_elements.append(element)
continue
text = element.text
this_idx = bbox_to_iou_mapping[i]
if any(
text == texts[potential_match + this_idx + 1]
for potential_match in np.where(iou[this_idx, this_idx + 1 :])[0]
):
continue
else:
filtered_elements.append(element)
page.elements[:-1] = filtered_elements

return document


@requires_dependencies("unstructured_inference")
def remove_duplicate_elements(
elements: list["TextRegion"],
Expand Down

0 comments on commit 0ed69a1

Please sign in to comment.