diff --git a/CHANGELOG.md b/CHANGELOG.md index 3cb32ab39f..6d1d67c0c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.13-dev2 +## 0.15.13-dev3 ### Enhancements diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py index 1572954188..4873c44e90 100644 --- a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py +++ b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py @@ -9,7 +9,6 @@ aggregate_embedded_text_by_block, bboxes1_is_almost_subregion_of_bboxes2, boxes_self_iou, - clean_pdfminer_duplicate_image_elements, clean_pdfminer_inner_elements, remove_duplicate_elements, ) @@ -129,23 +128,6 @@ def test_clean_pdfminer_inner_elements(elements, length_extra_info, expected_doc ] -@pytest.mark.parametrize( - ("elements", "expected_document_length"), - [ - (elements_with_duplicate_images, 2), - (elements_without_duplicate_images, 4), - ], -) -def test_clean_pdfminer_duplicate_image_elements(elements, expected_document_length): - page = PageLayout(number=1, image=Image.new("1", (1, 1))) - page.elements = elements - document = DocumentLayout(pages=[page]) - - cleaned_doc = clean_pdfminer_duplicate_image_elements(document) - - assert len(cleaned_doc.pages[0].elements) == expected_document_length - - def test_aggregate_by_block(): expected = "Inside region1 Inside region2" embedded_regions = [ diff --git a/unstructured/__version__.py b/unstructured/__version__.py index de4688b312..89b723d097 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.13-dev2" # pragma: no cover +__version__ = "0.15.13-dev3" # pragma: no cover diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 37ad78ea6d..3dad1f9960 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -68,7 +68,6 @@ save_elements, ) from unstructured.partition.pdf_image.pdfminer_processing import ( - clean_pdfminer_duplicate_image_elements, clean_pdfminer_inner_elements, merge_inferred_with_extracted_layout, ) @@ -712,7 +711,6 @@ def _partition_pdf_or_image_local( if hi_res_model_name.startswith("chipper") and hi_res_model_name != "chipperv1": kwargs["sort_mode"] = SORT_MODE_DONT - final_document_layout = clean_pdfminer_duplicate_image_elements(final_document_layout) final_document_layout = clean_pdfminer_inner_elements(final_document_layout) for page in final_document_layout.pages: diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index ca910db07d..0836292c8a 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -3,7 +3,6 @@ import numpy as np from pdfminer.utils import open_filename -from unstructured.documents.elements import ElementType from unstructured.partition.pdf_image.pdf_image_utils import remove_control_characters from unstructured.partition.pdf_image.pdfminer_utils import ( extract_image_objects, @@ -268,43 +267,6 @@ def clean_pdfminer_inner_elements(document: "DocumentLayout") -> "DocumentLayout return document -def clean_pdfminer_duplicate_image_elements(document: "DocumentLayout") -> "DocumentLayout": - """Removes duplicate image elements extracted by PDFMiner from a document layout.""" - - for page in document.pages: - image_bboxes = [] - texts = [] - bbox_to_iou_mapping = {} - current_idx = 0 - for i, element in enumerate(page.elements): - if element.source != Source.PDFMINER or element.type != ElementType.IMAGE: - continue - image_bboxes.append(element.bbox) - texts.append(element.text) - bbox_to_iou_mapping[i] = current_idx - current_idx += 1 - - iou = boxes_self_iou(image_bboxes, env_config.EMBEDDED_IMAGE_SAME_REGION_THRESHOLD) - - filtered_elements = [] - for i, element in enumerate(page.elements[:-1]): - if element.source != Source.PDFMINER or element.type != ElementType.IMAGE: - filtered_elements.append(element) - continue - text = element.text - this_idx = bbox_to_iou_mapping[i] - if any( - text == texts[potential_match + this_idx + 1] - for potential_match in np.where(iou[this_idx, this_idx + 1 :])[0] - ): - continue - else: - filtered_elements.append(element) - page.elements[:-1] = filtered_elements - - return document - - @requires_dependencies("unstructured_inference") def remove_duplicate_elements( elements: list["TextRegion"],