diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c33bfc91a..e2b8877b92 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,9 @@ -## 0.15.1-dev7 +## 0.15.1-dev8 ### Enhancements +* **Improve `pdfminer` embedded `image` extraction to exclude text elements and produce more accurate bounding boxes.** This results in cleaner, more precise element extraction in `pdf` partitioning. + ### Features * **Mark ingest as deprecated** Begin sunset of ingest code in this repo as it's been moved to a dedicated repo. diff --git a/unstructured/__version__.py b/unstructured/__version__.py index d05dee38d4..31c20c9568 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.1-dev7" # pragma: no cover +__version__ = "0.15.1-dev8" # pragma: no cover diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 83e4bab67f..4759d4b610 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -5,7 +5,7 @@ from unstructured.documents.elements import ElementType from unstructured.partition.pdf_image.pdf_image_utils import remove_control_characters from unstructured.partition.pdf_image.pdfminer_utils import ( - get_images_from_pdf_element, + extract_image_objects, open_pdfminer_pages_generator, rect_to_bbox, ) @@ -51,32 +51,26 @@ def process_data_with_pdfminer( for page, page_layout in open_pdfminer_pages_generator(file): height = page_layout.height - layout: List["TextRegion"] = [] + layout: list["TextRegion"] = [] for obj in page_layout: x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height) if hasattr(obj, "get_text"): _text = obj.get_text() - element_class = EmbeddedTextRegion # type: ignore + text_region = _create_text_region( + x1, y1, x2, y2, coef, _text, Source.PDFMINER, EmbeddedTextRegion + ) + if text_region.bbox is not None and text_region.bbox.area > 0: + layout.append(text_region) else: - embedded_images = get_images_from_pdf_element(obj) - if len(embedded_images) > 0: - _text = None - element_class = ImageTextRegion # type: ignore - else: - continue - - text_region = element_class.from_coords( - x1 * coef, - y1 * coef, - x2 * coef, - y2 * coef, - text=_text, - source=Source.PDFMINER, - ) - - if text_region.bbox is not None and text_region.bbox.area > 0: - layout.append(text_region) + inner_image_objects = extract_image_objects(obj) + for img_obj in inner_image_objects: + new_x1, new_y1, new_x2, new_y2 = rect_to_bbox(img_obj.bbox, height) + text_region = _create_text_region( + new_x1, new_y1, new_x2, new_y2, coef, None, Source.PDFMINER, ImageTextRegion + ) + if text_region.bbox is not None and text_region.bbox.area > 0: + layout.append(text_region) # NOTE(christine): always do the basic sort first for deterministic order across # python versions. @@ -90,6 +84,18 @@ def process_data_with_pdfminer( return layouts +def _create_text_region(x1, y1, x2, y2, coef, text, source, region_class): + """Creates a text region of the specified class with scaled coordinates.""" + return region_class.from_coords( + x1 * coef, + y1 * coef, + x2 * coef, + y2 * coef, + text=text, + source=source, + ) + + @requires_dependencies("unstructured_inference") def merge_inferred_with_extracted_layout( inferred_document_layout: "DocumentLayout", diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py index c35a4dedd3..fce84de0bd 100644 --- a/unstructured/partition/pdf_image/pdfminer_utils.py +++ b/unstructured/partition/pdf_image/pdfminer_utils.py @@ -1,8 +1,8 @@ import tempfile -from typing import Any, BinaryIO, List, Tuple +from typing import BinaryIO, List, Tuple from pdfminer.converter import PDFPageAggregator -from pdfminer.layout import LAParams, LTContainer, LTImage +from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage from pdfminer.pdfparser import PSSyntaxError @@ -20,39 +20,17 @@ def init_pdfminer(): return device, interpreter -def get_images_from_pdf_element(layout_object: Any) -> List[LTImage]: - """ - Recursively extracts LTImage objects from a PDF layout element. - - This function takes a PDF layout element (could be LTImage or LTContainer) and recursively - extracts all LTImage objects contained within it. - - Parameters: - - layout_object (Any): The PDF layout element to extract images from. +def extract_image_objects(parent_object: LTItem) -> List[LTImage]: + """Recursively extracts image objects from a given parent object in a PDF document.""" + objects = [] - Returns: - - List[LTImage]: A list of LTImage objects extracted from the layout object. - - Note: - - This function recursively traverses through the layout_object to find and accumulate all - LTImage objects. - - If the input layout_object is an LTImage, it will be included in the returned list. - - If the input layout_object is an LTContainer, the function will recursively search its - children for LTImage objects. - - If the input layout_object is neither LTImage nor LTContainer, an empty list will be - returned. - """ + if isinstance(parent_object, LTImage): + objects.append(parent_object) + elif isinstance(parent_object, LTContainer): + for child in parent_object: + objects.extend(extract_image_objects(child)) - # recursively locate Image objects in layout_object - if isinstance(layout_object, LTImage): - return [layout_object] - if isinstance(layout_object, LTContainer): - img_list: List[LTImage] = [] - for child in layout_object: - img_list = img_list + get_images_from_pdf_element(child) - return img_list - else: - return [] + return objects def rect_to_bbox(