From 4f298c7a0931883e0dd9f77ea3a03dd7fd43be3d Mon Sep 17 00:00:00 2001 From: christinestraub Date: Wed, 31 Jul 2024 09:12:43 -0700 Subject: [PATCH 1/5] feat: add functionality to extract all inner objects --- .../pdf_image/pdfminer_processing.py | 42 +++++++++---------- .../partition/pdf_image/pdfminer_utils.py | 27 +++++++++++- 2 files changed, 46 insertions(+), 23 deletions(-) diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 83e4bab67f..6758ef5918 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -5,7 +5,7 @@ from unstructured.documents.elements import ElementType from unstructured.partition.pdf_image.pdf_image_utils import remove_control_characters from unstructured.partition.pdf_image.pdfminer_utils import ( - get_images_from_pdf_element, + extract_text_and_image_objects, open_pdfminer_pages_generator, rect_to_bbox, ) @@ -53,30 +53,28 @@ def process_data_with_pdfminer( layout: List["TextRegion"] = [] for obj in page_layout: - x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height) + inner_objects = extract_text_and_image_objects(obj) + for inner_obj in inner_objects: + x1, y1, x2, y2 = rect_to_bbox(inner_obj.bbox, height) - if hasattr(obj, "get_text"): - _text = obj.get_text() - element_class = EmbeddedTextRegion # type: ignore - else: - embedded_images = get_images_from_pdf_element(obj) - if len(embedded_images) > 0: - _text = None - element_class = ImageTextRegion # type: ignore + if hasattr(inner_obj, "get_text"): + _text = inner_obj.get_text() + element_class = EmbeddedTextRegion # type: ignore else: - continue - - text_region = element_class.from_coords( - x1 * coef, - y1 * coef, - x2 * coef, - y2 * coef, - text=_text, - source=Source.PDFMINER, - ) + _text = None + element_class = ImageTextRegion + + text_region = element_class.from_coords( + x1 * coef, + y1 * coef, + x2 * coef, + y2 * coef, + text=_text, + source=Source.PDFMINER, + ) - if text_region.bbox is not None and text_region.bbox.area > 0: - layout.append(text_region) + if text_region.bbox is not None and text_region.bbox.area > 0: + layout.append(text_region) # NOTE(christine): always do the basic sort first for deterministic order across # python versions. diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py index c35a4dedd3..f1ad39a447 100644 --- a/unstructured/partition/pdf_image/pdfminer_utils.py +++ b/unstructured/partition/pdf_image/pdfminer_utils.py @@ -2,7 +2,7 @@ from typing import Any, BinaryIO, List, Tuple from pdfminer.converter import PDFPageAggregator -from pdfminer.layout import LAParams, LTContainer, LTImage +from pdfminer.layout import LAParams, LTComponent, LTContainer, LTImage from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage from pdfminer.pdfparser import PSSyntaxError @@ -20,6 +20,31 @@ def init_pdfminer(): return device, interpreter +def extract_text_and_image_objects(parent_object) -> List[LTComponent]: + """ + Recursively extract text and image objects from a given parent object in a PDF document. + + This function navigates through the PDF's layout tree and collects objects that contain text + (objects with a 'get_text' method) or are images (instances of LTImage). + + Args: + parent_object: The root object from which to start the extraction. This could be an + instance of LTContainer, LTText, LTImage, or any other pdfminer layout object. + + Returns: + A list of LTComponent objects which are either text-containing objects or images. + """ + objects = [] + + if hasattr(parent_object, "get_text") or isinstance(parent_object, LTImage): + objects.append(parent_object) + elif isinstance(parent_object, LTContainer): + for child in parent_object: + objects.extend(extract_text_and_image_objects(child)) + + return objects + + def get_images_from_pdf_element(layout_object: Any) -> List[LTImage]: """ Recursively extracts LTImage objects from a PDF layout element. From fa53e53a0d82294365222e2a5a676ac795a49d3d Mon Sep 17 00:00:00 2001 From: christinestraub Date: Wed, 31 Jul 2024 09:13:42 -0700 Subject: [PATCH 2/5] refactor: remove unused `get_images_from_pdf_element` --- .../partition/pdf_image/pdfminer_utils.py | 35 ------------------- 1 file changed, 35 deletions(-) diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py index f1ad39a447..71eaea3a56 100644 --- a/unstructured/partition/pdf_image/pdfminer_utils.py +++ b/unstructured/partition/pdf_image/pdfminer_utils.py @@ -45,41 +45,6 @@ def extract_text_and_image_objects(parent_object) -> List[LTComponent]: return objects -def get_images_from_pdf_element(layout_object: Any) -> List[LTImage]: - """ - Recursively extracts LTImage objects from a PDF layout element. - - This function takes a PDF layout element (could be LTImage or LTContainer) and recursively - extracts all LTImage objects contained within it. - - Parameters: - - layout_object (Any): The PDF layout element to extract images from. - - Returns: - - List[LTImage]: A list of LTImage objects extracted from the layout object. - - Note: - - This function recursively traverses through the layout_object to find and accumulate all - LTImage objects. - - If the input layout_object is an LTImage, it will be included in the returned list. - - If the input layout_object is an LTContainer, the function will recursively search its - children for LTImage objects. - - If the input layout_object is neither LTImage nor LTContainer, an empty list will be - returned. - """ - - # recursively locate Image objects in layout_object - if isinstance(layout_object, LTImage): - return [layout_object] - if isinstance(layout_object, LTContainer): - img_list: List[LTImage] = [] - for child in layout_object: - img_list = img_list + get_images_from_pdf_element(child) - return img_list - else: - return [] - - def rect_to_bbox( rect: Tuple[float, float, float, float], height: float, From 9cd925464a776c0286a4f62fee8d26ce92470d11 Mon Sep 17 00:00:00 2001 From: christinestraub Date: Wed, 31 Jul 2024 16:12:19 -0700 Subject: [PATCH 3/5] feat: update `ImageTextRegion` extraction --- .../pdf_image/pdfminer_processing.py | 50 +++++++++++-------- .../partition/pdf_image/pdfminer_utils.py | 24 +++------ 2 files changed, 35 insertions(+), 39 deletions(-) diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 6758ef5918..4759d4b610 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -5,7 +5,7 @@ from unstructured.documents.elements import ElementType from unstructured.partition.pdf_image.pdf_image_utils import remove_control_characters from unstructured.partition.pdf_image.pdfminer_utils import ( - extract_text_and_image_objects, + extract_image_objects, open_pdfminer_pages_generator, rect_to_bbox, ) @@ -51,30 +51,26 @@ def process_data_with_pdfminer( for page, page_layout in open_pdfminer_pages_generator(file): height = page_layout.height - layout: List["TextRegion"] = [] + layout: list["TextRegion"] = [] for obj in page_layout: - inner_objects = extract_text_and_image_objects(obj) - for inner_obj in inner_objects: - x1, y1, x2, y2 = rect_to_bbox(inner_obj.bbox, height) - - if hasattr(inner_obj, "get_text"): - _text = inner_obj.get_text() - element_class = EmbeddedTextRegion # type: ignore - else: - _text = None - element_class = ImageTextRegion - - text_region = element_class.from_coords( - x1 * coef, - y1 * coef, - x2 * coef, - y2 * coef, - text=_text, - source=Source.PDFMINER, - ) + x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height) + if hasattr(obj, "get_text"): + _text = obj.get_text() + text_region = _create_text_region( + x1, y1, x2, y2, coef, _text, Source.PDFMINER, EmbeddedTextRegion + ) if text_region.bbox is not None and text_region.bbox.area > 0: layout.append(text_region) + else: + inner_image_objects = extract_image_objects(obj) + for img_obj in inner_image_objects: + new_x1, new_y1, new_x2, new_y2 = rect_to_bbox(img_obj.bbox, height) + text_region = _create_text_region( + new_x1, new_y1, new_x2, new_y2, coef, None, Source.PDFMINER, ImageTextRegion + ) + if text_region.bbox is not None and text_region.bbox.area > 0: + layout.append(text_region) # NOTE(christine): always do the basic sort first for deterministic order across # python versions. @@ -88,6 +84,18 @@ def process_data_with_pdfminer( return layouts +def _create_text_region(x1, y1, x2, y2, coef, text, source, region_class): + """Creates a text region of the specified class with scaled coordinates.""" + return region_class.from_coords( + x1 * coef, + y1 * coef, + x2 * coef, + y2 * coef, + text=text, + source=source, + ) + + @requires_dependencies("unstructured_inference") def merge_inferred_with_extracted_layout( inferred_document_layout: "DocumentLayout", diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py index 71eaea3a56..fce84de0bd 100644 --- a/unstructured/partition/pdf_image/pdfminer_utils.py +++ b/unstructured/partition/pdf_image/pdfminer_utils.py @@ -1,8 +1,8 @@ import tempfile -from typing import Any, BinaryIO, List, Tuple +from typing import BinaryIO, List, Tuple from pdfminer.converter import PDFPageAggregator -from pdfminer.layout import LAParams, LTComponent, LTContainer, LTImage +from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage from pdfminer.pdfparser import PSSyntaxError @@ -20,27 +20,15 @@ def init_pdfminer(): return device, interpreter -def extract_text_and_image_objects(parent_object) -> List[LTComponent]: - """ - Recursively extract text and image objects from a given parent object in a PDF document. - - This function navigates through the PDF's layout tree and collects objects that contain text - (objects with a 'get_text' method) or are images (instances of LTImage). - - Args: - parent_object: The root object from which to start the extraction. This could be an - instance of LTContainer, LTText, LTImage, or any other pdfminer layout object. - - Returns: - A list of LTComponent objects which are either text-containing objects or images. - """ +def extract_image_objects(parent_object: LTItem) -> List[LTImage]: + """Recursively extracts image objects from a given parent object in a PDF document.""" objects = [] - if hasattr(parent_object, "get_text") or isinstance(parent_object, LTImage): + if isinstance(parent_object, LTImage): objects.append(parent_object) elif isinstance(parent_object, LTContainer): for child in parent_object: - objects.extend(extract_text_and_image_objects(child)) + objects.extend(extract_image_objects(child)) return objects From be9141d66de8cf5a45e00ac6c4bee6a740c225fe Mon Sep 17 00:00:00 2001 From: christinestraub Date: Wed, 31 Jul 2024 16:14:48 -0700 Subject: [PATCH 4/5] chore: bump version --- CHANGELOG.md | 2 +- unstructured/__version__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c33bfc91a..512a1bca43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.1-dev7 +## 0.15.1-dev8 ### Enhancements diff --git a/unstructured/__version__.py b/unstructured/__version__.py index d05dee38d4..31c20c9568 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.1-dev7" # pragma: no cover +__version__ = "0.15.1-dev8" # pragma: no cover From 109e1c1ce7f5c234a95144fb39b99a8822c1fff3 Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Wed, 31 Jul 2024 17:46:27 -0700 Subject: [PATCH 5/5] chore: update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 512a1bca43..e2b8877b92 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ### Enhancements +* **Improve `pdfminer` embedded `image` extraction to exclude text elements and produce more accurate bounding boxes.** This results in cleaner, more precise element extraction in `pdf` partitioning. + ### Features * **Mark ingest as deprecated** Begin sunset of ingest code in this repo as it's been moved to a dedicated repo.