diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e8f3a6dc6..c1569f0097 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ -## 0.15.10-dev0 +## 0.15.10-dev1 ### Enhancements +* **Modified analysis drawing tools to dump to files and draw from dumps** If the parameter `analysis` of the `partition_pdf` function is set to `True`, the layout for Object Detection, Pdfminer Extraction, OCR and final layouts will be dumped as json files. The drawers now accept dict (dump) objects instead of internal classes instances. ### Features diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index d730e5e64a..3d41d6e304 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -1336,33 +1336,67 @@ def test_unique_and_deterministic_element_ids(strategy, expected_ids): assert ids == expected_ids, "Element IDs do not match expected IDs" -def test_analysis_artifacts_saved(): +@pytest.mark.parametrize("is_path", [True, False]) +@pytest.mark.parametrize( + ("example_doc", "doc_pages"), + [ + ("pdf/layout-parser-paper-fast.pdf", 2), + ("img/DA-1p.png", 1), + ], +) +def test_analysis_artifacts_saved(is_path: bool, example_doc: str, doc_pages: int): with tempfile.TemporaryDirectory() as temp_dir: - filename = example_doc_path("pdf/layout-parser-paper-fast.pdf") + file = None + filename = example_doc_path(example_doc) + is_image = not Path(filename).suffix.endswith("pdf") + if not is_path: + file = open(filename, "rb") # noqa: SIM115 + filename = None pdf.partition_pdf( filename=filename, + file=file, + is_image=is_image, strategy=PartitionStrategy.HI_RES, analysis=True, analyzed_image_output_dir_path=temp_dir, ) analysis_dir = Path(temp_dir) - layout_dump_dir = analysis_dir / "analysis" / "layout-parser-paper-fast" / "layout_dump" + file_analysis_root = None + if is_path: + file_analysis_root = analysis_dir / "analysis" / Path(example_doc).stem + else: + # if file is not a path, the filename is None and the analysis directory + # for the document is generated + generated_file_stem_path = list((analysis_dir / "analysis").iterdir())[0] + if is_image: + assert "image" in generated_file_stem_path.name + else: + assert "pdf" in generated_file_stem_path.name + file_analysis_root = generated_file_stem_path + layout_dump_dir = file_analysis_root / "layout_dump" assert layout_dump_dir.exists() layout_dump_files = list(layout_dump_dir.iterdir()) - assert len(layout_dump_files) == 1 - assert (layout_dump_dir / "object_detection.json").exists() - bboxes_dir = analysis_dir / "analysis" / "layout-parser-paper-fast" / "bboxes" + expected_layout_dumps = ["object_detection", "ocr", "pdfminer", "final"] + assert len(layout_dump_files) == len(expected_layout_dumps) + + for expected_layout_dump in expected_layout_dumps: + assert (layout_dump_dir / f"{expected_layout_dump}.json").exists() + + bboxes_dir = file_analysis_root / "bboxes" assert bboxes_dir.exists() bboxes_files = list(bboxes_dir.iterdir()) - assert len(bboxes_files) == 2 * 4 # 2 pages * 4 different layouts per page - expected_layouts = ["od_model", "ocr", "pdfminer", "final"] - expected_pages = [1, 2] - for el in expected_layouts: + expected_renders = ["od_model", "ocr", "pdfminer", "final"] + assert len(bboxes_files) == doc_pages * len(expected_renders) + + expected_pages = range(1, doc_pages + 1) + for el in expected_renders: for page in expected_pages: assert bboxes_dir / f"page{page}_layout_{el}.png" in bboxes_files + if file: + file.close() @pytest.mark.parametrize( diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 44a028a643..fd4e3cfe31 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.10-dev0" # pragma: no cover +__version__ = "0.15.10-dev1" # pragma: no cover diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 4cfb0b8516..d35a84812e 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -53,16 +53,12 @@ prepare_languages_for_tesseract, tesseract_to_paddle_language, ) -from unstructured.partition.pdf_image.analysis.bbox_visualisation import ( - AnalysisDrawer, - FinalLayoutDrawer, - OCRLayoutDrawer, - ODModelLayoutDrawer, - PdfminerLayoutDrawer, -) +from unstructured.partition.pdf_image.analysis import save_analysis_artifiacts from unstructured.partition.pdf_image.analysis.layout_dump import ( - JsonLayoutDumper, + ExtractedLayoutDumper, + FinalLayoutDumper, ObjectDetectionLayoutDumper, + OCRLayoutDumper, ) from unstructured.partition.pdf_image.form_extraction import run_form_extraction from unstructured.partition.pdf_image.pdf_image_utils import ( @@ -589,12 +585,12 @@ def _partition_pdf_or_image_local( f"(currently {pdf_image_dpi}).", ) - pdfminer_drawer: Optional[PdfminerLayoutDrawer] = None - od_model_drawer: Optional[ODModelLayoutDrawer] = None - ocr_drawer: Optional[OCRLayoutDrawer] = None od_model_layout_dumper: Optional[ObjectDetectionLayoutDumper] = None - skip_bboxes = env_config.ANALYSIS_BBOX_SKIP - skip_dump_od = env_config.ANALYSIS_DUMP_OD_SKIP + extracted_layout_dumper: Optional[ExtractedLayoutDumper] = None + ocr_layout_dumper: Optional[OCRLayoutDumper] = None + final_layout_dumper: Optional[FinalLayoutDumper] = None + + skip_analysis_dump = env_config.ANALYSIS_DUMP_OD_SKIP if file is None: inferred_document_layout = process_file_with_model( @@ -624,19 +620,15 @@ def _partition_pdf_or_image_local( else: analyzed_image_output_dir_path = str(Path.cwd() / "annotated") os.makedirs(analyzed_image_output_dir_path, exist_ok=True) - if not skip_bboxes: - pdfminer_drawer = PdfminerLayoutDrawer( - layout=extracted_layout, - ) - od_model_drawer = ODModelLayoutDrawer( - layout=inferred_document_layout, - ) - ocr_drawer = OCRLayoutDrawer() - if not skip_dump_od: + if not skip_analysis_dump: od_model_layout_dumper = ObjectDetectionLayoutDumper( layout=inferred_document_layout, model_name=hi_res_model_name, ) + extracted_layout_dumper = ExtractedLayoutDumper( + layout=extracted_layout, + ) + ocr_layout_dumper = OCRLayoutDumper() # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout merged_document_layout = merge_inferred_with_extracted_layout( inferred_document_layout=inferred_document_layout, @@ -653,7 +645,7 @@ def _partition_pdf_or_image_local( ocr_languages=ocr_languages, ocr_mode=ocr_mode, pdf_image_dpi=pdf_image_dpi, - ocr_drawer=ocr_drawer, + ocr_layout_dumper=ocr_layout_dumper, ) else: inferred_document_layout = process_data_with_model( @@ -685,14 +677,15 @@ def _partition_pdf_or_image_local( ) else: analyzed_image_output_dir_path = str(Path.cwd() / "annotated") - os.makedirs(analyzed_image_output_dir_path, exist_ok=True) - pdfminer_drawer = PdfminerLayoutDrawer( - layout=extracted_layout, - ) - od_model_drawer = ODModelLayoutDrawer( - layout=inferred_document_layout, - ) - ocr_drawer = OCRLayoutDrawer() + if not skip_analysis_dump: + od_model_layout_dumper = ObjectDetectionLayoutDumper( + layout=inferred_document_layout, + model_name=hi_res_model_name, + ) + extracted_layout_dumper = ExtractedLayoutDumper( + layout=extracted_layout, + ) + ocr_layout_dumper = OCRLayoutDumper() # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout merged_document_layout = merge_inferred_with_extracted_layout( @@ -712,7 +705,7 @@ def _partition_pdf_or_image_local( ocr_languages=ocr_languages, ocr_mode=ocr_mode, pdf_image_dpi=pdf_image_dpi, - ocr_drawer=ocr_drawer, + ocr_layout_dumper=ocr_layout_dumper, ) # NOTE(alan): starting with v2, chipper sorts the elements itself. @@ -801,38 +794,29 @@ def _partition_pdf_or_image_local( ) out_elements.extend(forms) - if analysis and not skip_bboxes: - final_drawer = FinalLayoutDrawer( - layout=out_elements, - ) - analysis_drawer = AnalysisDrawer( - filename=filename, - save_dir=analyzed_image_output_dir_path, - draw_grid=env_config.ANALYSIS_BBOX_DRAW_GRID, - draw_caption=env_config.ANALYSIS_BBOX_DRAW_CAPTION, - resize=env_config.ANALYSIS_BBOX_RESIZE, - format=env_config.ANALYSIS_BBOX_FORMAT, - ) - - if od_model_drawer: - analysis_drawer.add_drawer(od_model_drawer) - - if pdfminer_drawer: - analysis_drawer.add_drawer(pdfminer_drawer) - - if ocr_drawer: - analysis_drawer.add_drawer(ocr_drawer) - analysis_drawer.add_drawer(final_drawer) - analysis_drawer.process() - - if analysis and not skip_dump_od: - json_layout_dumper = JsonLayoutDumper( + if analysis: + if not skip_analysis_dump: + final_layout_dumper = FinalLayoutDumper( + layout=out_elements, + ) + layout_dumpers = [] + if od_model_layout_dumper: + layout_dumpers.append(od_model_layout_dumper) + if extracted_layout_dumper: + layout_dumpers.append(extracted_layout_dumper) + if ocr_layout_dumper: + layout_dumpers.append(ocr_layout_dumper) + if final_layout_dumper: + layout_dumpers.append(final_layout_dumper) + save_analysis_artifiacts( + *layout_dumpers, filename=filename, - save_dir=analyzed_image_output_dir_path, + file=file, + is_image=is_image, + analyzed_image_output_dir_path=analyzed_image_output_dir_path, + skip_bboxes=env_config.ANALYSIS_BBOX_SKIP, + skip_dump_od=env_config.ANALYSIS_DUMP_OD_SKIP, ) - if od_model_layout_dumper: - json_layout_dumper.add_layout_dumper(od_model_layout_dumper) - json_layout_dumper.process() return out_elements diff --git a/unstructured/partition/pdf_image/analysis/__init__.py b/unstructured/partition/pdf_image/analysis/__init__.py index e69de29bb2..c75a7a7963 100644 --- a/unstructured/partition/pdf_image/analysis/__init__.py +++ b/unstructured/partition/pdf_image/analysis/__init__.py @@ -0,0 +1,172 @@ +import json +import uuid +from io import BytesIO +from pathlib import Path +from typing import Optional + +from unstructured import env_config +from unstructured.partition.pdf_image.analysis.bbox_visualisation import ( + AnalysisDrawer, + FinalLayoutDrawer, + LayoutDrawer, + OCRLayoutDrawer, + ODModelLayoutDrawer, + PdfminerLayoutDrawer, +) +from unstructured.partition.pdf_image.analysis.layout_dump import ( + ExtractedLayoutDumper, + FinalLayoutDumper, + JsonLayoutDumper, + LayoutDumper, + ObjectDetectionLayoutDumper, + OCRLayoutDumper, +) + + +def _get_drawer_for_dumper(dumper: LayoutDumper) -> Optional[LayoutDrawer]: + """For a given layout dumper, return the corresponding layout drawer instance initialized with + a dumped layout dict. + + Args: + dumper: The layout dumper instance + + Returns: + LayoutDrawer: The corresponding layout drawer instance + """ + if isinstance(dumper, ObjectDetectionLayoutDumper): + return ODModelLayoutDrawer(layout_dump=dumper.dump()) + elif isinstance(dumper, ExtractedLayoutDumper): + return PdfminerLayoutDrawer(layout_dump=dumper.dump()) + elif isinstance(dumper, OCRLayoutDumper): + return OCRLayoutDrawer(layout_dump=dumper.dump()) + elif isinstance(dumper, FinalLayoutDumper): + return FinalLayoutDrawer(layout_dump=dumper.dump()) + else: + raise ValueError(f"Unknown dumper type: {dumper}") + + +def _generate_filename(is_image: bool): + """Generate a filename for the analysis artifacts based on the file type. + Adds a random uuid suffix + """ + suffix = uuid.uuid4().hex[:5] + if is_image: + return f"image_{suffix}.png" + return f"pdf_{suffix}.pdf" + + +def save_analysis_artifiacts( + *layout_dumpers: LayoutDumper, + is_image: bool, + analyzed_image_output_dir_path: str, + filename: Optional[str] = None, + file: Optional[BytesIO] = None, + skip_bboxes: bool = False, + skip_dump_od: bool = False, +): + """Save the analysis artifacts for a given file. Loads some settings from + the environment configuration. + + Args: + layout_dumpers: The layout dumpers to save and use for bboxes rendering + filename: The filename of the sources analyzed file (pdf/image) + analyzed_image_output_dir_path: The directory to save the analysis artifacts + """ + if not filename: + filename = _generate_filename(is_image) + if skip_bboxes or skip_dump_od: + return + + output_path = Path(analyzed_image_output_dir_path) + output_path.mkdir(parents=True, exist_ok=True) + if not skip_dump_od: + json_layout_dumper = JsonLayoutDumper( + filename=filename, + save_dir=output_path, + ) + for layout_dumper in layout_dumpers: + json_layout_dumper.add_layout_dumper(layout_dumper) + json_layout_dumper.process() + + if not skip_bboxes: + analysis_drawer = AnalysisDrawer( + filename=filename, + file=file, + is_image=is_image, + save_dir=output_path, + draw_grid=env_config.ANALYSIS_BBOX_DRAW_GRID, + draw_caption=env_config.ANALYSIS_BBOX_DRAW_CAPTION, + resize=env_config.ANALYSIS_BBOX_RESIZE, + format=env_config.ANALYSIS_BBOX_FORMAT, + ) + + for layout_dumper in layout_dumpers: + drawer = _get_drawer_for_dumper(layout_dumper) + analysis_drawer.add_drawer(drawer) + analysis_drawer.process() + + +def render_bboxes_for_file( + filename: str, + analyzed_image_output_dir_path: str, + renders_output_dir_path: Optional[str] = None, +): + """Render the bounding boxes for a given layout dimp file. + To be used for analysis after the partition is performed for + only dumping the layouts - the bboxes can be rendered later. + + Expects that the analyzed_image_output_dir_path keeps the structure + that was created by the save_analysis_artifacts function. + + Args: + filename: The filename of the sources analyzed file (pdf/image) + analyzed_image_output_dir_path: The directory where the analysis artifacts + (layout dumps) are saved. It should be the root directory of the structure + created by the save_analysis_artifacts function. + renders_output_dir_path: Optional directory to save the rendered bboxes - + if not provided, it will be saved in the analysis directory. + """ + filename_stem = Path(filename).stem + is_image = not Path(filename).suffix.endswith("pdf") + analysis_dumps_dir = ( + Path(analyzed_image_output_dir_path) / "analysis" / filename_stem / "layout_dump" + ) + print(f"analysis_dumps_dir: {analysis_dumps_dir}") + if not analysis_dumps_dir.exists(): + return + layout_drawers = [] + for analysis_dump_filename in analysis_dumps_dir.iterdir(): + if not analysis_dump_filename.is_file(): + continue + with open(analysis_dump_filename) as f: + layout_dump = json.load(f) + if analysis_dump_filename.stem == "final": + layout_drawers.append(FinalLayoutDrawer(layout_dump=layout_dump)) + if analysis_dump_filename.stem == "object_detection": + layout_drawers.append(ODModelLayoutDrawer(layout_dump=layout_dump)) + if analysis_dump_filename.stem == "ocr": + layout_drawers.append(OCRLayoutDrawer(layout_dump=layout_dump)) + if analysis_dump_filename.stem == "pdfminer": + layout_drawers.append(PdfminerLayoutDrawer(layout_dump=layout_dump)) + + if layout_drawers: + if not renders_output_dir_path: + output_path = ( + Path(analyzed_image_output_dir_path) / "analysis" / filename_stem / "bboxes" + ) + else: + output_path = Path(renders_output_dir_path) + output_path.mkdir(parents=True, exist_ok=True) + analysis_drawer = AnalysisDrawer( + filename=filename, + save_dir=output_path, + is_image=is_image, + draw_grid=env_config.ANALYSIS_BBOX_DRAW_GRID, + draw_caption=env_config.ANALYSIS_BBOX_DRAW_CAPTION, + resize=env_config.ANALYSIS_BBOX_RESIZE, + format=env_config.ANALYSIS_BBOX_FORMAT, + ) + + for drawer in layout_drawers: + analysis_drawer.add_drawer(drawer) + analysis_drawer.process() diff --git a/unstructured/partition/pdf_image/analysis/bbox_visualisation.py b/unstructured/partition/pdf_image/analysis/bbox_visualisation.py index 23aa677487..ecd7f722bf 100644 --- a/unstructured/partition/pdf_image/analysis/bbox_visualisation.py +++ b/unstructured/partition/pdf_image/analysis/bbox_visualisation.py @@ -1,25 +1,20 @@ -import copy import logging import math import tempfile from abc import ABC, abstractmethod from dataclasses import dataclass from enum import Enum +from io import BytesIO from pathlib import Path -from typing import Collection, Generator, List, Optional, TypeVar, Union +from typing import Any, Generator, List, Optional, TypeVar, Union import numpy as np from matplotlib import colors, font_manager from PIL import Image, ImageDraw, ImageFont from unstructured_inference.constants import ElementType -from unstructured_inference.inference.elements import TextRegion -from unstructured_inference.inference.layout import DocumentLayout -from unstructured_inference.inference.layoutelement import LayoutElement -from unstructured.documents.elements import Element, Text from unstructured.partition.pdf_image.analysis.processor import AnalysisProcessor from unstructured.partition.pdf_image.pdf_image_utils import convert_pdf_to_image -from unstructured.partition.utils.sorting import coordinates_to_bbox PageImage = TypeVar("PageImage", Image.Image, np.ndarray) @@ -390,51 +385,73 @@ def draw_bbox_on_image( class LayoutDrawer(ABC): layout_source: str = "unknown" + laytout_dump: dict + + def __init__(self, layout_dump: dict): + self.layout_dump = layout_dump - @abstractmethod def draw_layout_on_page(self, page_image: Image.Image, page_num: int) -> Image.Image: """Draw the layout bboxes with additional metadata on the image.""" + layout_pages = self.layout_dump.get("pages") + if not layout_pages: + print(f"Warning: layout in drawer {self.__class__.__name__} is empty - skipping") + return page_image + if len(layout_pages) < page_num: + print(f"Error! Page {page_num} not found in layout (pages: {len(layout_pages)})") + return page_image + image_draw = ImageDraw.ImageDraw(page_image) + page_layout_dump = layout_pages[page_num - 1] + if page_num != page_layout_dump.get("number"): + dump_page_num = page_layout_dump.get("number") + print(f"Warning: Requested page num {page_num} differs from dump {dump_page_num}") + for idx, elements in enumerate(page_layout_dump["elements"], 1): + self.render_element_on_page(idx, image_draw, elements) + return page_image + + @abstractmethod + def render_element_on_page(self, idx: int, image_draw: ImageDraw, elements: dict[str, Any]): + """Draw a single element on the image.""" class SimpleLayoutDrawer(LayoutDrawer, ABC): - layout: list[list[TextRegion]] color: str show_order: bool = False show_text_length: bool = False - def draw_layout_on_page(self, page_image: Image.Image, page_num: int) -> Image.Image: - if not self.layout: - print(f"Warning: layout in drawer {self.__class__.__name__} is empty - skipping") - return page_image - if len(self.layout) < page_num: - print(f"Error! Page {page_num} not found in layout (pages: {len(self.layout)})") - return page_image - image_draw = ImageDraw.ImageDraw(page_image) - page_layout = self.layout[page_num - 1] - for idx, region in enumerate(page_layout): - text_len = len(region.text) if region.text else 0 - element_prob = getattr(region, "prob", None) - element_order = f"{idx + 1}" if self.show_order else None - text_len = f"len: {text_len}" if self.show_text_length else None - bbox = BBox( - points=(region.bbox.x1, region.bbox.y1, region.bbox.x2, region.bbox.y2), - labels=BboxLabels( - top_right=f"prob: {element_prob:.2f}" if element_prob else None, - bottom_left=text_len, - center=element_order, - ), - ) - draw_bbox_on_image(image_draw, bbox, color=self.color) - return page_image + def render_element_on_page(self, idx: int, image_draw: ImageDraw, elements: dict[str, Any]): + text_len = len(elements["text"]) if elements.get("text") else 0 + element_prob = elements.get("prob") + element_order = f"{idx}" if self.show_order else None + text_len = f"len: {text_len}" if self.show_text_length else None + bbox = BBox( + points=elements["bbox"], + labels=BboxLabels( + top_right=f"prob: {element_prob:.2f}" if element_prob else None, + bottom_left=text_len, + center=element_order, + ), + ) + draw_bbox_on_image(image_draw, bbox, color=self.color) class PdfminerLayoutDrawer(SimpleLayoutDrawer): layout_source = "pdfminer" - def __init__(self, layout: List[List[TextRegion]], color: str = "red"): - self.layout = copy.deepcopy(layout) + def __init__(self, layout_dump: dict, color: str = "red"): + self.layout_dump = layout_dump self.color = color self.show_order = True + super().__init__(layout_dump) + + +class OCRLayoutDrawer(SimpleLayoutDrawer): + layout_source = "ocr" + + def __init__(self, layout_dump: dict, color: str = "red"): + self.color = color + self.show_order = False + self.show_text_length = False + super().__init__(layout_dump) class ODModelLayoutDrawer(LayoutDrawer): @@ -454,56 +471,24 @@ class ODModelLayoutDrawer(LayoutDrawer): ElementType.TITLE: "greenyellow", } - def __init__(self, layout: DocumentLayout): - self.layout: list[Collection[LayoutElement]] = copy.deepcopy( - [page.elements for page in layout.pages] + def render_element_on_page(self, idx: int, image_draw: ImageDraw, elements: dict[str, Any]): + element_type = elements["type"] + element_prob = elements.get("prob") + bbox_points = elements["bbox"] + color = self.get_element_type_color(element_type) + bbox = BBox( + points=bbox_points, + labels=BboxLabels( + top_left=f"{element_type}", + top_right=f"prob: {element_prob:.2f}" if element_prob else None, + ), ) - - def draw_layout_on_page(self, page_image: Image.Image, page_num: int) -> Image.Image: - if not self.layout: - print(f"Warning: layout in drawer {self.__class__.__name__} is empty - skipping") - return page_image - if len(self.layout) < page_num: - print(f"Error! Page {page_num} not found in layout (pages: {len(self.layout)})") - return page_image - image_draw = ImageDraw.ImageDraw(page_image) - page_layout = self.layout[page_num - 1] - for layout_element in page_layout: - element_type = layout_element.type - element_prob = layout_element.prob - color = self.get_element_type_color(element_type) - bbox = BBox( - points=( - layout_element.bbox.x1, - layout_element.bbox.y1, - layout_element.bbox.x2, - layout_element.bbox.y2, - ), - labels=BboxLabels( - top_left=f"{element_type}", - top_right=f"prob: {element_prob:.2f}" if element_prob else None, - ), - ) - draw_bbox_on_image(image_draw, bbox, color=color) - return page_image + draw_bbox_on_image(image_draw, bbox, color=color) def get_element_type_color(self, element_type: str) -> str: return self.color_map.get(element_type, "cyan") -class OCRLayoutDrawer(SimpleLayoutDrawer): - layout_source = "ocr" - - def __init__(self, color: str = "red"): - self.color = color - self.layout: list[list[TextRegion]] = [] - self.show_order = False - self.show_text_length = False - - def add_ocred_page(self, page_layout: list[TextRegion]): - self.layout.append(copy.deepcopy(page_layout)) - - class FinalLayoutDrawer(LayoutDrawer): layout_source = "final" @@ -523,36 +508,27 @@ class FinalLayoutDrawer(LayoutDrawer): "PageNumber": "crimson", } - def __init__(self, layout: List[Element]): - self.layout = layout - - def draw_layout_on_page(self, page_image: Image.Image, page_num: int) -> Image.Image: - image_draw = ImageDraw.ImageDraw(page_image) - elements_for_page = [ - element for element in self.layout if element.metadata.page_number == page_num - ] - for idx, element in enumerate(elements_for_page): - element_order = idx + 1 - element_type = ( - element.category if isinstance(element, Text) else str(element.__class__.__name__) - ) - element_prob = getattr(element.metadata, "detection_class_prob", None) - text_len = len(element.text) - bbox_points = coordinates_to_bbox(element.metadata.coordinates) - color = self.get_element_type_color(element_type) - cluster = getattr(element.metadata, "cluster", None) - bbox = BBox( - points=bbox_points, - labels=BboxLabels( - top_left=f"{element_type}", - top_right=f"prob: {element_prob:.2f}" if element_prob else None, - bottom_right=f"len: {text_len}", - bottom_left=f"cl: {cluster}" if cluster else None, - center=f"{element_order}", - ), - ) - draw_bbox_on_image(image_draw, bbox, color=color) - return page_image + def __init__(self, layout_dump: dict): + self.layout_dump = layout_dump + + def render_element_on_page(self, idx: int, image_draw: ImageDraw, elements: dict[str, Any]): + element_type = elements["type"] + element_prob = elements.get("prob") + text_len = len(elements["text"]) if elements.get("text") else 0 + bbox_points = elements["bbox"] + color = self.get_element_type_color(element_type) + cluster = elements.get("cluster") + bbox = BBox( + points=bbox_points, + labels=BboxLabels( + top_left=f"{element_type}", + top_right=f"prob: {element_prob:.2f}" if element_prob else None, + bottom_right=f"len: {text_len}", + bottom_left=f"cl: {cluster}" if cluster else None, + center=f"{idx}", + ), + ) + draw_bbox_on_image(image_draw, bbox, color=color) def get_element_type_color(self, element_type: str) -> str: return self.color_map.get(element_type, "cyan") @@ -562,8 +538,10 @@ class AnalysisDrawer(AnalysisProcessor): def __init__( self, - filename: Union[str, Path], + filename: Optional[Union[str, Path]], + is_image: bool, save_dir: Union[str, Path], + file: Optional[BytesIO] = None, draw_caption: bool = True, draw_grid: bool = False, resize: Optional[float] = None, @@ -572,8 +550,10 @@ def __init__( self.draw_caption = draw_caption self.draw_grid = draw_grid self.resize = resize + self.is_image = is_image self.format = format self.drawers = [] + self.file = file super().__init__(filename, save_dir) @@ -649,7 +629,7 @@ def add_caption(self, image: Image.Image, caption: str): image.close() return expanded_image - def paste_images_on_grid(self, images: list[Image.Image]) -> Image.Image: + def paste_images_on_grid(self, images: List[Image.Image]) -> Image.Image: """Creates a single image that presents all the images on a grid 2 x n/2""" pairs = [] @@ -676,15 +656,34 @@ def paste_images_on_grid(self, images: list[Image.Image]) -> Image.Image: def load_source_image(self) -> Generator[Image.Image, None, None]: with tempfile.TemporaryDirectory() as temp_dir: - try: - image_paths = convert_pdf_to_image( - self.filename, - output_folder=temp_dir, - path_only=True, - ) - except: # noqa: E722 - # probably got an image instead of pdf - load it directly - image_paths = [self.filename] + image_paths = [] + if self.is_image: + if self.file: + try: + image = Image.open(self.file) + output_file = Path(temp_dir) / self.filename + image.save(output_file, format="PNG") + image_paths = [output_file] + except Exception as ex: # noqa: E722 + print( + f"Error while converting image to PNG for file {self.filename}, " + f"exception: {ex}" + ) + else: + image_paths = [self.filename] + else: + try: + image_paths = convert_pdf_to_image( + filename=self.filename, + file=self.file, + output_folder=temp_dir, + path_only=True, + ) + except Exception as ex: # noqa: E722 + print( + f"Error while converting pdf to image for file {self.filename}", + f"exception: {ex}", + ) for image_path in image_paths: with Image.open(image_path) as image: diff --git a/unstructured/partition/pdf_image/analysis/layout_dump.py b/unstructured/partition/pdf_image/analysis/layout_dump.py index 7b8ccdd581..c529a6c078 100644 --- a/unstructured/partition/pdf_image/analysis/layout_dump.py +++ b/unstructured/partition/pdf_image/analysis/layout_dump.py @@ -1,8 +1,10 @@ import json from abc import ABC, abstractmethod +from collections import defaultdict from pathlib import Path -from typing import Optional +from typing import List, Optional +from unstructured_inference.inference.elements import ImageTextRegion, TextRegion from unstructured_inference.inference.layout import DocumentLayout from unstructured_inference.models.base import get_model from unstructured_inference.models.detectron2onnx import ( @@ -13,7 +15,9 @@ ) from unstructured_inference.models.yolox import YOLOX_LABEL_MAP, UnstructuredYoloXModel +from unstructured.documents.elements import Element, Text from unstructured.partition.pdf_image.analysis.processor import AnalysisProcessor +from unstructured.partition.utils.sorting import coordinates_to_bbox class LayoutDumper(ABC): @@ -24,7 +28,7 @@ def dump(self) -> dict: """Transforms the results to a dict convertible structured formats like JSON or YAML""" -def extract_layout_info(layout: DocumentLayout) -> dict: +def extract_document_layout_info(layout: DocumentLayout) -> dict: pages = [] for page in layout.pages: @@ -46,7 +50,7 @@ def extract_layout_info(layout: DocumentLayout) -> dict: return {"pages": pages} -def object_detection_classes(model_name) -> list[str]: +def object_detection_classes(model_name) -> List[str]: model = get_model(model_name) if isinstance(model, UnstructuredYoloXModel): return list(YOLOX_LABEL_MAP.values()) @@ -62,7 +66,7 @@ class ObjectDetectionLayoutDumper(LayoutDumper): layout_source = "object_detection" def __init__(self, layout: DocumentLayout, model_name: Optional[str] = None): - self.layout: dict = extract_layout_info(layout) + self.layout: dict = extract_document_layout_info(layout) self.model_name = model_name def dump(self) -> dict: @@ -75,6 +79,113 @@ def dump(self) -> dict: return self.layout +def _get_info_from_extracted_page(page: List[TextRegion]) -> List[dict]: + elements = [] + for element in page: + is_image = isinstance(element, ImageTextRegion) + bbox = element.bbox + elements.append( + { + "bbox": [bbox.x1, bbox.y1, bbox.x2, bbox.y2], + "text": element.text, + "source": str(element.source.value), + "is_image": is_image, + } + ) + return elements + + +def extract_text_regions_info(layout: List[List[TextRegion]]) -> dict: + pages = [] + for page_num, page in enumerate(layout, 1): + elements = _get_info_from_extracted_page(page) + pages.append({"number": page_num, "elements": elements}) + return {"pages": pages} + + +class ExtractedLayoutDumper(LayoutDumper): + + layout_source = "pdfminer" + + def __init__(self, layout: List[List[TextRegion]]): + self.layout = extract_text_regions_info(layout) + + def dump(self) -> dict: + return self.layout + + +class OCRLayoutDumper(LayoutDumper): + + layout_source = "ocr" + + def __init__(self): + self.layout = [] + self.page_number = 1 + + def add_ocred_page(self, page: List[TextRegion]): + elements = _get_info_from_extracted_page(page) + self.layout.append({"number": self.page_number, "elements": elements}) + self.page_number += 1 + + def dump(self) -> dict: + return {"pages": self.layout} + + +def _extract_final_element_info(element: Element) -> dict: + element_type = ( + element.category if isinstance(element, Text) else str(element.__class__.__name__) + ) + element_prob = getattr(element.metadata, "detection_class_prob", None) + text = element.text + bbox_points = coordinates_to_bbox(element.metadata.coordinates) + cluster = getattr(element.metadata, "cluster", None) + return { + "type": element_type, + "prob": element_prob, + "text": text, + "bbox": bbox_points, + "cluster": cluster, + } + + +def _extract_final_element_page_size(element: Element) -> dict: + try: + return { + "width": element.metadata.coordinates.system.width, + "height": element.metadata.coordinates.system.height, + } + except AttributeError: + return { + "width": None, + "height": None, + } + + +class FinalLayoutDumper(LayoutDumper): + + layout_source = "final" + + def __init__(self, layout: List[Element]): + pages = defaultdict(list) + for element in layout: + element_page_number = element.metadata.page_number + pages[element_page_number].append(_extract_final_element_info(element)) + extracted_pages = [ + { + "number": page_number, + "size": ( + _extract_final_element_page_size(page_elements[0]) if page_elements else None + ), + "elements": page_elements, + } + for page_number, page_elements in pages.items() + ] + self.layout = {"pages": sorted(extracted_pages, key=lambda x: x["number"])} + + def dump(self) -> dict: + return self.layout + + class JsonLayoutDumper(AnalysisProcessor): """Dumps the results of the analysis to a JSON file""" diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py index ed423921d6..a552e3d6b6 100644 --- a/unstructured/partition/pdf_image/ocr.py +++ b/unstructured/partition/pdf_image/ocr.py @@ -13,7 +13,7 @@ from unstructured.documents.elements import ElementType from unstructured.metrics.table.table_formats import SimpleTableCell -from unstructured.partition.pdf_image.analysis.bbox_visualisation import OCRLayoutDrawer +from unstructured.partition.pdf_image.analysis import OCRLayoutDumper from unstructured.partition.pdf_image.pdf_image_utils import pad_element_bboxes, valid_text from unstructured.partition.utils.config import env_config from unstructured.partition.utils.constants import OCRMode @@ -36,7 +36,7 @@ def process_data_with_ocr( ocr_languages: str = "eng", ocr_mode: str = OCRMode.FULL_PAGE.value, pdf_image_dpi: int = 200, - ocr_drawer: Optional[OCRLayoutDrawer] = None, + ocr_layout_dumper: Optional[OCRLayoutDumper] = None, ) -> "DocumentLayout": """ Process OCR data from a given data and supplement the output DocumentLayout @@ -62,6 +62,8 @@ def process_data_with_ocr( - pdf_image_dpi (int, optional): DPI (dots per inch) for processing PDF images. Defaults to 200. + - ocr_layout_dumper (OCRLayoutDumper, optional): The OCR layout dumper to save the OCR layout. + Returns: DocumentLayout: The merged layout information obtained after OCR processing. """ @@ -81,7 +83,7 @@ def process_data_with_ocr( ocr_languages=ocr_languages, ocr_mode=ocr_mode, pdf_image_dpi=pdf_image_dpi, - ocr_drawer=ocr_drawer, + ocr_layout_dumper=ocr_layout_dumper, ) return merged_layouts @@ -97,7 +99,7 @@ def process_file_with_ocr( ocr_languages: str = "eng", ocr_mode: str = OCRMode.FULL_PAGE.value, pdf_image_dpi: int = 200, - ocr_drawer: Optional[OCRLayoutDrawer] = None, + ocr_layout_dumper: Optional[OCRLayoutDumper] = None, ) -> "DocumentLayout": """ Process OCR data from a given file and supplement the output DocumentLayout @@ -144,7 +146,7 @@ def process_file_with_ocr( ocr_languages=ocr_languages, ocr_mode=ocr_mode, extracted_regions=extracted_regions, - ocr_drawer=ocr_drawer, + ocr_layout_dumper=ocr_layout_dumper, ) merged_page_layouts.append(merged_page_layout) return DocumentLayout.from_pages(merged_page_layouts) @@ -167,7 +169,7 @@ def process_file_with_ocr( ocr_languages=ocr_languages, ocr_mode=ocr_mode, extracted_regions=extracted_regions, - ocr_drawer=ocr_drawer, + ocr_layout_dumper=ocr_layout_dumper, ) merged_page_layouts.append(merged_page_layout) return DocumentLayout.from_pages(merged_page_layouts) @@ -186,7 +188,7 @@ def supplement_page_layout_with_ocr( ocr_languages: str = "eng", ocr_mode: str = OCRMode.FULL_PAGE.value, extracted_regions: Optional[List["TextRegion"]] = None, - ocr_drawer: Optional[OCRLayoutDrawer] = None, + ocr_layout_dumper: Optional[OCRLayoutDumper] = None, ) -> "PageLayout": """ Supplement an PageLayout with OCR results depending on OCR mode. @@ -199,8 +201,8 @@ def supplement_page_layout_with_ocr( ocr_agent = OCRAgent.get_agent(language=ocr_languages) if ocr_mode == OCRMode.FULL_PAGE.value: ocr_layout = ocr_agent.get_layout_from_image(image) - if ocr_drawer: - ocr_drawer.add_ocred_page(ocr_layout) + if ocr_layout_dumper: + ocr_layout_dumper.add_ocred_page(ocr_layout) page_layout.elements[:] = merge_out_layout_with_ocr_layout( out_layout=cast(List["LayoutElement"], page_layout.elements), ocr_layout=ocr_layout,