feat: make analysis drawing more flexible (#3574)

This PR changes the way the analysis tools can be used: - by default if `analysis` is set to `True` in `partition_pdf` and the strategy is resolved to `hi_res`: - for each file 4 layout dumps are produced and saved as JSON files (`object_detection`, `extracted`, `ocr`, `final`) - similar way to the current `object_detection` dump - the drawing functions/classes now accept these dumps accordingly instead of the internal classes instances (like `TextRegion`, `DocumentLayout` - it makes it possible to use the lightweight JSON files to render the bboxes of a given file after the partition is done - `_partition_pdf_or_image_local` has been refactored and most of the analysis code is now encapsulated in `save_analysis_artifiacts` function - to do this, helper function `render_bboxes_for_file` is added <img width="338" alt="Screenshot 2024-08-28 at 14 37 56" src="https://github.com/user-attachments/assets/10b6fbbd-7824-448d-8c11-52fc1b1b0dd0">
Unstructured-IO · Sep 2, 2024 · 404f780 · 404f780
1 parent 04322d1
commit 404f780
Show file tree

Hide file tree

Showing 8 changed files with 510 additions and 207 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,7 @@
-## 0.15.10-dev0
+## 0.15.10-dev1
 
 ### Enhancements
+* **Modified analysis drawing tools to dump to files and draw from dumps** If the parameter `analysis` of the `partition_pdf` function is set to `True`, the layout for Object Detection, Pdfminer Extraction, OCR and final layouts will be dumped as json files. The drawers now accept dict (dump) objects instead of internal classes instances.
 
 ### Features
 

diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -1336,33 +1336,67 @@ def test_unique_and_deterministic_element_ids(strategy, expected_ids):
     assert ids == expected_ids, "Element IDs do not match expected IDs"
 
 
-def test_analysis_artifacts_saved():
+@pytest.mark.parametrize("is_path", [True, False])
+@pytest.mark.parametrize(
+    ("example_doc", "doc_pages"),
+    [
+        ("pdf/layout-parser-paper-fast.pdf", 2),
+        ("img/DA-1p.png", 1),
+    ],
+)
+def test_analysis_artifacts_saved(is_path: bool, example_doc: str, doc_pages: int):
     with tempfile.TemporaryDirectory() as temp_dir:
-        filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
+        file = None
+        filename = example_doc_path(example_doc)
+        is_image = not Path(filename).suffix.endswith("pdf")
+        if not is_path:
+            file = open(filename, "rb")  # noqa: SIM115
+            filename = None
         pdf.partition_pdf(
             filename=filename,
+            file=file,
+            is_image=is_image,
             strategy=PartitionStrategy.HI_RES,
             analysis=True,
             analyzed_image_output_dir_path=temp_dir,
         )
 
         analysis_dir = Path(temp_dir)
-        layout_dump_dir = analysis_dir / "analysis" / "layout-parser-paper-fast" / "layout_dump"
+        file_analysis_root = None
+        if is_path:
+            file_analysis_root = analysis_dir / "analysis" / Path(example_doc).stem
+        else:
+            # if file is not a path, the filename is None and the analysis directory
+            # for the document is generated
+            generated_file_stem_path = list((analysis_dir / "analysis").iterdir())[0]
+            if is_image:
+                assert "image" in generated_file_stem_path.name
+            else:
+                assert "pdf" in generated_file_stem_path.name
+            file_analysis_root = generated_file_stem_path
+        layout_dump_dir = file_analysis_root / "layout_dump"
         assert layout_dump_dir.exists()
         layout_dump_files = list(layout_dump_dir.iterdir())
-        assert len(layout_dump_files) == 1
-        assert (layout_dump_dir / "object_detection.json").exists()
 
-        bboxes_dir = analysis_dir / "analysis" / "layout-parser-paper-fast" / "bboxes"
+        expected_layout_dumps = ["object_detection", "ocr", "pdfminer", "final"]
+        assert len(layout_dump_files) == len(expected_layout_dumps)
+
+        for expected_layout_dump in expected_layout_dumps:
+            assert (layout_dump_dir / f"{expected_layout_dump}.json").exists()
+
+        bboxes_dir = file_analysis_root / "bboxes"
         assert bboxes_dir.exists()
         bboxes_files = list(bboxes_dir.iterdir())
-        assert len(bboxes_files) == 2 * 4  # 2 pages * 4 different layouts per page
 
-        expected_layouts = ["od_model", "ocr", "pdfminer", "final"]
-        expected_pages = [1, 2]
-        for el in expected_layouts:
+        expected_renders = ["od_model", "ocr", "pdfminer", "final"]
+        assert len(bboxes_files) == doc_pages * len(expected_renders)
+
+        expected_pages = range(1, doc_pages + 1)
+        for el in expected_renders:
             for page in expected_pages:
                 assert bboxes_dir / f"page{page}_layout_{el}.png" in bboxes_files
+        if file:
+            file.close()
 
 
 @pytest.mark.parametrize(

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.15.10-dev0"  # pragma: no cover
+__version__ = "0.15.10-dev1"  # pragma: no cover
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -53,16 +53,12 @@
     prepare_languages_for_tesseract,
     tesseract_to_paddle_language,
 )
-from unstructured.partition.pdf_image.analysis.bbox_visualisation import (
-    AnalysisDrawer,
-    FinalLayoutDrawer,
-    OCRLayoutDrawer,
-    ODModelLayoutDrawer,
-    PdfminerLayoutDrawer,
-)
+from unstructured.partition.pdf_image.analysis import save_analysis_artifiacts
 from unstructured.partition.pdf_image.analysis.layout_dump import (
-    JsonLayoutDumper,
+    ExtractedLayoutDumper,
+    FinalLayoutDumper,
     ObjectDetectionLayoutDumper,
+    OCRLayoutDumper,
 )
 from unstructured.partition.pdf_image.form_extraction import run_form_extraction
 from unstructured.partition.pdf_image.pdf_image_utils import (
@@ -589,12 +585,12 @@ def _partition_pdf_or_image_local(
             f"(currently {pdf_image_dpi}).",
         )
 
-    pdfminer_drawer: Optional[PdfminerLayoutDrawer] = None
-    od_model_drawer: Optional[ODModelLayoutDrawer] = None
-    ocr_drawer: Optional[OCRLayoutDrawer] = None
     od_model_layout_dumper: Optional[ObjectDetectionLayoutDumper] = None
-    skip_bboxes = env_config.ANALYSIS_BBOX_SKIP
-    skip_dump_od = env_config.ANALYSIS_DUMP_OD_SKIP
+    extracted_layout_dumper: Optional[ExtractedLayoutDumper] = None
+    ocr_layout_dumper: Optional[OCRLayoutDumper] = None
+    final_layout_dumper: Optional[FinalLayoutDumper] = None
+
+    skip_analysis_dump = env_config.ANALYSIS_DUMP_OD_SKIP
 
     if file is None:
         inferred_document_layout = process_file_with_model(
@@ -624,19 +620,15 @@ def _partition_pdf_or_image_local(
                     else:
                         analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
                 os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
-                if not skip_bboxes:
-                    pdfminer_drawer = PdfminerLayoutDrawer(
-                        layout=extracted_layout,
-                    )
-                    od_model_drawer = ODModelLayoutDrawer(
-                        layout=inferred_document_layout,
-                    )
-                    ocr_drawer = OCRLayoutDrawer()
-                if not skip_dump_od:
+                if not skip_analysis_dump:
                     od_model_layout_dumper = ObjectDetectionLayoutDumper(
                         layout=inferred_document_layout,
                         model_name=hi_res_model_name,
                     )
+                    extracted_layout_dumper = ExtractedLayoutDumper(
+                        layout=extracted_layout,
+                    )
+                    ocr_layout_dumper = OCRLayoutDumper()
             # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
             merged_document_layout = merge_inferred_with_extracted_layout(
                 inferred_document_layout=inferred_document_layout,
@@ -653,7 +645,7 @@ def _partition_pdf_or_image_local(
                 ocr_languages=ocr_languages,
                 ocr_mode=ocr_mode,
                 pdf_image_dpi=pdf_image_dpi,
-                ocr_drawer=ocr_drawer,
+                ocr_layout_dumper=ocr_layout_dumper,
             )
     else:
         inferred_document_layout = process_data_with_model(
@@ -685,14 +677,15 @@ def _partition_pdf_or_image_local(
                         )
                     else:
                         analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
-                os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
-                pdfminer_drawer = PdfminerLayoutDrawer(
-                    layout=extracted_layout,
-                )
-                od_model_drawer = ODModelLayoutDrawer(
-                    layout=inferred_document_layout,
-                )
-                ocr_drawer = OCRLayoutDrawer()
+                if not skip_analysis_dump:
+                    od_model_layout_dumper = ObjectDetectionLayoutDumper(
+                        layout=inferred_document_layout,
+                        model_name=hi_res_model_name,
+                    )
+                    extracted_layout_dumper = ExtractedLayoutDumper(
+                        layout=extracted_layout,
+                    )
+                    ocr_layout_dumper = OCRLayoutDumper()
 
             # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
             merged_document_layout = merge_inferred_with_extracted_layout(
@@ -712,7 +705,7 @@ def _partition_pdf_or_image_local(
                 ocr_languages=ocr_languages,
                 ocr_mode=ocr_mode,
                 pdf_image_dpi=pdf_image_dpi,
-                ocr_drawer=ocr_drawer,
+                ocr_layout_dumper=ocr_layout_dumper,
             )
 
     # NOTE(alan): starting with v2, chipper sorts the elements itself.
@@ -801,38 +794,29 @@ def _partition_pdf_or_image_local(
         )
         out_elements.extend(forms)
 
-    if analysis and not skip_bboxes:
-        final_drawer = FinalLayoutDrawer(
-            layout=out_elements,
-        )
-        analysis_drawer = AnalysisDrawer(
-            filename=filename,
-            save_dir=analyzed_image_output_dir_path,
-            draw_grid=env_config.ANALYSIS_BBOX_DRAW_GRID,
-            draw_caption=env_config.ANALYSIS_BBOX_DRAW_CAPTION,
-            resize=env_config.ANALYSIS_BBOX_RESIZE,
-            format=env_config.ANALYSIS_BBOX_FORMAT,
-        )
-
-        if od_model_drawer:
-            analysis_drawer.add_drawer(od_model_drawer)
-
-        if pdfminer_drawer:
-            analysis_drawer.add_drawer(pdfminer_drawer)
-
-        if ocr_drawer:
-            analysis_drawer.add_drawer(ocr_drawer)
-        analysis_drawer.add_drawer(final_drawer)
-        analysis_drawer.process()
-
-    if analysis and not skip_dump_od:
-        json_layout_dumper = JsonLayoutDumper(
+    if analysis:
+        if not skip_analysis_dump:
+            final_layout_dumper = FinalLayoutDumper(
+                layout=out_elements,
+            )
+        layout_dumpers = []
+        if od_model_layout_dumper:
+            layout_dumpers.append(od_model_layout_dumper)
+        if extracted_layout_dumper:
+            layout_dumpers.append(extracted_layout_dumper)
+        if ocr_layout_dumper:
+            layout_dumpers.append(ocr_layout_dumper)
+        if final_layout_dumper:
+            layout_dumpers.append(final_layout_dumper)
+        save_analysis_artifiacts(
+            *layout_dumpers,
             filename=filename,
-            save_dir=analyzed_image_output_dir_path,
+            file=file,
+            is_image=is_image,
+            analyzed_image_output_dir_path=analyzed_image_output_dir_path,
+            skip_bboxes=env_config.ANALYSIS_BBOX_SKIP,
+            skip_dump_od=env_config.ANALYSIS_DUMP_OD_SKIP,
         )
-        if od_model_layout_dumper:
-            json_layout_dumper.add_layout_dumper(od_model_layout_dumper)
-        json_layout_dumper.process()
 
     return out_elements
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.15.10-dev0" # pragma: no cover
		__version__ = "0.15.10-dev1" # pragma: no cover