Skip to content

Commit

Permalink
feat: make analysis drawing more flexible (#3574)
Browse files Browse the repository at this point in the history
This PR changes the way the analysis tools can be used:
- by default if `analysis` is set to `True` in `partition_pdf` and the
strategy is resolved to `hi_res`:
- for each file 4 layout dumps are produced and saved as JSON files
(`object_detection`, `extracted`, `ocr`, `final`) - similar way to the
current `object_detection` dump
- the drawing functions/classes now accept these dumps accordingly
instead of the internal classes instances (like `TextRegion`,
`DocumentLayout`
- it makes it possible to use the lightweight JSON files to render the
bboxes of a given file after the partition is done
- `_partition_pdf_or_image_local` has been refactored and most of the
analysis code is now encapsulated in `save_analysis_artifiacts` function
- to do this, helper function `render_bboxes_for_file` is added
<img width="338" alt="Screenshot 2024-08-28 at 14 37 56"
src="https://github.com/user-attachments/assets/10b6fbbd-7824-448d-8c11-52fc1b1b0dd0">
  • Loading branch information
pawel-kmiecik authored Sep 2, 2024
1 parent 04322d1 commit 404f780
Show file tree
Hide file tree
Showing 8 changed files with 510 additions and 207 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
## 0.15.10-dev0
## 0.15.10-dev1

### Enhancements
* **Modified analysis drawing tools to dump to files and draw from dumps** If the parameter `analysis` of the `partition_pdf` function is set to `True`, the layout for Object Detection, Pdfminer Extraction, OCR and final layouts will be dumped as json files. The drawers now accept dict (dump) objects instead of internal classes instances.

### Features

Expand Down
54 changes: 44 additions & 10 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1336,33 +1336,67 @@ def test_unique_and_deterministic_element_ids(strategy, expected_ids):
assert ids == expected_ids, "Element IDs do not match expected IDs"


def test_analysis_artifacts_saved():
@pytest.mark.parametrize("is_path", [True, False])
@pytest.mark.parametrize(
("example_doc", "doc_pages"),
[
("pdf/layout-parser-paper-fast.pdf", 2),
("img/DA-1p.png", 1),
],
)
def test_analysis_artifacts_saved(is_path: bool, example_doc: str, doc_pages: int):
with tempfile.TemporaryDirectory() as temp_dir:
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
file = None
filename = example_doc_path(example_doc)
is_image = not Path(filename).suffix.endswith("pdf")
if not is_path:
file = open(filename, "rb") # noqa: SIM115
filename = None
pdf.partition_pdf(
filename=filename,
file=file,
is_image=is_image,
strategy=PartitionStrategy.HI_RES,
analysis=True,
analyzed_image_output_dir_path=temp_dir,
)

analysis_dir = Path(temp_dir)
layout_dump_dir = analysis_dir / "analysis" / "layout-parser-paper-fast" / "layout_dump"
file_analysis_root = None
if is_path:
file_analysis_root = analysis_dir / "analysis" / Path(example_doc).stem
else:
# if file is not a path, the filename is None and the analysis directory
# for the document is generated
generated_file_stem_path = list((analysis_dir / "analysis").iterdir())[0]
if is_image:
assert "image" in generated_file_stem_path.name
else:
assert "pdf" in generated_file_stem_path.name
file_analysis_root = generated_file_stem_path
layout_dump_dir = file_analysis_root / "layout_dump"
assert layout_dump_dir.exists()
layout_dump_files = list(layout_dump_dir.iterdir())
assert len(layout_dump_files) == 1
assert (layout_dump_dir / "object_detection.json").exists()

bboxes_dir = analysis_dir / "analysis" / "layout-parser-paper-fast" / "bboxes"
expected_layout_dumps = ["object_detection", "ocr", "pdfminer", "final"]
assert len(layout_dump_files) == len(expected_layout_dumps)

for expected_layout_dump in expected_layout_dumps:
assert (layout_dump_dir / f"{expected_layout_dump}.json").exists()

bboxes_dir = file_analysis_root / "bboxes"
assert bboxes_dir.exists()
bboxes_files = list(bboxes_dir.iterdir())
assert len(bboxes_files) == 2 * 4 # 2 pages * 4 different layouts per page

expected_layouts = ["od_model", "ocr", "pdfminer", "final"]
expected_pages = [1, 2]
for el in expected_layouts:
expected_renders = ["od_model", "ocr", "pdfminer", "final"]
assert len(bboxes_files) == doc_pages * len(expected_renders)

expected_pages = range(1, doc_pages + 1)
for el in expected_renders:
for page in expected_pages:
assert bboxes_dir / f"page{page}_layout_{el}.png" in bboxes_files
if file:
file.close()


@pytest.mark.parametrize(
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.15.10-dev0" # pragma: no cover
__version__ = "0.15.10-dev1" # pragma: no cover
108 changes: 46 additions & 62 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,16 +53,12 @@
prepare_languages_for_tesseract,
tesseract_to_paddle_language,
)
from unstructured.partition.pdf_image.analysis.bbox_visualisation import (
AnalysisDrawer,
FinalLayoutDrawer,
OCRLayoutDrawer,
ODModelLayoutDrawer,
PdfminerLayoutDrawer,
)
from unstructured.partition.pdf_image.analysis import save_analysis_artifiacts
from unstructured.partition.pdf_image.analysis.layout_dump import (
JsonLayoutDumper,
ExtractedLayoutDumper,
FinalLayoutDumper,
ObjectDetectionLayoutDumper,
OCRLayoutDumper,
)
from unstructured.partition.pdf_image.form_extraction import run_form_extraction
from unstructured.partition.pdf_image.pdf_image_utils import (
Expand Down Expand Up @@ -589,12 +585,12 @@ def _partition_pdf_or_image_local(
f"(currently {pdf_image_dpi}).",
)

pdfminer_drawer: Optional[PdfminerLayoutDrawer] = None
od_model_drawer: Optional[ODModelLayoutDrawer] = None
ocr_drawer: Optional[OCRLayoutDrawer] = None
od_model_layout_dumper: Optional[ObjectDetectionLayoutDumper] = None
skip_bboxes = env_config.ANALYSIS_BBOX_SKIP
skip_dump_od = env_config.ANALYSIS_DUMP_OD_SKIP
extracted_layout_dumper: Optional[ExtractedLayoutDumper] = None
ocr_layout_dumper: Optional[OCRLayoutDumper] = None
final_layout_dumper: Optional[FinalLayoutDumper] = None

skip_analysis_dump = env_config.ANALYSIS_DUMP_OD_SKIP

if file is None:
inferred_document_layout = process_file_with_model(
Expand Down Expand Up @@ -624,19 +620,15 @@ def _partition_pdf_or_image_local(
else:
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
if not skip_bboxes:
pdfminer_drawer = PdfminerLayoutDrawer(
layout=extracted_layout,
)
od_model_drawer = ODModelLayoutDrawer(
layout=inferred_document_layout,
)
ocr_drawer = OCRLayoutDrawer()
if not skip_dump_od:
if not skip_analysis_dump:
od_model_layout_dumper = ObjectDetectionLayoutDumper(
layout=inferred_document_layout,
model_name=hi_res_model_name,
)
extracted_layout_dumper = ExtractedLayoutDumper(
layout=extracted_layout,
)
ocr_layout_dumper = OCRLayoutDumper()
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = merge_inferred_with_extracted_layout(
inferred_document_layout=inferred_document_layout,
Expand All @@ -653,7 +645,7 @@ def _partition_pdf_or_image_local(
ocr_languages=ocr_languages,
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_drawer=ocr_drawer,
ocr_layout_dumper=ocr_layout_dumper,
)
else:
inferred_document_layout = process_data_with_model(
Expand Down Expand Up @@ -685,14 +677,15 @@ def _partition_pdf_or_image_local(
)
else:
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
pdfminer_drawer = PdfminerLayoutDrawer(
layout=extracted_layout,
)
od_model_drawer = ODModelLayoutDrawer(
layout=inferred_document_layout,
)
ocr_drawer = OCRLayoutDrawer()
if not skip_analysis_dump:
od_model_layout_dumper = ObjectDetectionLayoutDumper(
layout=inferred_document_layout,
model_name=hi_res_model_name,
)
extracted_layout_dumper = ExtractedLayoutDumper(
layout=extracted_layout,
)
ocr_layout_dumper = OCRLayoutDumper()

# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = merge_inferred_with_extracted_layout(
Expand All @@ -712,7 +705,7 @@ def _partition_pdf_or_image_local(
ocr_languages=ocr_languages,
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_drawer=ocr_drawer,
ocr_layout_dumper=ocr_layout_dumper,
)

# NOTE(alan): starting with v2, chipper sorts the elements itself.
Expand Down Expand Up @@ -801,38 +794,29 @@ def _partition_pdf_or_image_local(
)
out_elements.extend(forms)

if analysis and not skip_bboxes:
final_drawer = FinalLayoutDrawer(
layout=out_elements,
)
analysis_drawer = AnalysisDrawer(
filename=filename,
save_dir=analyzed_image_output_dir_path,
draw_grid=env_config.ANALYSIS_BBOX_DRAW_GRID,
draw_caption=env_config.ANALYSIS_BBOX_DRAW_CAPTION,
resize=env_config.ANALYSIS_BBOX_RESIZE,
format=env_config.ANALYSIS_BBOX_FORMAT,
)

if od_model_drawer:
analysis_drawer.add_drawer(od_model_drawer)

if pdfminer_drawer:
analysis_drawer.add_drawer(pdfminer_drawer)

if ocr_drawer:
analysis_drawer.add_drawer(ocr_drawer)
analysis_drawer.add_drawer(final_drawer)
analysis_drawer.process()

if analysis and not skip_dump_od:
json_layout_dumper = JsonLayoutDumper(
if analysis:
if not skip_analysis_dump:
final_layout_dumper = FinalLayoutDumper(
layout=out_elements,
)
layout_dumpers = []
if od_model_layout_dumper:
layout_dumpers.append(od_model_layout_dumper)
if extracted_layout_dumper:
layout_dumpers.append(extracted_layout_dumper)
if ocr_layout_dumper:
layout_dumpers.append(ocr_layout_dumper)
if final_layout_dumper:
layout_dumpers.append(final_layout_dumper)
save_analysis_artifiacts(
*layout_dumpers,
filename=filename,
save_dir=analyzed_image_output_dir_path,
file=file,
is_image=is_image,
analyzed_image_output_dir_path=analyzed_image_output_dir_path,
skip_bboxes=env_config.ANALYSIS_BBOX_SKIP,
skip_dump_od=env_config.ANALYSIS_DUMP_OD_SKIP,
)
if od_model_layout_dumper:
json_layout_dumper.add_layout_dumper(od_model_layout_dumper)
json_layout_dumper.process()

return out_elements

Expand Down
Loading

0 comments on commit 404f780

Please sign in to comment.