Skip to content

Commit

Permalink
fix: expose drawing options as function params rather than env config (
Browse files Browse the repository at this point in the history
…#3598)

This PR:
- changes the interface of analysis tools to expose drawing params as
function parameters rather than env_config (=environmental variables)
- restructures analysis package
  • Loading branch information
pawel-kmiecik authored Sep 5, 2024
1 parent acd070c commit f25eb60
Show file tree
Hide file tree
Showing 6 changed files with 198 additions and 176 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.15.10-dev3
## 0.15.10-dev4

### Enhancements

Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.15.10-dev3" # pragma: no cover
__version__ = "0.15.10-dev4" # pragma: no cover
6 changes: 5 additions & 1 deletion unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,13 @@
prepare_languages_for_tesseract,
tesseract_to_paddle_language,
)
from unstructured.partition.pdf_image.analysis import save_analysis_artifiacts
from unstructured.partition.pdf_image.analysis.layout_dump import (
ExtractedLayoutDumper,
FinalLayoutDumper,
ObjectDetectionLayoutDumper,
OCRLayoutDumper,
)
from unstructured.partition.pdf_image.analysis.tools import save_analysis_artifiacts
from unstructured.partition.pdf_image.form_extraction import run_form_extraction
from unstructured.partition.pdf_image.pdf_image_utils import (
check_element_types_to_extract,
Expand Down Expand Up @@ -816,6 +816,10 @@ def _partition_pdf_or_image_local(
analyzed_image_output_dir_path=analyzed_image_output_dir_path,
skip_bboxes=env_config.ANALYSIS_BBOX_SKIP,
skip_dump_od=env_config.ANALYSIS_DUMP_OD_SKIP,
draw_grid=env_config.ANALYSIS_BBOX_DRAW_GRID,
draw_caption=env_config.ANALYSIS_BBOX_DRAW_CAPTION,
resize=env_config.ANALYSIS_BBOX_RESIZE,
format=env_config.ANALYSIS_BBOX_FORMAT,
)

return out_elements
Expand Down
172 changes: 0 additions & 172 deletions unstructured/partition/pdf_image/analysis/__init__.py
Original file line number Diff line number Diff line change
@@ -1,172 +0,0 @@
import json
import uuid
from io import BytesIO
from pathlib import Path
from typing import Optional

from unstructured import env_config
from unstructured.partition.pdf_image.analysis.bbox_visualisation import (
AnalysisDrawer,
FinalLayoutDrawer,
LayoutDrawer,
OCRLayoutDrawer,
ODModelLayoutDrawer,
PdfminerLayoutDrawer,
)
from unstructured.partition.pdf_image.analysis.layout_dump import (
ExtractedLayoutDumper,
FinalLayoutDumper,
JsonLayoutDumper,
LayoutDumper,
ObjectDetectionLayoutDumper,
OCRLayoutDumper,
)


def _get_drawer_for_dumper(dumper: LayoutDumper) -> Optional[LayoutDrawer]:
"""For a given layout dumper, return the corresponding layout drawer instance initialized with
a dumped layout dict.
Args:
dumper: The layout dumper instance
Returns:
LayoutDrawer: The corresponding layout drawer instance
"""
if isinstance(dumper, ObjectDetectionLayoutDumper):
return ODModelLayoutDrawer(layout_dump=dumper.dump())
elif isinstance(dumper, ExtractedLayoutDumper):
return PdfminerLayoutDrawer(layout_dump=dumper.dump())
elif isinstance(dumper, OCRLayoutDumper):
return OCRLayoutDrawer(layout_dump=dumper.dump())
elif isinstance(dumper, FinalLayoutDumper):
return FinalLayoutDrawer(layout_dump=dumper.dump())
else:
raise ValueError(f"Unknown dumper type: {dumper}")


def _generate_filename(is_image: bool):
"""Generate a filename for the analysis artifacts based on the file type.
Adds a random uuid suffix
"""
suffix = uuid.uuid4().hex[:5]
if is_image:
return f"image_{suffix}.png"
return f"pdf_{suffix}.pdf"


def save_analysis_artifiacts(
*layout_dumpers: LayoutDumper,
is_image: bool,
analyzed_image_output_dir_path: str,
filename: Optional[str] = None,
file: Optional[BytesIO] = None,
skip_bboxes: bool = False,
skip_dump_od: bool = False,
):
"""Save the analysis artifacts for a given file. Loads some settings from
the environment configuration.
Args:
layout_dumpers: The layout dumpers to save and use for bboxes rendering
filename: The filename of the sources analyzed file (pdf/image)
analyzed_image_output_dir_path: The directory to save the analysis artifacts
"""
if not filename:
filename = _generate_filename(is_image)
if skip_bboxes or skip_dump_od:
return

output_path = Path(analyzed_image_output_dir_path)
output_path.mkdir(parents=True, exist_ok=True)
if not skip_dump_od:
json_layout_dumper = JsonLayoutDumper(
filename=filename,
save_dir=output_path,
)
for layout_dumper in layout_dumpers:
json_layout_dumper.add_layout_dumper(layout_dumper)
json_layout_dumper.process()

if not skip_bboxes:
analysis_drawer = AnalysisDrawer(
filename=filename,
file=file,
is_image=is_image,
save_dir=output_path,
draw_grid=env_config.ANALYSIS_BBOX_DRAW_GRID,
draw_caption=env_config.ANALYSIS_BBOX_DRAW_CAPTION,
resize=env_config.ANALYSIS_BBOX_RESIZE,
format=env_config.ANALYSIS_BBOX_FORMAT,
)

for layout_dumper in layout_dumpers:
drawer = _get_drawer_for_dumper(layout_dumper)
analysis_drawer.add_drawer(drawer)
analysis_drawer.process()


def render_bboxes_for_file(
filename: str,
analyzed_image_output_dir_path: str,
renders_output_dir_path: Optional[str] = None,
):
"""Render the bounding boxes for a given layout dimp file.
To be used for analysis after the partition is performed for
only dumping the layouts - the bboxes can be rendered later.
Expects that the analyzed_image_output_dir_path keeps the structure
that was created by the save_analysis_artifacts function.
Args:
filename: The filename of the sources analyzed file (pdf/image)
analyzed_image_output_dir_path: The directory where the analysis artifacts
(layout dumps) are saved. It should be the root directory of the structure
created by the save_analysis_artifacts function.
renders_output_dir_path: Optional directory to save the rendered bboxes -
if not provided, it will be saved in the analysis directory.
"""
filename_stem = Path(filename).stem
is_image = not Path(filename).suffix.endswith("pdf")
analysis_dumps_dir = (
Path(analyzed_image_output_dir_path) / "analysis" / filename_stem / "layout_dump"
)
print(f"analysis_dumps_dir: {analysis_dumps_dir}")
if not analysis_dumps_dir.exists():
return
layout_drawers = []
for analysis_dump_filename in analysis_dumps_dir.iterdir():
if not analysis_dump_filename.is_file():
continue
with open(analysis_dump_filename) as f:
layout_dump = json.load(f)
if analysis_dump_filename.stem == "final":
layout_drawers.append(FinalLayoutDrawer(layout_dump=layout_dump))
if analysis_dump_filename.stem == "object_detection":
layout_drawers.append(ODModelLayoutDrawer(layout_dump=layout_dump))
if analysis_dump_filename.stem == "ocr":
layout_drawers.append(OCRLayoutDrawer(layout_dump=layout_dump))
if analysis_dump_filename.stem == "pdfminer":
layout_drawers.append(PdfminerLayoutDrawer(layout_dump=layout_dump))

if layout_drawers:
if not renders_output_dir_path:
output_path = (
Path(analyzed_image_output_dir_path) / "analysis" / filename_stem / "bboxes"
)
else:
output_path = Path(renders_output_dir_path)
output_path.mkdir(parents=True, exist_ok=True)
analysis_drawer = AnalysisDrawer(
filename=filename,
save_dir=output_path,
is_image=is_image,
draw_grid=env_config.ANALYSIS_BBOX_DRAW_GRID,
draw_caption=env_config.ANALYSIS_BBOX_DRAW_CAPTION,
resize=env_config.ANALYSIS_BBOX_RESIZE,
format=env_config.ANALYSIS_BBOX_FORMAT,
)

for drawer in layout_drawers:
analysis_drawer.add_drawer(drawer)
analysis_drawer.process()
190 changes: 190 additions & 0 deletions unstructured/partition/pdf_image/analysis/tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
import json
import uuid
from io import BytesIO
from pathlib import Path
from typing import Optional

from unstructured.partition.pdf_image.analysis.bbox_visualisation import (
AnalysisDrawer,
FinalLayoutDrawer,
LayoutDrawer,
OCRLayoutDrawer,
ODModelLayoutDrawer,
PdfminerLayoutDrawer,
)
from unstructured.partition.pdf_image.analysis.layout_dump import (
ExtractedLayoutDumper,
FinalLayoutDumper,
JsonLayoutDumper,
LayoutDumper,
ObjectDetectionLayoutDumper,
OCRLayoutDumper,
)


def _get_drawer_for_dumper(dumper: LayoutDumper) -> Optional[LayoutDrawer]:
"""For a given layout dumper, return the corresponding layout drawer instance initialized with
a dumped layout dict.
Args:
dumper: The layout dumper instance
Returns:
LayoutDrawer: The corresponding layout drawer instance
"""
if isinstance(dumper, ObjectDetectionLayoutDumper):
return ODModelLayoutDrawer(layout_dump=dumper.dump())
elif isinstance(dumper, ExtractedLayoutDumper):
return PdfminerLayoutDrawer(layout_dump=dumper.dump())
elif isinstance(dumper, OCRLayoutDumper):
return OCRLayoutDrawer(layout_dump=dumper.dump())
elif isinstance(dumper, FinalLayoutDumper):
return FinalLayoutDrawer(layout_dump=dumper.dump())
else:
raise ValueError(f"Unknown dumper type: {dumper}")


def _generate_filename(is_image: bool):
"""Generate a filename for the analysis artifacts based on the file type.
Adds a random uuid suffix
"""
suffix = uuid.uuid4().hex[:5]
if is_image:
return f"image_{suffix}.png"
return f"pdf_{suffix}.pdf"


def save_analysis_artifiacts(
*layout_dumpers: LayoutDumper,
is_image: bool,
analyzed_image_output_dir_path: str,
filename: Optional[str] = None,
file: Optional[BytesIO] = None,
skip_bboxes: bool = False,
skip_dump_od: bool = False,
draw_grid: bool = False,
draw_caption: bool = True,
resize: Optional[float] = None,
format: str = "png",
):
"""Save the analysis artifacts for a given file. Loads some settings from
the environment configuration.
Args:
layout_dumpers: The layout dumpers to save and use for bboxes rendering
is_image: Flag for the file type (pdf/image)
analyzed_image_output_dir_path: The directory to save the analysis artifacts
filename: The filename of the sources analyzed file (pdf/image).
Only one of filename or file should be provided.
file: The file object for the analyzed file.
Only one of filename or file should be provided.
draw_grid: Flag for drawing the analysis bboxes on a single image (as grid)
draw_caption: Flag for drawing the caption above the analyzed page (for e.g. layout source)
resize: Output image resize value. If not provided, the image will not be resized.
format: The format for analyzed pages with bboxes drawn on them. Default is 'png'.
"""
if not filename:
filename = _generate_filename(is_image)
if skip_bboxes or skip_dump_od:
return

output_path = Path(analyzed_image_output_dir_path)
output_path.mkdir(parents=True, exist_ok=True)
if not skip_dump_od:
json_layout_dumper = JsonLayoutDumper(
filename=filename,
save_dir=output_path,
)
for layout_dumper in layout_dumpers:
json_layout_dumper.add_layout_dumper(layout_dumper)
json_layout_dumper.process()

if not skip_bboxes:
analysis_drawer = AnalysisDrawer(
filename=filename,
file=file,
is_image=is_image,
save_dir=output_path,
draw_grid=draw_grid,
draw_caption=draw_caption,
resize=resize,
format=format,
)

for layout_dumper in layout_dumpers:
drawer = _get_drawer_for_dumper(layout_dumper)
analysis_drawer.add_drawer(drawer)
analysis_drawer.process()


def render_bboxes_for_file(
filename: str,
analyzed_image_output_dir_path: str,
renders_output_dir_path: Optional[str] = None,
draw_grid: bool = False,
draw_caption: bool = True,
resize: Optional[float] = None,
format: str = "png",
):
"""Render the bounding boxes for a given layout dimp file.
To be used for analysis after the partition is performed for
only dumping the layouts - the bboxes can be rendered later.
Expects that the analyzed_image_output_dir_path keeps the structure
that was created by the save_analysis_artifacts function.
Args:
filename: The filename of the sources analyzed file (pdf/image)
analyzed_image_output_dir_path: The directory where the analysis artifacts
(layout dumps) are saved. It should be the root directory of the structure
created by the save_analysis_artifacts function.
renders_output_dir_path: Optional directory to save the rendered bboxes -
if not provided, it will be saved in the analysis directory.
draw_grid: Flag for drawing the analysis bboxes on a single image (as grid)
draw_caption: Flag for drawing the caption above the analyzed page (for e.g. layout source)
resize: Output image resize value. If not provided, the image will not be resized.
format: The format for analyzed pages with bboxes drawn on them. Default is 'png'.
"""
filename_stem = Path(filename).stem
is_image = not Path(filename).suffix.endswith("pdf")
analysis_dumps_dir = (
Path(analyzed_image_output_dir_path) / "analysis" / filename_stem / "layout_dump"
)
if not analysis_dumps_dir.exists():
return
layout_drawers = []
for analysis_dump_filename in analysis_dumps_dir.iterdir():
if not analysis_dump_filename.is_file():
continue
with open(analysis_dump_filename) as f:
layout_dump = json.load(f)
if analysis_dump_filename.stem == "final":
layout_drawers.append(FinalLayoutDrawer(layout_dump=layout_dump))
if analysis_dump_filename.stem == "object_detection":
layout_drawers.append(ODModelLayoutDrawer(layout_dump=layout_dump))
if analysis_dump_filename.stem == "ocr":
layout_drawers.append(OCRLayoutDrawer(layout_dump=layout_dump))
if analysis_dump_filename.stem == "pdfminer":
layout_drawers.append(PdfminerLayoutDrawer(layout_dump=layout_dump))

if layout_drawers:
if not renders_output_dir_path:
output_path = (
Path(analyzed_image_output_dir_path) / "analysis" / filename_stem / "bboxes"
)
else:
output_path = Path(renders_output_dir_path)
output_path.mkdir(parents=True, exist_ok=True)
analysis_drawer = AnalysisDrawer(
filename=filename,
save_dir=output_path,
is_image=is_image,
draw_grid=draw_grid,
draw_caption=draw_caption,
resize=resize,
format=format,
)

for drawer in layout_drawers:
analysis_drawer.add_drawer(drawer)
analysis_drawer.process()
Loading

0 comments on commit f25eb60

Please sign in to comment.