refactor: restructure PDF/Image example document organization (#3410)

This PR aims to improve the organization and readability of our example documents used in unit tests, specifically focusing on PDF and image files. ### Summary - Created two new subdirectories in the `example-docs` folder: - `pdf/`: for all PDF example files - `img/`: for all image example files - Moved relevant PDF files from `example-docs/` to `example-docs/pdf/` - Moved relevant image files from `example-docs/` to `example-docs/img/` - Updated file paths in affected unit & ingest tests to reflect the new directory structure ### Testing All unit & ingest tests should be updated and verified to work with the new file structure. ## Notes Other file types (e.g., office documents, HTML files) remain in the root of `example-docs/` for now. ## Next Steps Consider similar reorganization for other file types if this structure proves to be beneficial. --------- Co-authored-by: ryannikolaidis <[email protected]> Co-authored-by: christinestraub <[email protected]>
Unstructured-IO · Jul 18, 2024 · 0eb461a · 0eb461a
1 parent 5d38703
commit 0eb461a
Show file tree

Hide file tree

Showing 80 changed files with 206 additions and 217 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,7 +15,6 @@
 ### Fixes
 
 * **Remedy error on Windows when `nltk` binaries are downloaded.** Work around a quirk in the Windows implementation of `tempfile.NamedTemporaryFile` where accessing the temporary file by name raises `PermissionError`.
-
 * **Move Astra embedded_dimension to write config**
 
 ## 0.14.10

diff --git a/example-docs/DA-1p.heic → example-docs/img/DA-1p.heic b/example-docs/DA-1p.heic → example-docs/img/DA-1p.heic
diff --git a/example-docs/DA-1p.jpg → example-docs/img/DA-1p.jpg b/example-docs/DA-1p.jpg → example-docs/img/DA-1p.jpg
diff --git a/example-docs/DA-1p.png → example-docs/img/DA-1p.png b/example-docs/DA-1p.png → example-docs/img/DA-1p.png
diff --git a/example-docs/bmp_24.bmp → example-docs/img/bmp_24.bmp b/example-docs/bmp_24.bmp → example-docs/img/bmp_24.bmp
diff --git a/example-docs/chi_sim_image.jpeg → example-docs/img/chi_sim_image.jpeg b/example-docs/chi_sim_image.jpeg → example-docs/img/chi_sim_image.jpeg
diff --git a/example-docs/double-column-A.jpg → example-docs/img/double-column-A.jpg b/example-docs/double-column-A.jpg → example-docs/img/double-column-A.jpg
diff --git a/example-docs/double-column-B.jpg → example-docs/img/double-column-B.jpg b/example-docs/double-column-B.jpg → example-docs/img/double-column-B.jpg
diff --git a/example-docs/embedded-images-tables.jpg → example-docs/img/embedded-images-tables.jpg b/example-docs/embedded-images-tables.jpg → example-docs/img/embedded-images-tables.jpg
diff --git a/example-docs/english-and-korean.png → example-docs/img/english-and-korean.png b/example-docs/english-and-korean.png → example-docs/img/english-and-korean.png
diff --git a/example-docs/example.jpg → example-docs/img/example.jpg b/example-docs/example.jpg → example-docs/img/example.jpg
diff --git a/example-docs/jpn-vert.jpeg → example-docs/img/jpn-vert.jpeg b/example-docs/jpn-vert.jpeg → example-docs/img/jpn-vert.jpeg
diff --git a/example-docs/layout-parser-paper-10p.jpg → example-docs/img/layout-parser-paper-10p.jpg b/example-docs/layout-parser-paper-10p.jpg → example-docs/img/layout-parser-paper-10p.jpg
diff --git a/...le-docs/layout-parser-paper-combined.tiff → ...ocs/img/layout-parser-paper-combined.tiff b/...le-docs/layout-parser-paper-combined.tiff → ...ocs/img/layout-parser-paper-combined.tiff
diff --git a/example-docs/layout-parser-paper-fast.jpg → ...ple-docs/img/layout-parser-paper-fast.jpg b/example-docs/layout-parser-paper-fast.jpg → ...ple-docs/img/layout-parser-paper-fast.jpg
diff --git a/example-docs/layout-parser-paper-fast.tiff → ...le-docs/img/layout-parser-paper-fast.tiff b/example-docs/layout-parser-paper-fast.tiff → ...le-docs/img/layout-parser-paper-fast.tiff
diff --git a/...e-docs/layout-parser-paper-with-table.jpg → ...cs/img/layout-parser-paper-with-table.jpg b/...e-docs/layout-parser-paper-with-table.jpg → ...cs/img/layout-parser-paper-with-table.jpg
diff --git a/...ple-docs/table-multi-row-column-cells.png → ...docs/img/table-multi-row-column-cells.png b/...ple-docs/table-multi-row-column-cells.png → ...docs/img/table-multi-row-column-cells.png
diff --git a/example-docs/DA-1p.pdf → example-docs/pdf/DA-1p.pdf b/example-docs/DA-1p.pdf → example-docs/pdf/DA-1p.pdf
diff --git a/example-docs/DA-619p.pdf → example-docs/pdf/DA-619p.pdf b/example-docs/DA-619p.pdf → example-docs/pdf/DA-619p.pdf
diff --git a/example-docs/a1977-backus-p21.pdf → example-docs/pdf/a1977-backus-p21.pdf b/example-docs/a1977-backus-p21.pdf → example-docs/pdf/a1977-backus-p21.pdf
diff --git a/example-docs/all-number-table.pdf → example-docs/pdf/all-number-table.pdf b/example-docs/all-number-table.pdf → example-docs/pdf/all-number-table.pdf
diff --git a/example-docs/chevron-page.pdf → example-docs/pdf/chevron-page.pdf b/example-docs/chevron-page.pdf → example-docs/pdf/chevron-page.pdf
diff --git a/example-docs/copy-protected.pdf → example-docs/pdf/copy-protected.pdf b/example-docs/copy-protected.pdf → example-docs/pdf/copy-protected.pdf
diff --git a/example-docs/embedded-images-tables.pdf → example-docs/pdf/embedded-images-tables.pdf b/example-docs/embedded-images-tables.pdf → example-docs/pdf/embedded-images-tables.pdf
diff --git a/example-docs/embedded-images.pdf → example-docs/pdf/embedded-images.pdf b/example-docs/embedded-images.pdf → example-docs/pdf/embedded-images.pdf
diff --git a/example-docs/embedded-link.pdf → example-docs/pdf/embedded-link.pdf b/example-docs/embedded-link.pdf → example-docs/pdf/embedded-link.pdf
diff --git a/example-docs/emphasis-text.pdf → example-docs/pdf/emphasis-text.pdf b/example-docs/emphasis-text.pdf → example-docs/pdf/emphasis-text.pdf
diff --git a/example-docs/failure-after-repair.pdf → example-docs/pdf/failure-after-repair.pdf b/example-docs/failure-after-repair.pdf → example-docs/pdf/failure-after-repair.pdf
diff --git a/...le-docs/fake-memo-with-duplicate-page.pdf → ...ocs/pdf/fake-memo-with-duplicate-page.pdf b/...le-docs/fake-memo-with-duplicate-page.pdf → ...ocs/pdf/fake-memo-with-duplicate-page.pdf
diff --git a/example-docs/fake-memo.pdf → example-docs/pdf/fake-memo.pdf b/example-docs/fake-memo.pdf → example-docs/pdf/fake-memo.pdf
diff --git a/example-docs/header-test-doc.pdf → example-docs/pdf/header-test-doc.pdf b/example-docs/header-test-doc.pdf → example-docs/pdf/header-test-doc.pdf
diff --git a/example-docs/interface-config-guide-p93.pdf → ...e-docs/pdf/interface-config-guide-p93.pdf b/example-docs/interface-config-guide-p93.pdf → ...e-docs/pdf/interface-config-guide-p93.pdf
diff --git a/...lid-pdf-structure-pdfminer-entire-doc.pdf → ...lid-pdf-structure-pdfminer-entire-doc.pdf b/...lid-pdf-structure-pdfminer-entire-doc.pdf → ...lid-pdf-structure-pdfminer-entire-doc.pdf
diff --git a/...valid-pdf-structure-pdfminer-one-page.pdf → ...valid-pdf-structure-pdfminer-one-page.pdf b/...valid-pdf-structure-pdfminer-one-page.pdf → ...valid-pdf-structure-pdfminer-one-page.pdf
diff --git a/example-docs/korean-text-with-tables.pdf → example-docs/pdf/korean-text-with-tables.pdf b/example-docs/korean-text-with-tables.pdf → example-docs/pdf/korean-text-with-tables.pdf
diff --git a/example-docs/layout-parser-paper-fast.pdf → ...ple-docs/pdf/layout-parser-paper-fast.pdf b/example-docs/layout-parser-paper-fast.pdf → ...ple-docs/pdf/layout-parser-paper-fast.pdf
diff --git a/.../layout-parser-paper-with-empty-pages.pdf → .../layout-parser-paper-with-empty-pages.pdf b/.../layout-parser-paper-with-empty-pages.pdf → .../layout-parser-paper-with-empty-pages.pdf
diff --git a/...e-docs/layout-parser-paper-with-table.pdf → ...cs/pdf/layout-parser-paper-with-table.pdf b/...e-docs/layout-parser-paper-with-table.pdf → ...cs/pdf/layout-parser-paper-with-table.pdf
diff --git a/example-docs/layout-parser-paper.pdf → example-docs/pdf/layout-parser-paper.pdf b/example-docs/layout-parser-paper.pdf → example-docs/pdf/layout-parser-paper.pdf
diff --git a/example-docs/list-item-example.pdf → example-docs/pdf/list-item-example.pdf b/example-docs/list-item-example.pdf → example-docs/pdf/list-item-example.pdf
diff --git a/example-docs/loremipsum-flat.pdf → example-docs/pdf/loremipsum-flat.pdf b/example-docs/loremipsum-flat.pdf → example-docs/pdf/loremipsum-flat.pdf
diff --git a/example-docs/multi-column-2p.pdf → example-docs/pdf/multi-column-2p.pdf b/example-docs/multi-column-2p.pdf → example-docs/pdf/multi-column-2p.pdf
diff --git a/example-docs/multi-column.pdf → example-docs/pdf/multi-column.pdf b/example-docs/multi-column.pdf → example-docs/pdf/multi-column.pdf
diff --git a/example-docs/negative-coords.pdf → example-docs/pdf/negative-coords.pdf b/example-docs/negative-coords.pdf → example-docs/pdf/negative-coords.pdf
diff --git a/example-docs/pdf-bad-color-space.pdf → example-docs/pdf/pdf-bad-color-space.pdf b/example-docs/pdf-bad-color-space.pdf → example-docs/pdf/pdf-bad-color-space.pdf
diff --git a/...docs/pdf2image-memory-error-test-400p.pdf → .../pdf/pdf2image-memory-error-test-400p.pdf b/...docs/pdf2image-memory-error-test-400p.pdf → .../pdf/pdf2image-memory-error-test-400p.pdf
diff --git a/example-docs/reliance.pdf → example-docs/pdf/reliance.pdf b/example-docs/reliance.pdf → example-docs/pdf/reliance.pdf
diff --git a/...ple-docs/table-multi-row-column-cells.pdf → ...docs/pdf/table-multi-row-column-cells.pdf b/...ple-docs/table-multi-row-column-cells.pdf → ...docs/pdf/table-multi-row-column-cells.pdf
diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py
@@ -99,9 +99,9 @@ def test_detect_filetype_from_filename_with_extension(
 @pytest.mark.parametrize(
     ("file_name", "expected_value"),
     [
-        ("layout-parser-paper-fast.pdf", [FileType.PDF]),
+        ("pdf/layout-parser-paper-fast.pdf", [FileType.PDF]),
         ("fake.docx", [FileType.DOCX]),
-        ("example.jpg", [FileType.JPG]),
+        ("img/example.jpg", [FileType.JPG]),
         ("fake-text.txt", [FileType.TXT]),
         ("eml/fake-email.eml", [FileType.EML]),
         ("factbook.xml", [FileType.XML]),
@@ -424,7 +424,7 @@ def test_detect_BMP_from_file_path():
 
 
 def test_detect_BMP_from_file_no_extension():
-    with open(example_doc_path("bmp_24.bmp"), "rb") as f:
+    with open(example_doc_path("img/bmp_24.bmp"), "rb") as f:
         file = io.BytesIO(f.read())
     assert detect_filetype(file=file) == FileType.BMP
 

diff --git a/test_unstructured/file_utils/test_metadata.py b/test_unstructured/file_utils/test_metadata.py
@@ -7,9 +7,10 @@
 import pytest
 
 import unstructured.file_utils.metadata as meta
+from test_unstructured.unit_utils import example_doc_path
 
 DIRECTORY = pathlib.Path(__file__).parent.resolve()
-EXAMPLE_JPG_FILENAME = os.path.join(DIRECTORY, "..", "..", "example-docs", "example.jpg")
+EXAMPLE_JPG_FILENAME = example_doc_path("img/example.jpg")
 
 
 def test_get_docx_metadata_from_filename(tmpdir):

diff --git a/test_unstructured/metrics/test_table_structure.py b/test_unstructured/metrics/test_table_structure.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pytest
 
+from test_unstructured.unit_utils import example_doc_path
 from unstructured.metrics.table.table_alignment import TableAlignment
 from unstructured.metrics.table.table_eval import TableEvalProcessor
 from unstructured.metrics.table_structure import (
@@ -14,8 +15,8 @@
 @pytest.mark.parametrize(
     "filename",
     [
-        "example-docs/table-multi-row-column-cells.png",
-        "example-docs/table-multi-row-column-cells.pdf",
+        example_doc_path("img/table-multi-row-column-cells.png"),
+        example_doc_path("pdf/table-multi-row-column-cells.pdf"),
     ],
 )
 def test_image_or_pdf_to_dataframe(filename):
@@ -25,8 +26,8 @@ def test_image_or_pdf_to_dataframe(filename):
 
 def test_eval_table_transformer_for_file():
     score = eval_table_transformer_for_file(
-        "example-docs/table-multi-row-column-cells.png",
-        "example-docs/table-multi-row-column-cells-actual.csv",
+        example_doc_path("img/table-multi-row-column-cells.png"),
+        example_doc_path("table-multi-row-column-cells-actual.csv"),
     )
     # avoid severe degradation of performance
     assert 0.8 < score < 1

diff --git a/test_unstructured/partition/pdf_image/test_chipper.py b/test_unstructured/partition/pdf_image/test_chipper.py
@@ -1,13 +1,14 @@
 import pytest
 
+from test_unstructured.unit_utils import example_doc_path
 from unstructured.partition import pdf
 from unstructured.partition.utils.constants import PartitionStrategy
 
 
 @pytest.fixture(scope="session")
 def chipper_results():
     elements = pdf.partition_pdf(
-        "example-docs/layout-parser-paper-fast.pdf",
+        filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
         strategy=PartitionStrategy.HI_RES,
         model_name="chipper",
     )