diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0630abda79..94a6ff8bc1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,7 +15,6 @@
### Fixes
* **Remedy error on Windows when `nltk` binaries are downloaded.** Work around a quirk in the Windows implementation of `tempfile.NamedTemporaryFile` where accessing the temporary file by name raises `PermissionError`.
-
* **Move Astra embedded_dimension to write config**
## 0.14.10
diff --git a/example-docs/DA-1p.heic b/example-docs/img/DA-1p.heic
similarity index 100%
rename from example-docs/DA-1p.heic
rename to example-docs/img/DA-1p.heic
diff --git a/example-docs/DA-1p.jpg b/example-docs/img/DA-1p.jpg
similarity index 100%
rename from example-docs/DA-1p.jpg
rename to example-docs/img/DA-1p.jpg
diff --git a/example-docs/DA-1p.png b/example-docs/img/DA-1p.png
similarity index 100%
rename from example-docs/DA-1p.png
rename to example-docs/img/DA-1p.png
diff --git a/example-docs/bmp_24.bmp b/example-docs/img/bmp_24.bmp
similarity index 100%
rename from example-docs/bmp_24.bmp
rename to example-docs/img/bmp_24.bmp
diff --git a/example-docs/chi_sim_image.jpeg b/example-docs/img/chi_sim_image.jpeg
similarity index 100%
rename from example-docs/chi_sim_image.jpeg
rename to example-docs/img/chi_sim_image.jpeg
diff --git a/example-docs/double-column-A.jpg b/example-docs/img/double-column-A.jpg
similarity index 100%
rename from example-docs/double-column-A.jpg
rename to example-docs/img/double-column-A.jpg
diff --git a/example-docs/double-column-B.jpg b/example-docs/img/double-column-B.jpg
similarity index 100%
rename from example-docs/double-column-B.jpg
rename to example-docs/img/double-column-B.jpg
diff --git a/example-docs/embedded-images-tables.jpg b/example-docs/img/embedded-images-tables.jpg
similarity index 100%
rename from example-docs/embedded-images-tables.jpg
rename to example-docs/img/embedded-images-tables.jpg
diff --git a/example-docs/english-and-korean.png b/example-docs/img/english-and-korean.png
similarity index 100%
rename from example-docs/english-and-korean.png
rename to example-docs/img/english-and-korean.png
diff --git a/example-docs/example.jpg b/example-docs/img/example.jpg
similarity index 100%
rename from example-docs/example.jpg
rename to example-docs/img/example.jpg
diff --git a/example-docs/jpn-vert.jpeg b/example-docs/img/jpn-vert.jpeg
similarity index 100%
rename from example-docs/jpn-vert.jpeg
rename to example-docs/img/jpn-vert.jpeg
diff --git a/example-docs/layout-parser-paper-10p.jpg b/example-docs/img/layout-parser-paper-10p.jpg
similarity index 100%
rename from example-docs/layout-parser-paper-10p.jpg
rename to example-docs/img/layout-parser-paper-10p.jpg
diff --git a/example-docs/layout-parser-paper-combined.tiff b/example-docs/img/layout-parser-paper-combined.tiff
similarity index 100%
rename from example-docs/layout-parser-paper-combined.tiff
rename to example-docs/img/layout-parser-paper-combined.tiff
diff --git a/example-docs/layout-parser-paper-fast.jpg b/example-docs/img/layout-parser-paper-fast.jpg
similarity index 100%
rename from example-docs/layout-parser-paper-fast.jpg
rename to example-docs/img/layout-parser-paper-fast.jpg
diff --git a/example-docs/layout-parser-paper-fast.tiff b/example-docs/img/layout-parser-paper-fast.tiff
similarity index 100%
rename from example-docs/layout-parser-paper-fast.tiff
rename to example-docs/img/layout-parser-paper-fast.tiff
diff --git a/example-docs/layout-parser-paper-with-table.jpg b/example-docs/img/layout-parser-paper-with-table.jpg
similarity index 100%
rename from example-docs/layout-parser-paper-with-table.jpg
rename to example-docs/img/layout-parser-paper-with-table.jpg
diff --git a/example-docs/table-multi-row-column-cells.png b/example-docs/img/table-multi-row-column-cells.png
similarity index 100%
rename from example-docs/table-multi-row-column-cells.png
rename to example-docs/img/table-multi-row-column-cells.png
diff --git a/example-docs/DA-1p.pdf b/example-docs/pdf/DA-1p.pdf
similarity index 100%
rename from example-docs/DA-1p.pdf
rename to example-docs/pdf/DA-1p.pdf
diff --git a/example-docs/DA-619p.pdf b/example-docs/pdf/DA-619p.pdf
similarity index 100%
rename from example-docs/DA-619p.pdf
rename to example-docs/pdf/DA-619p.pdf
diff --git a/example-docs/a1977-backus-p21.pdf b/example-docs/pdf/a1977-backus-p21.pdf
similarity index 100%
rename from example-docs/a1977-backus-p21.pdf
rename to example-docs/pdf/a1977-backus-p21.pdf
diff --git a/example-docs/all-number-table.pdf b/example-docs/pdf/all-number-table.pdf
similarity index 100%
rename from example-docs/all-number-table.pdf
rename to example-docs/pdf/all-number-table.pdf
diff --git a/example-docs/chevron-page.pdf b/example-docs/pdf/chevron-page.pdf
similarity index 100%
rename from example-docs/chevron-page.pdf
rename to example-docs/pdf/chevron-page.pdf
diff --git a/example-docs/copy-protected.pdf b/example-docs/pdf/copy-protected.pdf
similarity index 100%
rename from example-docs/copy-protected.pdf
rename to example-docs/pdf/copy-protected.pdf
diff --git a/example-docs/embedded-images-tables.pdf b/example-docs/pdf/embedded-images-tables.pdf
similarity index 100%
rename from example-docs/embedded-images-tables.pdf
rename to example-docs/pdf/embedded-images-tables.pdf
diff --git a/example-docs/embedded-images.pdf b/example-docs/pdf/embedded-images.pdf
similarity index 100%
rename from example-docs/embedded-images.pdf
rename to example-docs/pdf/embedded-images.pdf
diff --git a/example-docs/embedded-link.pdf b/example-docs/pdf/embedded-link.pdf
similarity index 100%
rename from example-docs/embedded-link.pdf
rename to example-docs/pdf/embedded-link.pdf
diff --git a/example-docs/emphasis-text.pdf b/example-docs/pdf/emphasis-text.pdf
similarity index 100%
rename from example-docs/emphasis-text.pdf
rename to example-docs/pdf/emphasis-text.pdf
diff --git a/example-docs/failure-after-repair.pdf b/example-docs/pdf/failure-after-repair.pdf
similarity index 100%
rename from example-docs/failure-after-repair.pdf
rename to example-docs/pdf/failure-after-repair.pdf
diff --git a/example-docs/fake-memo-with-duplicate-page.pdf b/example-docs/pdf/fake-memo-with-duplicate-page.pdf
similarity index 100%
rename from example-docs/fake-memo-with-duplicate-page.pdf
rename to example-docs/pdf/fake-memo-with-duplicate-page.pdf
diff --git a/example-docs/fake-memo.pdf b/example-docs/pdf/fake-memo.pdf
similarity index 100%
rename from example-docs/fake-memo.pdf
rename to example-docs/pdf/fake-memo.pdf
diff --git a/example-docs/header-test-doc.pdf b/example-docs/pdf/header-test-doc.pdf
similarity index 100%
rename from example-docs/header-test-doc.pdf
rename to example-docs/pdf/header-test-doc.pdf
diff --git a/example-docs/interface-config-guide-p93.pdf b/example-docs/pdf/interface-config-guide-p93.pdf
similarity index 100%
rename from example-docs/interface-config-guide-p93.pdf
rename to example-docs/pdf/interface-config-guide-p93.pdf
diff --git a/example-docs/invalid-pdf-structure-pdfminer-entire-doc.pdf b/example-docs/pdf/invalid-pdf-structure-pdfminer-entire-doc.pdf
similarity index 100%
rename from example-docs/invalid-pdf-structure-pdfminer-entire-doc.pdf
rename to example-docs/pdf/invalid-pdf-structure-pdfminer-entire-doc.pdf
diff --git a/example-docs/invalid-pdf-structure-pdfminer-one-page.pdf b/example-docs/pdf/invalid-pdf-structure-pdfminer-one-page.pdf
similarity index 100%
rename from example-docs/invalid-pdf-structure-pdfminer-one-page.pdf
rename to example-docs/pdf/invalid-pdf-structure-pdfminer-one-page.pdf
diff --git a/example-docs/korean-text-with-tables.pdf b/example-docs/pdf/korean-text-with-tables.pdf
similarity index 100%
rename from example-docs/korean-text-with-tables.pdf
rename to example-docs/pdf/korean-text-with-tables.pdf
diff --git a/example-docs/layout-parser-paper-fast.pdf b/example-docs/pdf/layout-parser-paper-fast.pdf
similarity index 100%
rename from example-docs/layout-parser-paper-fast.pdf
rename to example-docs/pdf/layout-parser-paper-fast.pdf
diff --git a/example-docs/layout-parser-paper-with-empty-pages.pdf b/example-docs/pdf/layout-parser-paper-with-empty-pages.pdf
similarity index 100%
rename from example-docs/layout-parser-paper-with-empty-pages.pdf
rename to example-docs/pdf/layout-parser-paper-with-empty-pages.pdf
diff --git a/example-docs/layout-parser-paper-with-table.pdf b/example-docs/pdf/layout-parser-paper-with-table.pdf
similarity index 100%
rename from example-docs/layout-parser-paper-with-table.pdf
rename to example-docs/pdf/layout-parser-paper-with-table.pdf
diff --git a/example-docs/layout-parser-paper.pdf b/example-docs/pdf/layout-parser-paper.pdf
similarity index 100%
rename from example-docs/layout-parser-paper.pdf
rename to example-docs/pdf/layout-parser-paper.pdf
diff --git a/example-docs/list-item-example.pdf b/example-docs/pdf/list-item-example.pdf
similarity index 100%
rename from example-docs/list-item-example.pdf
rename to example-docs/pdf/list-item-example.pdf
diff --git a/example-docs/loremipsum-flat.pdf b/example-docs/pdf/loremipsum-flat.pdf
similarity index 100%
rename from example-docs/loremipsum-flat.pdf
rename to example-docs/pdf/loremipsum-flat.pdf
diff --git a/example-docs/multi-column-2p.pdf b/example-docs/pdf/multi-column-2p.pdf
similarity index 100%
rename from example-docs/multi-column-2p.pdf
rename to example-docs/pdf/multi-column-2p.pdf
diff --git a/example-docs/multi-column.pdf b/example-docs/pdf/multi-column.pdf
similarity index 100%
rename from example-docs/multi-column.pdf
rename to example-docs/pdf/multi-column.pdf
diff --git a/example-docs/negative-coords.pdf b/example-docs/pdf/negative-coords.pdf
similarity index 100%
rename from example-docs/negative-coords.pdf
rename to example-docs/pdf/negative-coords.pdf
diff --git a/example-docs/pdf-bad-color-space.pdf b/example-docs/pdf/pdf-bad-color-space.pdf
similarity index 100%
rename from example-docs/pdf-bad-color-space.pdf
rename to example-docs/pdf/pdf-bad-color-space.pdf
diff --git a/example-docs/pdf2image-memory-error-test-400p.pdf b/example-docs/pdf/pdf2image-memory-error-test-400p.pdf
similarity index 100%
rename from example-docs/pdf2image-memory-error-test-400p.pdf
rename to example-docs/pdf/pdf2image-memory-error-test-400p.pdf
diff --git a/example-docs/reliance.pdf b/example-docs/pdf/reliance.pdf
similarity index 100%
rename from example-docs/reliance.pdf
rename to example-docs/pdf/reliance.pdf
diff --git a/example-docs/table-multi-row-column-cells.pdf b/example-docs/pdf/table-multi-row-column-cells.pdf
similarity index 100%
rename from example-docs/table-multi-row-column-cells.pdf
rename to example-docs/pdf/table-multi-row-column-cells.pdf
diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py
index af18577225..545f441a0d 100644
--- a/test_unstructured/file_utils/test_filetype.py
+++ b/test_unstructured/file_utils/test_filetype.py
@@ -99,9 +99,9 @@ def test_detect_filetype_from_filename_with_extension(
@pytest.mark.parametrize(
("file_name", "expected_value"),
[
- ("layout-parser-paper-fast.pdf", [FileType.PDF]),
+ ("pdf/layout-parser-paper-fast.pdf", [FileType.PDF]),
("fake.docx", [FileType.DOCX]),
- ("example.jpg", [FileType.JPG]),
+ ("img/example.jpg", [FileType.JPG]),
("fake-text.txt", [FileType.TXT]),
("eml/fake-email.eml", [FileType.EML]),
("factbook.xml", [FileType.XML]),
@@ -424,7 +424,7 @@ def test_detect_BMP_from_file_path():
def test_detect_BMP_from_file_no_extension():
- with open(example_doc_path("bmp_24.bmp"), "rb") as f:
+ with open(example_doc_path("img/bmp_24.bmp"), "rb") as f:
file = io.BytesIO(f.read())
assert detect_filetype(file=file) == FileType.BMP
diff --git a/test_unstructured/file_utils/test_metadata.py b/test_unstructured/file_utils/test_metadata.py
index 4239ab436b..99ee2356b6 100644
--- a/test_unstructured/file_utils/test_metadata.py
+++ b/test_unstructured/file_utils/test_metadata.py
@@ -7,9 +7,10 @@
import pytest
import unstructured.file_utils.metadata as meta
+from test_unstructured.unit_utils import example_doc_path
DIRECTORY = pathlib.Path(__file__).parent.resolve()
-EXAMPLE_JPG_FILENAME = os.path.join(DIRECTORY, "..", "..", "example-docs", "example.jpg")
+EXAMPLE_JPG_FILENAME = example_doc_path("img/example.jpg")
def test_get_docx_metadata_from_filename(tmpdir):
diff --git a/test_unstructured/metrics/test_table_structure.py b/test_unstructured/metrics/test_table_structure.py
index 3c684be5a4..dc564443cb 100644
--- a/test_unstructured/metrics/test_table_structure.py
+++ b/test_unstructured/metrics/test_table_structure.py
@@ -3,6 +3,7 @@
import numpy as np
import pytest
+from test_unstructured.unit_utils import example_doc_path
from unstructured.metrics.table.table_alignment import TableAlignment
from unstructured.metrics.table.table_eval import TableEvalProcessor
from unstructured.metrics.table_structure import (
@@ -14,8 +15,8 @@
@pytest.mark.parametrize(
"filename",
[
- "example-docs/table-multi-row-column-cells.png",
- "example-docs/table-multi-row-column-cells.pdf",
+ example_doc_path("img/table-multi-row-column-cells.png"),
+ example_doc_path("pdf/table-multi-row-column-cells.pdf"),
],
)
def test_image_or_pdf_to_dataframe(filename):
@@ -25,8 +26,8 @@ def test_image_or_pdf_to_dataframe(filename):
def test_eval_table_transformer_for_file():
score = eval_table_transformer_for_file(
- "example-docs/table-multi-row-column-cells.png",
- "example-docs/table-multi-row-column-cells-actual.csv",
+ example_doc_path("img/table-multi-row-column-cells.png"),
+ example_doc_path("table-multi-row-column-cells-actual.csv"),
)
# avoid severe degradation of performance
assert 0.8 < score < 1
diff --git a/test_unstructured/partition/pdf_image/test_chipper.py b/test_unstructured/partition/pdf_image/test_chipper.py
index 81f421159c..03cc610ea7 100644
--- a/test_unstructured/partition/pdf_image/test_chipper.py
+++ b/test_unstructured/partition/pdf_image/test_chipper.py
@@ -1,5 +1,6 @@
import pytest
+from test_unstructured.unit_utils import example_doc_path
from unstructured.partition import pdf
from unstructured.partition.utils.constants import PartitionStrategy
@@ -7,7 +8,7 @@
@pytest.fixture(scope="session")
def chipper_results():
elements = pdf.partition_pdf(
- "example-docs/layout-parser-paper-fast.pdf",
+ filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
strategy=PartitionStrategy.HI_RES,
model_name="chipper",
)
diff --git a/test_unstructured/partition/pdf_image/test_image.py b/test_unstructured/partition/pdf_image/test_image.py
index 55018cb3a8..270b38c5ea 100644
--- a/test_unstructured/partition/pdf_image/test_image.py
+++ b/test_unstructured/partition/pdf_image/test_image.py
@@ -91,7 +91,7 @@ def pages(self):
@pytest.mark.parametrize(
("filename", "file"),
[
- ("example-docs/example.jpg", None),
+ (example_doc_path("img/example.jpg"), None),
(None, b"0000"),
],
)
@@ -132,7 +132,7 @@ def test_partition_image_local_raises_with_no_filename():
def test_partition_image_with_auto_strategy(
- filename="example-docs/layout-parser-paper-fast.jpg",
+ filename=example_doc_path("img/layout-parser-paper-fast.jpg"),
):
elements = image.partition_image(filename=filename, strategy=PartitionStrategy.AUTO)
titles = [
@@ -146,7 +146,7 @@ def test_partition_image_with_auto_strategy(
def test_partition_image_with_table_extraction(
- filename="example-docs/layout-parser-paper-with-table.jpg",
+ filename=example_doc_path("img/layout-parser-paper-with-table.jpg"),
):
elements = image.partition_image(
filename=filename,
@@ -160,7 +160,7 @@ def test_partition_image_with_table_extraction(
def test_partition_image_with_multipage_tiff(
- filename="example-docs/layout-parser-paper-combined.tiff",
+ filename=example_doc_path("img/layout-parser-paper-combined.tiff"),
):
elements = image.partition_image(filename=filename, strategy=PartitionStrategy.AUTO)
assert elements[-1].metadata.page_number == 2
@@ -168,7 +168,7 @@ def test_partition_image_with_multipage_tiff(
def test_partition_image_with_bmp(
tmpdir,
- filename="example-docs/layout-parser-paper-with-table.jpg",
+ filename=example_doc_path("img/layout-parser-paper-with-table.jpg"),
):
bmp_filename = os.path.join(tmpdir.dirname, "example.bmp")
img = Image.open(filename)
@@ -185,7 +185,7 @@ def test_partition_image_with_bmp(
assert "
" in table[0]
-def test_partition_image_with_language_passed(filename="example-docs/example.jpg"):
+def test_partition_image_with_language_passed(filename=example_doc_path("img/example.jpg")):
with mock.patch.object(
ocr,
"process_file_with_ocr",
@@ -201,7 +201,7 @@ def test_partition_image_with_language_passed(filename="example-docs/example.jpg
def test_partition_image_from_file_with_language_passed(
- filename="example-docs/example.jpg",
+ filename=example_doc_path("img/example.jpg"),
):
with mock.patch.object(
ocr,
@@ -216,7 +216,7 @@ def test_partition_image_from_file_with_language_passed(
# NOTE(crag): see https://github.com/Unstructured-IO/unstructured/issues/1086
@pytest.mark.skip(reason="Current catching too many tesseract errors")
def test_partition_image_raises_with_invalid_language(
- filename="example-docs/example.jpg",
+ filename=example_doc_path("img/example.jpg"),
):
with pytest.raises(TesseractError):
image.partition_image(
@@ -227,21 +227,14 @@ def test_partition_image_raises_with_invalid_language(
@pytest.mark.parametrize(
- ("strategy"),
+ "strategy",
[
- (PartitionStrategy.HI_RES),
- (PartitionStrategy.OCR_ONLY),
+ PartitionStrategy.HI_RES,
+ PartitionStrategy.OCR_ONLY,
],
)
def test_partition_image_strategies_keep_languages_metadata(strategy):
- filename = os.path.join(
- DIRECTORY,
- "..",
- "..",
- "..",
- "example-docs",
- "english-and-korean.png",
- )
+ filename = example_doc_path("img/english-and-korean.png")
elements = image.partition_image(
filename=filename,
languages=["eng", "kor"],
@@ -252,14 +245,7 @@ def test_partition_image_strategies_keep_languages_metadata(strategy):
def test_partition_image_with_ocr_detects_korean():
- filename = os.path.join(
- DIRECTORY,
- "..",
- "..",
- "..",
- "example-docs",
- "english-and-korean.png",
- )
+ filename = example_doc_path("img/english-and-korean.png")
elements = image.partition_image(
filename=filename,
ocr_languages="eng+kor",
@@ -271,7 +257,7 @@ def test_partition_image_with_ocr_detects_korean():
def test_partition_image_with_ocr_detects_korean_from_file():
- filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "english-and-korean.png")
+ filename = example_doc_path("img/english-and-korean.png")
with open(filename, "rb") as f:
elements = image.partition_image(
file=f,
@@ -284,27 +270,13 @@ def test_partition_image_with_ocr_detects_korean_from_file():
def test_partition_image_raises_with_bad_strategy():
- filename = os.path.join(
- DIRECTORY,
- "..",
- "..",
- "..",
- "example-docs",
- "english-and-korean.png",
- )
+ filename = example_doc_path("img/english-and-korean.png")
with pytest.raises(ValueError):
image.partition_image(filename=filename, strategy="fakeroo")
def test_partition_image_default_strategy_hi_res():
- filename = os.path.join(
- DIRECTORY,
- "..",
- "..",
- "..",
- "example-docs",
- "layout-parser-paper-fast.jpg",
- )
+ filename = example_doc_path("img/layout-parser-paper-fast.jpg")
with open(filename, "rb") as f:
elements = image.partition_image(file=f)
@@ -324,7 +296,7 @@ def test_partition_image_default_strategy_hi_res():
def test_partition_image_metadata_date(
mocker,
- filename="example-docs/english-and-korean.png",
+ filename=example_doc_path("img/english-and-korean.png"),
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
@@ -338,7 +310,7 @@ def test_partition_image_metadata_date(
def test_partition_image_with_hi_res_strategy_metadata_date(
mocker,
- filename="example-docs/english-and-korean.png",
+ filename=example_doc_path("img/english-and-korean.png"),
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
@@ -352,7 +324,7 @@ def test_partition_image_with_hi_res_strategy_metadata_date(
def test_partition_image_metadata_date_custom_metadata_date(
mocker,
- filename="example-docs/english-and-korean.png",
+ filename=example_doc_path("img/english-and-korean.png"),
):
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modification_date = "2009-07-05T09:24:28"
@@ -371,7 +343,7 @@ def test_partition_image_metadata_date_custom_metadata_date(
def test_partition_image_with_hi_res_strategy_metadata_date_custom_metadata_date(
mocker,
- filename="example-docs/english-and-korean.png",
+ filename=example_doc_path("img/english-and-korean.png"),
):
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modification_date = "2009-07-05T09:24:28"
@@ -391,7 +363,7 @@ def test_partition_image_with_hi_res_strategy_metadata_date_custom_metadata_date
def test_partition_image_from_file_metadata_date(
mocker,
- filename="example-docs/english-and-korean.png",
+ filename=example_doc_path("img/english-and-korean.png"),
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
@@ -406,7 +378,7 @@ def test_partition_image_from_file_metadata_date(
def test_partition_image_from_file_explicit_get_metadata_date(
mocker,
- filename="example-docs/english-and-korean.png",
+ filename=example_doc_path("img/english-and-korean.png"),
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
@@ -421,7 +393,7 @@ def test_partition_image_from_file_explicit_get_metadata_date(
def test_partition_image_from_file_with_hi_res_strategy_metadata_date(
mocker,
- filename="example-docs/english-and-korean.png",
+ filename=example_doc_path("img/english-and-korean.png"),
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
@@ -437,7 +409,7 @@ def test_partition_image_from_file_with_hi_res_strategy_metadata_date(
def test_partition_image_from_file_with_hi_res_strategy_explicit_get_metadata_date(
mocker,
- filename="example-docs/english-and-korean.png",
+ filename=example_doc_path("img/english-and-korean.png"),
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
@@ -455,7 +427,7 @@ def test_partition_image_from_file_with_hi_res_strategy_explicit_get_metadata_da
def test_partition_image_from_file_metadata_date_custom_metadata_date(
mocker,
- filename="example-docs/english-and-korean.png",
+ filename=example_doc_path("img/english-and-korean.png"),
):
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modification_date = "2009-07-05T09:24:28"
@@ -475,7 +447,7 @@ def test_partition_image_from_file_metadata_date_custom_metadata_date(
def test_partition_image_from_file_with_hi_res_strategy_metadata_date_custom_metadata_date(
mocker,
- filename="example-docs/english-and-korean.png",
+ filename=example_doc_path("img/english-and-korean.png"),
):
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modification_date = "2009-07-05T09:24:28"
@@ -495,7 +467,7 @@ def test_partition_image_from_file_with_hi_res_strategy_metadata_date_custom_met
def test_partition_image_from_file_without_metadata_date(
- filename="example-docs/english-and-korean.png",
+ filename=example_doc_path("img/english-and-korean.png"),
):
"""Test partition_image() with file that are not possible to get last modified date"""
with open(filename, "rb") as f:
@@ -509,14 +481,14 @@ def test_partition_image_from_file_without_metadata_date(
def test_partition_msg_with_json():
elements = image.partition_image(
- example_doc_path("layout-parser-paper-fast.jpg"),
+ example_doc_path("img/layout-parser-paper-fast.jpg"),
strategy=PartitionStrategy.AUTO,
)
assert_round_trips_through_JSON(elements)
def test_partition_image_with_ocr_has_coordinates_from_filename(
- filename="example-docs/english-and-korean.png",
+ filename=example_doc_path("img/english-and-korean.png"),
):
elements = image.partition_image(filename=filename, strategy=PartitionStrategy.OCR_ONLY)
int_coordinates = [(int(x), int(y)) for x, y in elements[0].metadata.coordinates.points]
@@ -524,11 +496,11 @@ def test_partition_image_with_ocr_has_coordinates_from_filename(
@pytest.mark.parametrize(
- ("filename"),
+ "filename",
[
- ("example-docs/layout-parser-paper-with-table.jpg"),
- ("example-docs/english-and-korean.png"),
- ("example-docs/layout-parser-paper-fast.jpg"),
+ "img/layout-parser-paper-with-table.jpg",
+ "img/english-and-korean.png",
+ "img/layout-parser-paper-fast.jpg",
],
)
def test_partition_image_with_ocr_coordinates_are_not_nan_from_filename(
@@ -536,7 +508,9 @@ def test_partition_image_with_ocr_coordinates_are_not_nan_from_filename(
):
import math
- elements = image.partition_image(filename=filename, strategy=PartitionStrategy.OCR_ONLY)
+ elements = image.partition_image(
+ filename=example_doc_path(filename), strategy=PartitionStrategy.OCR_ONLY
+ )
for element in elements:
# TODO (jennings) One or multiple elements is an empty string
# without coordinates. This should be fixed in a new issue
@@ -548,7 +522,7 @@ def test_partition_image_with_ocr_coordinates_are_not_nan_from_filename(
def test_partition_image_formats_languages_for_tesseract():
- filename = "example-docs/jpn-vert.jpeg"
+ filename = example_doc_path("img/jpn-vert.jpeg")
with mock.patch(
"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
) as mock_process_file_with_ocr:
@@ -561,13 +535,13 @@ def test_partition_image_formats_languages_for_tesseract():
def test_partition_image_warns_with_ocr_languages(caplog):
- filename = "example-docs/layout-parser-paper-fast.jpg"
+ filename = example_doc_path("img/layout-parser-paper-fast.jpg")
image.partition_image(filename=filename, strategy=PartitionStrategy.HI_RES, ocr_languages="eng")
assert "The ocr_languages kwarg will be deprecated" in caplog.text
def test_add_chunking_strategy_on_partition_image(
- filename="example-docs/layout-parser-paper-fast.jpg",
+ filename=example_doc_path("img/layout-parser-paper-fast.jpg"),
):
elements = image.partition_image(filename=filename)
chunk_elements = image.partition_image(filename, chunking_strategy="by_title")
@@ -577,7 +551,7 @@ def test_add_chunking_strategy_on_partition_image(
def test_add_chunking_strategy_on_partition_image_hi_res(
- filename="example-docs/layout-parser-paper-with-table.jpg",
+ filename=example_doc_path("img/layout-parser-paper-with-table.jpg"),
):
elements = image.partition_image(
filename=filename,
@@ -600,7 +574,9 @@ def test_partition_image_uses_model_name():
pdf,
"_partition_pdf_or_image_local",
) as mockpartition:
- image.partition_image("example-docs/layout-parser-paper-fast.jpg", model_name="test")
+ image.partition_image(
+ example_doc_path("img/layout-parser-paper-fast.jpg"), model_name="test"
+ )
print(mockpartition.call_args)
assert "model_name" in mockpartition.call_args.kwargs
assert mockpartition.call_args.kwargs["model_name"]
@@ -611,7 +587,9 @@ def test_partition_image_uses_hi_res_model_name():
pdf,
"_partition_pdf_or_image_local",
) as mockpartition:
- image.partition_image("example-docs/layout-parser-paper-fast.jpg", hi_res_model_name="test")
+ image.partition_image(
+ example_doc_path("img/layout-parser-paper-fast.jpg"), hi_res_model_name="test"
+ )
print(mockpartition.call_args)
assert "model_name" not in mockpartition.call_args.kwargs
assert "hi_res_model_name" in mockpartition.call_args.kwargs
@@ -626,7 +604,7 @@ def test_partition_image_uses_hi_res_model_name():
],
)
def test_partition_image_hi_res_ocr_mode(ocr_mode, idx_title_element):
- filename = "example-docs/layout-parser-paper-fast.jpg"
+ filename = example_doc_path("img/layout-parser-paper-fast.jpg")
elements = image.partition_image(
filename=filename, ocr_mode=ocr_mode, strategy=PartitionStrategy.HI_RES
)
@@ -635,7 +613,7 @@ def test_partition_image_hi_res_ocr_mode(ocr_mode, idx_title_element):
def test_partition_image_hi_res_invalid_ocr_mode():
- filename = "example-docs/layout-parser-paper-fast.jpg"
+ filename = example_doc_path("img/layout-parser-paper-fast.jpg")
with pytest.raises(ValueError):
_ = image.partition_image(
filename=filename, ocr_mode="invalid_ocr_mode", strategy=PartitionStrategy.HI_RES
@@ -643,14 +621,14 @@ def test_partition_image_hi_res_invalid_ocr_mode():
@pytest.mark.parametrize(
- ("ocr_mode"),
+ "ocr_mode",
[
- ("entire_page"),
- ("individual_blocks"),
+ "entire_page",
+ "individual_blocks",
],
)
def test_partition_image_hi_res_ocr_mode_with_table_extraction(ocr_mode):
- filename = "example-docs/layout-parser-paper-with-table.jpg"
+ filename = example_doc_path("img/layout-parser-paper-with-table.jpg")
elements = image.partition_image(
filename=filename,
ocr_mode=ocr_mode,
@@ -665,8 +643,8 @@ def test_partition_image_hi_res_ocr_mode_with_table_extraction(ocr_mode):
assert "Layouts of scanned modern magazines and scientific reports" in table[0]
-def test_partition_image_raises_TypeError_for_invalid_languages():
- filename = "example-docs/layout-parser-paper-fast.jpg"
+def test_partition_image_raises_type_error_for_invalid_languages():
+ filename = example_doc_path("img/layout-parser-paper-fast.jpg")
with pytest.raises(TypeError):
image.partition_image(filename=filename, strategy=PartitionStrategy.HI_RES, languages="eng")
@@ -683,7 +661,6 @@ def inference_results():
def test_partition_image_has_filename(inference_results):
- doc_path = "example-docs"
filename = "layout-parser-paper-fast.jpg"
# Mock inference call with known return results
with mock.patch(
@@ -691,7 +668,7 @@ def test_partition_image_has_filename(inference_results):
return_value=inference_results,
) as mock_inference_func:
elements = image.partition_image(
- filename=os.path.join(doc_path, filename),
+ filename=example_doc_path(f"img/{filename}"),
strategy=PartitionStrategy.HI_RES,
)
# Make sure we actually went down the path we expect.
@@ -710,7 +687,7 @@ def test_partition_image_has_filename(inference_results):
def test_partition_image_element_extraction(
file_mode,
extract_image_block_to_payload,
- filename=example_doc_path("embedded-images-tables.jpg"),
+ filename=example_doc_path("img/embedded-images-tables.jpg"),
):
extract_image_block_types = ["Image", "Table"]
@@ -737,7 +714,7 @@ def test_partition_image_element_extraction(
def test_partition_image_works_on_heic_file(
- filename="example-docs/DA-1p.heic",
+ filename=example_doc_path("img/DA-1p.heic"),
):
elements = image.partition_image(filename=filename, strategy=PartitionStrategy.AUTO)
titles = [el.text for el in elements if el.category == ElementType.TITLE]
@@ -745,17 +722,17 @@ def test_partition_image_works_on_heic_file(
@pytest.mark.parametrize(
- ("strategy"),
+ "strategy",
[PartitionStrategy.HI_RES, PartitionStrategy.OCR_ONLY],
)
def test_deterministic_element_ids(strategy: str):
elements_1 = image.partition_image(
- example_doc_path("layout-parser-paper-with-table.jpg"),
+ example_doc_path("img/layout-parser-paper-with-table.jpg"),
strategy=strategy,
starting_page_number=2,
)
elements_2 = image.partition_image(
- example_doc_path("layout-parser-paper-with-table.jpg"),
+ example_doc_path("img/layout-parser-paper-with-table.jpg"),
strategy=strategy,
starting_page_number=2,
)
@@ -765,9 +742,9 @@ def test_deterministic_element_ids(strategy: str):
assert ids_1 == ids_2
-def test_multipage_tiff_starts_on_starting_page_number():
+def test_multi_page_tiff_starts_on_starting_page_number():
elements = image.partition_image(
- example_doc_path("layout-parser-paper-combined.tiff"),
+ example_doc_path("img/layout-parser-paper-combined.tiff"),
starting_page_number=2,
)
pages = {element.metadata.page_number for element in elements}
diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
index 7301fb45b2..f6d233701b 100644
--- a/test_unstructured/partition/pdf_image/test_pdf.py
+++ b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -105,7 +105,7 @@ def pages(self):
@pytest.mark.parametrize(
("filename", "file"),
[
- (example_doc_path("layout-parser-paper-fast.pdf"), None),
+ (example_doc_path("pdf/layout-parser-paper-fast.pdf"), None),
(None, b"0000"),
],
)
@@ -168,7 +168,7 @@ def test_partition_pdf_outputs_valid_amount_of_elements_and_metadata_values(
starting_page_number,
expected_page_numbers,
origin,
- filename=example_doc_path("layout-parser-paper-with-empty-pages.pdf"),
+ filename=example_doc_path("pdf/layout-parser-paper-with-empty-pages.pdf"),
):
# Test that the partition_pdf function can handle filename
def _test(result):
@@ -204,7 +204,7 @@ def _test(result):
@mock.patch.dict(os.environ, {"UNSTRUCTURED_HI_RES_MODEL_NAME": "checkbox"})
def test_partition_pdf_with_model_name_env_var(
monkeypatch,
- filename=example_doc_path("layout-parser-paper-fast.pdf"),
+ filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
with mock.patch.object(
@@ -220,7 +220,7 @@ def test_partition_pdf_with_model_name_env_var(
def test_partition_pdf_with_model_name(
monkeypatch,
model_name,
- filename=example_doc_path("layout-parser-paper-fast.pdf"),
+ filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
with mock.patch.object(
@@ -251,7 +251,7 @@ def test_partition_pdf_with_model_name(
def test_partition_pdf_with_hi_res_model_name(
monkeypatch,
- filename=example_doc_path("layout-parser-paper-fast.pdf"),
+ filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
with mock.patch.object(
@@ -268,7 +268,7 @@ def test_partition_pdf_with_hi_res_model_name(
def test_partition_pdf_or_image_with_hi_res_model_name(
monkeypatch,
- filename=example_doc_path("layout-parser-paper-fast.pdf"),
+ filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
with mock.patch.object(
@@ -284,7 +284,7 @@ def test_partition_pdf_or_image_with_hi_res_model_name(
def test_partition_pdf_with_auto_strategy(
- filename=example_doc_path("layout-parser-paper-fast.pdf"),
+ filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.AUTO)
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
@@ -294,21 +294,21 @@ def test_partition_pdf_with_auto_strategy(
def test_partition_pdf_with_page_breaks(
- filename=example_doc_path("layout-parser-paper-fast.pdf"),
+ filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
elements = pdf.partition_pdf(filename=filename, url=None, include_page_breaks=True)
assert "PageBreak" in [elem.category for elem in elements]
def test_partition_pdf_with_no_page_breaks(
- filename=example_doc_path("layout-parser-paper-fast.pdf"),
+ filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
elements = pdf.partition_pdf(filename=filename, url=None)
assert "PageBreak" not in [elem.category for elem in elements]
def test_partition_pdf_with_fast_strategy(
- filename=example_doc_path("layout-parser-paper-fast.pdf"),
+ filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
elements = pdf.partition_pdf(
filename=filename, url=None, strategy=PartitionStrategy.FAST, starting_page_number=3
@@ -321,7 +321,7 @@ def test_partition_pdf_with_fast_strategy(
def test_partition_pdf_with_fast_neg_coordinates():
- filename = example_doc_path("negative-coords.pdf")
+ filename = example_doc_path("pdf/negative-coords.pdf")
elements = pdf.partition_pdf(filename=filename, url=None, strategy=PartitionStrategy.FAST)
assert len(elements) == 5
assert elements[0].metadata.coordinates.points[0][0] < 0
@@ -329,7 +329,7 @@ def test_partition_pdf_with_fast_neg_coordinates():
def test_partition_pdf_with_fast_groups_text(
- filename=example_doc_path("layout-parser-paper-fast.pdf"),
+ filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
elements = pdf.partition_pdf(filename=filename, url=None, strategy=PartitionStrategy.FAST)
@@ -345,7 +345,7 @@ def test_partition_pdf_with_fast_groups_text(
def test_partition_pdf_with_fast_strategy_from_file(
- filename=example_doc_path("layout-parser-paper-fast.pdf"),
+ filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
with open(filename, "rb") as f:
elements = pdf.partition_pdf(file=f, url=None, strategy=PartitionStrategy.FAST)
@@ -354,7 +354,7 @@ def test_partition_pdf_with_fast_strategy_from_file(
def test_partition_pdf_with_fast_strategy_and_page_breaks(
caplog,
- filename=example_doc_path("layout-parser-paper-fast.pdf"),
+ filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
elements = pdf.partition_pdf(
filename=filename,
@@ -371,7 +371,7 @@ def test_partition_pdf_with_fast_strategy_and_page_breaks(
def test_partition_pdf_raises_with_bad_strategy(
- filename=example_doc_path("layout-parser-paper-fast.pdf"),
+ filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
with pytest.raises(ValueError):
pdf.partition_pdf(filename=filename, url=None, strategy="made_up")
@@ -380,7 +380,7 @@ def test_partition_pdf_raises_with_bad_strategy(
def test_partition_pdf_falls_back_to_fast(
monkeypatch,
caplog,
- filename=example_doc_path("layout-parser-paper-fast.pdf"),
+ filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def mock_exists(dep):
return dep not in ["unstructured_inference", "pytesseract"]
@@ -402,7 +402,7 @@ def mock_exists(dep):
def test_partition_pdf_falls_back_to_fast_from_ocr_only(
monkeypatch,
caplog,
- filename=example_doc_path("layout-parser-paper-fast.pdf"),
+ filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def mock_exists(dep):
return dep not in ["pytesseract"]
@@ -428,7 +428,7 @@ def mock_exists(dep):
def test_partition_pdf_falls_back_to_hi_res_from_ocr_only(
monkeypatch,
caplog,
- filename=example_doc_path("layout-parser-paper-fast.pdf"),
+ filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def mock_exists(dep):
return dep not in ["pytesseract"]
@@ -451,7 +451,7 @@ def mock_exists(dep):
def test_partition_pdf_falls_back_to_ocr_only(
monkeypatch,
caplog,
- filename=example_doc_path("layout-parser-paper-fast.pdf"),
+ filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def mock_exists(dep):
return dep not in ["unstructured_inference"]
@@ -471,7 +471,7 @@ def mock_exists(dep):
def test_partition_pdf_uses_table_extraction():
- filename = example_doc_path("layout-parser-paper-fast.pdf")
+ filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
with mock.patch(
"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
) as mock_process_file_with_model:
@@ -480,14 +480,14 @@ def test_partition_pdf_uses_table_extraction():
@pytest.mark.parametrize(
- ("ocr_mode"),
+ "ocr_mode",
[
- ("entire_page"),
- ("individual_blocks"),
+ "entire_page",
+ "individual_blocks",
],
)
def test_partition_pdf_hi_table_extraction_with_languages(ocr_mode):
- filename = example_doc_path("korean-text-with-tables.pdf")
+ filename = example_doc_path("pdf/korean-text-with-tables.pdf")
elements = pdf.partition_pdf(
filename=filename,
ocr_mode=ocr_mode,
@@ -506,15 +506,15 @@ def test_partition_pdf_hi_table_extraction_with_languages(ocr_mode):
@pytest.mark.parametrize(
- ("strategy"),
+ "strategy",
[
- (PartitionStrategy.FAST),
- (PartitionStrategy.HI_RES),
- (PartitionStrategy.OCR_ONLY),
+ PartitionStrategy.FAST,
+ PartitionStrategy.HI_RES,
+ PartitionStrategy.OCR_ONLY,
],
)
def test_partition_pdf_strategies_keep_languages_metadata(strategy):
- filename = example_doc_path("korean-text-with-tables.pdf")
+ filename = example_doc_path("pdf/korean-text-with-tables.pdf")
elements = pdf.partition_pdf(
filename=filename,
languages=["kor"],
@@ -531,7 +531,7 @@ def test_partition_pdf_strategies_keep_languages_metadata(strategy):
],
)
def test_partition_pdf_hi_res_ocr_mode_with_table_extraction(ocr_mode):
- filename = example_doc_path("layout-parser-paper.pdf")
+ filename = example_doc_path("pdf/layout-parser-paper.pdf")
elements = pdf.partition_pdf(
filename=filename,
ocr_mode=ocr_mode,
@@ -548,7 +548,7 @@ def test_partition_pdf_hi_res_ocr_mode_with_table_extraction(ocr_mode):
def test_partition_pdf_with_copy_protection():
- filename = os.path.join("example-docs", "copy-protected.pdf")
+ filename = example_doc_path("pdf/copy-protected.pdf")
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES)
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
idx = 2
@@ -559,13 +559,13 @@ def test_partition_pdf_with_copy_protection():
def test_partition_pdf_with_dpi():
- filename = os.path.join("example-docs", "copy-protected.pdf")
+ filename = example_doc_path("pdf/copy-protected.pdf")
with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process:
pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES, pdf_image_dpi=100)
assert mock_process.call_args[1]["pdf_image_dpi"] == 100
-def test_partition_pdf_requiring_recursive_text_grab(filename=example_doc_path("reliance.pdf")):
+def test_partition_pdf_requiring_recursive_text_grab(filename=example_doc_path("pdf/reliance.pdf")):
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.FAST)
assert len(elements) > 50
assert elements[0].metadata.page_number == 1
@@ -573,14 +573,14 @@ def test_partition_pdf_requiring_recursive_text_grab(filename=example_doc_path("
def test_partition_pdf_text_not_extractable():
- filename = example_doc_path("loremipsum-flat.pdf")
+ filename = example_doc_path("pdf/loremipsum-flat.pdf")
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.FAST)
assert len(elements) == 0
def test_partition_pdf_fails_if_pdf_not_processable(
monkeypatch,
- filename=example_doc_path("layout-parser-paper-fast.pdf"),
+ filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def mock_exists(dep):
return dep not in ["unstructured_inference", "pytesseract"]
@@ -593,7 +593,7 @@ def mock_exists(dep):
def test_partition_pdf_fast_groups_text_in_text_box():
- filename = os.path.join("example-docs", "chevron-page.pdf")
+ filename = example_doc_path("pdf/chevron-page.pdf")
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.FAST)
expected_coordinate_points_0 = (
(193.1741, 71.94000000000005),
@@ -633,7 +633,7 @@ def test_partition_pdf_fast_groups_text_in_text_box():
def test_partition_pdf_with_metadata_filename(
- filename=example_doc_path("layout-parser-paper-fast.pdf"),
+ filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
elements = pdf.partition_pdf(
filename=filename,
@@ -646,7 +646,7 @@ def test_partition_pdf_with_metadata_filename(
def test_partition_pdf_with_fast_strategy_from_file_with_metadata_filename(
- filename=example_doc_path("layout-parser-paper-fast.pdf"),
+ filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
with open(filename, "rb") as f:
elements = pdf.partition_pdf(
@@ -672,7 +672,7 @@ def test_partition_pdf_with_fast_strategy_from_file_with_metadata_filename(
def test_partition_pdf_exclude_metadata(
file_mode,
strategy,
- filename=example_doc_path("layout-parser-paper-fast.pdf"),
+ filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
if file_mode == "filename":
elements = pdf.partition_pdf(
@@ -711,7 +711,7 @@ def test_partition_pdf_metadata_date(
strategy,
last_modification_date,
date_from_file_object,
- filename=example_doc_path("copy-protected.pdf"),
+ filename=example_doc_path("pdf/copy-protected.pdf"),
):
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modification_date = (
@@ -762,14 +762,14 @@ def test_partition_pdf_metadata_date(
@pytest.mark.parametrize("strategy", [PartitionStrategy.FAST, PartitionStrategy.HI_RES])
def test_partition_pdf_with_json(strategy: str):
elements = pdf.partition_pdf(
- example_doc_path("layout-parser-paper-fast.pdf"),
+ example_doc_path("pdf/layout-parser-paper-fast.pdf"),
strategy=strategy,
)
assert_round_trips_through_JSON(elements)
def test_add_chunking_strategy_by_title_on_partition_pdf(
- filename=example_doc_path("layout-parser-paper-fast.pdf"),
+ filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
elements = pdf.partition_pdf(filename=filename)
chunk_elements = pdf.partition_pdf(filename, chunking_strategy="by_title")
@@ -779,14 +779,14 @@ def test_add_chunking_strategy_by_title_on_partition_pdf(
def test_partition_pdf_formats_languages_for_tesseract():
- filename = example_doc_path("DA-1p.pdf")
+ filename = example_doc_path("pdf/DA-1p.pdf")
with mock.patch.object(ocr, "process_file_with_ocr", mock.MagicMock()) as mock_process:
pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES, languages=["en"])
assert mock_process.call_args[1]["ocr_languages"] == "eng"
def test_partition_pdf_warns_with_ocr_languages(caplog):
- filename = example_doc_path("chevron-page.pdf")
+ filename = example_doc_path("pdf/chevron-page.pdf")
pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES, ocr_languages="eng")
assert "The ocr_languages kwarg will be deprecated" in caplog.text
@@ -795,7 +795,7 @@ def test_partition_categorization_backup():
text = "This is Clearly a Title"
with mock.patch.object(pdf, "_partition_pdf_or_image_local", return_value=[Text(text)]):
elements = pdf.partition_pdf_or_image(
- example_doc_path("layout-parser-paper-fast.pdf"),
+ example_doc_path("pdf/layout-parser-paper-fast.pdf"),
strategy=PartitionStrategy.HI_RES,
)
# Should have changed the element class from Text to Title
@@ -805,7 +805,7 @@ def test_partition_categorization_backup():
@pytest.mark.parametrize(
"filename",
- [example_doc_path("layout-parser-paper-fast.pdf")],
+ [example_doc_path("pdf/layout-parser-paper-fast.pdf")],
)
def test_combine_numbered_list(filename):
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.AUTO)
@@ -823,7 +823,7 @@ def test_combine_numbered_list(filename):
@pytest.mark.parametrize(
"filename",
- [example_doc_path("layout-parser-paper-fast.pdf")],
+ [example_doc_path("pdf/layout-parser-paper-fast.pdf")],
)
def test_partition_pdf_hyperlinks(filename):
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.AUTO)
@@ -849,7 +849,7 @@ def test_partition_pdf_hyperlinks(filename):
@pytest.mark.parametrize(
"filename",
- [example_doc_path("embedded-link.pdf")],
+ [example_doc_path("pdf/embedded-link.pdf")],
)
def test_partition_pdf_hyperlinks_multiple_lines(filename):
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.AUTO)
@@ -863,7 +863,7 @@ def test_partition_pdf_uses_model_name():
"_partition_pdf_or_image_local",
) as mockpartition:
pdf.partition_pdf(
- example_doc_path("layout-parser-paper-fast.pdf"),
+ example_doc_path("pdf/layout-parser-paper-fast.pdf"),
model_name="test",
strategy=PartitionStrategy.HI_RES,
)
@@ -879,7 +879,7 @@ def test_partition_pdf_uses_hi_res_model_name():
"_partition_pdf_or_image_local",
) as mockpartition:
pdf.partition_pdf(
- example_doc_path("layout-parser-paper-fast.pdf"),
+ example_doc_path("pdf/layout-parser-paper-fast.pdf"),
hi_res_model_name="test",
strategy=PartitionStrategy.HI_RES,
)
@@ -890,7 +890,7 @@ def test_partition_pdf_uses_hi_res_model_name():
def test_partition_pdf_word_bbox_not_char(
- filename=example_doc_path("interface-config-guide-p93.pdf"),
+ filename=example_doc_path("pdf/interface-config-guide-p93.pdf"),
):
try:
elements = pdf.partition_pdf(filename=filename, strategy="fast")
@@ -900,7 +900,7 @@ def test_partition_pdf_word_bbox_not_char(
def test_partition_pdf_fast_no_mapping_errors(
- filename=example_doc_path("a1977-backus-p21.pdf"),
+ filename=example_doc_path("pdf/a1977-backus-p21.pdf"),
):
"""Verify there is no regression for https://github.com/Unstructured-IO/unstructured/pull/2940,
failing to map old parent_id's to new"""
@@ -908,7 +908,7 @@ def test_partition_pdf_fast_no_mapping_errors(
def test_partition_pdf_raises_TypeError_for_invalid_languages():
- filename = example_doc_path("chevron-page.pdf")
+ filename = example_doc_path("pdf/chevron-page.pdf")
with pytest.raises(TypeError):
pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES, languages="eng")
@@ -948,7 +948,7 @@ def test_default_hi_res_model(env, expected, monkeypatch):
def test_partition_model_name_default_to_None():
- filename = example_doc_path("DA-1p.pdf")
+ filename = example_doc_path("pdf/DA-1p.pdf")
try:
pdf.partition_pdf(
filename=filename,
@@ -961,7 +961,7 @@ def test_partition_model_name_default_to_None():
def test_partition_hi_res_model_name_default_to_None():
- filename = example_doc_path("DA-1p.pdf")
+ filename = example_doc_path("pdf/DA-1p.pdf")
try:
pdf.partition_pdf(
filename=filename,
@@ -998,7 +998,7 @@ class CallException(Exception):
# Patch the ocr function with the mock that will record the call and then terminate
with mock.patch(ocr_func, mock_ocr_func), pytest.raises(CallException):
pdf.partition_pdf(
- example_doc_path("layout-parser-paper-fast.pdf"),
+ example_doc_path("pdf/layout-parser-paper-fast.pdf"),
strategy=strategy,
ocr_languages="kor",
)
@@ -1089,8 +1089,8 @@ def test_get_uris_from_annots_string_annotation(
@pytest.mark.parametrize(
("filename", "is_image"),
[
- (example_doc_path("layout-parser-paper-fast.pdf"), False),
- (example_doc_path("layout-parser-paper-fast.jpg"), True),
+ (example_doc_path("pdf/layout-parser-paper-fast.pdf"), False),
+ (example_doc_path("img/layout-parser-paper-fast.jpg"), True),
],
)
def test_partition_pdf_with_ocr_only_strategy(
@@ -1148,18 +1148,20 @@ def test_partition_pdf_with_ocr_only_strategy(
def test_partition_pdf_with_all_number_table_and_ocr_only_strategy():
# AttributeError was previously being raised when partitioning documents that contained only
# numerical values with `strategy=PartitionStrategy.OCR_ONLY`
- filename = example_doc_path("all-number-table.pdf")
+ filename = example_doc_path("pdf/all-number-table.pdf")
assert pdf.partition_pdf(filename, strategy=PartitionStrategy.OCR_ONLY)
# As of pdfminer 221105, this pdf throws an error and requires a workaround
# See #2059
def test_partition_pdf_with_bad_color_profile():
- filename = example_doc_path("pdf-bad-color-space.pdf")
+ filename = example_doc_path("pdf/pdf-bad-color-space.pdf")
assert pdf.partition_pdf(filename, strategy="fast")
-def test_partition_pdf_with_fast_finds_headers_footers(filename="example-docs/header-test-doc.pdf"):
+def test_partition_pdf_with_fast_finds_headers_footers(
+ filename=example_doc_path("pdf/header-test-doc.pdf"),
+):
elements = pdf.partition_pdf(filename, strategy="fast")
assert isinstance(elements[0], Header)
assert isinstance(elements[-1], Footer)
@@ -1180,7 +1182,7 @@ def test_partition_pdf_with_fast_finds_headers_footers(filename="example-docs/he
)
def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_log, caplog):
caplog.set_level(logging.INFO)
- assert pdf.extractable_elements(filename=example_doc_path(filename))
+ assert pdf.extractable_elements(filename=example_doc_path(f"pdf/{filename}"))
assert expected_log in caplog.text
@@ -1222,7 +1224,7 @@ def assert_element_extraction(
def test_partition_pdf_element_extraction(
file_mode,
extract_image_block_to_payload,
- filename=example_doc_path("embedded-images-tables.pdf"),
+ filename=example_doc_path("pdf/embedded-images-tables.pdf"),
):
extract_image_block_types = ["Image", "Table"]
@@ -1253,7 +1255,7 @@ def test_partition_pdf_element_extraction(
def test_partition_pdf_always_keep_all_image_elements(
- filename=example_doc_path("embedded-images.pdf"),
+ filename=example_doc_path("pdf/embedded-images.pdf"),
):
elements = pdf.partition_pdf(
filename=filename,
@@ -1325,7 +1327,9 @@ def expected_ids(request):
)
def test_unique_and_deterministic_element_ids(strategy, expected_ids):
elements = pdf.partition_pdf(
- "example-docs/fake-memo-with-duplicate-page.pdf", strategy=strategy, starting_page_number=2
+ example_doc_path("pdf/fake-memo-with-duplicate-page.pdf"),
+ strategy=strategy,
+ starting_page_number=2,
)
ids = [element.id for element in elements]
assert ids == expected_ids, "Element IDs do not match expected IDs"
@@ -1333,7 +1337,7 @@ def test_unique_and_deterministic_element_ids(strategy, expected_ids):
def test_analysis_artifacts_saved():
with tempfile.TemporaryDirectory() as temp_dir:
- filename = example_doc_path("layout-parser-paper-fast.pdf")
+ filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
pdf.partition_pdf(
filename=filename,
strategy=PartitionStrategy.HI_RES,
diff --git a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
index f3a66c9e49..29582007bc 100644
--- a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
+++ b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
@@ -36,7 +36,7 @@ def test_write_image(image_type):
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
@pytest.mark.parametrize("path_only", [True, False])
def test_convert_pdf_to_image(
- file_mode, path_only, filename=example_doc_path("embedded-images.pdf")
+ file_mode, path_only, filename=example_doc_path("pdf/embedded-images.pdf")
):
with tempfile.TemporaryDirectory() as tmpdir:
if file_mode == "filename":
@@ -71,8 +71,8 @@ def test_convert_pdf_to_image_raises_error(filename=example_doc_path("embedded-i
@pytest.mark.parametrize(
("filename", "is_image"),
[
- (example_doc_path("layout-parser-paper-fast.pdf"), False),
- (example_doc_path("layout-parser-paper-fast.jpg"), True),
+ (example_doc_path("pdf/layout-parser-paper-fast.pdf"), False),
+ (example_doc_path("img/layout-parser-paper-fast.jpg"), True),
],
)
@pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE])
@@ -301,8 +301,8 @@ def test_annotate_layout_elements_with_image():
@pytest.mark.parametrize(
("filename", "is_image"),
[
- (example_doc_path("layout-parser-paper-fast.pdf"), False),
- (example_doc_path("layout-parser-paper-fast.jpg"), True),
+ (example_doc_path("pdf/layout-parser-paper-fast.pdf"), False),
+ (example_doc_path("img/layout-parser-paper-fast.jpg"), True),
],
)
def test_annotate_layout_elements(filename, is_image):
diff --git a/test_unstructured/partition/test_api.py b/test_unstructured/partition/test_api.py
index 85be972102..a9e5956dad 100644
--- a/test_unstructured/partition/test_api.py
+++ b/test_unstructured/partition/test_api.py
@@ -374,8 +374,8 @@ def test_partition_multiple_via_api_valid_request_data_kwargs():
@pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI")
def test_partition_multiple_via_api_invalid_request_data_kwargs():
filenames = [
- example_doc_path("layout-parser-paper-fast.pdf"),
- example_doc_path("layout-parser-paper-fast.jpg"),
+ example_doc_path("pdf/layout-parser-paper-fast.pdf"),
+ example_doc_path("img/layout-parser-paper-fast.jpg"),
]
with pytest.raises(ValueError):
partition_multiple_via_api(
diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
index 7663d84ab3..ef10b9ede7 100644
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@@ -306,7 +306,7 @@ def test_auto_partition_html_pre_from_file():
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_jpeg_from_filename(pass_metadata_filename: bool, content_type: str | None):
- file_path = example_doc_path("layout-parser-paper-fast.jpg")
+ file_path = example_doc_path("img/layout-parser-paper-fast.jpg")
metadata_filename = file_path if pass_metadata_filename else None
elements = partition(
@@ -328,7 +328,7 @@ def test_auto_partition_jpeg_from_filename(pass_metadata_filename: bool, content
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_jpeg_from_file(pass_metadata_filename: bool, content_type: str | None):
- file_path = example_doc_path("layout-parser-paper-fast.jpg")
+ file_path = example_doc_path("img/layout-parser-paper-fast.jpg")
metadata_filename = file_path if pass_metadata_filename else None
with open(file_path, "rb") as f:
@@ -348,7 +348,7 @@ def test_auto_partition_jpeg_from_file(pass_metadata_filename: bool, content_typ
def test_auto_partition_bmp_from_filename(tmp_path: pathlib.Path):
bmp_filename = str(tmp_path / "example.bmp")
- with Image.open(example_doc_path("layout-parser-paper-with-table.jpg")) as img:
+ with Image.open(example_doc_path("img/layout-parser-paper-with-table.jpg")) as img:
img.save(bmp_filename)
elements = partition(filename=bmp_filename, strategy=PartitionStrategy.HI_RES)
@@ -365,7 +365,7 @@ def test_auto_partition_image_element_extraction(extract_image_block_to_payload:
with tempfile.TemporaryDirectory() as tmpdir:
elements = partition(
- filename=example_doc_path("embedded-images-tables.jpg"),
+ filename=example_doc_path("img/embedded-images-tables.jpg"),
extract_image_block_types=extract_image_block_types,
extract_image_block_to_payload=extract_image_block_to_payload,
extract_image_block_output_dir=tmpdir,
@@ -516,7 +516,7 @@ def test_auto_partition_org_from_file():
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_type: str | None):
- file_path = example_doc_path("layout-parser-paper-fast.pdf")
+ file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf")
metadata_filename = file_path if pass_metadata_filename else None
elements = partition(
@@ -547,7 +547,7 @@ def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type: str | None):
- file_path = example_doc_path("layout-parser-paper-fast.pdf")
+ file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf")
metadata_filename = file_path if pass_metadata_filename else None
with open(file_path, "rb") as f:
@@ -571,7 +571,7 @@ def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type
def test_auto_partition_pdf_with_fast_strategy(monkeypatch: MonkeyPatch):
- file_path = example_doc_path("layout-parser-paper-fast.pdf")
+ file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf")
mock_return = [NarrativeText("Hello there!")]
with patch.object(auto, "partition_pdf", return_value=mock_return) as mock_partition:
@@ -603,7 +603,7 @@ def test_auto_partition_pdf_uses_pdf_infer_table_structure_argument():
"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
) as mock_process_file_with_model:
partition(
- example_doc_path("layout-parser-paper-fast.pdf"),
+ example_doc_path("pdf/layout-parser-paper-fast.pdf"),
pdf_infer_table_structure=True,
strategy=PartitionStrategy.HI_RES,
)
@@ -616,7 +616,7 @@ def test_auto_partition_pdf_element_extraction(extract_image_block_to_payload: b
with tempfile.TemporaryDirectory() as tmpdir:
elements = partition(
- example_doc_path("embedded-images-tables.pdf"),
+ example_doc_path("pdf/embedded-images-tables.pdf"),
extract_image_block_types=extract_image_block_types,
extract_image_block_to_payload=extract_image_block_to_payload,
extract_image_block_output_dir=tmpdir,
@@ -635,7 +635,7 @@ def test_partition_pdf_does_not_raise_warning():
with warnings.catch_warnings():
warnings.simplefilter("error")
partition(
- example_doc_path("layout-parser-paper-fast.pdf"), strategy=PartitionStrategy.HI_RES
+ example_doc_path("pdf/layout-parser-paper-fast.pdf"), strategy=PartitionStrategy.HI_RES
)
@@ -1056,7 +1056,7 @@ def test_auto_partition_respects_language_arg(file_extension: str):
def test_auto_partition_forwards_include_page_breaks_to_partition_pdf():
elements = partition(
- example_doc_path("layout-parser-paper-fast.pdf"),
+ example_doc_path("pdf/layout-parser-paper-fast.pdf"),
include_page_breaks=True,
strategy=PartitionStrategy.HI_RES,
)
@@ -1102,7 +1102,9 @@ def test_auto_partition_image_formats_languages_for_tesseract(request: FixtureRe
)
partition(
- example_doc_path("chi_sim_image.jpeg"), strategy=PartitionStrategy.HI_RES, languages=["zh"]
+ example_doc_path("img/chi_sim_image.jpeg"),
+ strategy=PartitionStrategy.HI_RES,
+ languages=["zh"],
)
call_kwargs = process_file_with_ocr_.call_args_list[0][1]
@@ -1124,7 +1126,9 @@ def test_auto_partition_ignores_empty_string_for_ocr_languages(
def test_auto_partition_warns_with_ocr_languages(caplog: LogCaptureFixture):
partition(
- example_doc_path("chevron-page.pdf"), strategy=PartitionStrategy.HI_RES, ocr_languages="eng"
+ example_doc_path("pdf/chevron-page.pdf"),
+ strategy=PartitionStrategy.HI_RES,
+ ocr_languages="eng",
)
assert caplog.records[0].levelname == "WARNING"
@@ -1186,7 +1190,7 @@ def test_auto_partition_adds_filetype_to_metadata(
monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map)
elements = partition(
- example_doc_path("layout-parser-paper-fast.pdf"), content_type=content_type
+ example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type
)
assert len(elements) == 2
@@ -1213,7 +1217,9 @@ def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partiti
)
monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", {"pdf": partition_pdf_})
- elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type)
+ elements = partition(
+ example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type
+ )
assert len(elements) == 2
assert all(e.metadata.filetype == "application/pdf" for e in elements)
@@ -1238,7 +1244,8 @@ def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(filetype:
# -- partition the first example-doc with the extension for this filetype --
elements: list[Element] = []
- for file in pathlib.Path(example_doc_path("")).iterdir():
+ doc_path = example_doc_path("pdf") if filetype == FileType.PDF else example_doc_path("")
+ for file in pathlib.Path(doc_path).iterdir():
if file.is_file() and file.suffix == f".{extension}":
elements = partition_fn(str(file))
break
@@ -1256,7 +1263,7 @@ def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(filetype:
def test_auto_partition_passes_user_provided_languages_arg_to_PDF():
elements = partition(
- example_doc_path("chevron-page.pdf"),
+ example_doc_path("pdf/chevron-page.pdf"),
strategy=PartitionStrategy.OCR_ONLY,
languages=["eng"],
)
diff --git a/test_unstructured/partition/test_common.py b/test_unstructured/partition/test_common.py
index 4131d62a89..ebe12c0bee 100644
--- a/test_unstructured/partition/test_common.py
+++ b/test_unstructured/partition/test_common.py
@@ -562,7 +562,7 @@ def test_document_to_element_list_sets_category_depth_titles():
def test_ocr_data_to_elements(
- filename="example-docs/layout-parser-paper-fast.jpg",
+ filename=example_doc_path("img/layout-parser-paper-fast.jpg"),
):
text_regions = [
TextRegion.from_coords(
diff --git a/test_unstructured/partition/test_strategies.py b/test_unstructured/partition/test_strategies.py
index 9c66076eab..de284e84f0 100644
--- a/test_unstructured/partition/test_strategies.py
+++ b/test_unstructured/partition/test_strategies.py
@@ -1,7 +1,6 @@
-import os
-
import pytest
+from test_unstructured.unit_utils import example_doc_path
from unstructured.documents.elements import Text
from unstructured.partition import pdf, strategies
from unstructured.partition.utils.constants import PartitionStrategy
@@ -43,7 +42,7 @@ def test_validate_strategy_raises_for_bad_strategy():
],
)
def test_is_pdf_text_extractable(filename, from_file, expected):
- filename = os.path.join("example-docs", filename)
+ filename = example_doc_path(f"pdf/{filename}")
if from_file:
with open(filename, "rb") as f:
diff --git a/test_unstructured_ingest/dest/azure-cognitive-search.sh b/test_unstructured_ingest/dest/azure-cognitive-search.sh
index 2d2c64db26..8b534939f3 100755
--- a/test_unstructured_ingest/dest/azure-cognitive-search.sh
+++ b/test_unstructured_ingest/dest/azure-cognitive-search.sh
@@ -78,7 +78,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
--strategy fast \
--verbose \
--reprocess \
- --input-path example-docs/fake-memo.pdf \
+ --input-path example-docs/pdf/fake-memo.pdf \
--work-dir "$WORK_DIR" \
--chunking-strategy by_title \
--chunk-combine-text-under-n-chars 150 \
diff --git a/test_unstructured_ingest/dest/azure.sh b/test_unstructured_ingest/dest/azure.sh
index 293324e2d9..208b4a5a4b 100755
--- a/test_unstructured_ingest/dest/azure.sh
+++ b/test_unstructured_ingest/dest/azure.sh
@@ -43,7 +43,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
--strategy fast \
--verbose \
--reprocess \
- --input-path example-docs/fake-memo.pdf \
+ --input-path example-docs/pdf/fake-memo.pdf \
--work-dir "$WORK_DIR" \
azure \
--overwrite \
diff --git a/test_unstructured_ingest/dest/box.sh b/test_unstructured_ingest/dest/box.sh
index 0e36d1b87f..37ad702dd5 100755
--- a/test_unstructured_ingest/dest/box.sh
+++ b/test_unstructured_ingest/dest/box.sh
@@ -45,7 +45,7 @@
# --strategy fast \
# --verbose \
# --reprocess \
-# --input-path example-docs/fake-memo.pdf \
+# --input-path example-docs/pdf/fake-memo.pdf \
# --work-dir "$WORK_DIR" \
# box \
# --box-app-config "$BOX_APP_CONFIG_PATH" \
diff --git a/test_unstructured_ingest/dest/databricks-volumes.sh b/test_unstructured_ingest/dest/databricks-volumes.sh
index c97289c5fc..6cf6e38a24 100755
--- a/test_unstructured_ingest/dest/databricks-volumes.sh
+++ b/test_unstructured_ingest/dest/databricks-volumes.sh
@@ -44,7 +44,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--output-dir "$OUTPUT_DIR" \
--strategy fast \
--verbose \
- --input-path example-docs/fake-memo.pdf \
+ --input-path example-docs/pdf/fake-memo.pdf \
--work-dir "$WORK_DIR" \
databricks-volumes \
--host "$DATABRICKS_HOST" \
diff --git a/test_unstructured_ingest/dest/delta-table.sh b/test_unstructured_ingest/dest/delta-table.sh
index d639d804b9..cf54e10546 100755
--- a/test_unstructured_ingest/dest/delta-table.sh
+++ b/test_unstructured_ingest/dest/delta-table.sh
@@ -38,7 +38,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--strategy fast \
--verbose \
--reprocess \
- --input-path example-docs/fake-memo.pdf \
+ --input-path example-docs/pdf/fake-memo.pdf \
--work-dir "$WORK_DIR" \
delta-table \
--table-uri "$DESTINATION_TABLE"
diff --git a/test_unstructured_ingest/dest/dropbox.sh b/test_unstructured_ingest/dest/dropbox.sh
index b2f36a1868..52ade67223 100755
--- a/test_unstructured_ingest/dest/dropbox.sh
+++ b/test_unstructured_ingest/dest/dropbox.sh
@@ -62,7 +62,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
--strategy fast \
--verbose \
--reprocess \
- --input-path example-docs/fake-memo.pdf \
+ --input-path example-docs/pdf/fake-memo.pdf \
--work-dir "$WORK_DIR" \
dropbox \
--token "$DROPBOX_ACCESS_TOKEN" \
diff --git a/test_unstructured_ingest/dest/gcs.sh b/test_unstructured_ingest/dest/gcs.sh
index 3099dc31e4..21571a9373 100755
--- a/test_unstructured_ingest/dest/gcs.sh
+++ b/test_unstructured_ingest/dest/gcs.sh
@@ -47,7 +47,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
--strategy fast \
--verbose \
--reprocess \
- --input-path example-docs/fake-memo.pdf \
+ --input-path example-docs/pdf/fake-memo.pdf \
--work-dir "$WORK_DIR" \
gcs \
--service-account-key "$GCP_INGEST_SERVICE_KEY_FILE" \
diff --git a/test_unstructured_ingest/dest/kafka-local.sh b/test_unstructured_ingest/dest/kafka-local.sh
index 2e71b7484d..9086687ed2 100755
--- a/test_unstructured_ingest/dest/kafka-local.sh
+++ b/test_unstructured_ingest/dest/kafka-local.sh
@@ -42,7 +42,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--strategy fast \
--verbose \
--reprocess \
- --input-path example-docs/layout-parser-paper.pdf \
+ --input-path example-docs/pdf/layout-parser-paper.pdf \
--work-dir "$WORK_DIR" \
--chunking-strategy basic \
--chunk-combine-text-under-n-chars 200 \
diff --git a/test_unstructured_ingest/dest/mongodb.sh b/test_unstructured_ingest/dest/mongodb.sh
index aa28090d3e..6c90c53fe4 100755
--- a/test_unstructured_ingest/dest/mongodb.sh
+++ b/test_unstructured_ingest/dest/mongodb.sh
@@ -54,7 +54,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
--strategy fast \
--verbose \
--reprocess \
- --input-path example-docs/fake-memo.pdf \
+ --input-path example-docs/pdf/fake-memo.pdf \
--work-dir "$WORK_DIR" \
--embedding-provider "langchain-huggingface" \
mongodb \
diff --git a/test_unstructured_ingest/dest/opensearch.sh b/test_unstructured_ingest/dest/opensearch.sh
index db64f3ff39..003e4f2868 100755
--- a/test_unstructured_ingest/dest/opensearch.sh
+++ b/test_unstructured_ingest/dest/opensearch.sh
@@ -41,7 +41,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--strategy fast \
--verbose \
--reprocess \
- --input-path example-docs/fake-memo.pdf \
+ --input-path example-docs/pdf/fake-memo.pdf \
--work-dir "$WORK_DIR" \
--embedding-provider "langchain-huggingface" \
opensearch \
diff --git a/test_unstructured_ingest/dest/pgvector.sh b/test_unstructured_ingest/dest/pgvector.sh
index 66f6aa5bd7..25836cf1dc 100755
--- a/test_unstructured_ingest/dest/pgvector.sh
+++ b/test_unstructured_ingest/dest/pgvector.sh
@@ -40,7 +40,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--output-dir "$OUTPUT_DIR" \
--strategy fast \
--verbose \
- --input-path example-docs/fake-memo.pdf \
+ --input-path example-docs/pdf/fake-memo.pdf \
--work-dir "$WORK_DIR" \
--embedding-provider "langchain-huggingface" \
sql \
diff --git a/test_unstructured_ingest/dest/s3.sh b/test_unstructured_ingest/dest/s3.sh
index b992ebf7ad..b8d0b901e2 100755
--- a/test_unstructured_ingest/dest/s3.sh
+++ b/test_unstructured_ingest/dest/s3.sh
@@ -37,7 +37,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
--strategy fast \
--verbose \
--reprocess \
- --input-path example-docs/fake-memo.pdf \
+ --input-path example-docs/pdf/fake-memo.pdf \
--work-dir "$WORK_DIR" \
s3 \
--key "$S3_INGEST_TEST_ACCESS_KEY" \
diff --git a/test_unstructured_ingest/dest/singlestore.sh b/test_unstructured_ingest/dest/singlestore.sh
index 1816a0e0e9..a04f81370c 100755
--- a/test_unstructured_ingest/dest/singlestore.sh
+++ b/test_unstructured_ingest/dest/singlestore.sh
@@ -46,7 +46,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--strategy fast \
--verbose \
--reprocess \
- --input-path example-docs/fake-memo.pdf \
+ --input-path example-docs/pdf/fake-memo.pdf \
--work-dir "$WORK_DIR" \
--embedding-provider "langchain-huggingface" \
singlestore \
diff --git a/test_unstructured_ingest/dest/sqlite.sh b/test_unstructured_ingest/dest/sqlite.sh
index c289bf4218..9cd54b35e7 100755
--- a/test_unstructured_ingest/dest/sqlite.sh
+++ b/test_unstructured_ingest/dest/sqlite.sh
@@ -42,7 +42,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--strategy fast \
--verbose \
--reprocess \
- --input-path example-docs/fake-memo.pdf \
+ --input-path example-docs/pdf/fake-memo.pdf \
--work-dir "$WORK_DIR" \
sql \
--db-type "$DATABASE_TYPE" \
diff --git a/test_unstructured_ingest/dest/weaviate.sh b/test_unstructured_ingest/dest/weaviate.sh
index 68c4953d39..7dfa3281a5 100755
--- a/test_unstructured_ingest/dest/weaviate.sh
+++ b/test_unstructured_ingest/dest/weaviate.sh
@@ -40,7 +40,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--strategy fast \
--verbose \
--reprocess \
- --input-path example-docs/fake-memo.pdf \
+ --input-path example-docs/pdf/fake-memo.pdf \
--work-dir "$WORK_DIR" \
--embedding-provider "langchain-huggingface" \
weaviate \
diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json
index e89d8787e0..b07103abf1 100644
--- a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json
@@ -6,7 +6,7 @@
"metadata": {
"data_source": {
"record_locator": {
- "path": "/home/runner/work/unstructured/unstructured/example-docs/multi-column-2p.pdf"
+ "path": "/home/runner/work/unstructured/unstructured/example-docs/pdf/multi-column-2p.pdf"
},
"permissions_data": [
{
@@ -28,7 +28,7 @@
"metadata": {
"data_source": {
"record_locator": {
- "path": "/home/runner/work/unstructured/unstructured/example-docs/multi-column-2p.pdf"
+ "path": "/home/runner/work/unstructured/unstructured/example-docs/pdf/multi-column-2p.pdf"
},
"permissions_data": [
{
@@ -50,7 +50,7 @@
"metadata": {
"data_source": {
"record_locator": {
- "path": "/home/runner/work/unstructured/unstructured/example-docs/multi-column-2p.pdf"
+ "path": "/home/runner/work/unstructured/unstructured/example-docs/pdf/multi-column-2p.pdf"
},
"permissions_data": [
{
@@ -72,7 +72,7 @@
"metadata": {
"data_source": {
"record_locator": {
- "path": "/home/runner/work/unstructured/unstructured/example-docs/multi-column-2p.pdf"
+ "path": "/home/runner/work/unstructured/unstructured/example-docs/pdf/multi-column-2p.pdf"
},
"permissions_data": [
{
@@ -94,7 +94,7 @@
"metadata": {
"data_source": {
"record_locator": {
- "path": "/home/runner/work/unstructured/unstructured/example-docs/multi-column-2p.pdf"
+ "path": "/home/runner/work/unstructured/unstructured/example-docs/pdf/multi-column-2p.pdf"
},
"permissions_data": [
{
@@ -116,7 +116,7 @@
"metadata": {
"data_source": {
"record_locator": {
- "path": "/home/runner/work/unstructured/unstructured/example-docs/multi-column-2p.pdf"
+ "path": "/home/runner/work/unstructured/unstructured/example-docs/pdf/multi-column-2p.pdf"
},
"permissions_data": [
{
@@ -138,7 +138,7 @@
"metadata": {
"data_source": {
"record_locator": {
- "path": "/home/runner/work/unstructured/unstructured/example-docs/multi-column-2p.pdf"
+ "path": "/home/runner/work/unstructured/unstructured/example-docs/pdf/multi-column-2p.pdf"
},
"permissions_data": [
{
diff --git a/test_unstructured_ingest/src/against-api.sh b/test_unstructured_ingest/src/against-api.sh
index 63ab033dfa..a4ff8f3adb 100755
--- a/test_unstructured_ingest/src/against-api.sh
+++ b/test_unstructured_ingest/src/against-api.sh
@@ -40,7 +40,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
--output-dir "$OUTPUT_DIR" \
--verbose \
--num-processes "$max_processes" \
- --input-path "example-docs/$TEST_FILE_NAME" \
+ --input-path "example-docs/pdf/$TEST_FILE_NAME" \
--work-dir "$WORK_DIR"
RESULT_FILE_PATH="$OUTPUT_DIR/$TEST_FILE_NAME.json"
diff --git a/test_unstructured_ingest/src/kafka-local.sh b/test_unstructured_ingest/src/kafka-local.sh
index de2253f4bb..c2ed84d0b4 100755
--- a/test_unstructured_ingest/src/kafka-local.sh
+++ b/test_unstructured_ingest/src/kafka-local.sh
@@ -45,13 +45,13 @@ echo "Sending test document (pdf)"
#Check the number of messages in destination topic
#Note we are calling it twice since this will hack our way into the topic being created (default kafka setting)
python "$SCRIPT_DIR"/python/test-produce-kafka-message.py up \
- --input-file "example-docs/fake-memo.pdf" \
+ --input-file "example-docs/pdf/fake-memo.pdf" \
--bootstrap-server localhost \
--topic "$KAFKA_TOPIC" \
--confluent false \
--port 29092
python "$SCRIPT_DIR"/python/test-produce-kafka-message.py up \
- --input-file "example-docs/fake-memo.pdf" \
+ --input-file "example-docs/pdf/fake-memo.pdf" \
--bootstrap-server localhost \
--topic "$KAFKA_TOPIC" \
--confluent false \
diff --git a/test_unstructured_ingest/src/local-single-file-chunk-no-orig-elements.sh b/test_unstructured_ingest/src/local-single-file-chunk-no-orig-elements.sh
index 29dad44f21..452686eebf 100755
--- a/test_unstructured_ingest/src/local-single-file-chunk-no-orig-elements.sh
+++ b/test_unstructured_ingest/src/local-single-file-chunk-no-orig-elements.sh
@@ -21,7 +21,7 @@ OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
# -- use absolute path of input file to verify passing an absolute path --
-ABS_INPUT_PATH="$SCRIPT_DIR/../example-docs/$EXAMPLE_DOC"
+ABS_INPUT_PATH="$SCRIPT_DIR/../example-docs/pdf/$EXAMPLE_DOC"
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
# shellcheck disable=SC1091
diff --git a/test_unstructured_ingest/unit/connector/test_sql_conform_dict.py b/test_unstructured_ingest/unit/connector/test_sql_conform_dict.py
index 103b489e96..bfa6281831 100644
--- a/test_unstructured_ingest/unit/connector/test_sql_conform_dict.py
+++ b/test_unstructured_ingest/unit/connector/test_sql_conform_dict.py
@@ -22,7 +22,7 @@
"date_modified": "2023-10-25 10:05:44.976775",
"date_processed": "2023-12-14T17:06:33.074057",
"permissions_data": [{"mode": 33188}],
- "url": "example-docs/fake-memo.pdf",
+ "url": "example-docs/pdf/fake-memo.pdf",
},
"file_directory": "example-docs",
"filename": "fake-memo.pdf",
@@ -105,7 +105,7 @@ def test_conform_dict_1():
"date_modified": datetime.datetime(2023, 10, 25, 10, 5, 44, 976775),
"date_processed": datetime.datetime(2023, 12, 14, 17, 6, 33, 74057),
"permissions_data": '[{"mode": 33188}]',
- "url": "example-docs/fake-memo.pdf",
+ "url": "example-docs/pdf/fake-memo.pdf",
"layout_height": 792,
"layout_width": 612,
"points": "[[72.0, 72.69200000000001], [72.0, 83.69200000000001],"