diff --git a/CHANGELOG.md b/CHANGELOG.md index 0630abda79..94a6ff8bc1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,6 @@ ### Fixes * **Remedy error on Windows when `nltk` binaries are downloaded.** Work around a quirk in the Windows implementation of `tempfile.NamedTemporaryFile` where accessing the temporary file by name raises `PermissionError`. - * **Move Astra embedded_dimension to write config** ## 0.14.10 diff --git a/example-docs/DA-1p.heic b/example-docs/img/DA-1p.heic similarity index 100% rename from example-docs/DA-1p.heic rename to example-docs/img/DA-1p.heic diff --git a/example-docs/DA-1p.jpg b/example-docs/img/DA-1p.jpg similarity index 100% rename from example-docs/DA-1p.jpg rename to example-docs/img/DA-1p.jpg diff --git a/example-docs/DA-1p.png b/example-docs/img/DA-1p.png similarity index 100% rename from example-docs/DA-1p.png rename to example-docs/img/DA-1p.png diff --git a/example-docs/bmp_24.bmp b/example-docs/img/bmp_24.bmp similarity index 100% rename from example-docs/bmp_24.bmp rename to example-docs/img/bmp_24.bmp diff --git a/example-docs/chi_sim_image.jpeg b/example-docs/img/chi_sim_image.jpeg similarity index 100% rename from example-docs/chi_sim_image.jpeg rename to example-docs/img/chi_sim_image.jpeg diff --git a/example-docs/double-column-A.jpg b/example-docs/img/double-column-A.jpg similarity index 100% rename from example-docs/double-column-A.jpg rename to example-docs/img/double-column-A.jpg diff --git a/example-docs/double-column-B.jpg b/example-docs/img/double-column-B.jpg similarity index 100% rename from example-docs/double-column-B.jpg rename to example-docs/img/double-column-B.jpg diff --git a/example-docs/embedded-images-tables.jpg b/example-docs/img/embedded-images-tables.jpg similarity index 100% rename from example-docs/embedded-images-tables.jpg rename to example-docs/img/embedded-images-tables.jpg diff --git a/example-docs/english-and-korean.png b/example-docs/img/english-and-korean.png similarity index 100% rename from example-docs/english-and-korean.png rename to example-docs/img/english-and-korean.png diff --git a/example-docs/example.jpg b/example-docs/img/example.jpg similarity index 100% rename from example-docs/example.jpg rename to example-docs/img/example.jpg diff --git a/example-docs/jpn-vert.jpeg b/example-docs/img/jpn-vert.jpeg similarity index 100% rename from example-docs/jpn-vert.jpeg rename to example-docs/img/jpn-vert.jpeg diff --git a/example-docs/layout-parser-paper-10p.jpg b/example-docs/img/layout-parser-paper-10p.jpg similarity index 100% rename from example-docs/layout-parser-paper-10p.jpg rename to example-docs/img/layout-parser-paper-10p.jpg diff --git a/example-docs/layout-parser-paper-combined.tiff b/example-docs/img/layout-parser-paper-combined.tiff similarity index 100% rename from example-docs/layout-parser-paper-combined.tiff rename to example-docs/img/layout-parser-paper-combined.tiff diff --git a/example-docs/layout-parser-paper-fast.jpg b/example-docs/img/layout-parser-paper-fast.jpg similarity index 100% rename from example-docs/layout-parser-paper-fast.jpg rename to example-docs/img/layout-parser-paper-fast.jpg diff --git a/example-docs/layout-parser-paper-fast.tiff b/example-docs/img/layout-parser-paper-fast.tiff similarity index 100% rename from example-docs/layout-parser-paper-fast.tiff rename to example-docs/img/layout-parser-paper-fast.tiff diff --git a/example-docs/layout-parser-paper-with-table.jpg b/example-docs/img/layout-parser-paper-with-table.jpg similarity index 100% rename from example-docs/layout-parser-paper-with-table.jpg rename to example-docs/img/layout-parser-paper-with-table.jpg diff --git a/example-docs/table-multi-row-column-cells.png b/example-docs/img/table-multi-row-column-cells.png similarity index 100% rename from example-docs/table-multi-row-column-cells.png rename to example-docs/img/table-multi-row-column-cells.png diff --git a/example-docs/DA-1p.pdf b/example-docs/pdf/DA-1p.pdf similarity index 100% rename from example-docs/DA-1p.pdf rename to example-docs/pdf/DA-1p.pdf diff --git a/example-docs/DA-619p.pdf b/example-docs/pdf/DA-619p.pdf similarity index 100% rename from example-docs/DA-619p.pdf rename to example-docs/pdf/DA-619p.pdf diff --git a/example-docs/a1977-backus-p21.pdf b/example-docs/pdf/a1977-backus-p21.pdf similarity index 100% rename from example-docs/a1977-backus-p21.pdf rename to example-docs/pdf/a1977-backus-p21.pdf diff --git a/example-docs/all-number-table.pdf b/example-docs/pdf/all-number-table.pdf similarity index 100% rename from example-docs/all-number-table.pdf rename to example-docs/pdf/all-number-table.pdf diff --git a/example-docs/chevron-page.pdf b/example-docs/pdf/chevron-page.pdf similarity index 100% rename from example-docs/chevron-page.pdf rename to example-docs/pdf/chevron-page.pdf diff --git a/example-docs/copy-protected.pdf b/example-docs/pdf/copy-protected.pdf similarity index 100% rename from example-docs/copy-protected.pdf rename to example-docs/pdf/copy-protected.pdf diff --git a/example-docs/embedded-images-tables.pdf b/example-docs/pdf/embedded-images-tables.pdf similarity index 100% rename from example-docs/embedded-images-tables.pdf rename to example-docs/pdf/embedded-images-tables.pdf diff --git a/example-docs/embedded-images.pdf b/example-docs/pdf/embedded-images.pdf similarity index 100% rename from example-docs/embedded-images.pdf rename to example-docs/pdf/embedded-images.pdf diff --git a/example-docs/embedded-link.pdf b/example-docs/pdf/embedded-link.pdf similarity index 100% rename from example-docs/embedded-link.pdf rename to example-docs/pdf/embedded-link.pdf diff --git a/example-docs/emphasis-text.pdf b/example-docs/pdf/emphasis-text.pdf similarity index 100% rename from example-docs/emphasis-text.pdf rename to example-docs/pdf/emphasis-text.pdf diff --git a/example-docs/failure-after-repair.pdf b/example-docs/pdf/failure-after-repair.pdf similarity index 100% rename from example-docs/failure-after-repair.pdf rename to example-docs/pdf/failure-after-repair.pdf diff --git a/example-docs/fake-memo-with-duplicate-page.pdf b/example-docs/pdf/fake-memo-with-duplicate-page.pdf similarity index 100% rename from example-docs/fake-memo-with-duplicate-page.pdf rename to example-docs/pdf/fake-memo-with-duplicate-page.pdf diff --git a/example-docs/fake-memo.pdf b/example-docs/pdf/fake-memo.pdf similarity index 100% rename from example-docs/fake-memo.pdf rename to example-docs/pdf/fake-memo.pdf diff --git a/example-docs/header-test-doc.pdf b/example-docs/pdf/header-test-doc.pdf similarity index 100% rename from example-docs/header-test-doc.pdf rename to example-docs/pdf/header-test-doc.pdf diff --git a/example-docs/interface-config-guide-p93.pdf b/example-docs/pdf/interface-config-guide-p93.pdf similarity index 100% rename from example-docs/interface-config-guide-p93.pdf rename to example-docs/pdf/interface-config-guide-p93.pdf diff --git a/example-docs/invalid-pdf-structure-pdfminer-entire-doc.pdf b/example-docs/pdf/invalid-pdf-structure-pdfminer-entire-doc.pdf similarity index 100% rename from example-docs/invalid-pdf-structure-pdfminer-entire-doc.pdf rename to example-docs/pdf/invalid-pdf-structure-pdfminer-entire-doc.pdf diff --git a/example-docs/invalid-pdf-structure-pdfminer-one-page.pdf b/example-docs/pdf/invalid-pdf-structure-pdfminer-one-page.pdf similarity index 100% rename from example-docs/invalid-pdf-structure-pdfminer-one-page.pdf rename to example-docs/pdf/invalid-pdf-structure-pdfminer-one-page.pdf diff --git a/example-docs/korean-text-with-tables.pdf b/example-docs/pdf/korean-text-with-tables.pdf similarity index 100% rename from example-docs/korean-text-with-tables.pdf rename to example-docs/pdf/korean-text-with-tables.pdf diff --git a/example-docs/layout-parser-paper-fast.pdf b/example-docs/pdf/layout-parser-paper-fast.pdf similarity index 100% rename from example-docs/layout-parser-paper-fast.pdf rename to example-docs/pdf/layout-parser-paper-fast.pdf diff --git a/example-docs/layout-parser-paper-with-empty-pages.pdf b/example-docs/pdf/layout-parser-paper-with-empty-pages.pdf similarity index 100% rename from example-docs/layout-parser-paper-with-empty-pages.pdf rename to example-docs/pdf/layout-parser-paper-with-empty-pages.pdf diff --git a/example-docs/layout-parser-paper-with-table.pdf b/example-docs/pdf/layout-parser-paper-with-table.pdf similarity index 100% rename from example-docs/layout-parser-paper-with-table.pdf rename to example-docs/pdf/layout-parser-paper-with-table.pdf diff --git a/example-docs/layout-parser-paper.pdf b/example-docs/pdf/layout-parser-paper.pdf similarity index 100% rename from example-docs/layout-parser-paper.pdf rename to example-docs/pdf/layout-parser-paper.pdf diff --git a/example-docs/list-item-example.pdf b/example-docs/pdf/list-item-example.pdf similarity index 100% rename from example-docs/list-item-example.pdf rename to example-docs/pdf/list-item-example.pdf diff --git a/example-docs/loremipsum-flat.pdf b/example-docs/pdf/loremipsum-flat.pdf similarity index 100% rename from example-docs/loremipsum-flat.pdf rename to example-docs/pdf/loremipsum-flat.pdf diff --git a/example-docs/multi-column-2p.pdf b/example-docs/pdf/multi-column-2p.pdf similarity index 100% rename from example-docs/multi-column-2p.pdf rename to example-docs/pdf/multi-column-2p.pdf diff --git a/example-docs/multi-column.pdf b/example-docs/pdf/multi-column.pdf similarity index 100% rename from example-docs/multi-column.pdf rename to example-docs/pdf/multi-column.pdf diff --git a/example-docs/negative-coords.pdf b/example-docs/pdf/negative-coords.pdf similarity index 100% rename from example-docs/negative-coords.pdf rename to example-docs/pdf/negative-coords.pdf diff --git a/example-docs/pdf-bad-color-space.pdf b/example-docs/pdf/pdf-bad-color-space.pdf similarity index 100% rename from example-docs/pdf-bad-color-space.pdf rename to example-docs/pdf/pdf-bad-color-space.pdf diff --git a/example-docs/pdf2image-memory-error-test-400p.pdf b/example-docs/pdf/pdf2image-memory-error-test-400p.pdf similarity index 100% rename from example-docs/pdf2image-memory-error-test-400p.pdf rename to example-docs/pdf/pdf2image-memory-error-test-400p.pdf diff --git a/example-docs/reliance.pdf b/example-docs/pdf/reliance.pdf similarity index 100% rename from example-docs/reliance.pdf rename to example-docs/pdf/reliance.pdf diff --git a/example-docs/table-multi-row-column-cells.pdf b/example-docs/pdf/table-multi-row-column-cells.pdf similarity index 100% rename from example-docs/table-multi-row-column-cells.pdf rename to example-docs/pdf/table-multi-row-column-cells.pdf diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index af18577225..545f441a0d 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -99,9 +99,9 @@ def test_detect_filetype_from_filename_with_extension( @pytest.mark.parametrize( ("file_name", "expected_value"), [ - ("layout-parser-paper-fast.pdf", [FileType.PDF]), + ("pdf/layout-parser-paper-fast.pdf", [FileType.PDF]), ("fake.docx", [FileType.DOCX]), - ("example.jpg", [FileType.JPG]), + ("img/example.jpg", [FileType.JPG]), ("fake-text.txt", [FileType.TXT]), ("eml/fake-email.eml", [FileType.EML]), ("factbook.xml", [FileType.XML]), @@ -424,7 +424,7 @@ def test_detect_BMP_from_file_path(): def test_detect_BMP_from_file_no_extension(): - with open(example_doc_path("bmp_24.bmp"), "rb") as f: + with open(example_doc_path("img/bmp_24.bmp"), "rb") as f: file = io.BytesIO(f.read()) assert detect_filetype(file=file) == FileType.BMP diff --git a/test_unstructured/file_utils/test_metadata.py b/test_unstructured/file_utils/test_metadata.py index 4239ab436b..99ee2356b6 100644 --- a/test_unstructured/file_utils/test_metadata.py +++ b/test_unstructured/file_utils/test_metadata.py @@ -7,9 +7,10 @@ import pytest import unstructured.file_utils.metadata as meta +from test_unstructured.unit_utils import example_doc_path DIRECTORY = pathlib.Path(__file__).parent.resolve() -EXAMPLE_JPG_FILENAME = os.path.join(DIRECTORY, "..", "..", "example-docs", "example.jpg") +EXAMPLE_JPG_FILENAME = example_doc_path("img/example.jpg") def test_get_docx_metadata_from_filename(tmpdir): diff --git a/test_unstructured/metrics/test_table_structure.py b/test_unstructured/metrics/test_table_structure.py index 3c684be5a4..dc564443cb 100644 --- a/test_unstructured/metrics/test_table_structure.py +++ b/test_unstructured/metrics/test_table_structure.py @@ -3,6 +3,7 @@ import numpy as np import pytest +from test_unstructured.unit_utils import example_doc_path from unstructured.metrics.table.table_alignment import TableAlignment from unstructured.metrics.table.table_eval import TableEvalProcessor from unstructured.metrics.table_structure import ( @@ -14,8 +15,8 @@ @pytest.mark.parametrize( "filename", [ - "example-docs/table-multi-row-column-cells.png", - "example-docs/table-multi-row-column-cells.pdf", + example_doc_path("img/table-multi-row-column-cells.png"), + example_doc_path("pdf/table-multi-row-column-cells.pdf"), ], ) def test_image_or_pdf_to_dataframe(filename): @@ -25,8 +26,8 @@ def test_image_or_pdf_to_dataframe(filename): def test_eval_table_transformer_for_file(): score = eval_table_transformer_for_file( - "example-docs/table-multi-row-column-cells.png", - "example-docs/table-multi-row-column-cells-actual.csv", + example_doc_path("img/table-multi-row-column-cells.png"), + example_doc_path("table-multi-row-column-cells-actual.csv"), ) # avoid severe degradation of performance assert 0.8 < score < 1 diff --git a/test_unstructured/partition/pdf_image/test_chipper.py b/test_unstructured/partition/pdf_image/test_chipper.py index 81f421159c..03cc610ea7 100644 --- a/test_unstructured/partition/pdf_image/test_chipper.py +++ b/test_unstructured/partition/pdf_image/test_chipper.py @@ -1,5 +1,6 @@ import pytest +from test_unstructured.unit_utils import example_doc_path from unstructured.partition import pdf from unstructured.partition.utils.constants import PartitionStrategy @@ -7,7 +8,7 @@ @pytest.fixture(scope="session") def chipper_results(): elements = pdf.partition_pdf( - "example-docs/layout-parser-paper-fast.pdf", + filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), strategy=PartitionStrategy.HI_RES, model_name="chipper", ) diff --git a/test_unstructured/partition/pdf_image/test_image.py b/test_unstructured/partition/pdf_image/test_image.py index 55018cb3a8..270b38c5ea 100644 --- a/test_unstructured/partition/pdf_image/test_image.py +++ b/test_unstructured/partition/pdf_image/test_image.py @@ -91,7 +91,7 @@ def pages(self): @pytest.mark.parametrize( ("filename", "file"), [ - ("example-docs/example.jpg", None), + (example_doc_path("img/example.jpg"), None), (None, b"0000"), ], ) @@ -132,7 +132,7 @@ def test_partition_image_local_raises_with_no_filename(): def test_partition_image_with_auto_strategy( - filename="example-docs/layout-parser-paper-fast.jpg", + filename=example_doc_path("img/layout-parser-paper-fast.jpg"), ): elements = image.partition_image(filename=filename, strategy=PartitionStrategy.AUTO) titles = [ @@ -146,7 +146,7 @@ def test_partition_image_with_auto_strategy( def test_partition_image_with_table_extraction( - filename="example-docs/layout-parser-paper-with-table.jpg", + filename=example_doc_path("img/layout-parser-paper-with-table.jpg"), ): elements = image.partition_image( filename=filename, @@ -160,7 +160,7 @@ def test_partition_image_with_table_extraction( def test_partition_image_with_multipage_tiff( - filename="example-docs/layout-parser-paper-combined.tiff", + filename=example_doc_path("img/layout-parser-paper-combined.tiff"), ): elements = image.partition_image(filename=filename, strategy=PartitionStrategy.AUTO) assert elements[-1].metadata.page_number == 2 @@ -168,7 +168,7 @@ def test_partition_image_with_multipage_tiff( def test_partition_image_with_bmp( tmpdir, - filename="example-docs/layout-parser-paper-with-table.jpg", + filename=example_doc_path("img/layout-parser-paper-with-table.jpg"), ): bmp_filename = os.path.join(tmpdir.dirname, "example.bmp") img = Image.open(filename) @@ -185,7 +185,7 @@ def test_partition_image_with_bmp( assert "" in table[0] -def test_partition_image_with_language_passed(filename="example-docs/example.jpg"): +def test_partition_image_with_language_passed(filename=example_doc_path("img/example.jpg")): with mock.patch.object( ocr, "process_file_with_ocr", @@ -201,7 +201,7 @@ def test_partition_image_with_language_passed(filename="example-docs/example.jpg def test_partition_image_from_file_with_language_passed( - filename="example-docs/example.jpg", + filename=example_doc_path("img/example.jpg"), ): with mock.patch.object( ocr, @@ -216,7 +216,7 @@ def test_partition_image_from_file_with_language_passed( # NOTE(crag): see https://github.com/Unstructured-IO/unstructured/issues/1086 @pytest.mark.skip(reason="Current catching too many tesseract errors") def test_partition_image_raises_with_invalid_language( - filename="example-docs/example.jpg", + filename=example_doc_path("img/example.jpg"), ): with pytest.raises(TesseractError): image.partition_image( @@ -227,21 +227,14 @@ def test_partition_image_raises_with_invalid_language( @pytest.mark.parametrize( - ("strategy"), + "strategy", [ - (PartitionStrategy.HI_RES), - (PartitionStrategy.OCR_ONLY), + PartitionStrategy.HI_RES, + PartitionStrategy.OCR_ONLY, ], ) def test_partition_image_strategies_keep_languages_metadata(strategy): - filename = os.path.join( - DIRECTORY, - "..", - "..", - "..", - "example-docs", - "english-and-korean.png", - ) + filename = example_doc_path("img/english-and-korean.png") elements = image.partition_image( filename=filename, languages=["eng", "kor"], @@ -252,14 +245,7 @@ def test_partition_image_strategies_keep_languages_metadata(strategy): def test_partition_image_with_ocr_detects_korean(): - filename = os.path.join( - DIRECTORY, - "..", - "..", - "..", - "example-docs", - "english-and-korean.png", - ) + filename = example_doc_path("img/english-and-korean.png") elements = image.partition_image( filename=filename, ocr_languages="eng+kor", @@ -271,7 +257,7 @@ def test_partition_image_with_ocr_detects_korean(): def test_partition_image_with_ocr_detects_korean_from_file(): - filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "english-and-korean.png") + filename = example_doc_path("img/english-and-korean.png") with open(filename, "rb") as f: elements = image.partition_image( file=f, @@ -284,27 +270,13 @@ def test_partition_image_with_ocr_detects_korean_from_file(): def test_partition_image_raises_with_bad_strategy(): - filename = os.path.join( - DIRECTORY, - "..", - "..", - "..", - "example-docs", - "english-and-korean.png", - ) + filename = example_doc_path("img/english-and-korean.png") with pytest.raises(ValueError): image.partition_image(filename=filename, strategy="fakeroo") def test_partition_image_default_strategy_hi_res(): - filename = os.path.join( - DIRECTORY, - "..", - "..", - "..", - "example-docs", - "layout-parser-paper-fast.jpg", - ) + filename = example_doc_path("img/layout-parser-paper-fast.jpg") with open(filename, "rb") as f: elements = image.partition_image(file=f) @@ -324,7 +296,7 @@ def test_partition_image_default_strategy_hi_res(): def test_partition_image_metadata_date( mocker, - filename="example-docs/english-and-korean.png", + filename=example_doc_path("img/english-and-korean.png"), ): mocked_last_modification_date = "2029-07-05T09:24:28" mocker.patch( @@ -338,7 +310,7 @@ def test_partition_image_metadata_date( def test_partition_image_with_hi_res_strategy_metadata_date( mocker, - filename="example-docs/english-and-korean.png", + filename=example_doc_path("img/english-and-korean.png"), ): mocked_last_modification_date = "2029-07-05T09:24:28" mocker.patch( @@ -352,7 +324,7 @@ def test_partition_image_with_hi_res_strategy_metadata_date( def test_partition_image_metadata_date_custom_metadata_date( mocker, - filename="example-docs/english-and-korean.png", + filename=example_doc_path("img/english-and-korean.png"), ): mocked_last_modification_date = "2029-07-05T09:24:28" expected_last_modification_date = "2009-07-05T09:24:28" @@ -371,7 +343,7 @@ def test_partition_image_metadata_date_custom_metadata_date( def test_partition_image_with_hi_res_strategy_metadata_date_custom_metadata_date( mocker, - filename="example-docs/english-and-korean.png", + filename=example_doc_path("img/english-and-korean.png"), ): mocked_last_modification_date = "2029-07-05T09:24:28" expected_last_modification_date = "2009-07-05T09:24:28" @@ -391,7 +363,7 @@ def test_partition_image_with_hi_res_strategy_metadata_date_custom_metadata_date def test_partition_image_from_file_metadata_date( mocker, - filename="example-docs/english-and-korean.png", + filename=example_doc_path("img/english-and-korean.png"), ): mocked_last_modification_date = "2029-07-05T09:24:28" mocker.patch( @@ -406,7 +378,7 @@ def test_partition_image_from_file_metadata_date( def test_partition_image_from_file_explicit_get_metadata_date( mocker, - filename="example-docs/english-and-korean.png", + filename=example_doc_path("img/english-and-korean.png"), ): mocked_last_modification_date = "2029-07-05T09:24:28" mocker.patch( @@ -421,7 +393,7 @@ def test_partition_image_from_file_explicit_get_metadata_date( def test_partition_image_from_file_with_hi_res_strategy_metadata_date( mocker, - filename="example-docs/english-and-korean.png", + filename=example_doc_path("img/english-and-korean.png"), ): mocked_last_modification_date = "2029-07-05T09:24:28" mocker.patch( @@ -437,7 +409,7 @@ def test_partition_image_from_file_with_hi_res_strategy_metadata_date( def test_partition_image_from_file_with_hi_res_strategy_explicit_get_metadata_date( mocker, - filename="example-docs/english-and-korean.png", + filename=example_doc_path("img/english-and-korean.png"), ): mocked_last_modification_date = "2029-07-05T09:24:28" mocker.patch( @@ -455,7 +427,7 @@ def test_partition_image_from_file_with_hi_res_strategy_explicit_get_metadata_da def test_partition_image_from_file_metadata_date_custom_metadata_date( mocker, - filename="example-docs/english-and-korean.png", + filename=example_doc_path("img/english-and-korean.png"), ): mocked_last_modification_date = "2029-07-05T09:24:28" expected_last_modification_date = "2009-07-05T09:24:28" @@ -475,7 +447,7 @@ def test_partition_image_from_file_metadata_date_custom_metadata_date( def test_partition_image_from_file_with_hi_res_strategy_metadata_date_custom_metadata_date( mocker, - filename="example-docs/english-and-korean.png", + filename=example_doc_path("img/english-and-korean.png"), ): mocked_last_modification_date = "2029-07-05T09:24:28" expected_last_modification_date = "2009-07-05T09:24:28" @@ -495,7 +467,7 @@ def test_partition_image_from_file_with_hi_res_strategy_metadata_date_custom_met def test_partition_image_from_file_without_metadata_date( - filename="example-docs/english-and-korean.png", + filename=example_doc_path("img/english-and-korean.png"), ): """Test partition_image() with file that are not possible to get last modified date""" with open(filename, "rb") as f: @@ -509,14 +481,14 @@ def test_partition_image_from_file_without_metadata_date( def test_partition_msg_with_json(): elements = image.partition_image( - example_doc_path("layout-parser-paper-fast.jpg"), + example_doc_path("img/layout-parser-paper-fast.jpg"), strategy=PartitionStrategy.AUTO, ) assert_round_trips_through_JSON(elements) def test_partition_image_with_ocr_has_coordinates_from_filename( - filename="example-docs/english-and-korean.png", + filename=example_doc_path("img/english-and-korean.png"), ): elements = image.partition_image(filename=filename, strategy=PartitionStrategy.OCR_ONLY) int_coordinates = [(int(x), int(y)) for x, y in elements[0].metadata.coordinates.points] @@ -524,11 +496,11 @@ def test_partition_image_with_ocr_has_coordinates_from_filename( @pytest.mark.parametrize( - ("filename"), + "filename", [ - ("example-docs/layout-parser-paper-with-table.jpg"), - ("example-docs/english-and-korean.png"), - ("example-docs/layout-parser-paper-fast.jpg"), + "img/layout-parser-paper-with-table.jpg", + "img/english-and-korean.png", + "img/layout-parser-paper-fast.jpg", ], ) def test_partition_image_with_ocr_coordinates_are_not_nan_from_filename( @@ -536,7 +508,9 @@ def test_partition_image_with_ocr_coordinates_are_not_nan_from_filename( ): import math - elements = image.partition_image(filename=filename, strategy=PartitionStrategy.OCR_ONLY) + elements = image.partition_image( + filename=example_doc_path(filename), strategy=PartitionStrategy.OCR_ONLY + ) for element in elements: # TODO (jennings) One or multiple elements is an empty string # without coordinates. This should be fixed in a new issue @@ -548,7 +522,7 @@ def test_partition_image_with_ocr_coordinates_are_not_nan_from_filename( def test_partition_image_formats_languages_for_tesseract(): - filename = "example-docs/jpn-vert.jpeg" + filename = example_doc_path("img/jpn-vert.jpeg") with mock.patch( "unstructured.partition.pdf_image.ocr.process_file_with_ocr", ) as mock_process_file_with_ocr: @@ -561,13 +535,13 @@ def test_partition_image_formats_languages_for_tesseract(): def test_partition_image_warns_with_ocr_languages(caplog): - filename = "example-docs/layout-parser-paper-fast.jpg" + filename = example_doc_path("img/layout-parser-paper-fast.jpg") image.partition_image(filename=filename, strategy=PartitionStrategy.HI_RES, ocr_languages="eng") assert "The ocr_languages kwarg will be deprecated" in caplog.text def test_add_chunking_strategy_on_partition_image( - filename="example-docs/layout-parser-paper-fast.jpg", + filename=example_doc_path("img/layout-parser-paper-fast.jpg"), ): elements = image.partition_image(filename=filename) chunk_elements = image.partition_image(filename, chunking_strategy="by_title") @@ -577,7 +551,7 @@ def test_add_chunking_strategy_on_partition_image( def test_add_chunking_strategy_on_partition_image_hi_res( - filename="example-docs/layout-parser-paper-with-table.jpg", + filename=example_doc_path("img/layout-parser-paper-with-table.jpg"), ): elements = image.partition_image( filename=filename, @@ -600,7 +574,9 @@ def test_partition_image_uses_model_name(): pdf, "_partition_pdf_or_image_local", ) as mockpartition: - image.partition_image("example-docs/layout-parser-paper-fast.jpg", model_name="test") + image.partition_image( + example_doc_path("img/layout-parser-paper-fast.jpg"), model_name="test" + ) print(mockpartition.call_args) assert "model_name" in mockpartition.call_args.kwargs assert mockpartition.call_args.kwargs["model_name"] @@ -611,7 +587,9 @@ def test_partition_image_uses_hi_res_model_name(): pdf, "_partition_pdf_or_image_local", ) as mockpartition: - image.partition_image("example-docs/layout-parser-paper-fast.jpg", hi_res_model_name="test") + image.partition_image( + example_doc_path("img/layout-parser-paper-fast.jpg"), hi_res_model_name="test" + ) print(mockpartition.call_args) assert "model_name" not in mockpartition.call_args.kwargs assert "hi_res_model_name" in mockpartition.call_args.kwargs @@ -626,7 +604,7 @@ def test_partition_image_uses_hi_res_model_name(): ], ) def test_partition_image_hi_res_ocr_mode(ocr_mode, idx_title_element): - filename = "example-docs/layout-parser-paper-fast.jpg" + filename = example_doc_path("img/layout-parser-paper-fast.jpg") elements = image.partition_image( filename=filename, ocr_mode=ocr_mode, strategy=PartitionStrategy.HI_RES ) @@ -635,7 +613,7 @@ def test_partition_image_hi_res_ocr_mode(ocr_mode, idx_title_element): def test_partition_image_hi_res_invalid_ocr_mode(): - filename = "example-docs/layout-parser-paper-fast.jpg" + filename = example_doc_path("img/layout-parser-paper-fast.jpg") with pytest.raises(ValueError): _ = image.partition_image( filename=filename, ocr_mode="invalid_ocr_mode", strategy=PartitionStrategy.HI_RES @@ -643,14 +621,14 @@ def test_partition_image_hi_res_invalid_ocr_mode(): @pytest.mark.parametrize( - ("ocr_mode"), + "ocr_mode", [ - ("entire_page"), - ("individual_blocks"), + "entire_page", + "individual_blocks", ], ) def test_partition_image_hi_res_ocr_mode_with_table_extraction(ocr_mode): - filename = "example-docs/layout-parser-paper-with-table.jpg" + filename = example_doc_path("img/layout-parser-paper-with-table.jpg") elements = image.partition_image( filename=filename, ocr_mode=ocr_mode, @@ -665,8 +643,8 @@ def test_partition_image_hi_res_ocr_mode_with_table_extraction(ocr_mode): assert "Layouts of scanned modern magazines and scientific reports" in table[0] -def test_partition_image_raises_TypeError_for_invalid_languages(): - filename = "example-docs/layout-parser-paper-fast.jpg" +def test_partition_image_raises_type_error_for_invalid_languages(): + filename = example_doc_path("img/layout-parser-paper-fast.jpg") with pytest.raises(TypeError): image.partition_image(filename=filename, strategy=PartitionStrategy.HI_RES, languages="eng") @@ -683,7 +661,6 @@ def inference_results(): def test_partition_image_has_filename(inference_results): - doc_path = "example-docs" filename = "layout-parser-paper-fast.jpg" # Mock inference call with known return results with mock.patch( @@ -691,7 +668,7 @@ def test_partition_image_has_filename(inference_results): return_value=inference_results, ) as mock_inference_func: elements = image.partition_image( - filename=os.path.join(doc_path, filename), + filename=example_doc_path(f"img/{filename}"), strategy=PartitionStrategy.HI_RES, ) # Make sure we actually went down the path we expect. @@ -710,7 +687,7 @@ def test_partition_image_has_filename(inference_results): def test_partition_image_element_extraction( file_mode, extract_image_block_to_payload, - filename=example_doc_path("embedded-images-tables.jpg"), + filename=example_doc_path("img/embedded-images-tables.jpg"), ): extract_image_block_types = ["Image", "Table"] @@ -737,7 +714,7 @@ def test_partition_image_element_extraction( def test_partition_image_works_on_heic_file( - filename="example-docs/DA-1p.heic", + filename=example_doc_path("img/DA-1p.heic"), ): elements = image.partition_image(filename=filename, strategy=PartitionStrategy.AUTO) titles = [el.text for el in elements if el.category == ElementType.TITLE] @@ -745,17 +722,17 @@ def test_partition_image_works_on_heic_file( @pytest.mark.parametrize( - ("strategy"), + "strategy", [PartitionStrategy.HI_RES, PartitionStrategy.OCR_ONLY], ) def test_deterministic_element_ids(strategy: str): elements_1 = image.partition_image( - example_doc_path("layout-parser-paper-with-table.jpg"), + example_doc_path("img/layout-parser-paper-with-table.jpg"), strategy=strategy, starting_page_number=2, ) elements_2 = image.partition_image( - example_doc_path("layout-parser-paper-with-table.jpg"), + example_doc_path("img/layout-parser-paper-with-table.jpg"), strategy=strategy, starting_page_number=2, ) @@ -765,9 +742,9 @@ def test_deterministic_element_ids(strategy: str): assert ids_1 == ids_2 -def test_multipage_tiff_starts_on_starting_page_number(): +def test_multi_page_tiff_starts_on_starting_page_number(): elements = image.partition_image( - example_doc_path("layout-parser-paper-combined.tiff"), + example_doc_path("img/layout-parser-paper-combined.tiff"), starting_page_number=2, ) pages = {element.metadata.page_number for element in elements} diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 7301fb45b2..f6d233701b 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -105,7 +105,7 @@ def pages(self): @pytest.mark.parametrize( ("filename", "file"), [ - (example_doc_path("layout-parser-paper-fast.pdf"), None), + (example_doc_path("pdf/layout-parser-paper-fast.pdf"), None), (None, b"0000"), ], ) @@ -168,7 +168,7 @@ def test_partition_pdf_outputs_valid_amount_of_elements_and_metadata_values( starting_page_number, expected_page_numbers, origin, - filename=example_doc_path("layout-parser-paper-with-empty-pages.pdf"), + filename=example_doc_path("pdf/layout-parser-paper-with-empty-pages.pdf"), ): # Test that the partition_pdf function can handle filename def _test(result): @@ -204,7 +204,7 @@ def _test(result): @mock.patch.dict(os.environ, {"UNSTRUCTURED_HI_RES_MODEL_NAME": "checkbox"}) def test_partition_pdf_with_model_name_env_var( monkeypatch, - filename=example_doc_path("layout-parser-paper-fast.pdf"), + filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), ): monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: []) with mock.patch.object( @@ -220,7 +220,7 @@ def test_partition_pdf_with_model_name_env_var( def test_partition_pdf_with_model_name( monkeypatch, model_name, - filename=example_doc_path("layout-parser-paper-fast.pdf"), + filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), ): monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: []) with mock.patch.object( @@ -251,7 +251,7 @@ def test_partition_pdf_with_model_name( def test_partition_pdf_with_hi_res_model_name( monkeypatch, - filename=example_doc_path("layout-parser-paper-fast.pdf"), + filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), ): monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: []) with mock.patch.object( @@ -268,7 +268,7 @@ def test_partition_pdf_with_hi_res_model_name( def test_partition_pdf_or_image_with_hi_res_model_name( monkeypatch, - filename=example_doc_path("layout-parser-paper-fast.pdf"), + filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), ): monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: []) with mock.patch.object( @@ -284,7 +284,7 @@ def test_partition_pdf_or_image_with_hi_res_model_name( def test_partition_pdf_with_auto_strategy( - filename=example_doc_path("layout-parser-paper-fast.pdf"), + filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), ): elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.AUTO) title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis" @@ -294,21 +294,21 @@ def test_partition_pdf_with_auto_strategy( def test_partition_pdf_with_page_breaks( - filename=example_doc_path("layout-parser-paper-fast.pdf"), + filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), ): elements = pdf.partition_pdf(filename=filename, url=None, include_page_breaks=True) assert "PageBreak" in [elem.category for elem in elements] def test_partition_pdf_with_no_page_breaks( - filename=example_doc_path("layout-parser-paper-fast.pdf"), + filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), ): elements = pdf.partition_pdf(filename=filename, url=None) assert "PageBreak" not in [elem.category for elem in elements] def test_partition_pdf_with_fast_strategy( - filename=example_doc_path("layout-parser-paper-fast.pdf"), + filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), ): elements = pdf.partition_pdf( filename=filename, url=None, strategy=PartitionStrategy.FAST, starting_page_number=3 @@ -321,7 +321,7 @@ def test_partition_pdf_with_fast_strategy( def test_partition_pdf_with_fast_neg_coordinates(): - filename = example_doc_path("negative-coords.pdf") + filename = example_doc_path("pdf/negative-coords.pdf") elements = pdf.partition_pdf(filename=filename, url=None, strategy=PartitionStrategy.FAST) assert len(elements) == 5 assert elements[0].metadata.coordinates.points[0][0] < 0 @@ -329,7 +329,7 @@ def test_partition_pdf_with_fast_neg_coordinates(): def test_partition_pdf_with_fast_groups_text( - filename=example_doc_path("layout-parser-paper-fast.pdf"), + filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), ): elements = pdf.partition_pdf(filename=filename, url=None, strategy=PartitionStrategy.FAST) @@ -345,7 +345,7 @@ def test_partition_pdf_with_fast_groups_text( def test_partition_pdf_with_fast_strategy_from_file( - filename=example_doc_path("layout-parser-paper-fast.pdf"), + filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), ): with open(filename, "rb") as f: elements = pdf.partition_pdf(file=f, url=None, strategy=PartitionStrategy.FAST) @@ -354,7 +354,7 @@ def test_partition_pdf_with_fast_strategy_from_file( def test_partition_pdf_with_fast_strategy_and_page_breaks( caplog, - filename=example_doc_path("layout-parser-paper-fast.pdf"), + filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), ): elements = pdf.partition_pdf( filename=filename, @@ -371,7 +371,7 @@ def test_partition_pdf_with_fast_strategy_and_page_breaks( def test_partition_pdf_raises_with_bad_strategy( - filename=example_doc_path("layout-parser-paper-fast.pdf"), + filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), ): with pytest.raises(ValueError): pdf.partition_pdf(filename=filename, url=None, strategy="made_up") @@ -380,7 +380,7 @@ def test_partition_pdf_raises_with_bad_strategy( def test_partition_pdf_falls_back_to_fast( monkeypatch, caplog, - filename=example_doc_path("layout-parser-paper-fast.pdf"), + filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), ): def mock_exists(dep): return dep not in ["unstructured_inference", "pytesseract"] @@ -402,7 +402,7 @@ def mock_exists(dep): def test_partition_pdf_falls_back_to_fast_from_ocr_only( monkeypatch, caplog, - filename=example_doc_path("layout-parser-paper-fast.pdf"), + filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), ): def mock_exists(dep): return dep not in ["pytesseract"] @@ -428,7 +428,7 @@ def mock_exists(dep): def test_partition_pdf_falls_back_to_hi_res_from_ocr_only( monkeypatch, caplog, - filename=example_doc_path("layout-parser-paper-fast.pdf"), + filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), ): def mock_exists(dep): return dep not in ["pytesseract"] @@ -451,7 +451,7 @@ def mock_exists(dep): def test_partition_pdf_falls_back_to_ocr_only( monkeypatch, caplog, - filename=example_doc_path("layout-parser-paper-fast.pdf"), + filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), ): def mock_exists(dep): return dep not in ["unstructured_inference"] @@ -471,7 +471,7 @@ def mock_exists(dep): def test_partition_pdf_uses_table_extraction(): - filename = example_doc_path("layout-parser-paper-fast.pdf") + filename = example_doc_path("pdf/layout-parser-paper-fast.pdf") with mock.patch( "unstructured.partition.pdf_image.ocr.process_file_with_ocr", ) as mock_process_file_with_model: @@ -480,14 +480,14 @@ def test_partition_pdf_uses_table_extraction(): @pytest.mark.parametrize( - ("ocr_mode"), + "ocr_mode", [ - ("entire_page"), - ("individual_blocks"), + "entire_page", + "individual_blocks", ], ) def test_partition_pdf_hi_table_extraction_with_languages(ocr_mode): - filename = example_doc_path("korean-text-with-tables.pdf") + filename = example_doc_path("pdf/korean-text-with-tables.pdf") elements = pdf.partition_pdf( filename=filename, ocr_mode=ocr_mode, @@ -506,15 +506,15 @@ def test_partition_pdf_hi_table_extraction_with_languages(ocr_mode): @pytest.mark.parametrize( - ("strategy"), + "strategy", [ - (PartitionStrategy.FAST), - (PartitionStrategy.HI_RES), - (PartitionStrategy.OCR_ONLY), + PartitionStrategy.FAST, + PartitionStrategy.HI_RES, + PartitionStrategy.OCR_ONLY, ], ) def test_partition_pdf_strategies_keep_languages_metadata(strategy): - filename = example_doc_path("korean-text-with-tables.pdf") + filename = example_doc_path("pdf/korean-text-with-tables.pdf") elements = pdf.partition_pdf( filename=filename, languages=["kor"], @@ -531,7 +531,7 @@ def test_partition_pdf_strategies_keep_languages_metadata(strategy): ], ) def test_partition_pdf_hi_res_ocr_mode_with_table_extraction(ocr_mode): - filename = example_doc_path("layout-parser-paper.pdf") + filename = example_doc_path("pdf/layout-parser-paper.pdf") elements = pdf.partition_pdf( filename=filename, ocr_mode=ocr_mode, @@ -548,7 +548,7 @@ def test_partition_pdf_hi_res_ocr_mode_with_table_extraction(ocr_mode): def test_partition_pdf_with_copy_protection(): - filename = os.path.join("example-docs", "copy-protected.pdf") + filename = example_doc_path("pdf/copy-protected.pdf") elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES) title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis" idx = 2 @@ -559,13 +559,13 @@ def test_partition_pdf_with_copy_protection(): def test_partition_pdf_with_dpi(): - filename = os.path.join("example-docs", "copy-protected.pdf") + filename = example_doc_path("pdf/copy-protected.pdf") with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process: pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES, pdf_image_dpi=100) assert mock_process.call_args[1]["pdf_image_dpi"] == 100 -def test_partition_pdf_requiring_recursive_text_grab(filename=example_doc_path("reliance.pdf")): +def test_partition_pdf_requiring_recursive_text_grab(filename=example_doc_path("pdf/reliance.pdf")): elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.FAST) assert len(elements) > 50 assert elements[0].metadata.page_number == 1 @@ -573,14 +573,14 @@ def test_partition_pdf_requiring_recursive_text_grab(filename=example_doc_path(" def test_partition_pdf_text_not_extractable(): - filename = example_doc_path("loremipsum-flat.pdf") + filename = example_doc_path("pdf/loremipsum-flat.pdf") elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.FAST) assert len(elements) == 0 def test_partition_pdf_fails_if_pdf_not_processable( monkeypatch, - filename=example_doc_path("layout-parser-paper-fast.pdf"), + filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), ): def mock_exists(dep): return dep not in ["unstructured_inference", "pytesseract"] @@ -593,7 +593,7 @@ def mock_exists(dep): def test_partition_pdf_fast_groups_text_in_text_box(): - filename = os.path.join("example-docs", "chevron-page.pdf") + filename = example_doc_path("pdf/chevron-page.pdf") elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.FAST) expected_coordinate_points_0 = ( (193.1741, 71.94000000000005), @@ -633,7 +633,7 @@ def test_partition_pdf_fast_groups_text_in_text_box(): def test_partition_pdf_with_metadata_filename( - filename=example_doc_path("layout-parser-paper-fast.pdf"), + filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), ): elements = pdf.partition_pdf( filename=filename, @@ -646,7 +646,7 @@ def test_partition_pdf_with_metadata_filename( def test_partition_pdf_with_fast_strategy_from_file_with_metadata_filename( - filename=example_doc_path("layout-parser-paper-fast.pdf"), + filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), ): with open(filename, "rb") as f: elements = pdf.partition_pdf( @@ -672,7 +672,7 @@ def test_partition_pdf_with_fast_strategy_from_file_with_metadata_filename( def test_partition_pdf_exclude_metadata( file_mode, strategy, - filename=example_doc_path("layout-parser-paper-fast.pdf"), + filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), ): if file_mode == "filename": elements = pdf.partition_pdf( @@ -711,7 +711,7 @@ def test_partition_pdf_metadata_date( strategy, last_modification_date, date_from_file_object, - filename=example_doc_path("copy-protected.pdf"), + filename=example_doc_path("pdf/copy-protected.pdf"), ): mocked_last_modification_date = "2029-07-05T09:24:28" expected_last_modification_date = ( @@ -762,14 +762,14 @@ def test_partition_pdf_metadata_date( @pytest.mark.parametrize("strategy", [PartitionStrategy.FAST, PartitionStrategy.HI_RES]) def test_partition_pdf_with_json(strategy: str): elements = pdf.partition_pdf( - example_doc_path("layout-parser-paper-fast.pdf"), + example_doc_path("pdf/layout-parser-paper-fast.pdf"), strategy=strategy, ) assert_round_trips_through_JSON(elements) def test_add_chunking_strategy_by_title_on_partition_pdf( - filename=example_doc_path("layout-parser-paper-fast.pdf"), + filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), ): elements = pdf.partition_pdf(filename=filename) chunk_elements = pdf.partition_pdf(filename, chunking_strategy="by_title") @@ -779,14 +779,14 @@ def test_add_chunking_strategy_by_title_on_partition_pdf( def test_partition_pdf_formats_languages_for_tesseract(): - filename = example_doc_path("DA-1p.pdf") + filename = example_doc_path("pdf/DA-1p.pdf") with mock.patch.object(ocr, "process_file_with_ocr", mock.MagicMock()) as mock_process: pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES, languages=["en"]) assert mock_process.call_args[1]["ocr_languages"] == "eng" def test_partition_pdf_warns_with_ocr_languages(caplog): - filename = example_doc_path("chevron-page.pdf") + filename = example_doc_path("pdf/chevron-page.pdf") pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES, ocr_languages="eng") assert "The ocr_languages kwarg will be deprecated" in caplog.text @@ -795,7 +795,7 @@ def test_partition_categorization_backup(): text = "This is Clearly a Title" with mock.patch.object(pdf, "_partition_pdf_or_image_local", return_value=[Text(text)]): elements = pdf.partition_pdf_or_image( - example_doc_path("layout-parser-paper-fast.pdf"), + example_doc_path("pdf/layout-parser-paper-fast.pdf"), strategy=PartitionStrategy.HI_RES, ) # Should have changed the element class from Text to Title @@ -805,7 +805,7 @@ def test_partition_categorization_backup(): @pytest.mark.parametrize( "filename", - [example_doc_path("layout-parser-paper-fast.pdf")], + [example_doc_path("pdf/layout-parser-paper-fast.pdf")], ) def test_combine_numbered_list(filename): elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.AUTO) @@ -823,7 +823,7 @@ def test_combine_numbered_list(filename): @pytest.mark.parametrize( "filename", - [example_doc_path("layout-parser-paper-fast.pdf")], + [example_doc_path("pdf/layout-parser-paper-fast.pdf")], ) def test_partition_pdf_hyperlinks(filename): elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.AUTO) @@ -849,7 +849,7 @@ def test_partition_pdf_hyperlinks(filename): @pytest.mark.parametrize( "filename", - [example_doc_path("embedded-link.pdf")], + [example_doc_path("pdf/embedded-link.pdf")], ) def test_partition_pdf_hyperlinks_multiple_lines(filename): elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.AUTO) @@ -863,7 +863,7 @@ def test_partition_pdf_uses_model_name(): "_partition_pdf_or_image_local", ) as mockpartition: pdf.partition_pdf( - example_doc_path("layout-parser-paper-fast.pdf"), + example_doc_path("pdf/layout-parser-paper-fast.pdf"), model_name="test", strategy=PartitionStrategy.HI_RES, ) @@ -879,7 +879,7 @@ def test_partition_pdf_uses_hi_res_model_name(): "_partition_pdf_or_image_local", ) as mockpartition: pdf.partition_pdf( - example_doc_path("layout-parser-paper-fast.pdf"), + example_doc_path("pdf/layout-parser-paper-fast.pdf"), hi_res_model_name="test", strategy=PartitionStrategy.HI_RES, ) @@ -890,7 +890,7 @@ def test_partition_pdf_uses_hi_res_model_name(): def test_partition_pdf_word_bbox_not_char( - filename=example_doc_path("interface-config-guide-p93.pdf"), + filename=example_doc_path("pdf/interface-config-guide-p93.pdf"), ): try: elements = pdf.partition_pdf(filename=filename, strategy="fast") @@ -900,7 +900,7 @@ def test_partition_pdf_word_bbox_not_char( def test_partition_pdf_fast_no_mapping_errors( - filename=example_doc_path("a1977-backus-p21.pdf"), + filename=example_doc_path("pdf/a1977-backus-p21.pdf"), ): """Verify there is no regression for https://github.com/Unstructured-IO/unstructured/pull/2940, failing to map old parent_id's to new""" @@ -908,7 +908,7 @@ def test_partition_pdf_fast_no_mapping_errors( def test_partition_pdf_raises_TypeError_for_invalid_languages(): - filename = example_doc_path("chevron-page.pdf") + filename = example_doc_path("pdf/chevron-page.pdf") with pytest.raises(TypeError): pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES, languages="eng") @@ -948,7 +948,7 @@ def test_default_hi_res_model(env, expected, monkeypatch): def test_partition_model_name_default_to_None(): - filename = example_doc_path("DA-1p.pdf") + filename = example_doc_path("pdf/DA-1p.pdf") try: pdf.partition_pdf( filename=filename, @@ -961,7 +961,7 @@ def test_partition_model_name_default_to_None(): def test_partition_hi_res_model_name_default_to_None(): - filename = example_doc_path("DA-1p.pdf") + filename = example_doc_path("pdf/DA-1p.pdf") try: pdf.partition_pdf( filename=filename, @@ -998,7 +998,7 @@ class CallException(Exception): # Patch the ocr function with the mock that will record the call and then terminate with mock.patch(ocr_func, mock_ocr_func), pytest.raises(CallException): pdf.partition_pdf( - example_doc_path("layout-parser-paper-fast.pdf"), + example_doc_path("pdf/layout-parser-paper-fast.pdf"), strategy=strategy, ocr_languages="kor", ) @@ -1089,8 +1089,8 @@ def test_get_uris_from_annots_string_annotation( @pytest.mark.parametrize( ("filename", "is_image"), [ - (example_doc_path("layout-parser-paper-fast.pdf"), False), - (example_doc_path("layout-parser-paper-fast.jpg"), True), + (example_doc_path("pdf/layout-parser-paper-fast.pdf"), False), + (example_doc_path("img/layout-parser-paper-fast.jpg"), True), ], ) def test_partition_pdf_with_ocr_only_strategy( @@ -1148,18 +1148,20 @@ def test_partition_pdf_with_ocr_only_strategy( def test_partition_pdf_with_all_number_table_and_ocr_only_strategy(): # AttributeError was previously being raised when partitioning documents that contained only # numerical values with `strategy=PartitionStrategy.OCR_ONLY` - filename = example_doc_path("all-number-table.pdf") + filename = example_doc_path("pdf/all-number-table.pdf") assert pdf.partition_pdf(filename, strategy=PartitionStrategy.OCR_ONLY) # As of pdfminer 221105, this pdf throws an error and requires a workaround # See #2059 def test_partition_pdf_with_bad_color_profile(): - filename = example_doc_path("pdf-bad-color-space.pdf") + filename = example_doc_path("pdf/pdf-bad-color-space.pdf") assert pdf.partition_pdf(filename, strategy="fast") -def test_partition_pdf_with_fast_finds_headers_footers(filename="example-docs/header-test-doc.pdf"): +def test_partition_pdf_with_fast_finds_headers_footers( + filename=example_doc_path("pdf/header-test-doc.pdf"), +): elements = pdf.partition_pdf(filename, strategy="fast") assert isinstance(elements[0], Header) assert isinstance(elements[-1], Footer) @@ -1180,7 +1182,7 @@ def test_partition_pdf_with_fast_finds_headers_footers(filename="example-docs/he ) def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_log, caplog): caplog.set_level(logging.INFO) - assert pdf.extractable_elements(filename=example_doc_path(filename)) + assert pdf.extractable_elements(filename=example_doc_path(f"pdf/{filename}")) assert expected_log in caplog.text @@ -1222,7 +1224,7 @@ def assert_element_extraction( def test_partition_pdf_element_extraction( file_mode, extract_image_block_to_payload, - filename=example_doc_path("embedded-images-tables.pdf"), + filename=example_doc_path("pdf/embedded-images-tables.pdf"), ): extract_image_block_types = ["Image", "Table"] @@ -1253,7 +1255,7 @@ def test_partition_pdf_element_extraction( def test_partition_pdf_always_keep_all_image_elements( - filename=example_doc_path("embedded-images.pdf"), + filename=example_doc_path("pdf/embedded-images.pdf"), ): elements = pdf.partition_pdf( filename=filename, @@ -1325,7 +1327,9 @@ def expected_ids(request): ) def test_unique_and_deterministic_element_ids(strategy, expected_ids): elements = pdf.partition_pdf( - "example-docs/fake-memo-with-duplicate-page.pdf", strategy=strategy, starting_page_number=2 + example_doc_path("pdf/fake-memo-with-duplicate-page.pdf"), + strategy=strategy, + starting_page_number=2, ) ids = [element.id for element in elements] assert ids == expected_ids, "Element IDs do not match expected IDs" @@ -1333,7 +1337,7 @@ def test_unique_and_deterministic_element_ids(strategy, expected_ids): def test_analysis_artifacts_saved(): with tempfile.TemporaryDirectory() as temp_dir: - filename = example_doc_path("layout-parser-paper-fast.pdf") + filename = example_doc_path("pdf/layout-parser-paper-fast.pdf") pdf.partition_pdf( filename=filename, strategy=PartitionStrategy.HI_RES, diff --git a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py index f3a66c9e49..29582007bc 100644 --- a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py +++ b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py @@ -36,7 +36,7 @@ def test_write_image(image_type): @pytest.mark.parametrize("file_mode", ["filename", "rb"]) @pytest.mark.parametrize("path_only", [True, False]) def test_convert_pdf_to_image( - file_mode, path_only, filename=example_doc_path("embedded-images.pdf") + file_mode, path_only, filename=example_doc_path("pdf/embedded-images.pdf") ): with tempfile.TemporaryDirectory() as tmpdir: if file_mode == "filename": @@ -71,8 +71,8 @@ def test_convert_pdf_to_image_raises_error(filename=example_doc_path("embedded-i @pytest.mark.parametrize( ("filename", "is_image"), [ - (example_doc_path("layout-parser-paper-fast.pdf"), False), - (example_doc_path("layout-parser-paper-fast.jpg"), True), + (example_doc_path("pdf/layout-parser-paper-fast.pdf"), False), + (example_doc_path("img/layout-parser-paper-fast.jpg"), True), ], ) @pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE]) @@ -301,8 +301,8 @@ def test_annotate_layout_elements_with_image(): @pytest.mark.parametrize( ("filename", "is_image"), [ - (example_doc_path("layout-parser-paper-fast.pdf"), False), - (example_doc_path("layout-parser-paper-fast.jpg"), True), + (example_doc_path("pdf/layout-parser-paper-fast.pdf"), False), + (example_doc_path("img/layout-parser-paper-fast.jpg"), True), ], ) def test_annotate_layout_elements(filename, is_image): diff --git a/test_unstructured/partition/test_api.py b/test_unstructured/partition/test_api.py index 85be972102..a9e5956dad 100644 --- a/test_unstructured/partition/test_api.py +++ b/test_unstructured/partition/test_api.py @@ -374,8 +374,8 @@ def test_partition_multiple_via_api_valid_request_data_kwargs(): @pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI") def test_partition_multiple_via_api_invalid_request_data_kwargs(): filenames = [ - example_doc_path("layout-parser-paper-fast.pdf"), - example_doc_path("layout-parser-paper-fast.jpg"), + example_doc_path("pdf/layout-parser-paper-fast.pdf"), + example_doc_path("img/layout-parser-paper-fast.jpg"), ] with pytest.raises(ValueError): partition_multiple_via_api( diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 7663d84ab3..ef10b9ede7 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -306,7 +306,7 @@ def test_auto_partition_html_pre_from_file(): [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)], ) def test_auto_partition_jpeg_from_filename(pass_metadata_filename: bool, content_type: str | None): - file_path = example_doc_path("layout-parser-paper-fast.jpg") + file_path = example_doc_path("img/layout-parser-paper-fast.jpg") metadata_filename = file_path if pass_metadata_filename else None elements = partition( @@ -328,7 +328,7 @@ def test_auto_partition_jpeg_from_filename(pass_metadata_filename: bool, content [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)], ) def test_auto_partition_jpeg_from_file(pass_metadata_filename: bool, content_type: str | None): - file_path = example_doc_path("layout-parser-paper-fast.jpg") + file_path = example_doc_path("img/layout-parser-paper-fast.jpg") metadata_filename = file_path if pass_metadata_filename else None with open(file_path, "rb") as f: @@ -348,7 +348,7 @@ def test_auto_partition_jpeg_from_file(pass_metadata_filename: bool, content_typ def test_auto_partition_bmp_from_filename(tmp_path: pathlib.Path): bmp_filename = str(tmp_path / "example.bmp") - with Image.open(example_doc_path("layout-parser-paper-with-table.jpg")) as img: + with Image.open(example_doc_path("img/layout-parser-paper-with-table.jpg")) as img: img.save(bmp_filename) elements = partition(filename=bmp_filename, strategy=PartitionStrategy.HI_RES) @@ -365,7 +365,7 @@ def test_auto_partition_image_element_extraction(extract_image_block_to_payload: with tempfile.TemporaryDirectory() as tmpdir: elements = partition( - filename=example_doc_path("embedded-images-tables.jpg"), + filename=example_doc_path("img/embedded-images-tables.jpg"), extract_image_block_types=extract_image_block_types, extract_image_block_to_payload=extract_image_block_to_payload, extract_image_block_output_dir=tmpdir, @@ -516,7 +516,7 @@ def test_auto_partition_org_from_file(): [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)], ) def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_type: str | None): - file_path = example_doc_path("layout-parser-paper-fast.pdf") + file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf") metadata_filename = file_path if pass_metadata_filename else None elements = partition( @@ -547,7 +547,7 @@ def test_auto_partition_pdf_from_filename(pass_metadata_filename: bool, content_ [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)], ) def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type: str | None): - file_path = example_doc_path("layout-parser-paper-fast.pdf") + file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf") metadata_filename = file_path if pass_metadata_filename else None with open(file_path, "rb") as f: @@ -571,7 +571,7 @@ def test_auto_partition_pdf_from_file(pass_metadata_filename: bool, content_type def test_auto_partition_pdf_with_fast_strategy(monkeypatch: MonkeyPatch): - file_path = example_doc_path("layout-parser-paper-fast.pdf") + file_path = example_doc_path("pdf/layout-parser-paper-fast.pdf") mock_return = [NarrativeText("Hello there!")] with patch.object(auto, "partition_pdf", return_value=mock_return) as mock_partition: @@ -603,7 +603,7 @@ def test_auto_partition_pdf_uses_pdf_infer_table_structure_argument(): "unstructured.partition.pdf_image.ocr.process_file_with_ocr", ) as mock_process_file_with_model: partition( - example_doc_path("layout-parser-paper-fast.pdf"), + example_doc_path("pdf/layout-parser-paper-fast.pdf"), pdf_infer_table_structure=True, strategy=PartitionStrategy.HI_RES, ) @@ -616,7 +616,7 @@ def test_auto_partition_pdf_element_extraction(extract_image_block_to_payload: b with tempfile.TemporaryDirectory() as tmpdir: elements = partition( - example_doc_path("embedded-images-tables.pdf"), + example_doc_path("pdf/embedded-images-tables.pdf"), extract_image_block_types=extract_image_block_types, extract_image_block_to_payload=extract_image_block_to_payload, extract_image_block_output_dir=tmpdir, @@ -635,7 +635,7 @@ def test_partition_pdf_does_not_raise_warning(): with warnings.catch_warnings(): warnings.simplefilter("error") partition( - example_doc_path("layout-parser-paper-fast.pdf"), strategy=PartitionStrategy.HI_RES + example_doc_path("pdf/layout-parser-paper-fast.pdf"), strategy=PartitionStrategy.HI_RES ) @@ -1056,7 +1056,7 @@ def test_auto_partition_respects_language_arg(file_extension: str): def test_auto_partition_forwards_include_page_breaks_to_partition_pdf(): elements = partition( - example_doc_path("layout-parser-paper-fast.pdf"), + example_doc_path("pdf/layout-parser-paper-fast.pdf"), include_page_breaks=True, strategy=PartitionStrategy.HI_RES, ) @@ -1102,7 +1102,9 @@ def test_auto_partition_image_formats_languages_for_tesseract(request: FixtureRe ) partition( - example_doc_path("chi_sim_image.jpeg"), strategy=PartitionStrategy.HI_RES, languages=["zh"] + example_doc_path("img/chi_sim_image.jpeg"), + strategy=PartitionStrategy.HI_RES, + languages=["zh"], ) call_kwargs = process_file_with_ocr_.call_args_list[0][1] @@ -1124,7 +1126,9 @@ def test_auto_partition_ignores_empty_string_for_ocr_languages( def test_auto_partition_warns_with_ocr_languages(caplog: LogCaptureFixture): partition( - example_doc_path("chevron-page.pdf"), strategy=PartitionStrategy.HI_RES, ocr_languages="eng" + example_doc_path("pdf/chevron-page.pdf"), + strategy=PartitionStrategy.HI_RES, + ocr_languages="eng", ) assert caplog.records[0].levelname == "WARNING" @@ -1186,7 +1190,7 @@ def test_auto_partition_adds_filetype_to_metadata( monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", mock_partition_with_extras_map) elements = partition( - example_doc_path("layout-parser-paper-fast.pdf"), content_type=content_type + example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type ) assert len(elements) == 2 @@ -1213,7 +1217,9 @@ def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partiti ) monkeypatch.setattr(auto, "PARTITION_WITH_EXTRAS_MAP", {"pdf": partition_pdf_}) - elements = partition("example-docs/layout-parser-paper-fast.pdf", content_type=content_type) + elements = partition( + example_doc_path("pdf/layout-parser-paper-fast.pdf"), content_type=content_type + ) assert len(elements) == 2 assert all(e.metadata.filetype == "application/pdf" for e in elements) @@ -1238,7 +1244,8 @@ def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(filetype: # -- partition the first example-doc with the extension for this filetype -- elements: list[Element] = [] - for file in pathlib.Path(example_doc_path("")).iterdir(): + doc_path = example_doc_path("pdf") if filetype == FileType.PDF else example_doc_path("") + for file in pathlib.Path(doc_path).iterdir(): if file.is_file() and file.suffix == f".{extension}": elements = partition_fn(str(file)) break @@ -1256,7 +1263,7 @@ def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(filetype: def test_auto_partition_passes_user_provided_languages_arg_to_PDF(): elements = partition( - example_doc_path("chevron-page.pdf"), + example_doc_path("pdf/chevron-page.pdf"), strategy=PartitionStrategy.OCR_ONLY, languages=["eng"], ) diff --git a/test_unstructured/partition/test_common.py b/test_unstructured/partition/test_common.py index 4131d62a89..ebe12c0bee 100644 --- a/test_unstructured/partition/test_common.py +++ b/test_unstructured/partition/test_common.py @@ -562,7 +562,7 @@ def test_document_to_element_list_sets_category_depth_titles(): def test_ocr_data_to_elements( - filename="example-docs/layout-parser-paper-fast.jpg", + filename=example_doc_path("img/layout-parser-paper-fast.jpg"), ): text_regions = [ TextRegion.from_coords( diff --git a/test_unstructured/partition/test_strategies.py b/test_unstructured/partition/test_strategies.py index 9c66076eab..de284e84f0 100644 --- a/test_unstructured/partition/test_strategies.py +++ b/test_unstructured/partition/test_strategies.py @@ -1,7 +1,6 @@ -import os - import pytest +from test_unstructured.unit_utils import example_doc_path from unstructured.documents.elements import Text from unstructured.partition import pdf, strategies from unstructured.partition.utils.constants import PartitionStrategy @@ -43,7 +42,7 @@ def test_validate_strategy_raises_for_bad_strategy(): ], ) def test_is_pdf_text_extractable(filename, from_file, expected): - filename = os.path.join("example-docs", filename) + filename = example_doc_path(f"pdf/{filename}") if from_file: with open(filename, "rb") as f: diff --git a/test_unstructured_ingest/dest/azure-cognitive-search.sh b/test_unstructured_ingest/dest/azure-cognitive-search.sh index 2d2c64db26..8b534939f3 100755 --- a/test_unstructured_ingest/dest/azure-cognitive-search.sh +++ b/test_unstructured_ingest/dest/azure-cognitive-search.sh @@ -78,7 +78,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy fast \ --verbose \ --reprocess \ - --input-path example-docs/fake-memo.pdf \ + --input-path example-docs/pdf/fake-memo.pdf \ --work-dir "$WORK_DIR" \ --chunking-strategy by_title \ --chunk-combine-text-under-n-chars 150 \ diff --git a/test_unstructured_ingest/dest/azure.sh b/test_unstructured_ingest/dest/azure.sh index 293324e2d9..208b4a5a4b 100755 --- a/test_unstructured_ingest/dest/azure.sh +++ b/test_unstructured_ingest/dest/azure.sh @@ -43,7 +43,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy fast \ --verbose \ --reprocess \ - --input-path example-docs/fake-memo.pdf \ + --input-path example-docs/pdf/fake-memo.pdf \ --work-dir "$WORK_DIR" \ azure \ --overwrite \ diff --git a/test_unstructured_ingest/dest/box.sh b/test_unstructured_ingest/dest/box.sh index 0e36d1b87f..37ad702dd5 100755 --- a/test_unstructured_ingest/dest/box.sh +++ b/test_unstructured_ingest/dest/box.sh @@ -45,7 +45,7 @@ # --strategy fast \ # --verbose \ # --reprocess \ -# --input-path example-docs/fake-memo.pdf \ +# --input-path example-docs/pdf/fake-memo.pdf \ # --work-dir "$WORK_DIR" \ # box \ # --box-app-config "$BOX_APP_CONFIG_PATH" \ diff --git a/test_unstructured_ingest/dest/databricks-volumes.sh b/test_unstructured_ingest/dest/databricks-volumes.sh index c97289c5fc..6cf6e38a24 100755 --- a/test_unstructured_ingest/dest/databricks-volumes.sh +++ b/test_unstructured_ingest/dest/databricks-volumes.sh @@ -44,7 +44,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --output-dir "$OUTPUT_DIR" \ --strategy fast \ --verbose \ - --input-path example-docs/fake-memo.pdf \ + --input-path example-docs/pdf/fake-memo.pdf \ --work-dir "$WORK_DIR" \ databricks-volumes \ --host "$DATABRICKS_HOST" \ diff --git a/test_unstructured_ingest/dest/delta-table.sh b/test_unstructured_ingest/dest/delta-table.sh index d639d804b9..cf54e10546 100755 --- a/test_unstructured_ingest/dest/delta-table.sh +++ b/test_unstructured_ingest/dest/delta-table.sh @@ -38,7 +38,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --strategy fast \ --verbose \ --reprocess \ - --input-path example-docs/fake-memo.pdf \ + --input-path example-docs/pdf/fake-memo.pdf \ --work-dir "$WORK_DIR" \ delta-table \ --table-uri "$DESTINATION_TABLE" diff --git a/test_unstructured_ingest/dest/dropbox.sh b/test_unstructured_ingest/dest/dropbox.sh index b2f36a1868..52ade67223 100755 --- a/test_unstructured_ingest/dest/dropbox.sh +++ b/test_unstructured_ingest/dest/dropbox.sh @@ -62,7 +62,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy fast \ --verbose \ --reprocess \ - --input-path example-docs/fake-memo.pdf \ + --input-path example-docs/pdf/fake-memo.pdf \ --work-dir "$WORK_DIR" \ dropbox \ --token "$DROPBOX_ACCESS_TOKEN" \ diff --git a/test_unstructured_ingest/dest/gcs.sh b/test_unstructured_ingest/dest/gcs.sh index 3099dc31e4..21571a9373 100755 --- a/test_unstructured_ingest/dest/gcs.sh +++ b/test_unstructured_ingest/dest/gcs.sh @@ -47,7 +47,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy fast \ --verbose \ --reprocess \ - --input-path example-docs/fake-memo.pdf \ + --input-path example-docs/pdf/fake-memo.pdf \ --work-dir "$WORK_DIR" \ gcs \ --service-account-key "$GCP_INGEST_SERVICE_KEY_FILE" \ diff --git a/test_unstructured_ingest/dest/kafka-local.sh b/test_unstructured_ingest/dest/kafka-local.sh index 2e71b7484d..9086687ed2 100755 --- a/test_unstructured_ingest/dest/kafka-local.sh +++ b/test_unstructured_ingest/dest/kafka-local.sh @@ -42,7 +42,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --strategy fast \ --verbose \ --reprocess \ - --input-path example-docs/layout-parser-paper.pdf \ + --input-path example-docs/pdf/layout-parser-paper.pdf \ --work-dir "$WORK_DIR" \ --chunking-strategy basic \ --chunk-combine-text-under-n-chars 200 \ diff --git a/test_unstructured_ingest/dest/mongodb.sh b/test_unstructured_ingest/dest/mongodb.sh index aa28090d3e..6c90c53fe4 100755 --- a/test_unstructured_ingest/dest/mongodb.sh +++ b/test_unstructured_ingest/dest/mongodb.sh @@ -54,7 +54,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy fast \ --verbose \ --reprocess \ - --input-path example-docs/fake-memo.pdf \ + --input-path example-docs/pdf/fake-memo.pdf \ --work-dir "$WORK_DIR" \ --embedding-provider "langchain-huggingface" \ mongodb \ diff --git a/test_unstructured_ingest/dest/opensearch.sh b/test_unstructured_ingest/dest/opensearch.sh index db64f3ff39..003e4f2868 100755 --- a/test_unstructured_ingest/dest/opensearch.sh +++ b/test_unstructured_ingest/dest/opensearch.sh @@ -41,7 +41,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --strategy fast \ --verbose \ --reprocess \ - --input-path example-docs/fake-memo.pdf \ + --input-path example-docs/pdf/fake-memo.pdf \ --work-dir "$WORK_DIR" \ --embedding-provider "langchain-huggingface" \ opensearch \ diff --git a/test_unstructured_ingest/dest/pgvector.sh b/test_unstructured_ingest/dest/pgvector.sh index 66f6aa5bd7..25836cf1dc 100755 --- a/test_unstructured_ingest/dest/pgvector.sh +++ b/test_unstructured_ingest/dest/pgvector.sh @@ -40,7 +40,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --output-dir "$OUTPUT_DIR" \ --strategy fast \ --verbose \ - --input-path example-docs/fake-memo.pdf \ + --input-path example-docs/pdf/fake-memo.pdf \ --work-dir "$WORK_DIR" \ --embedding-provider "langchain-huggingface" \ sql \ diff --git a/test_unstructured_ingest/dest/s3.sh b/test_unstructured_ingest/dest/s3.sh index b992ebf7ad..b8d0b901e2 100755 --- a/test_unstructured_ingest/dest/s3.sh +++ b/test_unstructured_ingest/dest/s3.sh @@ -37,7 +37,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy fast \ --verbose \ --reprocess \ - --input-path example-docs/fake-memo.pdf \ + --input-path example-docs/pdf/fake-memo.pdf \ --work-dir "$WORK_DIR" \ s3 \ --key "$S3_INGEST_TEST_ACCESS_KEY" \ diff --git a/test_unstructured_ingest/dest/singlestore.sh b/test_unstructured_ingest/dest/singlestore.sh index 1816a0e0e9..a04f81370c 100755 --- a/test_unstructured_ingest/dest/singlestore.sh +++ b/test_unstructured_ingest/dest/singlestore.sh @@ -46,7 +46,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --strategy fast \ --verbose \ --reprocess \ - --input-path example-docs/fake-memo.pdf \ + --input-path example-docs/pdf/fake-memo.pdf \ --work-dir "$WORK_DIR" \ --embedding-provider "langchain-huggingface" \ singlestore \ diff --git a/test_unstructured_ingest/dest/sqlite.sh b/test_unstructured_ingest/dest/sqlite.sh index c289bf4218..9cd54b35e7 100755 --- a/test_unstructured_ingest/dest/sqlite.sh +++ b/test_unstructured_ingest/dest/sqlite.sh @@ -42,7 +42,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --strategy fast \ --verbose \ --reprocess \ - --input-path example-docs/fake-memo.pdf \ + --input-path example-docs/pdf/fake-memo.pdf \ --work-dir "$WORK_DIR" \ sql \ --db-type "$DATABASE_TYPE" \ diff --git a/test_unstructured_ingest/dest/weaviate.sh b/test_unstructured_ingest/dest/weaviate.sh index 68c4953d39..7dfa3281a5 100755 --- a/test_unstructured_ingest/dest/weaviate.sh +++ b/test_unstructured_ingest/dest/weaviate.sh @@ -40,7 +40,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --strategy fast \ --verbose \ --reprocess \ - --input-path example-docs/fake-memo.pdf \ + --input-path example-docs/pdf/fake-memo.pdf \ --work-dir "$WORK_DIR" \ --embedding-provider "langchain-huggingface" \ weaviate \ diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json index e89d8787e0..b07103abf1 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json @@ -6,7 +6,7 @@ "metadata": { "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/example-docs/multi-column-2p.pdf" + "path": "/home/runner/work/unstructured/unstructured/example-docs/pdf/multi-column-2p.pdf" }, "permissions_data": [ { @@ -28,7 +28,7 @@ "metadata": { "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/example-docs/multi-column-2p.pdf" + "path": "/home/runner/work/unstructured/unstructured/example-docs/pdf/multi-column-2p.pdf" }, "permissions_data": [ { @@ -50,7 +50,7 @@ "metadata": { "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/example-docs/multi-column-2p.pdf" + "path": "/home/runner/work/unstructured/unstructured/example-docs/pdf/multi-column-2p.pdf" }, "permissions_data": [ { @@ -72,7 +72,7 @@ "metadata": { "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/example-docs/multi-column-2p.pdf" + "path": "/home/runner/work/unstructured/unstructured/example-docs/pdf/multi-column-2p.pdf" }, "permissions_data": [ { @@ -94,7 +94,7 @@ "metadata": { "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/example-docs/multi-column-2p.pdf" + "path": "/home/runner/work/unstructured/unstructured/example-docs/pdf/multi-column-2p.pdf" }, "permissions_data": [ { @@ -116,7 +116,7 @@ "metadata": { "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/example-docs/multi-column-2p.pdf" + "path": "/home/runner/work/unstructured/unstructured/example-docs/pdf/multi-column-2p.pdf" }, "permissions_data": [ { @@ -138,7 +138,7 @@ "metadata": { "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/example-docs/multi-column-2p.pdf" + "path": "/home/runner/work/unstructured/unstructured/example-docs/pdf/multi-column-2p.pdf" }, "permissions_data": [ { diff --git a/test_unstructured_ingest/src/against-api.sh b/test_unstructured_ingest/src/against-api.sh index 63ab033dfa..a4ff8f3adb 100755 --- a/test_unstructured_ingest/src/against-api.sh +++ b/test_unstructured_ingest/src/against-api.sh @@ -40,7 +40,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --output-dir "$OUTPUT_DIR" \ --verbose \ --num-processes "$max_processes" \ - --input-path "example-docs/$TEST_FILE_NAME" \ + --input-path "example-docs/pdf/$TEST_FILE_NAME" \ --work-dir "$WORK_DIR" RESULT_FILE_PATH="$OUTPUT_DIR/$TEST_FILE_NAME.json" diff --git a/test_unstructured_ingest/src/kafka-local.sh b/test_unstructured_ingest/src/kafka-local.sh index de2253f4bb..c2ed84d0b4 100755 --- a/test_unstructured_ingest/src/kafka-local.sh +++ b/test_unstructured_ingest/src/kafka-local.sh @@ -45,13 +45,13 @@ echo "Sending test document (pdf)" #Check the number of messages in destination topic #Note we are calling it twice since this will hack our way into the topic being created (default kafka setting) python "$SCRIPT_DIR"/python/test-produce-kafka-message.py up \ - --input-file "example-docs/fake-memo.pdf" \ + --input-file "example-docs/pdf/fake-memo.pdf" \ --bootstrap-server localhost \ --topic "$KAFKA_TOPIC" \ --confluent false \ --port 29092 python "$SCRIPT_DIR"/python/test-produce-kafka-message.py up \ - --input-file "example-docs/fake-memo.pdf" \ + --input-file "example-docs/pdf/fake-memo.pdf" \ --bootstrap-server localhost \ --topic "$KAFKA_TOPIC" \ --confluent false \ diff --git a/test_unstructured_ingest/src/local-single-file-chunk-no-orig-elements.sh b/test_unstructured_ingest/src/local-single-file-chunk-no-orig-elements.sh index 29dad44f21..452686eebf 100755 --- a/test_unstructured_ingest/src/local-single-file-chunk-no-orig-elements.sh +++ b/test_unstructured_ingest/src/local-single-file-chunk-no-orig-elements.sh @@ -21,7 +21,7 @@ OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME # -- use absolute path of input file to verify passing an absolute path -- -ABS_INPUT_PATH="$SCRIPT_DIR/../example-docs/$EXAMPLE_DOC" +ABS_INPUT_PATH="$SCRIPT_DIR/../example-docs/pdf/$EXAMPLE_DOC" max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} # shellcheck disable=SC1091 diff --git a/test_unstructured_ingest/unit/connector/test_sql_conform_dict.py b/test_unstructured_ingest/unit/connector/test_sql_conform_dict.py index 103b489e96..bfa6281831 100644 --- a/test_unstructured_ingest/unit/connector/test_sql_conform_dict.py +++ b/test_unstructured_ingest/unit/connector/test_sql_conform_dict.py @@ -22,7 +22,7 @@ "date_modified": "2023-10-25 10:05:44.976775", "date_processed": "2023-12-14T17:06:33.074057", "permissions_data": [{"mode": 33188}], - "url": "example-docs/fake-memo.pdf", + "url": "example-docs/pdf/fake-memo.pdf", }, "file_directory": "example-docs", "filename": "fake-memo.pdf", @@ -105,7 +105,7 @@ def test_conform_dict_1(): "date_modified": datetime.datetime(2023, 10, 25, 10, 5, 44, 976775), "date_processed": datetime.datetime(2023, 12, 14, 17, 6, 33, 74057), "permissions_data": '[{"mode": 33188}]', - "url": "example-docs/fake-memo.pdf", + "url": "example-docs/pdf/fake-memo.pdf", "layout_height": 792, "layout_width": 612, "points": "[[72.0, 72.69200000000001], [72.0, 83.69200000000001],"