From 3bb0ee1e792a39575fb6ee0a0e7174a90e82f956 Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Sun, 8 Sep 2024 14:25:52 -0700 Subject: [PATCH] chore: fix tests breaking on main (#3603) Fix API tests (really more like integration tests) that run only on main. Also use less compute intensive files to speedup test time and remove a useless test. Tests in `test_unstructured/partition/test_api.py` pass, temporarily running outside of main per per screenshot: ![image](https://github.com/user-attachments/assets/f15d440a-2574-40f2-98b4-adf57fbae704) https://github.com/Unstructured-IO/unstructured/actions/runs/10754098974/job/29824415513 --- test_unstructured/partition/test_api.py | 67 +++++++++++++------------ 1 file changed, 35 insertions(+), 32 deletions(-) diff --git a/test_unstructured/partition/test_api.py b/test_unstructured/partition/test_api.py index 270fbd34a0..3a36870bc7 100644 --- a/test_unstructured/partition/test_api.py +++ b/test_unstructured/partition/test_api.py @@ -18,6 +18,9 @@ DIRECTORY = pathlib.Path(__file__).parent.resolve() +# NOTE(crag): point to freemium API for now +API_URL = "https://api.unstructured.io/general/v0/general" + is_in_ci = os.getenv("CI", "").lower() not in {"", "false", "f", "0"} skip_not_on_main = os.getenv("GITHUB_REF_NAME", "").lower() != "main" @@ -105,20 +108,29 @@ def test_partition_via_api_raises_with_bad_response(request: FixtureRequest): @pytest.mark.skipif(not is_in_ci, reason="Skipping test run outside of CI") @pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch") def test_partition_via_api_with_no_strategy(): + test_file = example_doc_path("pdf/loremipsum-flat.pdf") elements_no_strategy = partition_via_api( - filename=example_doc_path("layout-parser-paper-fast.pdf"), + filename=test_file, strategy="auto", api_key=get_api_key(), # The url has changed since the 06/24 API release while the sdk defaults to the old url - api_url="https://api.unstructuredapp.io/general/v0/general", + api_url=API_URL, skip_infer_table_types=["pdf"], ) elements_hi_res = partition_via_api( - filename=example_doc_path("layout-parser-paper-fast.pdf"), + filename=test_file, strategy="hi_res", api_key=get_api_key(), # The url has changed since the 06/24 API release while the sdk defaults to the old url - api_url="https://api.unstructuredapp.io/general/v0/general", + api_url=API_URL, + skip_infer_table_types=["pdf"], + ) + elements_fast_res = partition_via_api( + filename=test_file, + strategy="fast", + api_key=get_api_key(), + # The url has changed since the 06/24 API release while the sdk defaults to the old url + api_url=API_URL, skip_infer_table_types=["pdf"], ) @@ -126,7 +138,11 @@ def test_partition_via_api_with_no_strategy(): # elements_hi_res[3].text = # 'LayoutParser: A Uniļ¬ed Toolkit for Deep Learning Based Document Image Analysis' # while elements_no_strategy[3].text = ']' (as of this writing) - assert elements_no_strategy[3].text != elements_hi_res[3].text + assert len(elements_no_strategy) == len(elements_hi_res) + assert len(elements_hi_res) != len(elements_fast_res) + + # NOTE(crag): slightly out scope assertion, but avoid extra API call + assert elements_hi_res[0].metadata.coordinates is None @pytest.mark.skipif(not is_in_ci, reason="Skipping test run outside of CI") @@ -134,41 +150,26 @@ def test_partition_via_api_with_no_strategy(): def test_partition_via_api_with_image_hi_res_strategy_includes_coordinates(): # coordinates not included by default to limit payload size elements = partition_via_api( - filename=example_doc_path("layout-parser-paper-fast.pdf"), + filename=example_doc_path("pdf/fake-memo.pdf"), strategy="hi_res", coordinates="true", api_key=get_api_key(), - # The url has changed since the 06/24 API release while the sdk defaults to the old url - api_url="https://api.unstructuredapp.io/general/v0/general", + api_url=API_URL, ) assert elements[0].metadata.coordinates is not None -@pytest.mark.skipif(not is_in_ci, reason="Skipping test run outside of CI") -@pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch") -def test_partition_via_api_valid_request_data_kwargs(): - elements = partition_via_api( - filename=example_doc_path("layout-parser-paper-fast.pdf"), - strategy="fast", - api_key=get_api_key(), - # The url has changed since the 06/24 API release while the sdk defaults to the old url - api_url="https://api.unstructuredapp.io/general/v0/general", - ) - - assert isinstance(elements, list) - - @pytest.mark.skipif(not is_in_ci, reason="Skipping test run outside of CI") @pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch") def test_partition_via_api_image_block_extraction(): elements = partition_via_api( - filename=example_doc_path("embedded-images-tables.pdf"), + filename=example_doc_path("pdf/embedded-images-tables.pdf"), strategy="hi_res", extract_image_block_types=["image", "table"], api_key=get_api_key(), # The url has changed since the 06/24 API release while the sdk defaults to the old url - api_url="https://api.unstructuredapp.io/general/v0/general", + api_url=API_URL, ) image_elements = [el for el in elements if el.category == ElementType.IMAGE] for el in image_elements: @@ -357,18 +358,20 @@ def get_api_key(): @pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch") def test_partition_multiple_via_api_valid_request_data_kwargs(): filenames = [ - example_doc_path("layout-parser-paper-fast.pdf"), - example_doc_path("layout-parser-paper-fast.jpg"), + example_doc_path("fake-text.txt"), + example_doc_path("fake-email.txt"), ] - elements = partition_multiple_via_api( + list_of_lists_of_elements = partition_multiple_via_api( filenames=filenames, - strategy="auto", + strategy="fast", api_key=get_api_key(), - # The url has changed since the 06/24 API release while the sdk defaults to the old url - api_url="https://api.unstructuredapp.io/general/v0/general", + api_url=API_URL, ) - assert isinstance(elements, list) + # assert there is a list of elements for each file + assert len(list_of_lists_of_elements) == 2 + assert isinstance(list_of_lists_of_elements[0], list) + assert isinstance(list_of_lists_of_elements[1], list) @pytest.mark.skipif(not is_in_ci, reason="Skipping test run outside of CI") @@ -383,7 +386,7 @@ def test_partition_multiple_via_api_invalid_request_data_kwargs(): strategy="not_a_strategy", api_key=get_api_key(), # The url has changed since the 06/24 API release while the sdk defaults to the old url - api_url="https://api.unstructuredapp.io/general/v0/general", + api_url=API_URL, )