From 4de546d58d1ba5c0cf4816bc4014c1ef9449dd9d Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Tue, 26 Nov 2024 13:50:49 +0100 Subject: [PATCH] Add end2end test --- .../example_with_alternative_text.html | 8 +++ .../test_ontology_to_unstructured_parsing.py | 6 +- .../example_with_alternative_text.json | 62 +++++++++++++++++++ 3 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 test_unstructured/documents/html_files/example_with_alternative_text.html create mode 100644 test_unstructured/documents/unstructured_json_output/example_with_alternative_text.json diff --git a/test_unstructured/documents/html_files/example_with_alternative_text.html b/test_unstructured/documents/html_files/example_with_alternative_text.html new file mode 100644 index 0000000000..64cbf2f605 --- /dev/null +++ b/test_unstructured/documents/html_files/example_with_alternative_text.html @@ -0,0 +1,8 @@ + +
+
+ + A line graph showing the comparison of 5 year cumulative total return for stocks +
+
+ diff --git a/test_unstructured/documents/test_ontology_to_unstructured_parsing.py b/test_unstructured/documents/test_ontology_to_unstructured_parsing.py index 72dec7d02a..f489be0363 100644 --- a/test_unstructured/documents/test_ontology_to_unstructured_parsing.py +++ b/test_unstructured/documents/test_ontology_to_unstructured_parsing.py @@ -181,6 +181,10 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path): [ ("html_files/example.html", "unstructured_json_output/example.json"), ("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"), + ( + "html_files/example_with_alternative_text.html", + "unstructured_json_output/example_with_alternative_text.json", + ), ("html_files/three_tables.html", "unstructured_json_output/three_tables.json"), ( "html_files/example_with_inline_fields.html", @@ -191,13 +195,13 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path): def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path): html_file_path = Path(__file__).parent / html_file_path json_file_path = Path(__file__).parent / json_file_path - expected_json_elements = elements_from_json(str(json_file_path)) html_code = html_file_path.read_text() predicted_elements = partition_html( text=html_code, html_parser_version="v2", unique_element_ids=True ) + assert len(expected_json_elements) == len(predicted_elements) for i in range(len(expected_json_elements)): diff --git a/test_unstructured/documents/unstructured_json_output/example_with_alternative_text.json b/test_unstructured/documents/unstructured_json_output/example_with_alternative_text.json new file mode 100644 index 0000000000..f6c32707ea --- /dev/null +++ b/test_unstructured/documents/unstructured_json_output/example_with_alternative_text.json @@ -0,0 +1,62 @@ +[ + { + "element_id": "3a6b156a81764e17be128264241f8136", + "metadata": { + "category_depth": 0, + "filetype": "text/html", + "languages": [ + "eng" + ], + "page_number": 1, + "parent_id": "897a8a47377c4ad6aab839a929879537", + "text_as_html": "
" + }, + "text": "", + "type": "UncategorizedText" + }, + { + "element_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d", + "metadata": { + "category_depth": 1, + "filetype": "text/html", + "languages": [ + "eng" + ], + "page_number": 1, + "parent_id": "3a6b156a81764e17be128264241f8136", + "text_as_html": "
" + }, + "text": "", + "type": "UncategorizedText" + }, + { + "element_id": "33d66969-b274-4f88-abaa-e7f258b1595f", + "metadata": { + "category_depth": 2, + "filetype": "text/html", + "languages": [ + "eng" + ], + "page_number": 1, + "parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d", + "text_as_html": "\"New" + }, + "text": "New York logo", + "type": "Image" + }, + { + "element_id": "40c32fd8-9a02-42b8-a587-884293881090", + "metadata": { + "category_depth": 2, + "filetype": "text/html", + "languages": [ + "eng" + ], + "page_number": 1, + "parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d", + "text_as_html": "\"A" + }, + "text": "A line graph showing the comparison of 5 year cumulative total return for stocks", + "type": "Image" + } +] \ No newline at end of file