Skip to content

Commit

Permalink
Add end2end test
Browse files Browse the repository at this point in the history
  • Loading branch information
plutasnyy committed Nov 26, 2024
1 parent b5e8b5c commit 4de546d
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 1 deletion.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<body class="Document" id="897a8a47377c4ad6aab839a929879537">
<div class="Page" data-page-number="1" id="3a6b156a81764e17be128264241f8136">
<header class="Header" id="6135aeb6-9558-46e2-9da4-473a74db3e9d">
<img alt="New York logo" class="Logo" id="33d66969-b274-4f88-abaa-e7f258b1595f"/>
<img alt="A line graph showing the comparison of 5 year cumulative total return for stocks" class="Image" id="40c32fd8-9a02-42b8-a587-884293881090"/>
</header>
</div>
</body>
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,10 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
[
("html_files/example.html", "unstructured_json_output/example.json"),
("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"),
(
"html_files/example_with_alternative_text.html",
"unstructured_json_output/example_with_alternative_text.json",
),
("html_files/three_tables.html", "unstructured_json_output/three_tables.json"),
(
"html_files/example_with_inline_fields.html",
Expand All @@ -191,13 +195,13 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path):
html_file_path = Path(__file__).parent / html_file_path
json_file_path = Path(__file__).parent / json_file_path

expected_json_elements = elements_from_json(str(json_file_path))
html_code = html_file_path.read_text()

predicted_elements = partition_html(
text=html_code, html_parser_version="v2", unique_element_ids=True
)

assert len(expected_json_elements) == len(predicted_elements)

for i in range(len(expected_json_elements)):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
[
{
"element_id": "3a6b156a81764e17be128264241f8136",
"metadata": {
"category_depth": 0,
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "897a8a47377c4ad6aab839a929879537",
"text_as_html": "<div class=\"Page\" data-page-number=\"1\" id=\"3a6b156a81764e17be128264241f8136\" />"
},
"text": "",
"type": "UncategorizedText"
},
{
"element_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
"metadata": {
"category_depth": 1,
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "3a6b156a81764e17be128264241f8136",
"text_as_html": "<header class=\"Header\" id=\"6135aeb6-9558-46e2-9da4-473a74db3e9d\" />"
},
"text": "",
"type": "UncategorizedText"
},
{
"element_id": "33d66969-b274-4f88-abaa-e7f258b1595f",
"metadata": {
"category_depth": 2,
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
"text_as_html": "<img class=\"Logo\" alt=\"New York logo\" id=\"33d66969-b274-4f88-abaa-e7f258b1595f\" />"
},
"text": "New York logo",
"type": "Image"
},
{
"element_id": "40c32fd8-9a02-42b8-a587-884293881090",
"metadata": {
"category_depth": 2,
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
"text_as_html": "<img class=\"Image\" alt=\"A line graph showing the comparison of 5 year cumulative total return for stocks\" id=\"40c32fd8-9a02-42b8-a587-884293881090\" />"
},
"text": "A line graph showing the comparison of 5 year cumulative total return for stocks",
"type": "Image"
}
]

0 comments on commit 4de546d

Please sign in to comment.