diff --git a/CHANGELOG.md b/CHANGELOG.md index a4018c1b25..31bea43ac2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ -## 0.16.6-dev0 +## 0.16.6-dev1 ### Enhancements - **Every tag is considered to be ontology.Table** Added special handling for tables in HTML partitioning. This change is made to improve the accuracy of table extraction from HTML documents. +- **Every HTML has default ontology class assigned** When parsing HTML to ontology each defined HTML in the Ontology has assigned default ontology class. This way it is possible to assign ontology class instead of UncategorizedText when the HTML tag is predicted correctly without class assigned class ### Features diff --git a/test_unstructured/documents/html_files/three_tables.html b/test_unstructured/documents/html_files/three_tables.html new file mode 100644 index 0000000000..94b715be30 --- /dev/null +++ b/test_unstructured/documents/html_files/three_tables.html @@ -0,0 +1,149 @@ + +
+ + + + + + + + + + + + +
+ Header 1 + + Header 2 +
+ Row 1, Cell 1 + + Row 1, Cell 2 +
+ Row 2, Cell 1 + + Row 2, Cell 2 +
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
+ Big Table Header +
+ Merged Cell 1 + + Cell 2 + + Cell 3 +
+ Merged Cell 4 and 5 +
+ Cell 6 + + Cell 7 + + Cell 8 +
+ Cell 9 + + A cell with a lot of text. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. +
+ Cell 10 + + Cell 11 + + Cell 12 +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ Chapter + + Title + + Page +
+ 1 + + Introduction + + 1 +
+ 2 + + Getting Started + + 5 +
+ 3 + + Basic Concepts + + 12 +
+ 4 + + Advanced Topics + + 25 +
+ 5 + + Conclusion + + 40 +
+ diff --git a/test_unstructured/documents/test_mappings.py b/test_unstructured/documents/test_mappings.py index 029bb544fb..0d39000d81 100644 --- a/test_unstructured/documents/test_mappings.py +++ b/test_unstructured/documents/test_mappings.py @@ -1,5 +1,5 @@ from collections import defaultdict -from typing import Dict, Type +from typing import Type from unstructured.documents import elements, ontology from unstructured.documents.mappings import ( @@ -11,27 +11,20 @@ from unstructured.documents.ontology import OntologyElement -def _get_exclusive_html_tags() -> dict[str, Type[OntologyElement]]: - """ - Get a mapping of HTML tags to their exclusive OntologyElement types. - """ - html_tag_to_element_type_mappings: Dict[str, list[Type[OntologyElement]]] = defaultdict(list) - for element_type in ALL_ONTOLOGY_ELEMENT_TYPES: - for tag in element_type().allowed_tags: - html_tag_to_element_type_mappings[tag].append(element_type) +def test_if_all_html_tags_have_default_ontology_type(): + html_tag_to_possible_ontology_classes: dict[str, list[Type[ontology.OntologyElement]]] = ( + defaultdict(list) + ) - return { - tag: element_types[0] - for tag, element_types in html_tag_to_element_type_mappings.items() - if len(element_types) == 1 - } + for ontology_class in ALL_ONTOLOGY_ELEMENT_TYPES: + for tag in ontology_class().allowed_tags: + html_tag_to_possible_ontology_classes[tag].append(ontology_class) - -def test_if_all_exclusive_html_tags_are_mapped_to_ontology_elements(): - exclusive_html_tags = _get_exclusive_html_tags() - for expected_tag, expected_element_type in exclusive_html_tags.items(): - assert expected_tag in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP - assert HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP[expected_tag] == expected_element_type + for html_tag, possible_ontology_classes in html_tag_to_possible_ontology_classes.items(): + assert html_tag in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP + assert HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP[html_tag] in possible_ontology_classes + [ + ontology.UncategorizedText + ] # In some cases it is better to use unknown type than assign incorrect type def test_all_expected_ontology_types_are_subclasses_of_OntologyElement(): diff --git a/test_unstructured/documents/test_ontology_to_unstructured_parsing.py b/test_unstructured/documents/test_ontology_to_unstructured_parsing.py index c69f49f2ca..72dec7d02a 100644 --- a/test_unstructured/documents/test_ontology_to_unstructured_parsing.py +++ b/test_unstructured/documents/test_ontology_to_unstructured_parsing.py @@ -181,6 +181,7 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path): [ ("html_files/example.html", "unstructured_json_output/example.json"), ("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"), + ("html_files/three_tables.html", "unstructured_json_output/three_tables.json"), ( "html_files/example_with_inline_fields.html", "unstructured_json_output/example_with_inline_fields.json", diff --git a/test_unstructured/documents/unstructured_json_output/three_tables.json b/test_unstructured/documents/unstructured_json_output/three_tables.json new file mode 100644 index 0000000000..9594b451d3 --- /dev/null +++ b/test_unstructured/documents/unstructured_json_output/three_tables.json @@ -0,0 +1,44 @@ +[ + { + "element_id": "2428404551304d4db5925f6afee11ed5", + "metadata": { + "category_depth": 0, + "filetype": "text/html", + "languages": [ + "eng" + ], + "parent_id": "517f8559ba594270bdd67e1b02bf19a2", + "text_as_html": "
Header 1Header 2
Row 1, Cell 1Row 1, Cell 2
Row 2, Cell 1Row 2, Cell 2
" + }, + "text": "Header 1 Header 2 Row 1, Cell 1 Row 1, Cell 2 Row 2, Cell 1 Row 2, Cell 2", + "type": "Table" + }, + { + "element_id": "9f91cae321c74b31bb1c83ac86cd7afb", + "metadata": { + "category_depth": 0, + "filetype": "text/html", + "languages": [ + "eng" + ], + "parent_id": "517f8559ba594270bdd67e1b02bf19a2", + "text_as_html": "
Big Table Header
Merged Cell 1Cell 2Cell 3
Merged Cell 4 and 5
Cell 6Cell 7Cell 8
Cell 9A cell with a lot of text. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
Cell 10Cell 11Cell 12
" + }, + "text": "Big Table Header Merged Cell 1 Cell 2 Cell 3 Merged Cell 4 and 5 Cell 6 Cell 7 Cell 8 Cell 9 A cell with a lot of text. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Cell 10 Cell 11 Cell 12", + "type": "Table" + }, + { + "element_id": "da6c34391e544b3480e45d68f40870fa", + "metadata": { + "category_depth": 0, + "filetype": "text/html", + "languages": [ + "eng" + ], + "parent_id": "517f8559ba594270bdd67e1b02bf19a2", + "text_as_html": "
ChapterTitlePage
1Introduction1
2Getting Started5
3Basic Concepts12
4Advanced Topics25
5Conclusion40
" + }, + "text": "Chapter Title Page 1 Introduction 1 2 Getting Started 5 3 Basic Concepts 12 4 Advanced Topics 25 5 Conclusion 40", + "type": "Table" + } +] \ No newline at end of file diff --git a/test_unstructured/partition/html/test_html_to_ontology_parsing.py b/test_unstructured/partition/html/test_html_to_ontology_parsing.py index a15e5cb502..8ab31db9cc 100644 --- a/test_unstructured/partition/html/test_html_to_ontology_parsing.py +++ b/test_unstructured/partition/html/test_html_to_ontology_parsing.py @@ -310,8 +310,7 @@ def test_when_unknown_element_keyword_only_attributes_are_preserved_during_mappi