Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Define default HTML to ontology mapping #3784

Merged
merged 5 commits into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
## 0.16.6-dev0
## 0.16.6-dev1

### Enhancements
- **Every <table> tag is considered to be ontology.Table** Added special handling for tables in HTML partitioning. This change is made to improve the accuracy of table extraction from HTML documents.
- **Every HTML has default ontology class assigned** When parsing HTML to ontology each defined HTML in the Ontology has assigned default ontology class. This way it is possible to assign ontology class instead of UncategorizedText when the HTML tag is predicted correctly without class assigned class

### Features

Expand Down
149 changes: 149 additions & 0 deletions test_unstructured/documents/html_files/three_tables.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
<body class="Document" id="517f8559ba594270bdd67e1b02bf19a2">
<table class="Table" id="2428404551304d4db5925f6afee11ed5">
<tr>
<th>
Header 1
</th>
<th>
Header 2
</th>
</tr>
<tr>
<td>
Row 1, Cell 1
</td>
<td>
Row 1, Cell 2
</td>
</tr>
<tr>
<td>
Row 2, Cell 1
</td>
<td>
Row 2, Cell 2
</td>
</tr>
</table>
<table id="9f91cae321c74b31bb1c83ac86cd7afb">
<tr>
<th colspan="3">
Big Table Header
</th>
</tr>
<tr>
<td rowspan="2">
Merged Cell 1
</td>
<td>
Cell 2
</td>
<td>
Cell 3
</td>
</tr>
<tr>
<td colspan="2">
Merged Cell 4 and 5
</td>
</tr>
<tr>
<td>
Cell 6
</td>
<td>
Cell 7
</td>
<td>
Cell 8
</td>
</tr>
<tr>
<td>
Cell 9
</td>
<td colspan="2">
A cell with a lot of text. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
</td>
</tr>
<tr>
<td>
Cell 10
</td>
<td>
Cell 11
</td>
<td>
Cell 12
</td>
</tr>
</table>
<table class="TableOfContents" id="da6c34391e544b3480e45d68f40870fa">
<tr>
<th>
Chapter
</th>
<th>
Title
</th>
<th>
Page
</th>
</tr>
<tr>
<td>
1
</td>
<td>
Introduction
</td>
<td>
1
</td>
</tr>
<tr>
<td>
2
</td>
<td>
Getting Started
</td>
<td>
5
</td>
</tr>
<tr>
<td>
3
</td>
<td>
Basic Concepts
</td>
<td>
12
</td>
</tr>
<tr>
<td>
4
</td>
<td>
Advanced Topics
</td>
<td>
25
</td>
</tr>
<tr>
<td>
5
</td>
<td>
Conclusion
</td>
<td>
40
</td>
</tr>
</table>
</body>
33 changes: 13 additions & 20 deletions test_unstructured/documents/test_mappings.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from collections import defaultdict
from typing import Dict, Type
from typing import Type

from unstructured.documents import elements, ontology
from unstructured.documents.mappings import (
Expand All @@ -11,27 +11,20 @@
from unstructured.documents.ontology import OntologyElement


def _get_exclusive_html_tags() -> dict[str, Type[OntologyElement]]:
"""
Get a mapping of HTML tags to their exclusive OntologyElement types.
"""
html_tag_to_element_type_mappings: Dict[str, list[Type[OntologyElement]]] = defaultdict(list)
for element_type in ALL_ONTOLOGY_ELEMENT_TYPES:
for tag in element_type().allowed_tags:
html_tag_to_element_type_mappings[tag].append(element_type)
def test_if_all_html_tags_have_default_ontology_type():
html_tag_to_possible_ontology_classes: dict[str, list[Type[ontology.OntologyElement]]] = (
defaultdict(list)
)

return {
tag: element_types[0]
for tag, element_types in html_tag_to_element_type_mappings.items()
if len(element_types) == 1
}
for ontology_class in ALL_ONTOLOGY_ELEMENT_TYPES:
for tag in ontology_class().allowed_tags:
html_tag_to_possible_ontology_classes[tag].append(ontology_class)


def test_if_all_exclusive_html_tags_are_mapped_to_ontology_elements():
exclusive_html_tags = _get_exclusive_html_tags()
for expected_tag, expected_element_type in exclusive_html_tags.items():
assert expected_tag in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP
assert HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP[expected_tag] == expected_element_type
for html_tag, possible_ontology_classes in html_tag_to_possible_ontology_classes.items():
assert html_tag in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP
assert HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP[html_tag] in possible_ontology_classes + [
ontology.UncategorizedText
] # In some cases it is better to use unknown type than assign incorrect type


def test_all_expected_ontology_types_are_subclasses_of_OntologyElement():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
[
("html_files/example.html", "unstructured_json_output/example.json"),
("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"),
("html_files/three_tables.html", "unstructured_json_output/three_tables.json"),
(
"html_files/example_with_inline_fields.html",
"unstructured_json_output/example_with_inline_fields.json",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
[
{
"element_id": "2428404551304d4db5925f6afee11ed5",
"metadata": {
"category_depth": 0,
"filetype": "text/html",
"languages": [
"eng"
],
"parent_id": "517f8559ba594270bdd67e1b02bf19a2",
"text_as_html": "<table class=\"Table\" id=\"2428404551304d4db5925f6afee11ed5\"> <tr> <th>Header 1</th><th>Header 2</th></tr><tr> <td>Row 1, Cell 1</td><td>Row 1, Cell 2</td></tr><tr> <td>Row 2, Cell 1</td><td>Row 2, Cell 2</td></tr></table>"
},
"text": "Header 1 Header 2 Row 1, Cell 1 Row 1, Cell 2 Row 2, Cell 1 Row 2, Cell 2",
"type": "Table"
},
{
"element_id": "9f91cae321c74b31bb1c83ac86cd7afb",
"metadata": {
"category_depth": 0,
"filetype": "text/html",
"languages": [
"eng"
],
"parent_id": "517f8559ba594270bdd67e1b02bf19a2",
"text_as_html": "<table class=\"Table\" id=\"9f91cae321c74b31bb1c83ac86cd7afb\"> <tr> <th colspan=\"3\">Big Table Header</th></tr><tr> <td rowspan=\"2\">Merged Cell 1</td><td>Cell 2</td><td>Cell 3</td></tr><tr> <td colspan=\"2\">Merged Cell 4 and 5</td></tr><tr> <td>Cell 6</td><td>Cell 7</td><td>Cell 8</td></tr><tr> <td>Cell 9</td><td colspan=\"2\">A cell with a lot of text. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.</td></tr><tr> <td>Cell 10</td><td>Cell 11</td><td>Cell 12</td></tr></table>"
},
"text": "Big Table Header Merged Cell 1 Cell 2 Cell 3 Merged Cell 4 and 5 Cell 6 Cell 7 Cell 8 Cell 9 A cell with a lot of text. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Cell 10 Cell 11 Cell 12",
"type": "Table"
},
{
"element_id": "da6c34391e544b3480e45d68f40870fa",
"metadata": {
"category_depth": 0,
"filetype": "text/html",
"languages": [
"eng"
],
"parent_id": "517f8559ba594270bdd67e1b02bf19a2",
"text_as_html": "<table class=\"TableOfContents\" id=\"da6c34391e544b3480e45d68f40870fa\"> <tr> <th>Chapter</th><th>Title</th><th>Page</th></tr><tr> <td>1</td><td>Introduction</td><td>1</td></tr><tr> <td>2</td><td>Getting Started</td><td>5</td></tr><tr> <td>3</td><td>Basic Concepts</td><td>12</td></tr><tr> <td>4</td><td>Advanced Topics</td><td>25</td></tr><tr> <td>5</td><td>Conclusion</td><td>40</td></tr></table>"
},
"text": "Chapter Title Page 1 Introduction 1 2 Getting Started 5 3 Basic Concepts 12 4 Advanced Topics 25 5 Conclusion 40",
"type": "Table"
}
]
Original file line number Diff line number Diff line change
Expand Up @@ -310,8 +310,7 @@ def test_when_unknown_element_keyword_only_attributes_are_preserved_during_mappi
<div class="Page">
<form class="Form">
<label class="FormField" for="option1">
<span class="UncategorizedText" type="radio" name="option1" value="2" checked>
</span>
<input class="Checkbox" type="radio" name="option1" value="2" checked />
<span class="UncategorizedText">
Option 1 (Checked)
</span>
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.6-dev0" # pragma: no cover
__version__ = "0.16.6-dev1" # pragma: no cover
62 changes: 40 additions & 22 deletions unstructured/documents/mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def get_ontology_to_unstructured_type_mapping() -> dict[str, Element]:
ontology.Keywords: elements.Text,
ontology.Abstract: elements.NarrativeText,
ontology.Hyperlink: elements.Text,
ontology.TableOfContents: elements.Text,
ontology.TableOfContents: elements.Table,
ontology.Index: elements.Text,
ontology.Form: elements.Text,
ontology.FormField: elements.Text,
Expand Down Expand Up @@ -140,38 +140,56 @@ def get_ontology_to_unstructured_type_mapping() -> dict[str, Element]:
}

HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP: Dict[str, Type[ontology.OntologyElement]] = {
"a": ontology.Hyperlink,
"address": ontology.Address,
"aside": ontology.Sidebar,
"audio": ontology.Audio,
"blockquote": ontology.Quote,
"body": ontology.Document,
"button": ontology.Button,
"cite": ontology.Citation,
"code": ontology.CodeBlock,
"del": ontology.RevisionDeletion,
"div": ontology.UncategorizedText,
"dl": ontology.DefinitionList,
"figcaption": ontology.Caption,
"figure": ontology.Figure,
"footer": ontology.Footer,
"aside": ontology.Sidebar,
"hr": ontology.PageBreak,
"form": ontology.Form,
"h1": ontology.Title,
"h2": ontology.Subtitle,
"h3": ontology.Heading,
"h4": ontology.Heading,
"h5": ontology.Heading,
"h6": ontology.Heading,
"blockquote": ontology.Quote,
"figcaption": ontology.Caption,
"ol": ontology.OrderedList,
"header": ontology.Header,
"hr": ontology.PageBreak,
"img": ontology.Image,
"input": ontology.Checkbox,
"ins": ontology.RevisionInsertion,
"label": ontology.FormField,
"li": ontology.ListItem,
"mark": ontology.Highlight,
"math": ontology.Equation,
"meta": ontology.Keywords,
"nav": ontology.Index,
"ol": ontology.OrderedList,
"p": ontology.Paragraph,
"pre": ontology.CodeBlock,
"section": ontology.Section,
"span": ontology.UncategorizedText,
"sub": ontology.FootnoteReference,
"svg": ontology.Signature,
"table": ontology.Table,
"tbody": ontology.TableBody,
"thead": ontology.TableHeader,
"tr": ontology.TableRow,
"td": ontology.TableCell,
"th": ontology.TableCellHeader,
"figure": ontology.Figure,
"thead": ontology.TableHeader,
"time": ontology.Time,
"tr": ontology.TableRow,
"ul": ontology.UnorderedList,
"video": ontology.Video,
"audio": ontology.Audio,
"pre": ontology.CodeBlock,
"sub": ontology.FootnoteReference,
"cite": ontology.Citation,
"nav": ontology.Index,
"form": ontology.Form,
"label": ontology.FormField,
"button": ontology.Button,
"mark": ontology.Highlight,
"ins": ontology.RevisionInsertion,
"del": ontology.RevisionDeletion,
"address": ontology.Address,
"table": ontology.Table,
}


ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE = get_ontology_to_unstructured_type_mapping()
Loading
Loading