Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into ML-468/add_text_as_ht…
Browse files Browse the repository at this point in the history
…ml_to_orig_elements_chunks
  • Loading branch information
plutasnyy committed Nov 18, 2024
2 parents 6332003 + ca27b8a commit 85dc0fc
Show file tree
Hide file tree
Showing 5 changed files with 224 additions and 165 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
## 0.16.6-dev0
## 0.16.6-dev1

### Enhancements
- **Every <table> tag is considered to be ontology.Table** Added special handling for tables in HTML partitioning. This change is made to improve the accuracy of table extraction from HTML documents.

### Features

Expand Down
53 changes: 53 additions & 0 deletions test_unstructured/documents/test_mappings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from collections import defaultdict
from typing import Dict, Type

from unstructured.documents import elements, ontology
from unstructured.documents.mappings import (
ALL_ONTOLOGY_ELEMENT_TYPES,
HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP,
ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE,
get_all_subclasses,
)
from unstructured.documents.ontology import OntologyElement


def _get_exclusive_html_tags() -> dict[str, Type[OntologyElement]]:
"""
Get a mapping of HTML tags to their exclusive OntologyElement types.
"""
html_tag_to_element_type_mappings: Dict[str, list[Type[OntologyElement]]] = defaultdict(list)
for element_type in ALL_ONTOLOGY_ELEMENT_TYPES:
for tag in element_type().allowed_tags:
html_tag_to_element_type_mappings[tag].append(element_type)

return {
tag: element_types[0]
for tag, element_types in html_tag_to_element_type_mappings.items()
if len(element_types) == 1
}


def test_if_all_exclusive_html_tags_are_mapped_to_ontology_elements():
exclusive_html_tags = _get_exclusive_html_tags()
for expected_tag, expected_element_type in exclusive_html_tags.items():
assert expected_tag in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP
assert HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP[expected_tag] == expected_element_type


def test_all_expected_ontology_types_are_subclasses_of_OntologyElement():
for element_type in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP.values():
assert issubclass(element_type, OntologyElement)


def test_ontology_to_unstructured_mapping_has_valid_types():
for (
ontology_element,
unstructured_element,
) in ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE.items():
assert issubclass(unstructured_element, elements.Element)
assert issubclass(ontology_element, ontology.OntologyElement)


def test_all_ontology_elements_are_defined_in_mapping_to_unstructured():
for ontology_element in get_all_subclasses(ontology.OntologyElement):
assert ontology_element in ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.6-dev0" # pragma: no cover
__version__ = "0.16.6-dev1" # pragma: no cover
214 changes: 116 additions & 98 deletions unstructured/documents/mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
of parsed documents
"""

from collections import defaultdict
from typing import Any, Dict, Type

from unstructured.documents.ontology import OntologyElement
from unstructured.documents import elements, ontology
from unstructured.documents.elements import Element


def get_all_subclasses(cls) -> list[Any]:
Expand All @@ -30,25 +30,9 @@ def get_all_subclasses(cls) -> list[Any]:
return all_subclasses


def get_exclusive_html_tags() -> dict[str, Type[OntologyElement]]:
def get_ontology_to_unstructured_type_mapping() -> dict[str, Element]:
"""
Get a mapping of HTML tags to their exclusive OntologyElement types.
"""
html_tag_to_element_type_mappings: Dict[str, list[Type[OntologyElement]]] = defaultdict(list)
for element_type in ALL_ONTOLOGY_ELEMENT_TYPES:
for tag in element_type().allowed_tags:
html_tag_to_element_type_mappings[tag].append(element_type)

return {
tag: element_types[0]
for tag, element_types in html_tag_to_element_type_mappings.items()
if len(element_types) == 1
}


def get_ontology_to_unstructured_type_mapping() -> dict[str, str]:
"""
Get a mapping of ontology element names to unstructured type names.
Get a mapping of ontology element to unstructured type.
The dictionary here was created base on ontology mapping json
Can be generated via the following code:
Expand All @@ -63,97 +47,131 @@ def get_ontology_to_unstructured_type_mapping() -> dict[str, str]:
```
Returns:
dict: A dictionary where keys are ontology element class names
and values are unstructured type names.
dict: A dictionary where keys are ontology element classes
and values are unstructured types.
"""
ontology_to_unstructured_class_mapping = {
"Document": "UncategorizedText",
"Section": "UncategorizedText",
"Page": "UncategorizedText",
"Column": "UncategorizedText",
"Paragraph": "NarrativeText",
"Header": "Header",
"Footer": "Footer",
"Sidebar": "UncategorizedText",
"PageBreak": "PageBreak",
"Title": "Title",
"Subtitle": "Title",
"Heading": "Title",
"NarrativeText": "NarrativeText",
"Quote": "NarrativeText",
"Footnote": "UncategorizedText",
"Caption": "FigureCaption",
"PageNumber": "PageNumber",
"UncategorizedText": "UncategorizedText",
"OrderedList": "UncategorizedText",
"UnorderedList": "UncategorizedText",
"DefinitionList": "UncategorizedText",
"ListItem": "ListItem",
"Table": "Table",
"TableRow": "Table",
"TableCell": "Table",
"TableCellHeader": "Table",
"TableBody": "Table",
"TableHeader": "Table",
"Image": "Image",
"Figure": "Image",
"Video": "UncategorizedText",
"Audio": "UncategorizedText",
"Barcode": "Image",
"QRCode": "Image",
"Logo": "Image",
"CodeBlock": "CodeSnippet",
"InlineCode": "CodeSnippet",
"Formula": "Formula",
"Equation": "Formula",
"FootnoteReference": "UncategorizedText",
"Citation": "UncategorizedText",
"Bibliography": "UncategorizedText",
"Glossary": "UncategorizedText",
"Author": "UncategorizedText",
"MetaDate": "UncategorizedText",
"Keywords": "UncategorizedText",
"Abstract": "NarrativeText",
"Hyperlink": "UncategorizedText",
"TableOfContents": "UncategorizedText",
"Index": "UncategorizedText",
"Form": "UncategorizedText",
"FormField": "UncategorizedText",
"FormFieldValue": "UncategorizedText",
"Checkbox": "UncategorizedText",
"RadioButton": "UncategorizedText",
"Button": "UncategorizedText",
"Comment": "UncategorizedText",
"Highlight": "UncategorizedText",
"RevisionInsertion": "UncategorizedText",
"RevisionDeletion": "UncategorizedText",
"Address": "Address",
"EmailAddress": "EmailAddress",
"PhoneNumber": "UncategorizedText",
"CalendarDate": "UncategorizedText",
"Time": "UncategorizedText",
"Currency": "UncategorizedText",
"Measurement": "UncategorizedText",
"Letterhead": "Header",
"Signature": "UncategorizedText",
"Watermark": "UncategorizedText",
"Stamp": "UncategorizedText",
ontology.Document: elements.Text,
ontology.Section: elements.Text,
ontology.Page: elements.Text,
ontology.Column: elements.Text,
ontology.Paragraph: elements.NarrativeText,
ontology.Header: elements.Header,
ontology.Footer: elements.Footer,
ontology.Sidebar: elements.Text,
ontology.PageBreak: elements.PageBreak,
ontology.Title: elements.Title,
ontology.Subtitle: elements.Title,
ontology.Heading: elements.Title,
ontology.NarrativeText: elements.NarrativeText,
ontology.Quote: elements.NarrativeText,
ontology.Footnote: elements.Text,
ontology.Caption: elements.FigureCaption,
ontology.PageNumber: elements.PageNumber,
ontology.UncategorizedText: elements.Text,
ontology.OrderedList: elements.Text,
ontology.UnorderedList: elements.Text,
ontology.DefinitionList: elements.Text,
ontology.ListItem: elements.ListItem,
ontology.Table: elements.Table,
ontology.TableRow: elements.Table,
ontology.TableCell: elements.Table,
ontology.TableCellHeader: elements.Table,
ontology.TableBody: elements.Table,
ontology.TableHeader: elements.Table,
ontology.Image: elements.Image,
ontology.Figure: elements.Image,
ontology.Video: elements.Text,
ontology.Audio: elements.Text,
ontology.Barcode: elements.Image,
ontology.QRCode: elements.Image,
ontology.Logo: elements.Image,
ontology.CodeBlock: elements.CodeSnippet,
ontology.InlineCode: elements.CodeSnippet,
ontology.Formula: elements.Formula,
ontology.Equation: elements.Formula,
ontology.FootnoteReference: elements.Text,
ontology.Citation: elements.Text,
ontology.Bibliography: elements.Text,
ontology.Glossary: elements.Text,
ontology.Author: elements.Text,
ontology.MetaDate: elements.Text,
ontology.Keywords: elements.Text,
ontology.Abstract: elements.NarrativeText,
ontology.Hyperlink: elements.Text,
ontology.TableOfContents: elements.Text,
ontology.Index: elements.Text,
ontology.Form: elements.Text,
ontology.FormField: elements.Text,
ontology.FormFieldValue: elements.Text,
ontology.Checkbox: elements.Text,
ontology.RadioButton: elements.Text,
ontology.Button: elements.Text,
ontology.Comment: elements.Text,
ontology.Highlight: elements.Text,
ontology.RevisionInsertion: elements.Text,
ontology.RevisionDeletion: elements.Text,
ontology.Address: elements.Address,
ontology.EmailAddress: elements.EmailAddress,
ontology.PhoneNumber: elements.Text,
ontology.CalendarDate: elements.Text,
ontology.Time: elements.Text,
ontology.Currency: elements.Text,
ontology.Measurement: elements.Text,
ontology.Letterhead: elements.Header,
ontology.Signature: elements.Text,
ontology.Watermark: elements.Text,
ontology.Stamp: elements.Text,
}

return ontology_to_unstructured_class_mapping


ALL_ONTOLOGY_ELEMENT_TYPES = get_all_subclasses(OntologyElement)
HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP: Dict[tuple[str, str], Type[OntologyElement]] = {
ALL_ONTOLOGY_ELEMENT_TYPES = get_all_subclasses(ontology.OntologyElement)
HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP: Dict[tuple[str, str], Type[ontology.OntologyElement]] = {
(tag, element_type().css_class_name): element_type
for element_type in ALL_ONTOLOGY_ELEMENT_TYPES
for tag in element_type().allowed_tags
}
CSS_CLASS_TO_ELEMENT_TYPE_MAP: Dict[str, Type[OntologyElement]] = {
CSS_CLASS_TO_ELEMENT_TYPE_MAP: Dict[str, Type[ontology.OntologyElement]] = {
element_type().css_class_name: element_type
for element_type in ALL_ONTOLOGY_ELEMENT_TYPES
for tag in element_type().allowed_tags
}

EXCLUSIVE_HTML_TAG_TO_ELEMENT_TYPE_MAP: Dict[str, Type[OntologyElement]] = get_exclusive_html_tags()
ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME = get_ontology_to_unstructured_type_mapping()
HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP: Dict[str, Type[ontology.OntologyElement]] = {
"body": ontology.Document,
"footer": ontology.Footer,
"aside": ontology.Sidebar,
"hr": ontology.PageBreak,
"h3": ontology.Heading,
"h4": ontology.Heading,
"h5": ontology.Heading,
"h6": ontology.Heading,
"blockquote": ontology.Quote,
"figcaption": ontology.Caption,
"ol": ontology.OrderedList,
"li": ontology.ListItem,
"tbody": ontology.TableBody,
"thead": ontology.TableHeader,
"tr": ontology.TableRow,
"td": ontology.TableCell,
"th": ontology.TableCellHeader,
"figure": ontology.Figure,
"video": ontology.Video,
"audio": ontology.Audio,
"pre": ontology.CodeBlock,
"sub": ontology.FootnoteReference,
"cite": ontology.Citation,
"nav": ontology.Index,
"form": ontology.Form,
"label": ontology.FormField,
"button": ontology.Button,
"mark": ontology.Highlight,
"ins": ontology.RevisionInsertion,
"del": ontology.RevisionDeletion,
"address": ontology.Address,
"table": ontology.Table,
}

ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE = get_ontology_to_unstructured_type_mapping()
Loading

0 comments on commit 85dc0fc

Please sign in to comment.