From ca27b8aa971663cf67f55ceeb15a2b768b601dee Mon Sep 17 00:00:00 2001 From: Pluto Date: Fri, 15 Nov 2024 15:30:48 +0100 Subject: [PATCH] Set to be ontology.Table not UncategorizedText (#3782) --- CHANGELOG.md | 9 + test_unstructured/documents/test_mappings.py | 53 +++++ unstructured/__version__.py | 2 +- unstructured/documents/mappings.py | 214 ++++++++++-------- .../partition/html/transformations.py | 117 +++++----- 5 files changed, 231 insertions(+), 164 deletions(-) create mode 100644 test_unstructured/documents/test_mappings.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 1584267192..a4018c1b25 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## 0.16.6-dev0 + +### Enhancements +- **Every
tag is considered to be ontology.Table** Added special handling for tables in HTML partitioning. This change is made to improve the accuracy of table extraction from HTML documents. + +### Features + +### Fixes + ## 0.16.5 ### Enhancements diff --git a/test_unstructured/documents/test_mappings.py b/test_unstructured/documents/test_mappings.py new file mode 100644 index 0000000000..029bb544fb --- /dev/null +++ b/test_unstructured/documents/test_mappings.py @@ -0,0 +1,53 @@ +from collections import defaultdict +from typing import Dict, Type + +from unstructured.documents import elements, ontology +from unstructured.documents.mappings import ( + ALL_ONTOLOGY_ELEMENT_TYPES, + HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP, + ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE, + get_all_subclasses, +) +from unstructured.documents.ontology import OntologyElement + + +def _get_exclusive_html_tags() -> dict[str, Type[OntologyElement]]: + """ + Get a mapping of HTML tags to their exclusive OntologyElement types. + """ + html_tag_to_element_type_mappings: Dict[str, list[Type[OntologyElement]]] = defaultdict(list) + for element_type in ALL_ONTOLOGY_ELEMENT_TYPES: + for tag in element_type().allowed_tags: + html_tag_to_element_type_mappings[tag].append(element_type) + + return { + tag: element_types[0] + for tag, element_types in html_tag_to_element_type_mappings.items() + if len(element_types) == 1 + } + + +def test_if_all_exclusive_html_tags_are_mapped_to_ontology_elements(): + exclusive_html_tags = _get_exclusive_html_tags() + for expected_tag, expected_element_type in exclusive_html_tags.items(): + assert expected_tag in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP + assert HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP[expected_tag] == expected_element_type + + +def test_all_expected_ontology_types_are_subclasses_of_OntologyElement(): + for element_type in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP.values(): + assert issubclass(element_type, OntologyElement) + + +def test_ontology_to_unstructured_mapping_has_valid_types(): + for ( + ontology_element, + unstructured_element, + ) in ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE.items(): + assert issubclass(unstructured_element, elements.Element) + assert issubclass(ontology_element, ontology.OntologyElement) + + +def test_all_ontology_elements_are_defined_in_mapping_to_unstructured(): + for ontology_element in get_all_subclasses(ontology.OntologyElement): + assert ontology_element in ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE diff --git a/unstructured/__version__.py b/unstructured/__version__.py index f4096753b3..a03340d1af 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.5" # pragma: no cover +__version__ = "0.16.6-dev0" # pragma: no cover diff --git a/unstructured/documents/mappings.py b/unstructured/documents/mappings.py index 98b9f6b4f8..d6b45114ac 100644 --- a/unstructured/documents/mappings.py +++ b/unstructured/documents/mappings.py @@ -5,10 +5,10 @@ of parsed documents """ -from collections import defaultdict from typing import Any, Dict, Type -from unstructured.documents.ontology import OntologyElement +from unstructured.documents import elements, ontology +from unstructured.documents.elements import Element def get_all_subclasses(cls) -> list[Any]: @@ -30,25 +30,9 @@ def get_all_subclasses(cls) -> list[Any]: return all_subclasses -def get_exclusive_html_tags() -> dict[str, Type[OntologyElement]]: +def get_ontology_to_unstructured_type_mapping() -> dict[str, Element]: """ - Get a mapping of HTML tags to their exclusive OntologyElement types. - """ - html_tag_to_element_type_mappings: Dict[str, list[Type[OntologyElement]]] = defaultdict(list) - for element_type in ALL_ONTOLOGY_ELEMENT_TYPES: - for tag in element_type().allowed_tags: - html_tag_to_element_type_mappings[tag].append(element_type) - - return { - tag: element_types[0] - for tag, element_types in html_tag_to_element_type_mappings.items() - if len(element_types) == 1 - } - - -def get_ontology_to_unstructured_type_mapping() -> dict[str, str]: - """ - Get a mapping of ontology element names to unstructured type names. + Get a mapping of ontology element to unstructured type. The dictionary here was created base on ontology mapping json Can be generated via the following code: @@ -63,97 +47,131 @@ def get_ontology_to_unstructured_type_mapping() -> dict[str, str]: ``` Returns: - dict: A dictionary where keys are ontology element class names - and values are unstructured type names. + dict: A dictionary where keys are ontology element classes + and values are unstructured types. """ ontology_to_unstructured_class_mapping = { - "Document": "UncategorizedText", - "Section": "UncategorizedText", - "Page": "UncategorizedText", - "Column": "UncategorizedText", - "Paragraph": "NarrativeText", - "Header": "Header", - "Footer": "Footer", - "Sidebar": "UncategorizedText", - "PageBreak": "PageBreak", - "Title": "Title", - "Subtitle": "Title", - "Heading": "Title", - "NarrativeText": "NarrativeText", - "Quote": "NarrativeText", - "Footnote": "UncategorizedText", - "Caption": "FigureCaption", - "PageNumber": "PageNumber", - "UncategorizedText": "UncategorizedText", - "OrderedList": "UncategorizedText", - "UnorderedList": "UncategorizedText", - "DefinitionList": "UncategorizedText", - "ListItem": "ListItem", - "Table": "Table", - "TableRow": "Table", - "TableCell": "Table", - "TableCellHeader": "Table", - "TableBody": "Table", - "TableHeader": "Table", - "Image": "Image", - "Figure": "Image", - "Video": "UncategorizedText", - "Audio": "UncategorizedText", - "Barcode": "Image", - "QRCode": "Image", - "Logo": "Image", - "CodeBlock": "CodeSnippet", - "InlineCode": "CodeSnippet", - "Formula": "Formula", - "Equation": "Formula", - "FootnoteReference": "UncategorizedText", - "Citation": "UncategorizedText", - "Bibliography": "UncategorizedText", - "Glossary": "UncategorizedText", - "Author": "UncategorizedText", - "MetaDate": "UncategorizedText", - "Keywords": "UncategorizedText", - "Abstract": "NarrativeText", - "Hyperlink": "UncategorizedText", - "TableOfContents": "UncategorizedText", - "Index": "UncategorizedText", - "Form": "UncategorizedText", - "FormField": "UncategorizedText", - "FormFieldValue": "UncategorizedText", - "Checkbox": "UncategorizedText", - "RadioButton": "UncategorizedText", - "Button": "UncategorizedText", - "Comment": "UncategorizedText", - "Highlight": "UncategorizedText", - "RevisionInsertion": "UncategorizedText", - "RevisionDeletion": "UncategorizedText", - "Address": "Address", - "EmailAddress": "EmailAddress", - "PhoneNumber": "UncategorizedText", - "CalendarDate": "UncategorizedText", - "Time": "UncategorizedText", - "Currency": "UncategorizedText", - "Measurement": "UncategorizedText", - "Letterhead": "Header", - "Signature": "UncategorizedText", - "Watermark": "UncategorizedText", - "Stamp": "UncategorizedText", + ontology.Document: elements.Text, + ontology.Section: elements.Text, + ontology.Page: elements.Text, + ontology.Column: elements.Text, + ontology.Paragraph: elements.NarrativeText, + ontology.Header: elements.Header, + ontology.Footer: elements.Footer, + ontology.Sidebar: elements.Text, + ontology.PageBreak: elements.PageBreak, + ontology.Title: elements.Title, + ontology.Subtitle: elements.Title, + ontology.Heading: elements.Title, + ontology.NarrativeText: elements.NarrativeText, + ontology.Quote: elements.NarrativeText, + ontology.Footnote: elements.Text, + ontology.Caption: elements.FigureCaption, + ontology.PageNumber: elements.PageNumber, + ontology.UncategorizedText: elements.Text, + ontology.OrderedList: elements.Text, + ontology.UnorderedList: elements.Text, + ontology.DefinitionList: elements.Text, + ontology.ListItem: elements.ListItem, + ontology.Table: elements.Table, + ontology.TableRow: elements.Table, + ontology.TableCell: elements.Table, + ontology.TableCellHeader: elements.Table, + ontology.TableBody: elements.Table, + ontology.TableHeader: elements.Table, + ontology.Image: elements.Image, + ontology.Figure: elements.Image, + ontology.Video: elements.Text, + ontology.Audio: elements.Text, + ontology.Barcode: elements.Image, + ontology.QRCode: elements.Image, + ontology.Logo: elements.Image, + ontology.CodeBlock: elements.CodeSnippet, + ontology.InlineCode: elements.CodeSnippet, + ontology.Formula: elements.Formula, + ontology.Equation: elements.Formula, + ontology.FootnoteReference: elements.Text, + ontology.Citation: elements.Text, + ontology.Bibliography: elements.Text, + ontology.Glossary: elements.Text, + ontology.Author: elements.Text, + ontology.MetaDate: elements.Text, + ontology.Keywords: elements.Text, + ontology.Abstract: elements.NarrativeText, + ontology.Hyperlink: elements.Text, + ontology.TableOfContents: elements.Text, + ontology.Index: elements.Text, + ontology.Form: elements.Text, + ontology.FormField: elements.Text, + ontology.FormFieldValue: elements.Text, + ontology.Checkbox: elements.Text, + ontology.RadioButton: elements.Text, + ontology.Button: elements.Text, + ontology.Comment: elements.Text, + ontology.Highlight: elements.Text, + ontology.RevisionInsertion: elements.Text, + ontology.RevisionDeletion: elements.Text, + ontology.Address: elements.Address, + ontology.EmailAddress: elements.EmailAddress, + ontology.PhoneNumber: elements.Text, + ontology.CalendarDate: elements.Text, + ontology.Time: elements.Text, + ontology.Currency: elements.Text, + ontology.Measurement: elements.Text, + ontology.Letterhead: elements.Header, + ontology.Signature: elements.Text, + ontology.Watermark: elements.Text, + ontology.Stamp: elements.Text, } return ontology_to_unstructured_class_mapping -ALL_ONTOLOGY_ELEMENT_TYPES = get_all_subclasses(OntologyElement) -HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP: Dict[tuple[str, str], Type[OntologyElement]] = { +ALL_ONTOLOGY_ELEMENT_TYPES = get_all_subclasses(ontology.OntologyElement) +HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP: Dict[tuple[str, str], Type[ontology.OntologyElement]] = { (tag, element_type().css_class_name): element_type for element_type in ALL_ONTOLOGY_ELEMENT_TYPES for tag in element_type().allowed_tags } -CSS_CLASS_TO_ELEMENT_TYPE_MAP: Dict[str, Type[OntologyElement]] = { +CSS_CLASS_TO_ELEMENT_TYPE_MAP: Dict[str, Type[ontology.OntologyElement]] = { element_type().css_class_name: element_type for element_type in ALL_ONTOLOGY_ELEMENT_TYPES for tag in element_type().allowed_tags } -EXCLUSIVE_HTML_TAG_TO_ELEMENT_TYPE_MAP: Dict[str, Type[OntologyElement]] = get_exclusive_html_tags() -ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME = get_ontology_to_unstructured_type_mapping() +HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP: Dict[str, Type[ontology.OntologyElement]] = { + "body": ontology.Document, + "footer": ontology.Footer, + "aside": ontology.Sidebar, + "hr": ontology.PageBreak, + "h3": ontology.Heading, + "h4": ontology.Heading, + "h5": ontology.Heading, + "h6": ontology.Heading, + "blockquote": ontology.Quote, + "figcaption": ontology.Caption, + "ol": ontology.OrderedList, + "li": ontology.ListItem, + "tbody": ontology.TableBody, + "thead": ontology.TableHeader, + "tr": ontology.TableRow, + "td": ontology.TableCell, + "th": ontology.TableCellHeader, + "figure": ontology.Figure, + "video": ontology.Video, + "audio": ontology.Audio, + "pre": ontology.CodeBlock, + "sub": ontology.FootnoteReference, + "cite": ontology.Citation, + "nav": ontology.Index, + "form": ontology.Form, + "label": ontology.FormField, + "button": ontology.Button, + "mark": ontology.Highlight, + "ins": ontology.RevisionInsertion, + "del": ontology.RevisionDeletion, + "address": ontology.Address, + "table": ontology.Table, +} + +ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE = get_ontology_to_unstructured_type_mapping() diff --git a/unstructured/partition/html/transformations.py b/unstructured/partition/html/transformations.py index 6054eba670..0f12c967bb 100644 --- a/unstructured/partition/html/transformations.py +++ b/unstructured/partition/html/transformations.py @@ -7,45 +7,24 @@ from bs4 import BeautifulSoup, Tag -from unstructured.documents.elements import ( - TYPE_TO_TEXT_ELEMENT_MAP, - Element, - ElementMetadata, - Text, -) +from unstructured.documents import elements, ontology from unstructured.documents.mappings import ( CSS_CLASS_TO_ELEMENT_TYPE_MAP, - EXCLUSIVE_HTML_TAG_TO_ELEMENT_TYPE_MAP, HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP, - ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME, -) -from unstructured.documents.ontology import ( - Bibliography, - Citation, - Document, - ElementTypeEnum, - Footnote, - FootnoteReference, - Glossary, - Hyperlink, - NarrativeText, - OntologyElement, - Page, - Paragraph, - Quote, - UncategorizedText, + HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP, + ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE, ) RECURSION_LIMIT = 50 def ontology_to_unstructured_elements( - ontology_element: OntologyElement, + ontology_element: ontology.OntologyElement, parent_id: str = None, page_number: int = None, depth: int = 0, filename: str | None = None, -) -> list[Element]: +) -> list[elements.Element]: """ Converts an OntologyElement object to a list of unstructured Element objects. @@ -70,18 +49,18 @@ def ontology_to_unstructured_elements( list[Element]: A list of unstructured Element objects. """ elements_to_return = [] - if ontology_element.elementType == ElementTypeEnum.layout and depth <= RECURSION_LIMIT: + if ontology_element.elementType == ontology.ElementTypeEnum.layout and depth <= RECURSION_LIMIT: - if page_number is None and isinstance(ontology_element, Page): + if page_number is None and isinstance(ontology_element, ontology.Page): page_number = ontology_element.page_number - if not isinstance(ontology_element, Document): + if not isinstance(ontology_element, ontology.Document): elements_to_return += [ - Text( + elements.Text( text="", element_id=ontology_element.id, detection_origin="vlm_partitioner", - metadata=ElementMetadata( + metadata=elements.ElementMetadata( parent_id=parent_id, text_as_html=ontology_element.to_html(add_children=False), page_number=page_number, @@ -96,7 +75,7 @@ def ontology_to_unstructured_elements( child, parent_id=ontology_element.id, page_number=page_number, - depth=0 if isinstance(ontology_element, Document) else depth + 1, + depth=0 if isinstance(ontology_element, ontology.Document) else depth + 1, filename=filename, ) children += child @@ -104,10 +83,7 @@ def ontology_to_unstructured_elements( combined_children = combine_inline_elements(children) elements_to_return += combined_children else: - unstructured_element_class_name = ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME[ - ontology_element.__class__.__name__ - ] - element_class = TYPE_TO_TEXT_ELEMENT_MAP[unstructured_element_class_name] + element_class = ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE[ontology_element.__class__] html_code_of_ontology_element = ontology_element.to_html() element_text = ontology_element.to_text() @@ -115,7 +91,7 @@ def ontology_to_unstructured_elements( text=element_text, element_id=ontology_element.id, detection_origin="vlm_partitioner", - metadata=ElementMetadata( + metadata=elements.ElementMetadata( parent_id=parent_id, text_as_html=html_code_of_ontology_element, page_number=page_number, @@ -128,7 +104,7 @@ def ontology_to_unstructured_elements( return elements_to_return -def combine_inline_elements(elements: list[Element]) -> list[Element]: +def combine_inline_elements(elements: list[elements.Element]) -> list[elements.Element]: """ Combines consecutive inline elements into a single element. Inline elements can be also combined with text elements. @@ -168,7 +144,9 @@ def combine_inline_elements(elements: list[Element]) -> list[Element]: return result_elements -def can_unstructured_elements_be_merged(current_element: Element, next_element: Element) -> bool: +def can_unstructured_elements_be_merged( + current_element: elements.Element, next_element: elements.Element +) -> bool: """ Elements can be merged when: - They are on the same level in the HTML tree @@ -200,20 +178,20 @@ def can_unstructured_elements_be_merged(current_element: Element, next_element: return True -def is_text_element(ontology_element: OntologyElement) -> bool: +def is_text_element(ontology_element: ontology.OntologyElement) -> bool: """Categories or classes that we want to combine with inline text""" text_classes = [ - NarrativeText, - Quote, - Paragraph, - Footnote, - FootnoteReference, - Citation, - Bibliography, - Glossary, + ontology.NarrativeText, + ontology.Quote, + ontology.Paragraph, + ontology.Footnote, + ontology.FootnoteReference, + ontology.Citation, + ontology.Bibliography, + ontology.Glossary, ] - text_categories = [ElementTypeEnum.metadata] + text_categories = [ontology.ElementTypeEnum.metadata] if any(isinstance(ontology_element, class_) for class_ in text_classes): return True @@ -224,11 +202,14 @@ def is_text_element(ontology_element: OntologyElement) -> bool: return False -def is_inline_element(ontology_element: OntologyElement) -> bool: +def is_inline_element(ontology_element: ontology.OntologyElement) -> bool: """Categories or classes that we want to combine with text elements""" - inline_classes = [Hyperlink] - inline_categories = [ElementTypeEnum.specialized_text, ElementTypeEnum.annotation] + inline_classes = [ontology.Hyperlink] + inline_categories = [ + ontology.ElementTypeEnum.specialized_text, + ontology.ElementTypeEnum.annotation, + ] if any(isinstance(ontology_element, class_) for class_ in inline_classes): return True @@ -239,7 +220,9 @@ def is_inline_element(ontology_element: OntologyElement) -> bool: return False -def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element]) -> OntologyElement: +def unstructured_elements_to_ontology( + unstructured_elements: Sequence[elements.Element], +) -> ontology.OntologyElement: """ Converts a sequence of unstructured Element objects to an OntologyElement object. @@ -260,10 +243,10 @@ def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element]) document_element_id = unstructured_elements[0].metadata.parent_id if document_element_id is None: - document_element_id = OntologyElement.generate_unique_id() + document_element_id = ontology.OntologyElement.generate_unique_id() unstructured_elements[0].metadata.parent_id = document_element_id - id_to_element_mapping[document_element_id] = Document( + id_to_element_mapping[document_element_id] = ontology.Document( additional_attributes={"id": document_element_id} ) @@ -288,7 +271,7 @@ def unstructured_elements_to_ontology(unstructured_elements: Sequence[Element]) return root_element -def parse_html_to_ontology(html_code: str) -> OntologyElement: +def parse_html_to_ontology(html_code: str) -> ontology.OntologyElement: """ Parses the given HTML code and converts it into an Element object. @@ -356,7 +339,9 @@ def remove_empty_tags(soup): return str(soup) -def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> OntologyElement | None: +def parse_html_to_ontology_element( + soup: Tag, recursion_depth: int = 1 +) -> ontology.OntologyElement | None: """ Converts a BeautifulSoup Tag object into an OntologyElement object. This function is recursive. First tries to recognize a class from Unstructured Ontology, then if class is matched tries @@ -375,7 +360,7 @@ def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> Ontol escaped_attrs = get_escaped_attributes(soup) if soup.name == "br": # Note(Pluto) should it be
? - return Paragraph( + return ontology.Paragraph( text="", css_class_name=None, html_tag_name="br", @@ -383,9 +368,9 @@ def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> Ontol ) has_children = ( - (ontology_class != UncategorizedText) + (ontology_class != ontology.UncategorizedText) and any(isinstance(content, Tag) for content in soup.contents) - or ontology_class().elementType == ElementTypeEnum.layout + or ontology_class().elementType == ontology.ElementTypeEnum.layout ) should_unwrap_html = has_children and recursion_depth <= RECURSION_LIMIT @@ -395,7 +380,7 @@ def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> Ontol ( parse_html_to_ontology_element(child, recursion_depth=recursion_depth + 1) if isinstance(child, Tag) - else Paragraph(text=str(child).strip()) + else ontology.Paragraph(text=str(child).strip()) ) for child in soup.children if str(child).strip() @@ -414,7 +399,9 @@ def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> Ontol return output_element -def extract_tag_and_ontology_class_from_tag(soup: Tag) -> tuple[str, Type[OntologyElement]]: +def extract_tag_and_ontology_class_from_tag( + soup: Tag, +) -> tuple[str, Type[ontology.OntologyElement]]: """ Extracts the HTML tag and corresponding ontology class from a BeautifulSoup Tag object. The CSS class is prioritized over @@ -445,8 +432,8 @@ def extract_tag_and_ontology_class_from_tag(soup: Tag) -> tuple[str, Type[Ontolo html_tag = element_class().allowed_tags[0] # Scenario 3: CSS class incorrect, but HTML tag correct and exclusive in ontology - if not element_class and soup.name in EXCLUSIVE_HTML_TAG_TO_ELEMENT_TYPE_MAP: - html_tag, element_class = soup.name, EXCLUSIVE_HTML_TAG_TO_ELEMENT_TYPE_MAP[soup.name] + if not element_class and soup.name in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP: + html_tag, element_class = soup.name, HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP[soup.name] # Scenario 4: CSS class incorrect, HTML tag incorrect # Fallback to default UncategorizedText @@ -455,7 +442,7 @@ def extract_tag_and_ontology_class_from_tag(soup: Tag) -> tuple[str, Type[Ontolo # e.g. parent=FormField soup.name=input -> element=FormFieldInput html_tag = "span" - element_class = UncategorizedText + element_class = ontology.UncategorizedText return html_tag, element_class