Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add text as html to orig elements chunks #3779

Merged
merged 9 commits into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
## 0.16.6-dev0
## 0.16.6-dev1

### Enhancements
- **Every <table> tag is considered to be ontology.Table** Added special handling for tables in HTML partitioning. This change is made to improve the accuracy of table extraction from HTML documents.

### Features

### Fixes
- **ElementMetadata consolidation** Now `text_as_html` metadata is combined across all elements in CompositeElement when chunking HTML output

## 0.16.5

Expand Down
90 changes: 90 additions & 0 deletions test_unstructured/chunking/test_html_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from functools import partial

import pytest

from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import ElementMetadata, NarrativeText, Text, Title


@pytest.fixture(params=[chunk_elements, partial(chunk_by_title, combine_text_under_n_chars=0)])
def chunking_fn(request):
return request.param


def test_combining_html_metadata_when_multiple_elements_in_composite_element(chunking_fn):
metadata_1 = '<h1 class="Title" id="1">Header </h1>'
metadata_2 = '<time class="CalendarDate" id="2">Date: October 30, 2023 </time>'
metadata_3 = (
'<form class="Form" id="3"> '
'<label class="FormField" for="company-name" id="4">Form field name </label>'
'<input class="FormFieldValue" id="5" value="Example value" />'
"</form>"
)
combined_metadata = " ".join([metadata_1, metadata_2, metadata_3])

elements = [
Title(text="Header", metadata=ElementMetadata(text_as_html=metadata_1)),
Text(text="Date: October 30, 2023", metadata=ElementMetadata(text_as_html=metadata_2)),
Text(
text="Form field name Example value", metadata=ElementMetadata(text_as_html=metadata_3)
),
]
chunks = chunking_fn(elements)
assert len(chunks) == 1
assert chunks[0].metadata.text_as_html == combined_metadata


def test_combining_html_metadata_with_nested_relationship_between_elements(chunking_fn):
"""
Ground truth
<Document>
<Page>
<Section>
<p>First</p>
<p>Second</p>
</Section>
</Page>
</Document>
Elements: Document, Page, Section, Paragraph, Paragraph
Chunk 1: Document, Page, Section, Paragraph

Chunk 2:
Paragraph
"""

metadata_1 = '<div class="Section" id="1" />'
metadata_2 = '<p class="Paragraph" id="2">First </p>'
metadata_3 = '<p class="Paragraph" id="3">Second </p>'

elements = [
Text(text="", metadata=ElementMetadata(text_as_html=metadata_1)),
NarrativeText(
text="First", metadata=ElementMetadata(text_as_html=metadata_2, parent_id="1")
),
NarrativeText(
text="Second", metadata=ElementMetadata(text_as_html=metadata_3, parent_id="1")
),
]
chunks = chunking_fn(elements, max_characters=6)
assert len(chunks) == 2
assert chunks[0].text == "First"
assert chunks[1].text == "Second"

assert chunks[0].metadata.text_as_html == metadata_1 + " " + metadata_2
assert chunks[1].metadata.text_as_html == metadata_3


def test_html_metadata_exist_in_both_element_when_text_is_split(chunking_fn):
"""Mimic behaviour of elements with non-html metadata"""
metadata_1 = '<h1 class="Title" id="1">Header </h1>'
elements = [
Title(text="Header", metadata=ElementMetadata(text_as_html=metadata_1)),
]
chunks = chunking_fn(elements, max_characters=3)
assert len(chunks) == 2

assert chunks[0].text == "Hea"
assert chunks[1].text == "der"
assert chunks[0].metadata.text_as_html == '<h1 class="Title" id="1">Header </h1>'
assert chunks[1].metadata.text_as_html == '<h1 class="Title" id="1">Header </h1>'
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.6-dev0" # pragma: no cover
__version__ = "0.16.6-dev1" # pragma: no cover
2 changes: 2 additions & 0 deletions unstructured/chunking/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,6 +774,8 @@ def iter_kwarg_pairs() -> Iterator[tuple[str, Any]]:
# -- Python 3.7+ maintains dict insertion order --
ordered_unique_keys = {key: None for val_list in values for key in val_list}
yield field_name, list(ordered_unique_keys.keys())
elif strategy is CS.STRING_CONCATENATE:
yield field_name, " ".join(val.strip() for val in values)
elif strategy is CS.DROP:
continue
else: # pragma: no cover
Expand Down
5 changes: 4 additions & 1 deletion unstructured/documents/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,9 @@ class ConsolidationStrategy(enum.Enum):
FIRST = "first"
"""Use the first value encountered, omit if not present in any elements."""

STRING_CONCATENATE = "string_concatenate"
"""Combine the values of this field across elements. Only suitable for fields of `str` type."""

LIST_CONCATENATE = "LIST_CONCATENATE"
"""Concatenate the list values across elements. Only suitable for fields of `List` type."""

Expand Down Expand Up @@ -507,7 +510,7 @@ def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]:
"sent_to": cls.FIRST,
"signature": cls.FIRST,
"subject": cls.FIRST,
"text_as_html": cls.FIRST, # -- only occurs in Table --
"text_as_html": cls.STRING_CONCATENATE,
"table_as_cells": cls.FIRST, # -- only occurs in Table --
"url": cls.FIRST,
"key_value_pairs": cls.DROP, # -- only occurs in FormKeysValues --
Expand Down
Loading