Skip to content

Commit

Permalink
Add extracting alt text from images
Browse files Browse the repository at this point in the history
  • Loading branch information
plutasnyy committed Nov 26, 2024
1 parent 626f73a commit b5e8b5c
Show file tree
Hide file tree
Showing 7 changed files with 122 additions and 11 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
## 0.16.7-dev0

### Enhancements
- **Add image_alt_mode to partition_html** Adds an `image_alt_mode` parameter to `partition_html()` to control how alt text is extracted from images in HTML documents. The parameter can be set to `to_text` to extract alt text as text from <img> html tags

### Features

### Fixes

## 0.16.6

### Enhancements
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -555,3 +555,21 @@ def test_inline_elements_are_squeezed_when_text_wrapped_into_paragraphs():
assert len(unstructured_elements) == 2
assert isinstance(unstructured_elements[0], Text)
assert isinstance(unstructured_elements[1], NarrativeText)


def test_alternate_text_from_image_is_passed():
# language=HTML
input_html = """
<div class="Page">
<table>
<tr>
<td rowspan="2">Example image nested in the table:</td>
<td rowspan="2"><img src="my-logo.png" alt="ALT TEXT Logo"></td>
</tr>
</table>
</div>add_img_alt_text
"""
page = parse_html_to_ontology(input_html)
unstructured_elements = ontology_to_unstructured_elements(page)
assert len(unstructured_elements) == 2
assert "ALT TEXT Logo" in unstructured_elements[1].text
47 changes: 47 additions & 0 deletions test_unstructured/partition/html/test_partition_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from unstructured.partition.html import partition_html


def test_alternative_image_text_can_be_included():
# language=HTML
html = """
<div class="Page">
<img src="my-logo.png" alt="ALT TEXT Logo"/>
</div>
"""
_, image_to_text_alt_mode = partition_html(
text=html,
image_alt_mode="to_text",
html_parser_version="v2",
)
assert "ALT TEXT Logo" in image_to_text_alt_mode.text

_, image_none_alt_mode = partition_html(
text=html,
image_alt_mode=None,
html_parser_version="v2",
)
assert "ALT TEXT Logo" not in image_none_alt_mode.text


def test_alternative_image_text_can_be_included_when_nested_in_paragraph():
# language=HTML
html = """
<div class="Page">
<p class="Paragraph">
<img src="my-logo.png" alt="ALT TEXT Logo"/>
</p>
</div>
"""
_, paragraph_to_text_alt_mode = partition_html(
text=html,
image_alt_mode="to_text",
html_parser_version="v2",
)
assert "ALT TEXT Logo" in paragraph_to_text_alt_mode.text

_, paragraph_none_alt_mode = partition_html(
text=html,
image_alt_mode=None,
html_parser_version="v2",
)
assert "ALT TEXT Logo" not in paragraph_none_alt_mode.text
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.6" # pragma: no cover
__version__ = "0.16.7-dev0" # pragma: no cover
26 changes: 21 additions & 5 deletions unstructured/documents/ontology.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,27 @@ def to_html(self, add_children=True) -> str:

return result_html

def to_text(self, add_children=True) -> str:
def to_text(self, add_children=True, add_img_alt_text=True) -> str:
"""
Returns the text representation of the element.
Args:
add_children: If True, the text of the children will be included.
Otherwise, element is represented as single self-closing tag.
add_img_alt_text: If True, the alt text of the image will be included.
"""
if self.children and add_children:
children_text = " ".join(child.to_text().strip() for child in self.children)
children_text = " ".join(
child.to_text(add_children, add_img_alt_text).strip() for child in self.children
)
return children_text
return BeautifulSoup(self.to_html(), "html.parser").get_text().strip()

text = BeautifulSoup(self.to_html(), "html.parser").get_text().strip()

if add_img_alt_text and self.html_tag_name == "img" and "alt" in self.additional_attributes:
text += f" {self.additional_attributes.get('alt', '')}"

return text.strip()

def _construct_attribute_string(self, attributes: dict) -> str:
return " ".join(
Expand Down Expand Up @@ -473,8 +489,8 @@ class FormFieldValue(OntologyElement):
elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
allowed_tags: List[str] = Field(["input"], frozen=True)

def to_text(self, add_children=True) -> str:
text = super().to_text()
def to_text(self, add_children=True, add_img_alt_text=True) -> str:
text = super().to_text(add_children, add_img_alt_text)
value = self.additional_attributes.get("value", "")
if not value:
return text
Expand Down
21 changes: 20 additions & 1 deletion unstructured/partition/html/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element
from unstructured.documents.ontology import Page
from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.model import FileType
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
Expand All @@ -36,6 +37,7 @@ def partition_html(
skip_headers_and_footers: bool = False,
detection_origin: Optional[str] = None,
html_parser_version: Literal["v1", "v2"] = "v1",
image_alt_mode: Optional[Literal["to_text"]] = "to_text",
**kwargs: Any,
) -> list[Element]:
"""Partitions an HTML document into its constituent elements.
Expand Down Expand Up @@ -65,6 +67,9 @@ def partition_html(
html_parser_version (Literal['v1', 'v2']):
The version of the HTML parser to use. The default is 'v1'. For 'v2' the parser will
use the ontology schema to parse the HTML document.
image_alt_mode (Literal['to_text']):
When set 'to_text', the v2 parser will include the alternative text of images in the output.
"""
# -- parser rejects an empty str, nip that edge-case in the bud here --
if text is not None and text.strip() == "" and not file and not filename and not url:
Expand All @@ -81,6 +86,7 @@ def partition_html(
skip_headers_and_footers=skip_headers_and_footers,
detection_origin=detection_origin,
html_parser_version=html_parser_version,
image_alt_mode=image_alt_mode,
)

return list(_HtmlPartitioner.iter_elements(opts))
Expand All @@ -102,6 +108,7 @@ def __init__(
skip_headers_and_footers: bool,
detection_origin: str | None,
html_parser_version: Literal["v1", "v2"] = "v1",
image_alt_mode: Optional[Literal["to_text"]] = "to_text",
):
self._file_path = file_path
self._file = file
Expand All @@ -113,6 +120,7 @@ def __init__(
self._skip_headers_and_footers = skip_headers_and_footers
self._detection_origin = detection_origin
self._html_parser_version = html_parser_version
self._image_alt_mode = image_alt_mode

@lazyproperty
def detection_origin(self) -> str | None:
Expand Down Expand Up @@ -172,6 +180,11 @@ def html_parser_version(self) -> Literal["v1", "v2"]:
"""When html_parser_version=='v2', HTML elements follow ontology schema."""
return self._html_parser_version

@lazyproperty
def add_img_alt_text(self) -> bool:
"""When True, the alternative text of images is included in the output."""
return self._image_alt_mode == "to_text"


class _HtmlPartitioner:
"""Partition HTML document into document-elements."""
Expand Down Expand Up @@ -239,5 +252,11 @@ def _from_ontology(self) -> List[Element]:
"""Convert an ontology elements represented in HTML to an ontology element."""
html_text = self._opts.html_text
ontology = parse_html_to_ontology(html_text)
unstructured_elements = ontology_to_unstructured_elements(ontology)
unstructured_elements = ontology_to_unstructured_elements(
ontology, add_img_alt_text=self._opts.add_img_alt_text
)
return unstructured_elements


if __name__ == "__main__":
Page()
10 changes: 6 additions & 4 deletions unstructured/partition/html/transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def ontology_to_unstructured_elements(
page_number: int = None,
depth: int = 0,
filename: str | None = None,
add_img_alt_text: bool = True,
) -> list[elements.Element]:
"""
Converts an OntologyElement object to a list of unstructured Element objects.
Expand All @@ -44,7 +45,9 @@ def ontology_to_unstructured_elements(
parent_id (str, optional): The ID of the parent element. Defaults to None.
page_number (int, optional): The page number of the element. Defaults to None.
depth (int, optional): The depth of the element in the hierarchy. Defaults to 0.
filename (str, optional): The name of the file the element comes from. Defaults to None.
add_img_alt_text (bool): Whether to include the alternative text of images
in the output. Defaults to True.
Returns:
list[Element]: A list of unstructured Element objects.
"""
Expand Down Expand Up @@ -77,6 +80,7 @@ def ontology_to_unstructured_elements(
page_number=page_number,
depth=0 if isinstance(ontology_element, ontology.Document) else depth + 1,
filename=filename,
add_img_alt_text=add_img_alt_text,
)
children += child

Expand All @@ -85,7 +89,7 @@ def ontology_to_unstructured_elements(
else:
element_class = ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE[ontology_element.__class__]
html_code_of_ontology_element = ontology_element.to_html()
element_text = ontology_element.to_text()
element_text = ontology_element.to_text(add_img_alt_text=add_img_alt_text)

unstructured_element = element_class(
text=element_text,
Expand Down Expand Up @@ -278,7 +282,6 @@ def parse_html_to_ontology(html_code: str) -> ontology.OntologyElement:
Args:
html_code (str): The HTML code to be parsed.
Parsing HTML will start from <div class="Page">.
Returns:
OntologyElement: The parsed Element object.
Expand Down Expand Up @@ -352,7 +355,6 @@ def parse_html_to_ontology_element(
Args:
soup (Tag): The BeautifulSoup Tag object to be converted.
recursion_depth (int): Flag to control limit of recursion depth.
Returns:
OntologyElement: The converted OntologyElement object.
"""
Expand Down

0 comments on commit b5e8b5c

Please sign in to comment.