Skip to content

Commit

Permalink
Fix extracting value from field (#3774)
Browse files Browse the repository at this point in the history
  • Loading branch information
plutasnyy authored Nov 7, 2024
1 parent 66d1e5a commit c2d17b1
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 5 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.16.5-dev0
## 0.16.5-dev1

### Enhancements

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,10 @@ def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_p

for i in range(len(expected_json_elements)):
assert expected_json_elements[i] == predicted_elements[i]
assert (
expected_json_elements[i].metadata.text_as_html
== predicted_elements[i].metadata.text_as_html
)


def test_inline_elements_are_squeezed():
Expand Down
15 changes: 15 additions & 0 deletions test_unstructured/partition/html/test_html_to_ontology_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -607,6 +607,21 @@ def test_text_in_form_field_value():
assert form_field_value.to_text() == "Random Input Value"


def test_text_in_form_field_value_with_null_value():
# language=HTML
input_html = """
<div class="Page">
<input class="FormFieldValue" value=""/>
</div>
"""
page = parse_html_to_ontology(input_html)

assert len(page.children) == 1
form_field_value = page.children[0]
assert form_field_value.text == ""
assert form_field_value.to_text() == ""


def test_to_text_when_form_field():
ontology = Page(
children=[
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.5-dev0" # pragma: no cover
__version__ = "0.16.5-dev1" # pragma: no cover
9 changes: 6 additions & 3 deletions unstructured/documents/ontology.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def to_text(self, add_children=True) -> str:
if self.children and add_children:
children_text = " ".join(child.to_text().strip() for child in self.children)
return children_text
return BeautifulSoup(self.to_html()).get_text().strip()
return BeautifulSoup(self.to_html(), "html.parser").get_text().strip()

def _construct_attribute_string(self, attributes: dict) -> str:
return " ".join(
Expand Down Expand Up @@ -466,8 +466,11 @@ class FormFieldValue(OntologyElement):
allowed_tags: List[str] = Field(["input"], frozen=True)

def to_text(self, add_children=True) -> str:
text = super().to_text() + self.additional_attributes.get("value", "")
return text.strip()
text = super().to_text()
value = self.additional_attributes.get("value", "")
if not value:
return text
return f"{text} {value}".strip()


class Checkbox(OntologyElement):
Expand Down

0 comments on commit c2d17b1

Please sign in to comment.