Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into ML-415/merge-inline-e…
Browse files Browse the repository at this point in the history
…lements
  • Loading branch information
plutasnyy committed Oct 29, 2024
2 parents 5f36bfe + 340a07f commit 4a18d04
Show file tree
Hide file tree
Showing 10 changed files with 58 additions and 16 deletions.
12 changes: 11 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.16.3-dev2
## 0.16.4-dev0

### Enhancements
* **Elements created from V2 HTML are less granular** Added merging of adjacent text elements and inline html tags in the HTML partitioner to reduce the number of elements created from V2 HTML.
Expand All @@ -7,7 +7,17 @@

### Fixes

## 0.16.3

### Enhancements

### Features

### Fixes

* **V2 elements without first parent ID can be parsed**
* **Fix missing elements when layout element parsed in V2 ontology**
* updated **unstructured-inference** to be **0.8.1** in requirements/extra-pdf-image.in


## 0.16.2
Expand Down
2 changes: 1 addition & 1 deletion requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ python-magic==0.4.27
# via -r ./base.in
python-oxmsg==0.0.1
# via -r ./base.in
rapidfuzz==3.10.0
rapidfuzz==3.10.1
# via -r ./base.in
regex==2024.9.11
# via nltk
Expand Down
2 changes: 1 addition & 1 deletion requirements/extra-paddleocr.txt
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ python-dateutil==2.9.0.post0
# matplotlib
pyyaml==6.0.2
# via unstructured-paddleocr
rapidfuzz==3.10.0
rapidfuzz==3.10.1
# via
# -c ./base.txt
# unstructured-paddleocr
Expand Down
2 changes: 1 addition & 1 deletion requirements/extra-pdf-image.in
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@ google-cloud-vision
effdet
# Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
# when unstructured library is.
unstructured-inference==0.8.0
unstructured-inference==0.8.1
unstructured.pytesseract>=0.3.12
8 changes: 4 additions & 4 deletions requirements/extra-pdf-image.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ google-auth==2.35.0
# via
# google-api-core
# google-cloud-vision
google-cloud-vision==3.7.4
google-cloud-vision==3.8.0
# via -r ./extra-pdf-image.in
googleapis-common-protos==1.65.0
# via
Expand Down Expand Up @@ -167,7 +167,7 @@ pillow==11.0.0
# unstructured-pytesseract
portalocker==2.10.1
# via iopath
proto-plus==1.24.0
proto-plus==1.25.0
# via
# google-api-core
# google-cloud-vision
Expand Down Expand Up @@ -217,7 +217,7 @@ pyyaml==6.0.2
# omegaconf
# timm
# transformers
rapidfuzz==3.10.0
rapidfuzz==3.10.1
# via
# -c ./base.txt
# unstructured-inference
Expand Down Expand Up @@ -282,7 +282,7 @@ typing-extensions==4.12.2
# torch
tzdata==2024.2
# via pandas
unstructured-inference==0.8.0
unstructured-inference==0.8.1
# via -r ./extra-pdf-image.in
unstructured-pytesseract==0.3.13
# via -r ./extra-pdf-image.in
Expand Down
2 changes: 1 addition & 1 deletion requirements/extra-xlsx.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#
# pip-compile ./extra-xlsx.in
#
et-xmlfile==1.1.0
et-xmlfile==2.0.0
# via openpyxl
networkx==3.2.1
# via -r ./extra-xlsx.in
Expand Down
2 changes: 1 addition & 1 deletion requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ mccabe==0.7.0
# via flake8
multidict==6.1.0
# via yarl
mypy==1.12.1
mypy==1.13.0
# via -r ./test.in
mypy-extensions==1.0.0
# via
Expand Down
35 changes: 33 additions & 2 deletions test_unstructured/partition/html/test_html_to_ontology_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def test_when_class_is_missing_it_can_be_inferred_from_type():
expected_html = _wrap_with_body(
"""
<div class="Page">
<aside class='Sidebar'>Some text</aside>
<aside class='Sidebar'><p class='Paragraph'>Some text</p></aside>
</div>
"""
)
Expand Down Expand Up @@ -87,7 +87,7 @@ def test_when_class_is_wrong_tag_name_is_overwritten():
expected_html = _wrap_with_body(
"""
<div class="Page">
<aside class='Sidebar'>Some text</aside>
<aside class='Sidebar'><p class='Paragraph'>Some text</p></aside>
</div>
"""
)
Expand Down Expand Up @@ -535,6 +535,8 @@ def test_malformed_html():
# language=HTML
expected_html = """
<body class="Document">
<p class="Paragraph">
Unclosed comment
<div class="">
<p>
Expand All @@ -554,6 +556,7 @@ def test_malformed_html():
<p>
Paragraph with invalid characters: � � �
</p>
</p>
</body>
"""

Expand All @@ -563,3 +566,31 @@ def test_malformed_html():
parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))

assert parsed_ontology == expected_html


def test_text_is_wrapped_inside_layout_element():
# language=HTML
base_html = _wrap_with_body(
"""
<div class="Page">
Text
</div>
"""
)
base_html = indent_html(base_html)

# language=HTML
expected_html = _wrap_with_body(
"""
<div class="Page">
<p class='Paragraph'>Text</p>
</div>
"""
)

expected_html = indent_html(expected_html)

ontology: OntologyElement = parse_html_to_ontology(base_html)
parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))

assert parsed_ontology == expected_html
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.3-dev2" # pragma: no cover
__version__ = "0.16.4-dev0" # pragma: no cover
7 changes: 4 additions & 3 deletions unstructured/partition/html/transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ def ontology_to_unstructured_elements(
BeautifulSoup(html_code_of_ontology_element, "html.parser").get_text().strip()
)
# TODO value attribute from form input should be added to the text

unstructured_element = element_class(
text=element_text,
element_id=ontology_element.id,
Expand Down Expand Up @@ -382,8 +381,10 @@ def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None:
additional_attributes=escaped_attrs,
)

has_children = (ontology_class != UncategorizedText) and any(
isinstance(content, Tag) for content in soup.contents
has_children = (
(ontology_class != UncategorizedText)
and any(isinstance(content, Tag) for content in soup.contents)
or ontology_class().elementType == ElementTypeEnum.layout
)

if has_children:
Expand Down

0 comments on commit 4a18d04

Please sign in to comment.