From 5a91f0cda9e70dc89dba9fb5cef3f7641968325f Mon Sep 17 00:00:00 2001 From: Pluto Date: Fri, 25 Oct 2024 16:42:06 +0200 Subject: [PATCH 1/2] Fix layout parsing (#3754) --- CHANGELOG.md | 3 +- .../html/test_html_to_ontology_parsing.py | 35 +++++++++++++++++-- unstructured/__version__.py | 2 +- .../partition/html/transformations.py | 13 ++++--- 4 files changed, 44 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b9e042effa..3f956444d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.16.3-dev1 +## 0.16.3-dev2 ### Enhancements @@ -7,6 +7,7 @@ ### Fixes * **V2 elements without first parent ID can be parsed** +* **Fix missing elements when layout element parsed in V2 ontology** ## 0.16.2 diff --git a/test_unstructured/partition/html/test_html_to_ontology_parsing.py b/test_unstructured/partition/html/test_html_to_ontology_parsing.py index 9cbdf6805f..102dd4c6e9 100644 --- a/test_unstructured/partition/html/test_html_to_ontology_parsing.py +++ b/test_unstructured/partition/html/test_html_to_ontology_parsing.py @@ -59,7 +59,7 @@ def test_when_class_is_missing_it_can_be_inferred_from_type(): expected_html = _wrap_with_body( """
- +
""" ) @@ -87,7 +87,7 @@ def test_when_class_is_wrong_tag_name_is_overwritten(): expected_html = _wrap_with_body( """
- +
""" ) @@ -535,6 +535,8 @@ def test_malformed_html(): # language=HTML expected_html = """ + +

Unclosed comment

@@ -554,6 +556,7 @@ def test_malformed_html():

Paragraph with invalid characters: � � �

+

""" @@ -563,3 +566,31 @@ def test_malformed_html(): parsed_ontology = indent_html(remove_all_ids(ontology.to_html())) assert parsed_ontology == expected_html + + +def test_text_is_wrapped_inside_layout_element(): + # language=HTML + base_html = _wrap_with_body( + """ +
+ Text +
+ """ + ) + base_html = indent_html(base_html) + + # language=HTML + expected_html = _wrap_with_body( + """ +
+

Text

+
+ """ + ) + + expected_html = indent_html(expected_html) + + ontology: OntologyElement = parse_html_to_ontology(base_html) + parsed_ontology = indent_html(remove_all_ids(ontology.to_html())) + + assert parsed_ontology == expected_html diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 3eb8a850e8..16fec7848d 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.3-dev1" # pragma: no cover +__version__ = "0.16.3-dev2" # pragma: no cover diff --git a/unstructured/partition/html/transformations.py b/unstructured/partition/html/transformations.py index 421585a2d5..f2b897e513 100644 --- a/unstructured/partition/html/transformations.py +++ b/unstructured/partition/html/transformations.py @@ -79,15 +79,17 @@ def ontology_to_unstructured_elements( ), ) ] - + childreen = [] for child in ontology_element.children: - elements_to_return += ontology_to_unstructured_elements( + childreen += ontology_to_unstructured_elements( child, parent_id=ontology_element.id, page_number=page_number, depth=0 if isinstance(ontology_element, Document) else depth + 1, filename=filename, ) + + elements_to_return += childreen else: unstructured_element_class_name = ONTOLOGY_CLASS_NAME_TO_UNSTRUCTURED_ELEMENT_TYPE_NAME[ ontology_element.__class__.__name__ @@ -98,7 +100,6 @@ def ontology_to_unstructured_elements( BeautifulSoup(html_code_of_ontology_element, "html.parser").get_text().strip() ) # TODO value attribute from form input should be added to the text - unstructured_element = element_class( text=element_text, element_id=ontology_element.id, @@ -255,8 +256,10 @@ def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None: additional_attributes=escaped_attrs, ) - has_children = (ontology_class != UncategorizedText) and any( - isinstance(content, Tag) for content in soup.contents + has_children = ( + (ontology_class != UncategorizedText) + and any(isinstance(content, Tag) for content in soup.contents) + or ontology_class().elementType == ElementTypeEnum.layout ) if has_children: From 340a07f18b6e4df47fe8365c636e9328657a520d Mon Sep 17 00:00:00 2001 From: Tracy Shen <34946571+tbs17@users.noreply.github.com> Date: Fri, 25 Oct 2024 16:23:41 -0400 Subject: [PATCH 2/2] [Merge] release to 0.16.3 (#3755) - bump version to 0.16.3 based on Pluto's fix on layout parsing - update unstructured-inference version to 0.8.1 in --- CHANGELOG.md | 3 ++- requirements/base.txt | 2 +- requirements/extra-paddleocr.txt | 2 +- requirements/extra-pdf-image.in | 2 +- requirements/extra-pdf-image.txt | 8 ++++---- requirements/extra-xlsx.txt | 2 +- requirements/test.txt | 2 +- unstructured/__version__.py | 2 +- 8 files changed, 12 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3f956444d7..aaaa3f6e1d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.16.3-dev2 +## 0.16.3 ### Enhancements @@ -8,6 +8,7 @@ * **V2 elements without first parent ID can be parsed** * **Fix missing elements when layout element parsed in V2 ontology** +* updated **unstructured-inference** to be **0.8.1** in requirements/extra-pdf-image.in ## 0.16.2 diff --git a/requirements/base.txt b/requirements/base.txt index b49e7ce8bf..7117e30a8a 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -100,7 +100,7 @@ python-magic==0.4.27 # via -r ./base.in python-oxmsg==0.0.1 # via -r ./base.in -rapidfuzz==3.10.0 +rapidfuzz==3.10.1 # via -r ./base.in regex==2024.9.11 # via nltk diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index c758ad209b..e14c2985ad 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -127,7 +127,7 @@ python-dateutil==2.9.0.post0 # matplotlib pyyaml==6.0.2 # via unstructured-paddleocr -rapidfuzz==3.10.0 +rapidfuzz==3.10.1 # via # -c ./base.txt # unstructured-paddleocr diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index 494f6dc4ff..ae3ccdf381 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -11,5 +11,5 @@ google-cloud-vision effdet # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded # when unstructured library is. -unstructured-inference==0.8.0 +unstructured-inference==0.8.1 unstructured.pytesseract>=0.3.12 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 5d93ba403b..ff34f2dedc 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -54,7 +54,7 @@ google-auth==2.35.0 # via # google-api-core # google-cloud-vision -google-cloud-vision==3.7.4 +google-cloud-vision==3.8.0 # via -r ./extra-pdf-image.in googleapis-common-protos==1.65.0 # via @@ -167,7 +167,7 @@ pillow==11.0.0 # unstructured-pytesseract portalocker==2.10.1 # via iopath -proto-plus==1.24.0 +proto-plus==1.25.0 # via # google-api-core # google-cloud-vision @@ -217,7 +217,7 @@ pyyaml==6.0.2 # omegaconf # timm # transformers -rapidfuzz==3.10.0 +rapidfuzz==3.10.1 # via # -c ./base.txt # unstructured-inference @@ -282,7 +282,7 @@ typing-extensions==4.12.2 # torch tzdata==2024.2 # via pandas -unstructured-inference==0.8.0 +unstructured-inference==0.8.1 # via -r ./extra-pdf-image.in unstructured-pytesseract==0.3.13 # via -r ./extra-pdf-image.in diff --git a/requirements/extra-xlsx.txt b/requirements/extra-xlsx.txt index ff08577a7f..a4faf2f5f4 100644 --- a/requirements/extra-xlsx.txt +++ b/requirements/extra-xlsx.txt @@ -4,7 +4,7 @@ # # pip-compile ./extra-xlsx.in # -et-xmlfile==1.1.0 +et-xmlfile==2.0.0 # via openpyxl networkx==3.2.1 # via -r ./extra-xlsx.in diff --git a/requirements/test.txt b/requirements/test.txt index d30cc78ac5..241f941d87 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -95,7 +95,7 @@ mccabe==0.7.0 # via flake8 multidict==6.1.0 # via yarl -mypy==1.12.1 +mypy==1.13.0 # via -r ./test.in mypy-extensions==1.0.0 # via diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 16fec7848d..283c7426c6 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.3-dev2" # pragma: no cover +__version__ = "0.16.3" # pragma: no cover