diff --git a/CHANGELOG.md b/CHANGELOG.md index 043219ef40..54a3cb2c51 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.16.3-dev2 +## 0.16.4-dev0 ### Enhancements * **Elements created from V2 HTML are less granular** Added merging of adjacent text elements and inline html tags in the HTML partitioner to reduce the number of elements created from V2 HTML. @@ -7,7 +7,17 @@ ### Fixes +## 0.16.3 + +### Enhancements + +### Features + +### Fixes + * **V2 elements without first parent ID can be parsed** +* **Fix missing elements when layout element parsed in V2 ontology** +* updated **unstructured-inference** to be **0.8.1** in requirements/extra-pdf-image.in ## 0.16.2 diff --git a/requirements/base.txt b/requirements/base.txt index b49e7ce8bf..7117e30a8a 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -100,7 +100,7 @@ python-magic==0.4.27 # via -r ./base.in python-oxmsg==0.0.1 # via -r ./base.in -rapidfuzz==3.10.0 +rapidfuzz==3.10.1 # via -r ./base.in regex==2024.9.11 # via nltk diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index c758ad209b..e14c2985ad 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -127,7 +127,7 @@ python-dateutil==2.9.0.post0 # matplotlib pyyaml==6.0.2 # via unstructured-paddleocr -rapidfuzz==3.10.0 +rapidfuzz==3.10.1 # via # -c ./base.txt # unstructured-paddleocr diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index 494f6dc4ff..ae3ccdf381 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -11,5 +11,5 @@ google-cloud-vision effdet # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded # when unstructured library is. -unstructured-inference==0.8.0 +unstructured-inference==0.8.1 unstructured.pytesseract>=0.3.12 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 5d93ba403b..ff34f2dedc 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -54,7 +54,7 @@ google-auth==2.35.0 # via # google-api-core # google-cloud-vision -google-cloud-vision==3.7.4 +google-cloud-vision==3.8.0 # via -r ./extra-pdf-image.in googleapis-common-protos==1.65.0 # via @@ -167,7 +167,7 @@ pillow==11.0.0 # unstructured-pytesseract portalocker==2.10.1 # via iopath -proto-plus==1.24.0 +proto-plus==1.25.0 # via # google-api-core # google-cloud-vision @@ -217,7 +217,7 @@ pyyaml==6.0.2 # omegaconf # timm # transformers -rapidfuzz==3.10.0 +rapidfuzz==3.10.1 # via # -c ./base.txt # unstructured-inference @@ -282,7 +282,7 @@ typing-extensions==4.12.2 # torch tzdata==2024.2 # via pandas -unstructured-inference==0.8.0 +unstructured-inference==0.8.1 # via -r ./extra-pdf-image.in unstructured-pytesseract==0.3.13 # via -r ./extra-pdf-image.in diff --git a/requirements/extra-xlsx.txt b/requirements/extra-xlsx.txt index ff08577a7f..a4faf2f5f4 100644 --- a/requirements/extra-xlsx.txt +++ b/requirements/extra-xlsx.txt @@ -4,7 +4,7 @@ # # pip-compile ./extra-xlsx.in # -et-xmlfile==1.1.0 +et-xmlfile==2.0.0 # via openpyxl networkx==3.2.1 # via -r ./extra-xlsx.in diff --git a/requirements/test.txt b/requirements/test.txt index d30cc78ac5..241f941d87 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -95,7 +95,7 @@ mccabe==0.7.0 # via flake8 multidict==6.1.0 # via yarl -mypy==1.12.1 +mypy==1.13.0 # via -r ./test.in mypy-extensions==1.0.0 # via diff --git a/test_unstructured/partition/html/test_html_to_ontology_parsing.py b/test_unstructured/partition/html/test_html_to_ontology_parsing.py index 9cbdf6805f..102dd4c6e9 100644 --- a/test_unstructured/partition/html/test_html_to_ontology_parsing.py +++ b/test_unstructured/partition/html/test_html_to_ontology_parsing.py @@ -59,7 +59,7 @@ def test_when_class_is_missing_it_can_be_inferred_from_type(): expected_html = _wrap_with_body( """
Unclosed comment
@@ -554,6 +556,7 @@ def test_malformed_html():
Paragraph with invalid characters: � � �
+ """ @@ -563,3 +566,31 @@ def test_malformed_html(): parsed_ontology = indent_html(remove_all_ids(ontology.to_html())) assert parsed_ontology == expected_html + + +def test_text_is_wrapped_inside_layout_element(): + # language=HTML + base_html = _wrap_with_body( + """ +Text
+