chore: bump unstructured-inference 0.7.35 (#3205)

### Summary - bump unstructured-inference to `0.7.35` which fixed syntax for generated HTML tables - update unit tests and ingest test fixtures to reflect changes in the generated HTML tables - cut a release for `0.14.6` --------- Co-authored-by: ryannikolaidis <[email protected]> Co-authored-by: christinestraub <[email protected]>
Unstructured-IO · Jun 14, 2024 · 9552fbb · 9552fbb
1 parent a6c09ec
commit 9552fbb
Show file tree

Hide file tree

Showing 18 changed files with 31 additions and 23 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,9 @@
-## 0.14.6-dev7
+## 0.14.6
 
 ### Enhancements
 
+* **Bump unstructured-inference==0.7.35** Fix syntax for generated HTML tables.
+
 ### Features
 
 * **tqdm ingest support** add optional flag to ingest flow to print out progress bar of each step in the process.

diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -82,7 +82,7 @@ executing==2.0.1
     # via stack-data
 fastjsonschema==2.19.1
     # via nbformat
-filelock==3.14.0
+filelock==3.15.1
     # via virtualenv
 fqdn==1.5.1
     # via jsonschema

diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt
@@ -121,7 +121,7 @@ opencv-python==4.8.0.76
     #   -c ././deps/constraints.txt
     #   imgaug
     #   unstructured-paddleocr
-openpyxl==3.1.3
+openpyxl==3.1.4
     # via unstructured-paddleocr
 packaging==23.2
     # via

diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in
@@ -12,7 +12,7 @@ google-cloud-vision
 effdet
 # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
 # when unstructured library is.
-unstructured-inference==0.7.33
+unstructured-inference==0.7.35
 # unstructured fork of pytesseract that provides an interface to allow for multiple output formats
 # from one tesseract call
 unstructured.pytesseract>=0.3.12
diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt
@@ -32,7 +32,7 @@ deprecated==1.2.14
     # via pikepdf
 effdet==0.4.1
     # via -r ./extra-pdf-image.in
-filelock==3.14.0
+filelock==3.15.1
     # via
     #   huggingface-hub
     #   torch
@@ -287,7 +287,7 @@ typing-extensions==4.12.2
     #   torch
 tzdata==2024.1
     # via pandas
-unstructured-inference==0.7.33
+unstructured-inference==0.7.35
     # via -r ./extra-pdf-image.in
 unstructured-pytesseract==0.3.12
     # via

diff --git a/requirements/extra-xlsx.txt b/requirements/extra-xlsx.txt
@@ -13,7 +13,7 @@ numpy==1.26.4
     #   -c ././deps/constraints.txt
     #   -c ./base.txt
     #   pandas
-openpyxl==3.1.3
+openpyxl==3.1.4
     # via -r ./extra-xlsx.in
 pandas==2.2.2
     # via -r ./extra-xlsx.in

diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt
@@ -17,7 +17,7 @@ click==8.1.7
     # via
     #   -c ./base.txt
     #   sacremoses
-filelock==3.14.0
+filelock==3.15.1
     # via
     #   huggingface-hub
     #   torch

diff --git a/requirements/ingest/chroma.txt b/requirements/ingest/chroma.txt
@@ -52,7 +52,7 @@ exceptiongroup==1.2.1
     # via anyio
 fastapi==0.110.3
     # via chromadb
-filelock==3.14.0
+filelock==3.15.1
     # via huggingface-hub
 flatbuffers==24.3.25
     # via onnxruntime

diff --git a/requirements/ingest/clarifai.txt b/requirements/ingest/clarifai.txt
@@ -15,7 +15,7 @@ charset-normalizer==3.3.2
     #   requests
 clarifai==10.5.0
     # via -r ./ingest/clarifai.in
-clarifai-grpc==10.5.1
+clarifai-grpc==10.5.2
     # via clarifai
 contextlib2==21.6.0
     # via schema

diff --git a/requirements/ingest/embed-huggingface.txt b/requirements/ingest/embed-huggingface.txt
@@ -31,7 +31,7 @@ dataclasses-json==0.6.7
     # via
     #   -c ./ingest/../base.txt
     #   langchain-community
-filelock==3.14.0
+filelock==3.15.1
     # via
     #   huggingface-hub
     #   torch

diff --git a/requirements/ingest/embed-octoai.txt b/requirements/ingest/embed-octoai.txt
@@ -38,7 +38,7 @@ idna==3.7
     #   anyio
     #   httpx
     #   requests
-openai==1.33.0
+openai==1.34.0
     # via -r ./ingest/embed-octoai.in
 pydantic==2.7.4
     # via openai

diff --git a/requirements/ingest/embed-openai.txt b/requirements/ingest/embed-openai.txt
@@ -98,7 +98,7 @@ numpy==1.26.4
     #   -c ./ingest/../deps/constraints.txt
     #   langchain
     #   langchain-community
-openai==1.33.0
+openai==1.34.0
     # via -r ./ingest/embed-openai.in
 orjson==3.10.4
     # via langsmith

diff --git a/test_unstructured/partition/pdf_image/test_image.py b/test_unstructured/partition/pdf_image/test_image.py
@@ -155,7 +155,8 @@ def test_partition_image_with_table_extraction(
     )
     table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
     assert len(table) == 1
-    assert "<table><thead><th>" in table[0]
+    assert "<table><thead><tr>" in table[0]
+    assert "</thead><tbody><tr>" in table[0]
 
 
 def test_partition_image_with_multipage_tiff(
@@ -180,7 +181,8 @@ def test_partition_image_with_bmp(
     )
     table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
     assert len(table) == 1
-    assert "<table><thead><th>" in table[0]
+    assert "<table><thead><tr>" in table[0]
+    assert "</thead><tbody><tr>" in table[0]
 
 
 def test_partition_image_with_language_passed(filename="example-docs/example.jpg"):
@@ -657,7 +659,8 @@ def test_partition_image_hi_res_ocr_mode_with_table_extraction(ocr_mode):
     )
     table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
     assert len(table) == 1
-    assert "<table><thead><th>" in table[0]
+    assert "<table><thead><tr>" in table[0]
+    assert "</thead><tbody><tr>" in table[0]
     assert "Layouts of history Japanese documents" in table[0]
     assert "Layouts of scanned modern magazines and scientific reports" in table[0]
 

diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -494,7 +494,8 @@ def test_partition_pdf_hi_table_extraction_with_languages(ocr_mode):
     table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
     assert elements[0].metadata.languages == ["kor"]
     assert len(table) == 2
-    assert "<table><thead><th>" in table[0]
+    assert "<table><thead><tr>" in table[0]
+    assert "</thead><tbody><tr>" in table[0]
     # FIXME(yuming): didn't test full sentence here since unit test and docker test have
     # some differences on spaces between characters
     assert "업" in table[0]
@@ -535,7 +536,8 @@ def test_partition_pdf_hi_res_ocr_mode_with_table_extraction(ocr_mode):
     )
     table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
     assert len(table) == 2
-    assert "<table><thead><th>" in table[0]
+    assert "<table><thead><tr>" in table[0]
+    assert "</thead><tbody><tr>" in table[0]
     assert "Layouts of history Japanese documents" in table[0]
     assert "Layouts of scanned modern magazines and scientific report" in table[0]
     assert "Layouts of scanned US newspapers from the 20th century" in table[0]

diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -1272,7 +1272,8 @@ def test_partition_image_with_bmp_with_auto(
     )
     table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
     assert len(table) == 1
-    assert "<table><thead><th>" in table[0]
+    assert "<table><thead><tr>" in table[0]
+    assert "</thead><tbody><tr>" in table[0]
 
 
 def test_auto_partition_eml_add_signature_to_metadata():

diff --git a/.../local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json b/.../local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json
@@ -48,7 +48,7 @@
     "element_id": "dddac446da6c93dc1449ecb5d997c423",
     "text": "Dataset | Base Model\" Large Model | Notes PubLayNet [38] P/M M Layouts of modern scientific documents PRImA [3) M - Layouts of scanned modern magazines and scientific reports Newspaper [17] P - Layouts of scanned US newspapers from the 20th century \u2018TableBank (18) P P Table region on modern scientific and business document HJDataset (31) | F/M - Layouts of history Japanese documents",
     "metadata": {
-      "text_as_html": "<table><thead><th>Dataset</th><th>| Base Model!|</th><th>Large Model</th><th>| Notes</th></thead><tr><td>PubLayNet [33]</td><td>P/M</td><td>M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA [3]</td><td>M</td><td></td><td>Layouts of scanned modern magazines and scientific reports</td></tr><tr><td>Newspaper [17]</td><td>P</td><td></td><td>Layouts of scanned US newspapers from the 20th century</td></tr><tr><td>TableBank [18]</td><td>P</td><td></td><td>Table region on modern scientific and business document</td></tr><tr><td>HIDataset [31]</td><td>P/M</td><td></td><td>Layouts of history Japanese documents</td></tr></table>",
+      "text_as_html": "<table><thead><tr><th>Dataset</th><th>| Base Model!|</th><th>Large Model</th><th>| Notes</th></tr></thead><tbody><tr><td>PubLayNet [33]</td><td>P/M</td><td>M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA [3]</td><td>M</td><td></td><td>Layouts of scanned modern magazines and scientific reports</td></tr><tr><td>Newspaper [17]</td><td>P</td><td></td><td>Layouts of scanned US newspapers from the 20th century</td></tr><tr><td>TableBank [18]</td><td>P</td><td></td><td>Table region on modern scientific and business document</td></tr><tr><td>HIDataset [31]</td><td>P/M</td><td></td><td>Layouts of history Japanese documents</td></tr></tbody></table>",
       "filetype": "image/jpeg",
       "languages": [
         "eng"

diff --git a/...ured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/...ured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json
@@ -840,7 +840,7 @@
     "element_id": "2a62c55be8401908c18140e858ec3345",
     "text": "Dataset Base Model1 Large Model Notes PubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31] F / M M F F F / M M - - F - Layouts of modern scienti\ufb01c documents Layouts of scanned modern magazines and scienti\ufb01c reports Layouts of scanned US newspapers from the 20th century Table region on modern scienti\ufb01c and business document Layouts of history Japanese documents",
     "metadata": {
-      "text_as_html": "<table><thead><th>Dataset</th><th>| Base Model'|</th><th>| Notes</th></thead><tr><td>PubLayNet B8]|</td><td>F/M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA</td><td>M</td><td>Layouts of scanned modern magazines and scientific report</td></tr><tr><td>Newspaper</td><td>F</td><td>Layouts of scanned US newspapers from the 20th century</td></tr><tr><td>TableBank</td><td>F</td><td>Table region on modern scientific and business document</td></tr><tr><td>HJDataset</td><td>F/M</td><td>Layouts of history Japanese documents</td></tr></table>",
+      "text_as_html": "<table><thead><tr><th>Dataset</th><th>| Base Model'|</th><th>| Notes</th></tr></thead><tbody><tr><td>PubLayNet B8]|</td><td>F/M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA</td><td>M</td><td>Layouts of scanned modern magazines and scientific report</td></tr><tr><td>Newspaper</td><td>F</td><td>Layouts of scanned US newspapers from the 20th century</td></tr><tr><td>TableBank</td><td>F</td><td>Table region on modern scientific and business document</td></tr><tr><td>HJDataset</td><td>F/M</td><td>Layouts of history Japanese documents</td></tr></tbody></table>",
       "filetype": "application/pdf",
       "languages": [
         "eng"
@@ -1391,7 +1391,7 @@
     "element_id": "64bc79d1132a89c71837f420d6e4e2dc",
     "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) Scale the current block given the ratio in x and y direction block.shift(dx, dy) Move the current block with the shift distances in x and y direction block1.is in(block2) Whether block1 is inside of block2 block1.intersect(block2) Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs. block1.union(block2) Return the union region of block1 and block2. Coordinate type to be determined based on the inputs. block1.relative to(block2) Convert the absolute coordinates of block1 to relative coordinates to block2 block1.condition on(block2) Calculate the absolute coordinates of block1 given the canvas block2\u2019s absolute coordinates block.crop image(image) Obtain the image segments in the block region",
     "metadata": {
-      "text_as_html": "<table><thead><th>block.pad(top, bottom,</th><th>right,</th><th>left)</th><th>Enlarge the current block according to the input</th></thead><tr><td>block.scale(fx, fy)</td><td></td><td></td><td>Scale the current block given the ratio in x and y direction</td></tr><tr><td>block.shift(dx, dy)</td><td></td><td></td><td>Move the current block with the shift distances in x and y direction</td></tr><tr><td>block1.is_in(block2)</td><td></td><td></td><td>Whether block] is inside of block2</td></tr><tr><td>block1. intersect (block2)</td><td></td><td></td><td>Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs</td></tr><tr><td>block1.union(block2)</td><td></td><td></td><td>Return the union region of blockl and block2. Coordinate type to be determined based on the inputs</td></tr><tr><td>block1.relative_to(block2)</td><td></td><td></td><td>Convert the absolute coordinates of block to relative coordinates to block2</td></tr><tr><td>block1.condition_on(block2)</td><td></td><td></td><td>Calculate the absolute coordinates of blockl given the canvas block2\u2019s absolute coordinates</td></tr><tr><td>block. crop_image (image)</td><td></td><td></td><td>Obtain the image segments in the block region</td></tr></table>",
+      "text_as_html": "<table><thead><tr><th>block.pad(top, bottom,</th><th>right,</th><th>left)</th><th>Enlarge the current block according to the input</th></tr></thead><tbody><tr><td>block.scale(fx, fy)</td><td></td><td></td><td>Scale the current block given the ratio in x and y direction</td></tr><tr><td>block.shift(dx, dy)</td><td></td><td></td><td>Move the current block with the shift distances in x and y direction</td></tr><tr><td>block1.is_in(block2)</td><td></td><td></td><td>Whether block] is inside of block2</td></tr><tr><td>block1. intersect (block2)</td><td></td><td></td><td>Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs</td></tr><tr><td>block1.union(block2)</td><td></td><td></td><td>Return the union region of blockl and block2. Coordinate type to be determined based on the inputs</td></tr><tr><td>block1.relative_to(block2)</td><td></td><td></td><td>Convert the absolute coordinates of block to relative coordinates to block2</td></tr><tr><td>block1.condition_on(block2)</td><td></td><td></td><td>Calculate the absolute coordinates of blockl given the canvas block2\u2019s absolute coordinates</td></tr><tr><td>block. crop_image (image)</td><td></td><td></td><td>Obtain the image segments in the block region</td></tr></tbody></table>",
       "filetype": "application/pdf",
       "languages": [
         "eng"

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.14.6-dev7"  # pragma: no cover
+__version__ = "0.14.6"  # pragma: no cover