rfctr(docx): DOCX emits std minified .text_as_html (#3545)

**Summary** Eliminate historical "idiosyncracies" of `table.metadata.text_as_html` HTML introduced by `partition_docx()`. Produce minified `.text_as_html` consistent with that formed by chunking. **Additional Context** - nested tables appear as their extracted text in the parent cell (no nested `<table>` elements in `.text_as_html`). - DOCX `.text_as_html` is minified (no extra whitespace or thead, tbody, tfoot elements).
Unstructured-IO · Aug 21, 2024 · 03e0ed3 · 03e0ed3
1 parent f135344
commit 03e0ed3
Show file tree

Hide file tree

Showing 4 changed files with 56 additions and 95 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.15.8-dev1
+## 0.15.8-dev2
 
 ### Enhancements
 
@@ -7,6 +7,7 @@
 ### Fixes
 
 * **Fix NLTK data download path to prevent nested directories**. Resolved an issue where a nested "nltk_data" directory was created within the parent "nltk_data" directory when it already existed. This fix prevents errors in checking for existing downloads and loading models from NLTK data.
+* **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.
 
 ## 0.15.6
 

diff --git a/test_unstructured/partition/test_docx.py b/test_unstructured/partition/test_docx.py
@@ -140,13 +140,9 @@ def test_partition_docx_processes_table():
     assert isinstance(elements[0], Table)
     assert elements[0].text == ("Header Col 1 Header Col 2 Lorem ipsum A Link example")
     assert elements[0].metadata.text_as_html == (
-        "<table>\n"
-        "<thead>\n"
-        "<tr><th>Header Col 1   </th><th>Header Col 2  </th></tr>\n"
-        "</thead>\n"
-        "<tbody>\n"
-        "<tr><td>Lorem ipsum    </td><td>A Link example</td></tr>\n"
-        "</tbody>\n"
+        "<table>"
+        "<tr><td>Header Col 1</td><td>Header Col 2</td></tr>"
+        "<tr><td>Lorem ipsum</td><td>A Link example</td></tr>"
         "</table>"
     )
     assert elements[0].metadata.filename == "fake_table.docx"
@@ -1086,13 +1082,9 @@ def it_can_convert_a_table_to_html(self, opts_args: dict[str, Any]):
         table = docx.Document(example_doc_path("docx-tables.docx")).tables[0]
 
         assert _DocxPartitioner(opts)._convert_table_to_html(table) == (
-            "<table>\n"
-            "<thead>\n"
-            "<tr><th>Header Col 1  </th><th>Header Col 2  </th></tr>\n"
-            "</thead>\n"
-            "<tbody>\n"
-            "<tr><td>Lorem ipsum   </td><td>A link example</td></tr>\n"
-            "</tbody>\n"
+            "<table>"
+            "<tr><td>Header Col 1</td><td>Header Col 2</td></tr>"
+            "<tr><td>Lorem ipsum</td><td>A link example</td></tr>"
             "</table>"
         )
 
@@ -1118,25 +1110,13 @@ def and_it_can_convert_a_nested_table_to_html(self, opts_args: dict[str, Any]):
         # -- re.sub() strips out the extra padding inserted by tabulate --
         html = re.sub(r" +<", "<", _DocxPartitioner(opts)._convert_table_to_html(table))
 
-        expected_lines = [
-            "<table>",
-            "<thead>",
-            "<tr><th>a</th><th>&gt;b&lt;</th><th>c</th></tr>",
-            "</thead>",
-            "<tbody>",
-            "<tr><td>d</td><td><table>",
-            "<tbody>",
-            "<tr><td>e</td><td>f</td></tr>",
-            "<tr><td>g&amp;t</td><td>h</td></tr>",
-            "</tbody>",
-            "</table></td><td>i</td></tr>",
-            "<tr><td>j</td><td>k</td><td>l</td></tr>",
-            "</tbody>",
-            "</table>",
-        ]
-        actual_lines = html.splitlines()
-        for expected, actual in zip(expected_lines, actual_lines):
-            assert actual == expected, f"\nexpected: {repr(expected)}\nactual:   {repr(actual)}"
+        assert html == (
+            "<table>"
+            "<tr><td>a</td><td>&gt;b&lt;</td><td>c</td></tr>"
+            "<tr><td>d</td><td>e f g&amp;t h</td><td>i</td></tr>"
+            "<tr><td>j</td><td>k</td><td>l</td></tr>"
+            "</table>"
+        )
 
     def it_can_convert_a_table_to_plain_text(self, opts_args: dict[str, Any]):
         opts = DocxPartitionerOptions(**opts_args)
@@ -1216,10 +1196,7 @@ def it_can_partition_tables_with_incomplete_rows(self):
         assert type(e).__name__ == "Table"
         assert e.text == "a b c d"
         assert e.metadata.text_as_html == (
-            "<table>\n"
-            "<thead>\n<tr><th>a  </th><th>b  </th></tr>\n</thead>\n"
-            "<tbody>\n<tr><td>c  </td><td>d  </td></tr>\n</tbody>\n"
-            "</table>"
+            "<table><tr><td>a</td><td>b</td></tr><tr><td>c</td><td>d</td></tr></table>"
         )
         # --
         # ┌───┐
@@ -1231,10 +1208,7 @@ def it_can_partition_tables_with_incomplete_rows(self):
         assert type(e).__name__ == "Table"
         assert e.text == "a b c", f"actual {e.text=}"
         assert e.metadata.text_as_html == (
-            "<table>\n"
-            "<thead>\n<tr><th>a  </th><th>  </th></tr>\n</thead>\n"
-            "<tbody>\n<tr><td>b  </td><td>c </td></tr>\n</tbody>\n"
-            "</table>"
+            "<table><tr><td>a</td><td/></tr><tr><td>b</td><td>c</td></tr></table>"
         ), f"actual {e.metadata.text_as_html=}"
         # --
         # ┌───────┐
@@ -1246,9 +1220,9 @@ def it_can_partition_tables_with_incomplete_rows(self):
         assert type(e).__name__ == "Table"
         assert e.text == "a b c d", f"actual {e.text=}"
         assert e.metadata.text_as_html == (
-            "<table>\n"
-            "<thead>\n<tr><th>a  </th><th>a  </th><th>  </th></tr>\n</thead>\n"
-            "<tbody>\n<tr><td>b  </td><td>c  </td><td>d </td></tr>\n</tbody>\n"
+            "<table>"
+            "<tr><td>a</td><td>a</td><td/></tr>"
+            "<tr><td>b</td><td>c</td><td>d</td></tr>"
             "</table>"
         ), f"actual {e.metadata.text_as_html=}"
         # --
@@ -1261,9 +1235,9 @@ def it_can_partition_tables_with_incomplete_rows(self):
         assert type(e).__name__ == "Table"
         assert e.text == "a b c d", f"actual {e.text=}"
         assert e.metadata.text_as_html == (
-            "<table>\n"
-            "<thead>\n<tr><th>a  </th><th>b  </th><th>  </th></tr>\n</thead>\n"
-            "<tbody>\n<tr><td>a  </td><td>c  </td><td>d </td></tr>\n</tbody>\n"
+            "<table>"
+            "<tr><td>a</td><td>b</td><td/></tr>"
+            "<tr><td>a</td><td>c</td><td>d</td></tr>"
             "</table>"
         ), f"actual {e.metadata.text_as_html=}"
         # -- late-start, early-end, and >2 rows vertical span --
@@ -1280,14 +1254,11 @@ def it_can_partition_tables_with_incomplete_rows(self):
         assert type(e).__name__ == "Table"
         assert e.text == "a b c d e f", f"actual {e.text=}"
         assert e.metadata.text_as_html == (
-            "<table>\n"
-            "<thead>\n"
-            "<tr><th>a  </th><th>a  </th><th>b  </th><th>c  </th></tr>\n"
-            "</thead>\n<tbody>\n"
-            "<tr><td>   </td><td>d  </td><td>d  </td><td>   </td></tr>\n"
-            "<tr><td>e  </td><td>d  </td><td>d  </td><td>f  </td></tr>\n"
-            "<tr><td>   </td><td>d  </td><td>d  </td><td>   </td></tr>\n"
-            "</tbody>\n"
+            "<table>"
+            "<tr><td>a</td><td>a</td><td>b</td><td>c</td></tr>"
+            "<tr><td/><td>d</td><td>d</td><td/></tr>"
+            "<tr><td>e</td><td>d</td><td>d</td><td>f</td></tr>"
+            "<tr><td/><td>d</td><td>d</td><td/></tr>"
             "</table>"
         ), f"actual {e.metadata.text_as_html=}"
         # --
@@ -1296,19 +1267,15 @@ def it_can_partition_tables_with_incomplete_rows(self):
         assert type(e).__name__ == "Table"
         assert e.text == "Data More Dato WTF? Strange Format", f"actual {e.text=}"
         assert e.metadata.text_as_html == (
-            "<table>\n"
-            "<thead>\n"
-            "<tr><th>Data   </th><th>Data   </th><th>      </th></tr>\n"
-            "</thead>\n"
-            "<tbody>\n"
-            "<tr><td>Data   </td><td>Data   </td><td>      </td></tr>\n"
-            "<tr><td>Data   </td><td>Data   </td><td>      </td></tr>\n"
-            "<tr><td>       </td><td>More   </td><td>      </td></tr>\n"
-            "<tr><td>Dato   </td><td>       </td><td>      </td></tr>\n"
-            "<tr><td>WTF?   </td><td>WTF?   </td><td>      </td></tr>\n"
-            "<tr><td>Strange</td><td>Strange</td><td>      </td></tr>\n"
-            "<tr><td>       </td><td>Format </td><td>Format</td></tr>\n"
-            "</tbody>\n"
+            "<table>"
+            "<tr><td>Data</td><td>Data</td><td/></tr>"
+            "<tr><td>Data</td><td>Data</td><td/></tr>"
+            "<tr><td>Data</td><td>Data</td><td/></tr>"
+            "<tr><td/><td>More</td><td/></tr>"
+            "<tr><td>Dato</td><td/></tr>"
+            "<tr><td>WTF?</td><td>WTF?</td><td/></tr>"
+            "<tr><td>Strange</td><td>Strange</td><td/></tr>"
+            "<tr><td/><td>Format</td><td>Format</td></tr>"
             "</table>"
         ), f"actual {e.metadata.text_as_html=}"
 

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.15.8-dev1"  # pragma: no cover
+__version__ = "0.15.8-dev2"  # pragma: no cover
diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import html
 import io
 import itertools
 import os
@@ -23,11 +22,11 @@
 from docx.text.pagebreak import RenderedPageBreak
 from docx.text.paragraph import Paragraph
 from docx.text.run import Run
-from tabulate import tabulate
 from typing_extensions import TypeAlias
 
 from unstructured.chunking import add_chunking_strategy
 from unstructured.cleaners.core import clean_bullets
+from unstructured.common.html_table import htmlify_matrix_of_cell_texts
 from unstructured.documents.elements import (
     Address,
     Element,
@@ -498,7 +497,7 @@ def _classify_paragraph_to_element(self, paragraph: Paragraph) -> Iterator[Eleme
         # NOTE(scanny) - if all that fails we give it the default `Text` element-type
         yield Text(text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
 
-    def _convert_table_to_html(self, table: DocxTable, is_nested: bool = False) -> str:
+    def _convert_table_to_html(self, table: DocxTable) -> str:
         """HTML string version of `table`.
 
         Example:
@@ -520,44 +519,38 @@ def _convert_table_to_html(self, table: DocxTable, is_nested: bool = False) -> s
         def iter_cell_block_items(cell: _Cell) -> Iterator[str]:
             """Generate the text of each paragraph or table in `cell` as a separate string.
 
-            A table nested in `cell` is converted to HTML and emitted as that string.
+            A table nested in `cell` is converted to the normalized text it contains.
             """
             for block_item in cell.iter_inner_content():
-                if isinstance(block_item, Paragraph):
+                if isinstance(paragraph := block_item, Paragraph):
                     # -- all docx content is ultimately in a paragraph; a nested table contributes
                     # -- structure only
-                    yield f"{html.escape(block_item.text)}"
-                elif isinstance(  # pyright: ignore[reportUnnecessaryIsInstance]
-                    block_item, DocxTable
-                ):
-                    yield self._convert_table_to_html(block_item, is_nested=True)
+                    yield paragraph.text
+                elif isinstance(table := block_item, DocxTable):
+                    for row in table.rows:
+                        yield from iter_row_cells_as_text(row)
 
         def iter_row_cells_as_text(row: _Row) -> Iterator[str]:
-            """Generate the text of each cell in `row` as a separate string.
+            """Generate the normalized text of each cell in `row` as a separate string.
 
-            The text of each paragraph within a cell is separated from the next by a newline
-            (`"\n"`). A table nested in a cell is first converted to HTML and then included as a
-            string, also separated by a newline.
+            The text of each paragraph within a cell is not separated. A table nested in a cell is
+            converted to a normalized string of its contents and combined with the text of the
+            cell that contains the table.
             """
-            # -- each omitted cell at the start of the row (pretty rare) gets the empty string --
+            # -- Each omitted cell at the start of the row (pretty rare) gets the empty string.
+            # -- This preserves column alignment when one or more initial cells are omitted.
             for _ in range(row.grid_cols_before):
                 yield ""
 
             for cell in row.cells:
-                yield "\n".join(iter_cell_block_items(cell))
+                cell_text = " ".join(iter_cell_block_items(cell))
+                yield " ".join(cell_text.split())
 
-            # -- each omitted cell at the end of the row (also rare) gets the empty string --
+            # -- Each omitted cell at the end of the row (also rare) gets the empty string. --
             for _ in range(row.grid_cols_after):
                 yield ""
 
-        return tabulate(
-            [list(iter_row_cells_as_text(row)) for row in table.rows],
-            headers=[] if is_nested else "firstrow",
-            # -- tabulate isn't really designed for recursive tables so we have to do any
-            # -- HTML-escaping for ourselves. `unsafehtml` disables tabulate html-escaping of cell
-            # -- contents.
-            tablefmt="unsafehtml",
-        )
+        return htmlify_matrix_of_cell_texts([list(iter_row_cells_as_text(r)) for r in table.rows])
 
     @lazyproperty
     def _document(self) -> Document:
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.15.8-dev1" # pragma: no cover
		__version__ = "0.15.8-dev2" # pragma: no cover