From 03e0ed3519f4c3a94169fa2bd68a21ad505b71d7 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Wed, 21 Aug 2024 11:54:21 -0700 Subject: [PATCH] rfctr(docx): DOCX emits std minified .text_as_html (#3545) **Summary** Eliminate historical "idiosyncracies" of `table.metadata.text_as_html` HTML introduced by `partition_docx()`. Produce minified `.text_as_html` consistent with that formed by chunking. **Additional Context** - nested tables appear as their extracted text in the parent cell (no nested `` elements in `.text_as_html`). - DOCX `.text_as_html` is minified (no extra whitespace or thead, tbody, tfoot elements). --- CHANGELOG.md | 3 +- test_unstructured/partition/test_docx.py | 103 ++++++++--------------- unstructured/__version__.py | 2 +- unstructured/partition/docx.py | 43 ++++------ 4 files changed, 56 insertions(+), 95 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 94c90b4d36..d3d2718e48 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.8-dev1 +## 0.15.8-dev2 ### Enhancements @@ -7,6 +7,7 @@ ### Fixes * **Fix NLTK data download path to prevent nested directories**. Resolved an issue where a nested "nltk_data" directory was created within the parent "nltk_data" directory when it already existed. This fix prevents errors in checking for existing downloads and loading models from NLTK data. +* **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text. ## 0.15.6 diff --git a/test_unstructured/partition/test_docx.py b/test_unstructured/partition/test_docx.py index 996dbe2190..2e450991a5 100644 --- a/test_unstructured/partition/test_docx.py +++ b/test_unstructured/partition/test_docx.py @@ -140,13 +140,9 @@ def test_partition_docx_processes_table(): assert isinstance(elements[0], Table) assert elements[0].text == ("Header Col 1 Header Col 2 Lorem ipsum A Link example") assert elements[0].metadata.text_as_html == ( - "
\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" + "
Header Col 1 Header Col 2
Lorem ipsum A Link example
" + "" + "" "
Header Col 1Header Col 2
Lorem ipsumA Link example
" ) assert elements[0].metadata.filename == "fake_table.docx" @@ -1086,13 +1082,9 @@ def it_can_convert_a_table_to_html(self, opts_args: dict[str, Any]): table = docx.Document(example_doc_path("docx-tables.docx")).tables[0] assert _DocxPartitioner(opts)._convert_table_to_html(table) == ( - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" + "
Header Col 1 Header Col 2
Lorem ipsum A link example
" + "" + "" "
Header Col 1Header Col 2
Lorem ipsumA link example
" ) @@ -1118,25 +1110,13 @@ def and_it_can_convert_a_nested_table_to_html(self, opts_args: dict[str, Any]): # -- re.sub() strips out the extra padding inserted by tabulate -- html = re.sub(r" +<", "<", _DocxPartitioner(opts)._convert_table_to_html(table)) - expected_lines = [ - "", - "", - "", - "", - "", - "", - "", - "", - "
a>b<c
d", - "", - "", - "", - "", - "
ef
g&th
i
jkl
", - ] - actual_lines = html.splitlines() - for expected, actual in zip(expected_lines, actual_lines): - assert actual == expected, f"\nexpected: {repr(expected)}\nactual: {repr(actual)}" + assert html == ( + "" + "" + "" + "" + "
a>b<c
de f g&t hi
jkl
" + ) def it_can_convert_a_table_to_plain_text(self, opts_args: dict[str, Any]): opts = DocxPartitionerOptions(**opts_args) @@ -1216,10 +1196,7 @@ def it_can_partition_tables_with_incomplete_rows(self): assert type(e).__name__ == "Table" assert e.text == "a b c d" assert e.metadata.text_as_html == ( - "\n" - "\n\n\n" - "\n\n\n" - "
a b
c d
" + "
ab
cd
" ) # -- # ┌───┐ @@ -1231,10 +1208,7 @@ def it_can_partition_tables_with_incomplete_rows(self): assert type(e).__name__ == "Table" assert e.text == "a b c", f"actual {e.text=}" assert e.metadata.text_as_html == ( - "\n" - "\n\n\n" - "\n\n\n" - "
a
b c
" + "
a
bc
" ), f"actual {e.metadata.text_as_html=}" # -- # ┌───────┐ @@ -1246,9 +1220,9 @@ def it_can_partition_tables_with_incomplete_rows(self): assert type(e).__name__ == "Table" assert e.text == "a b c d", f"actual {e.text=}" assert e.metadata.text_as_html == ( - "\n" - "\n\n\n" - "\n\n\n" + "
a a
b c d
" + "" + "" "
aa
bcd
" ), f"actual {e.metadata.text_as_html=}" # -- @@ -1261,9 +1235,9 @@ def it_can_partition_tables_with_incomplete_rows(self): assert type(e).__name__ == "Table" assert e.text == "a b c d", f"actual {e.text=}" assert e.metadata.text_as_html == ( - "\n" - "\n\n\n" - "\n\n\n" + "
a b
a c d
" + "" + "" "
ab
acd
" ), f"actual {e.metadata.text_as_html=}" # -- late-start, early-end, and >2 rows vertical span -- @@ -1280,14 +1254,11 @@ def it_can_partition_tables_with_incomplete_rows(self): assert type(e).__name__ == "Table" assert e.text == "a b c d e f", f"actual {e.text=}" assert e.metadata.text_as_html == ( - "\n" - "\n" - "\n" - "\n\n" - "\n" - "\n" - "\n" - "\n" + "
a a b c
d d
e d d f
d d
" + "" + "" + "" + "" "
aabc
dd
eddf
dd
" ), f"actual {e.metadata.text_as_html=}" # -- @@ -1296,19 +1267,15 @@ def it_can_partition_tables_with_incomplete_rows(self): assert type(e).__name__ == "Table" assert e.text == "Data More Dato WTF? Strange Format", f"actual {e.text=}" assert e.metadata.text_as_html == ( - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" - "\n" + "
Data Data
Data Data
Data Data
More
Dato
WTF? WTF?
StrangeStrange
Format Format
" + "" + "" + "" + "" + "" + "" + "" + "" "
DataData
DataData
DataData
More
Dato
WTF?WTF?
StrangeStrange
FormatFormat
" ), f"actual {e.metadata.text_as_html=}" diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 4848da1a0a..0ca79ff041 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.8-dev1" # pragma: no cover +__version__ = "0.15.8-dev2" # pragma: no cover diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index 485dec1248..6e0fa1b249 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -2,7 +2,6 @@ from __future__ import annotations -import html import io import itertools import os @@ -23,11 +22,11 @@ from docx.text.pagebreak import RenderedPageBreak from docx.text.paragraph import Paragraph from docx.text.run import Run -from tabulate import tabulate from typing_extensions import TypeAlias from unstructured.chunking import add_chunking_strategy from unstructured.cleaners.core import clean_bullets +from unstructured.common.html_table import htmlify_matrix_of_cell_texts from unstructured.documents.elements import ( Address, Element, @@ -498,7 +497,7 @@ def _classify_paragraph_to_element(self, paragraph: Paragraph) -> Iterator[Eleme # NOTE(scanny) - if all that fails we give it the default `Text` element-type yield Text(text, metadata=metadata, detection_origin=DETECTION_ORIGIN) - def _convert_table_to_html(self, table: DocxTable, is_nested: bool = False) -> str: + def _convert_table_to_html(self, table: DocxTable) -> str: """HTML string version of `table`. Example: @@ -520,44 +519,38 @@ def _convert_table_to_html(self, table: DocxTable, is_nested: bool = False) -> s def iter_cell_block_items(cell: _Cell) -> Iterator[str]: """Generate the text of each paragraph or table in `cell` as a separate string. - A table nested in `cell` is converted to HTML and emitted as that string. + A table nested in `cell` is converted to the normalized text it contains. """ for block_item in cell.iter_inner_content(): - if isinstance(block_item, Paragraph): + if isinstance(paragraph := block_item, Paragraph): # -- all docx content is ultimately in a paragraph; a nested table contributes # -- structure only - yield f"{html.escape(block_item.text)}" - elif isinstance( # pyright: ignore[reportUnnecessaryIsInstance] - block_item, DocxTable - ): - yield self._convert_table_to_html(block_item, is_nested=True) + yield paragraph.text + elif isinstance(table := block_item, DocxTable): + for row in table.rows: + yield from iter_row_cells_as_text(row) def iter_row_cells_as_text(row: _Row) -> Iterator[str]: - """Generate the text of each cell in `row` as a separate string. + """Generate the normalized text of each cell in `row` as a separate string. - The text of each paragraph within a cell is separated from the next by a newline - (`"\n"`). A table nested in a cell is first converted to HTML and then included as a - string, also separated by a newline. + The text of each paragraph within a cell is not separated. A table nested in a cell is + converted to a normalized string of its contents and combined with the text of the + cell that contains the table. """ - # -- each omitted cell at the start of the row (pretty rare) gets the empty string -- + # -- Each omitted cell at the start of the row (pretty rare) gets the empty string. + # -- This preserves column alignment when one or more initial cells are omitted. for _ in range(row.grid_cols_before): yield "" for cell in row.cells: - yield "\n".join(iter_cell_block_items(cell)) + cell_text = " ".join(iter_cell_block_items(cell)) + yield " ".join(cell_text.split()) - # -- each omitted cell at the end of the row (also rare) gets the empty string -- + # -- Each omitted cell at the end of the row (also rare) gets the empty string. -- for _ in range(row.grid_cols_after): yield "" - return tabulate( - [list(iter_row_cells_as_text(row)) for row in table.rows], - headers=[] if is_nested else "firstrow", - # -- tabulate isn't really designed for recursive tables so we have to do any - # -- HTML-escaping for ourselves. `unsafehtml` disables tabulate html-escaping of cell - # -- contents. - tablefmt="unsafehtml", - ) + return htmlify_matrix_of_cell_texts([list(iter_row_cells_as_text(r)) for r in table.rows]) @lazyproperty def _document(self) -> Document: