Skip to content

Commit

Permalink
rfctr(docx): DOCX emits std minified .text_as_html (#3545)
Browse files Browse the repository at this point in the history
**Summary**
Eliminate historical "idiosyncracies" of `table.metadata.text_as_html`
HTML introduced by `partition_docx()`. Produce minified `.text_as_html`
consistent with that formed by chunking.

**Additional Context**
- nested tables appear as their extracted text in the parent cell (no
nested `<table>` elements in `.text_as_html`).
- DOCX `.text_as_html` is minified (no extra whitespace or thead, tbody,
tfoot elements).
  • Loading branch information
scanny authored Aug 21, 2024
1 parent f135344 commit 03e0ed3
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 95 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.15.8-dev1
## 0.15.8-dev2

### Enhancements

Expand All @@ -7,6 +7,7 @@
### Fixes

* **Fix NLTK data download path to prevent nested directories**. Resolved an issue where a nested "nltk_data" directory was created within the parent "nltk_data" directory when it already existed. This fix prevents errors in checking for existing downloads and loading models from NLTK data.
* **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.

## 0.15.6

Expand Down
103 changes: 35 additions & 68 deletions test_unstructured/partition/test_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,13 +140,9 @@ def test_partition_docx_processes_table():
assert isinstance(elements[0], Table)
assert elements[0].text == ("Header Col 1 Header Col 2 Lorem ipsum A Link example")
assert elements[0].metadata.text_as_html == (
"<table>\n"
"<thead>\n"
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
"</thead>\n"
"<tbody>\n"
"<tr><td>Lorem ipsum </td><td>A Link example</td></tr>\n"
"</tbody>\n"
"<table>"
"<tr><td>Header Col 1</td><td>Header Col 2</td></tr>"
"<tr><td>Lorem ipsum</td><td>A Link example</td></tr>"
"</table>"
)
assert elements[0].metadata.filename == "fake_table.docx"
Expand Down Expand Up @@ -1086,13 +1082,9 @@ def it_can_convert_a_table_to_html(self, opts_args: dict[str, Any]):
table = docx.Document(example_doc_path("docx-tables.docx")).tables[0]

assert _DocxPartitioner(opts)._convert_table_to_html(table) == (
"<table>\n"
"<thead>\n"
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
"</thead>\n"
"<tbody>\n"
"<tr><td>Lorem ipsum </td><td>A link example</td></tr>\n"
"</tbody>\n"
"<table>"
"<tr><td>Header Col 1</td><td>Header Col 2</td></tr>"
"<tr><td>Lorem ipsum</td><td>A link example</td></tr>"
"</table>"
)

Expand All @@ -1118,25 +1110,13 @@ def and_it_can_convert_a_nested_table_to_html(self, opts_args: dict[str, Any]):
# -- re.sub() strips out the extra padding inserted by tabulate --
html = re.sub(r" +<", "<", _DocxPartitioner(opts)._convert_table_to_html(table))

expected_lines = [
"<table>",
"<thead>",
"<tr><th>a</th><th>&gt;b&lt;</th><th>c</th></tr>",
"</thead>",
"<tbody>",
"<tr><td>d</td><td><table>",
"<tbody>",
"<tr><td>e</td><td>f</td></tr>",
"<tr><td>g&amp;t</td><td>h</td></tr>",
"</tbody>",
"</table></td><td>i</td></tr>",
"<tr><td>j</td><td>k</td><td>l</td></tr>",
"</tbody>",
"</table>",
]
actual_lines = html.splitlines()
for expected, actual in zip(expected_lines, actual_lines):
assert actual == expected, f"\nexpected: {repr(expected)}\nactual: {repr(actual)}"
assert html == (
"<table>"
"<tr><td>a</td><td>&gt;b&lt;</td><td>c</td></tr>"
"<tr><td>d</td><td>e f g&amp;t h</td><td>i</td></tr>"
"<tr><td>j</td><td>k</td><td>l</td></tr>"
"</table>"
)

def it_can_convert_a_table_to_plain_text(self, opts_args: dict[str, Any]):
opts = DocxPartitionerOptions(**opts_args)
Expand Down Expand Up @@ -1216,10 +1196,7 @@ def it_can_partition_tables_with_incomplete_rows(self):
assert type(e).__name__ == "Table"
assert e.text == "a b c d"
assert e.metadata.text_as_html == (
"<table>\n"
"<thead>\n<tr><th>a </th><th>b </th></tr>\n</thead>\n"
"<tbody>\n<tr><td>c </td><td>d </td></tr>\n</tbody>\n"
"</table>"
"<table><tr><td>a</td><td>b</td></tr><tr><td>c</td><td>d</td></tr></table>"
)
# --
# ┌───┐
Expand All @@ -1231,10 +1208,7 @@ def it_can_partition_tables_with_incomplete_rows(self):
assert type(e).__name__ == "Table"
assert e.text == "a b c", f"actual {e.text=}"
assert e.metadata.text_as_html == (
"<table>\n"
"<thead>\n<tr><th>a </th><th> </th></tr>\n</thead>\n"
"<tbody>\n<tr><td>b </td><td>c </td></tr>\n</tbody>\n"
"</table>"
"<table><tr><td>a</td><td/></tr><tr><td>b</td><td>c</td></tr></table>"
), f"actual {e.metadata.text_as_html=}"
# --
# ┌───────┐
Expand All @@ -1246,9 +1220,9 @@ def it_can_partition_tables_with_incomplete_rows(self):
assert type(e).__name__ == "Table"
assert e.text == "a b c d", f"actual {e.text=}"
assert e.metadata.text_as_html == (
"<table>\n"
"<thead>\n<tr><th>a </th><th>a </th><th> </th></tr>\n</thead>\n"
"<tbody>\n<tr><td>b </td><td>c </td><td>d </td></tr>\n</tbody>\n"
"<table>"
"<tr><td>a</td><td>a</td><td/></tr>"
"<tr><td>b</td><td>c</td><td>d</td></tr>"
"</table>"
), f"actual {e.metadata.text_as_html=}"
# --
Expand All @@ -1261,9 +1235,9 @@ def it_can_partition_tables_with_incomplete_rows(self):
assert type(e).__name__ == "Table"
assert e.text == "a b c d", f"actual {e.text=}"
assert e.metadata.text_as_html == (
"<table>\n"
"<thead>\n<tr><th>a </th><th>b </th><th> </th></tr>\n</thead>\n"
"<tbody>\n<tr><td>a </td><td>c </td><td>d </td></tr>\n</tbody>\n"
"<table>"
"<tr><td>a</td><td>b</td><td/></tr>"
"<tr><td>a</td><td>c</td><td>d</td></tr>"
"</table>"
), f"actual {e.metadata.text_as_html=}"
# -- late-start, early-end, and >2 rows vertical span --
Expand All @@ -1280,14 +1254,11 @@ def it_can_partition_tables_with_incomplete_rows(self):
assert type(e).__name__ == "Table"
assert e.text == "a b c d e f", f"actual {e.text=}"
assert e.metadata.text_as_html == (
"<table>\n"
"<thead>\n"
"<tr><th>a </th><th>a </th><th>b </th><th>c </th></tr>\n"
"</thead>\n<tbody>\n"
"<tr><td> </td><td>d </td><td>d </td><td> </td></tr>\n"
"<tr><td>e </td><td>d </td><td>d </td><td>f </td></tr>\n"
"<tr><td> </td><td>d </td><td>d </td><td> </td></tr>\n"
"</tbody>\n"
"<table>"
"<tr><td>a</td><td>a</td><td>b</td><td>c</td></tr>"
"<tr><td/><td>d</td><td>d</td><td/></tr>"
"<tr><td>e</td><td>d</td><td>d</td><td>f</td></tr>"
"<tr><td/><td>d</td><td>d</td><td/></tr>"
"</table>"
), f"actual {e.metadata.text_as_html=}"
# --
Expand All @@ -1296,19 +1267,15 @@ def it_can_partition_tables_with_incomplete_rows(self):
assert type(e).__name__ == "Table"
assert e.text == "Data More Dato WTF? Strange Format", f"actual {e.text=}"
assert e.metadata.text_as_html == (
"<table>\n"
"<thead>\n"
"<tr><th>Data </th><th>Data </th><th> </th></tr>\n"
"</thead>\n"
"<tbody>\n"
"<tr><td>Data </td><td>Data </td><td> </td></tr>\n"
"<tr><td>Data </td><td>Data </td><td> </td></tr>\n"
"<tr><td> </td><td>More </td><td> </td></tr>\n"
"<tr><td>Dato </td><td> </td><td> </td></tr>\n"
"<tr><td>WTF? </td><td>WTF? </td><td> </td></tr>\n"
"<tr><td>Strange</td><td>Strange</td><td> </td></tr>\n"
"<tr><td> </td><td>Format </td><td>Format</td></tr>\n"
"</tbody>\n"
"<table>"
"<tr><td>Data</td><td>Data</td><td/></tr>"
"<tr><td>Data</td><td>Data</td><td/></tr>"
"<tr><td>Data</td><td>Data</td><td/></tr>"
"<tr><td/><td>More</td><td/></tr>"
"<tr><td>Dato</td><td/></tr>"
"<tr><td>WTF?</td><td>WTF?</td><td/></tr>"
"<tr><td>Strange</td><td>Strange</td><td/></tr>"
"<tr><td/><td>Format</td><td>Format</td></tr>"
"</table>"
), f"actual {e.metadata.text_as_html=}"

Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.15.8-dev1" # pragma: no cover
__version__ = "0.15.8-dev2" # pragma: no cover
43 changes: 18 additions & 25 deletions unstructured/partition/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

from __future__ import annotations

import html
import io
import itertools
import os
Expand All @@ -23,11 +22,11 @@
from docx.text.pagebreak import RenderedPageBreak
from docx.text.paragraph import Paragraph
from docx.text.run import Run
from tabulate import tabulate
from typing_extensions import TypeAlias

from unstructured.chunking import add_chunking_strategy
from unstructured.cleaners.core import clean_bullets
from unstructured.common.html_table import htmlify_matrix_of_cell_texts
from unstructured.documents.elements import (
Address,
Element,
Expand Down Expand Up @@ -498,7 +497,7 @@ def _classify_paragraph_to_element(self, paragraph: Paragraph) -> Iterator[Eleme
# NOTE(scanny) - if all that fails we give it the default `Text` element-type
yield Text(text, metadata=metadata, detection_origin=DETECTION_ORIGIN)

def _convert_table_to_html(self, table: DocxTable, is_nested: bool = False) -> str:
def _convert_table_to_html(self, table: DocxTable) -> str:
"""HTML string version of `table`.
Example:
Expand All @@ -520,44 +519,38 @@ def _convert_table_to_html(self, table: DocxTable, is_nested: bool = False) -> s
def iter_cell_block_items(cell: _Cell) -> Iterator[str]:
"""Generate the text of each paragraph or table in `cell` as a separate string.
A table nested in `cell` is converted to HTML and emitted as that string.
A table nested in `cell` is converted to the normalized text it contains.
"""
for block_item in cell.iter_inner_content():
if isinstance(block_item, Paragraph):
if isinstance(paragraph := block_item, Paragraph):
# -- all docx content is ultimately in a paragraph; a nested table contributes
# -- structure only
yield f"{html.escape(block_item.text)}"
elif isinstance( # pyright: ignore[reportUnnecessaryIsInstance]
block_item, DocxTable
):
yield self._convert_table_to_html(block_item, is_nested=True)
yield paragraph.text
elif isinstance(table := block_item, DocxTable):
for row in table.rows:
yield from iter_row_cells_as_text(row)

def iter_row_cells_as_text(row: _Row) -> Iterator[str]:
"""Generate the text of each cell in `row` as a separate string.
"""Generate the normalized text of each cell in `row` as a separate string.
The text of each paragraph within a cell is separated from the next by a newline
(`"\n"`). A table nested in a cell is first converted to HTML and then included as a
string, also separated by a newline.
The text of each paragraph within a cell is not separated. A table nested in a cell is
converted to a normalized string of its contents and combined with the text of the
cell that contains the table.
"""
# -- each omitted cell at the start of the row (pretty rare) gets the empty string --
# -- Each omitted cell at the start of the row (pretty rare) gets the empty string.
# -- This preserves column alignment when one or more initial cells are omitted.
for _ in range(row.grid_cols_before):
yield ""

for cell in row.cells:
yield "\n".join(iter_cell_block_items(cell))
cell_text = " ".join(iter_cell_block_items(cell))
yield " ".join(cell_text.split())

# -- each omitted cell at the end of the row (also rare) gets the empty string --
# -- Each omitted cell at the end of the row (also rare) gets the empty string. --
for _ in range(row.grid_cols_after):
yield ""

return tabulate(
[list(iter_row_cells_as_text(row)) for row in table.rows],
headers=[] if is_nested else "firstrow",
# -- tabulate isn't really designed for recursive tables so we have to do any
# -- HTML-escaping for ourselves. `unsafehtml` disables tabulate html-escaping of cell
# -- contents.
tablefmt="unsafehtml",
)
return htmlify_matrix_of_cell_texts([list(iter_row_cells_as_text(r)) for r in table.rows])

@lazyproperty
def _document(self) -> Document:
Expand Down

0 comments on commit 03e0ed3

Please sign in to comment.