Skip to content

Commit

Permalink
rfctr(docx): organize docx tests (#3070)
Browse files Browse the repository at this point in the history
**Summary**
I preparation for adding DOCX pluggable image extraction, organize a few
of the DOCX tests to be parallel to very similar tests for the DOC and
ODT partitioners.
  • Loading branch information
scanny authored May 21, 2024
1 parent 7832dfc commit 30e5a0c
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 28 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.14.2-dev0
## 0.14.2-dev1

### Enhancements

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ line-length = 100

[tool.pyright]
pythonPlatform = "Linux"
pythonVersion = "3.8"
pythonVersion = "3.9"
reportUnnecessaryCast = true
reportUnnecessaryTypeIgnoreComment = true
stubPath = "./typings"
Expand Down
54 changes: 29 additions & 25 deletions test_unstructured/partition/docx/test_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,6 @@ def test_partition_docx_from_filename(
assert {element.metadata.detection_origin for element in elements} == {"docx"}


def test_partition_docx_from_filename_with_metadata_filename(mock_document_file_path: str):
elements = partition_docx(mock_document_file_path, metadata_filename="test")
assert all(element.metadata.filename == "test" for element in elements)


def test_partition_docx_with_spooled_file(
mock_document_file_path: str, expected_elements: list[Text]
):
Expand All @@ -92,16 +87,6 @@ def test_partition_docx_from_file(mock_document_file_path: str, expected_element
assert element.metadata.filename is None


def test_partition_docx_from_file_with_metadata_filename(
mock_document_file_path: str, expected_elements: list[Text]
):
with open(mock_document_file_path, "rb") as f:
elements = partition_docx(file=f, metadata_filename="test")
assert elements == expected_elements
for element in elements:
assert element.metadata.filename == "test"


def test_partition_docx_uses_file_path_when_both_are_specified(
mock_document_file_path: str, expected_elements: list[Text]
):
Expand Down Expand Up @@ -221,21 +206,37 @@ def test_partition_docx_detects_lists():
assert sum(1 for e in elements if isinstance(e, ListItem)) == 10


def test_partition_docx_from_filename_exclude_metadata():
# -- `include_metadata` arg ----------------------------------------------------------------------


def test_partition_docx_from_filename_excludes_metadata_when_so_instructed():
elements = partition_docx(example_doc_path("handbook-1p.docx"), include_metadata=False)
assert all(e.metadata.to_dict() == {} for e in elements)

assert elements[0].metadata.filetype is None
assert elements[0].metadata.page_name is None
assert elements[0].metadata.filename is None

def test_partition_docx_from_file_excludes_metadata_when_so_instructed():
with open(example_doc_path("simple.docx"), "rb") as f:
assert all(
element.metadata.to_dict() == {}
for element in partition_docx(file=f, include_metadata=False)
)

def test_partition_docx_from_file_exclude_metadata(mock_document_file_path: str):
with open(mock_document_file_path, "rb") as f:
elements = partition_docx(file=f, include_metadata=False)

assert elements[0].metadata.filetype is None
assert elements[0].metadata.page_name is None
assert elements[0].metadata.filename is None
# -- .metadata.filename --------------------------------------------------------------------------


def test_partition_docx_from_filename_prefers_metadata_filename_when_provided():
elements = partition_docx(example_doc_path("simple.docx"), metadata_filename="test")
assert all(element.metadata.filename == "test" for element in elements)


def test_partition_docx_from_file_prefers_metadata_filename_when_provided():
with open(example_doc_path("simple.docx"), "rb") as f:
elements = partition_docx(file=f, metadata_filename="test")
assert all(element.metadata.filename == "test" for element in elements)


# -- .metadata.last_modified ---------------------------------------------------------------------


def test_partition_docx_metadata_date(mocker: MockFixture):
Expand Down Expand Up @@ -307,6 +308,9 @@ def test_partition_docx_from_file_without_metadata_date():
assert elements[0].metadata.last_modified is None


# ------------------------------------------------------------------------------------------------


def test_get_emphasized_texts_from_paragraph(
opts_args: dict[str, Any], expected_emphasized_texts: list[dict[str, str]]
):
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.14.2-dev0" # pragma: no cover
__version__ = "0.14.2-dev1" # pragma: no cover

0 comments on commit 30e5a0c

Please sign in to comment.