Skip to content

Commit

Permalink
rfctr(file): refactor detect_filetype() (#3429)
Browse files Browse the repository at this point in the history
**Summary**
In preparation for fixing a cluster of bugs with automatic file-type
detection and paving the way for some reliability improvements, refactor
`unstructured.file_utils.filetype` module and improve thoroughness of
tests.

**Additional Context**
Factor type-recognition process into three distinct strategies that are
attempted in sequence. Attempted in order of preference,
type-recognition falls to the next strategy when the one before it is
not applicable or cannot determine the file-type. This provides a clear
basis for organizing the code and tests at the top level.

Consolidate the existing tests around these strategies, adding
additional cases to achieve better coverage.

Several bugs were uncovered in the process. Small ones were just fixed,
bigger ones will be remedied in following PRs.
  • Loading branch information
scanny authored Jul 23, 2024
1 parent 441b339 commit 3fe5c09
Show file tree
Hide file tree
Showing 13 changed files with 1,466 additions and 736 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.15.1-dev1
## 0.15.1-dev2

### Enhancements

Expand All @@ -7,6 +7,10 @@
### Fixes

* **Update import of Pinecone exception** Adds compatibility for pinecone-client>=5.0.0
* **File-type detection catches non-existent file-path.** `detect_filetype()` no longer silently falls back to detecting a file-type based on the extension when no file exists at the path provided. Instead `FileNotFoundError` is raised. This provides consistent user notification of a mis-typed path rather than an unpredictable exception from a file-type specific partitioner when the file cannot be opened.
* **EML files specified as a file-path are detected correctly.** Resolved a bug where an EML file submitted to `partition()` as a file-path was identified as TXT and partitioned using `partition_text()`. EML files specified by path are now identified and processed correctly, including processing any attachments.
* **A DOCX, PPTX, or XLSX file specified by path and ambiguously identified as MIME-type "application/octet-stream" is identified correctly.** Resolves a shortcoming where a file specified by path immediately fell back to filename-extension based identification when misidentified as "application/octet-stream", either by asserted content type or a mis-guess by libmagic. An MS Office file misidentified in this way is now correctly identified regardless of its filename and whether it is specified by path or file-like object.
* **Textual content retrieved from a URL with gzip transport compression now partitions correctly.** Resolves a bug where a textual file-type (such as Markdown) retrieved by passing a URL to `partition()` would raise when `gzip` compression was used for transport by the server.

## 0.15.0

Expand Down
Binary file added example-docs/simple.pptx
Binary file not shown.
1,191 changes: 806 additions & 385 deletions test_unstructured/file_utils/test_filetype.py

Large diffs are not rendered by default.

72 changes: 49 additions & 23 deletions test_unstructured/file_utils/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,14 @@
class DescribeFileType:
"""Unit-test suite for `unstructured.file_utils.model.Filetype`."""

# -- .__lt__() ----------------------------------------------

def it_is_a_collection_ordered_by_name_and_can_be_sorted(self):
"""FileType is a total order on name, e.g. FileType.A < FileType.B."""
assert FileType.EML < FileType.HTML < FileType.XML

# -- .from_extension() --------------------------------------

@pytest.mark.parametrize(
("ext", "file_type"),
[
Expand All @@ -23,10 +31,12 @@ class DescribeFileType:
def it_can_recognize_a_file_type_from_an_extension(self, ext: str, file_type: FileType | None):
assert FileType.from_extension(ext) is file_type

@pytest.mark.parametrize("ext", [".foobar", ".xyz", ".mdx", "", "."])
def but_not_when_that_extension_is_empty_or_not_registered(self, ext: str):
@pytest.mark.parametrize("ext", [".foobar", ".xyz", ".mdx", "", ".", None])
def but_not_when_that_extension_is_empty_or_None_or_not_registered(self, ext: str | None):
assert FileType.from_extension(ext) is None

# -- .from_mime_type() --------------------------------------

@pytest.mark.parametrize(
("mime_type", "file_type"),
[
Expand All @@ -46,29 +56,13 @@ def it_can_recognize_a_file_type_from_a_mime_type(
):
assert FileType.from_mime_type(mime_type) is file_type

@pytest.mark.parametrize("mime_type", ["text/css", "image/gif", "audio/mpeg", "foo/bar"])
def but_not_when_that_mime_type_is_not_registered_by_a_file_type(self, mime_type: str):
@pytest.mark.parametrize("mime_type", ["text/css", "image/gif", "audio/mpeg", "foo/bar", None])
def but_not_when_that_mime_type_is_not_registered_by_a_file_type_or_None(
self, mime_type: str | None
):
assert FileType.from_mime_type(mime_type) is None

@pytest.mark.parametrize(
("file_type", "expected_value"),
[
(FileType.BMP, ("unstructured_inference",)),
(FileType.CSV, ("pandas",)),
(FileType.DOC, ("docx",)),
(FileType.EMPTY, ()),
(FileType.HTML, ()),
(FileType.ODT, ("docx", "pypandoc")),
(FileType.PDF, ("pdf2image", "pdfminer", "PIL")),
(FileType.UNK, ()),
(FileType.WAV, ()),
(FileType.ZIP, ()),
],
)
def it_knows_which_importable_packages_its_partitioner_depends_on(
self, file_type: FileType, expected_value: tuple[str, ...]
):
assert file_type.importable_package_dependencies == expected_value
# -- .extra_name --------------------------------------------

@pytest.mark.parametrize(
("file_type", "expected_value"),
Expand All @@ -91,6 +85,30 @@ def and_it_knows_which_pip_extra_needs_to_be_installed_to_get_those_dependencies
):
assert file_type.extra_name == expected_value

# -- .importable_package_dependencies -----------------------

@pytest.mark.parametrize(
("file_type", "expected_value"),
[
(FileType.BMP, ("unstructured_inference",)),
(FileType.CSV, ("pandas",)),
(FileType.DOC, ("docx",)),
(FileType.EMPTY, ()),
(FileType.HTML, ()),
(FileType.ODT, ("docx", "pypandoc")),
(FileType.PDF, ("pdf2image", "pdfminer", "PIL")),
(FileType.UNK, ()),
(FileType.WAV, ()),
(FileType.ZIP, ()),
],
)
def it_knows_which_importable_packages_its_partitioner_depends_on(
self, file_type: FileType, expected_value: tuple[str, ...]
):
assert file_type.importable_package_dependencies == expected_value

# -- .is_partitionable --------------------------------------

@pytest.mark.parametrize(
("file_type", "expected_value"),
[
Expand All @@ -112,6 +130,8 @@ def it_knows_whether_files_of_its_type_are_directly_partitionable(
):
assert file_type.is_partitionable is expected_value

# -- .mime_type ---------------------------------------------

@pytest.mark.parametrize(
("file_type", "mime_type"),
[
Expand All @@ -131,6 +151,8 @@ def it_knows_whether_files_of_its_type_are_directly_partitionable(
def it_knows_its_canonical_MIME_type(self, file_type: FileType, mime_type: str):
assert file_type.mime_type == mime_type

# -- .partitioner_function_name -----------------------------

@pytest.mark.parametrize(
("file_type", "expected_value"),
[
Expand All @@ -155,6 +177,8 @@ def but_it_raises_on_partitioner_function_name_access_when_the_file_type_is_not_
with pytest.raises(ValueError, match="`.partitioner_function_name` is undefined because "):
file_type.partitioner_function_name

# -- .partitioner_module_qname ------------------------------

@pytest.mark.parametrize(
("file_type", "expected_value"),
[
Expand All @@ -181,6 +205,8 @@ def but_it_raises_on_partitioner_module_qname_access_when_the_file_type_is_not_p
with pytest.raises(ValueError, match="`.partitioner_module_qname` is undefined because "):
file_type.partitioner_module_qname

# -- .partitioner_shortname ---------------------------------

@pytest.mark.parametrize(
("file_type", "expected_value"),
[
Expand Down
23 changes: 14 additions & 9 deletions test_unstructured/metrics/test_element_type.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from __future__ import annotations

import pytest

from test_unstructured.unit_utils import example_doc_path
from unstructured.metrics.element_type import (
FrequencyDict,
calculate_element_type_percent_match,
get_element_type_frequency,
)
Expand All @@ -14,10 +18,9 @@
(
"fake-email.txt",
{
("UncategorizedText", None): 6,
("NarrativeText", None): 1,
("Title", None): 1,
("ListItem", None): 2,
("Title", None): 5,
("NarrativeText", None): 2,
},
),
(
Expand All @@ -34,8 +37,8 @@
),
],
)
def test_get_element_type_frequency(filename, frequency):
elements = partition(filename=f"example-docs/{filename}")
def test_get_element_type_frequency(filename: str, frequency: dict[tuple[str, int | None], int]):
elements = partition(example_doc_path(filename))
elements_freq = get_element_type_frequency(elements_to_json(elements))
assert elements_freq == frequency

Expand All @@ -46,11 +49,11 @@ def test_get_element_type_frequency(filename, frequency):
(
"fake-email.txt",
{
("UncategorizedText", None): 14,
("Title", None): 1,
("ListItem", None): 2,
("NarrativeText", None): 2,
},
(0.56, 0.56, 0.56),
(0.8, 0.8, 0.80),
),
(
"sample-presentation.pptx",
Expand Down Expand Up @@ -92,8 +95,10 @@ def test_get_element_type_frequency(filename, frequency):
),
],
)
def test_calculate_element_type_percent_match(filename, expected_frequency, percent_matched):
elements = partition(filename=f"example-docs/{filename}")
def test_calculate_element_type_percent_match(
filename: str, expected_frequency: FrequencyDict, percent_matched: tuple[float, float, float]
):
elements = partition(example_doc_path(filename))
elements_frequency = get_element_type_frequency(elements_to_json(elements))
assert (
round(calculate_element_type_percent_match(elements_frequency, expected_frequency), 2)
Expand Down
8 changes: 6 additions & 2 deletions test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -929,7 +929,11 @@ def test_auto_partition_raises_with_bad_type(request: FixtureRequest):
partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES)

detect_filetype_.assert_called_once_with(
content_type=None, encoding=None, file=None, file_filename=None, filename="made-up.fake"
file_path="made-up.fake",
file=None,
encoding=None,
content_type=None,
metadata_file_path=None,
)


Expand Down Expand Up @@ -1305,7 +1309,7 @@ def test_auto_partition_that_requires_extras_raises_when_dependencies_are_not_in
)
match = r"partition_pdf\(\) is not available because one or more dependencies are not installed"
with pytest.raises(ImportError, match=match):
partition(example_doc_path("layout-parser-paper-fast.pdf"))
partition(example_doc_path("pdf/layout-parser-paper-fast.pdf"))

dependency_exists_.assert_called_once_with("pdf2image")

Expand Down
38 changes: 19 additions & 19 deletions test_unstructured/partition/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
import pytest
from pytest_mock import MockFixture

from test_unstructured.unit_utils import example_doc_path
from unstructured.documents.elements import CompositeElement
from unstructured.file_utils.filetype import detect_filetype
from unstructured.file_utils.model import FileType
from unstructured.partition.email import partition_email
from unstructured.partition.html import partition_html
Expand Down Expand Up @@ -43,9 +43,9 @@ def test_it_chunks_elements_when_a_chunking_strategy_is_specified():

@pytest.mark.parametrize("filename", test_files)
def test_partition_json_from_filename(filename: str):
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
path = example_doc_path(filename)
elements = []
filetype = detect_filetype(filename=path)
filetype = FileType.from_extension(os.path.splitext(path)[1])
if filetype == FileType.TXT:
elements = partition_text(filename=path)
if filetype == FileType.HTML:
Expand All @@ -72,9 +72,9 @@ def test_partition_json_from_filename(filename: str):

@pytest.mark.parametrize("filename", test_files)
def test_partition_json_from_filename_with_metadata_filename(filename: str):
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
path = example_doc_path(filename)
elements = []
filetype = detect_filetype(filename=path)
filetype = FileType.from_extension(os.path.splitext(path)[1])
if filetype == FileType.TXT:
elements = partition_text(filename=path)
if filetype == FileType.HTML:
Expand All @@ -97,9 +97,9 @@ def test_partition_json_from_filename_with_metadata_filename(filename: str):

@pytest.mark.parametrize("filename", test_files)
def test_partition_json_from_file(filename: str):
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
path = example_doc_path(filename)
elements = []
filetype = detect_filetype(filename=path)
filetype = FileType.from_extension(os.path.splitext(path)[1])
if filetype == FileType.TXT:
elements = partition_text(filename=path)
if filetype == FileType.HTML:
Expand All @@ -126,9 +126,9 @@ def test_partition_json_from_file(filename: str):

@pytest.mark.parametrize("filename", test_files)
def test_partition_json_from_file_with_metadata_filename(filename: str):
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
path = example_doc_path(filename)
elements = []
filetype = detect_filetype(filename=path)
filetype = FileType.from_extension(os.path.splitext(path)[1])
if filetype == FileType.TXT:
elements = partition_text(filename=path)
if filetype == FileType.HTML:
Expand All @@ -150,9 +150,9 @@ def test_partition_json_from_file_with_metadata_filename(filename: str):

@pytest.mark.parametrize("filename", test_files)
def test_partition_json_from_text(filename: str):
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
path = example_doc_path(filename)
elements = []
filetype = detect_filetype(filename=path)
filetype = FileType.from_extension(os.path.splitext(path)[1])
if filetype == FileType.TXT:
elements = partition_text(filename=path)
if filetype == FileType.HTML:
Expand Down Expand Up @@ -192,9 +192,9 @@ def test_partition_json_works_with_empty_list():


def test_partition_json_raises_with_too_many_specified():
path = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
path = example_doc_path("fake-text.txt")
elements = []
filetype = detect_filetype(filename=path)
filetype = FileType.from_extension(os.path.splitext(path)[1])
if filetype == FileType.TXT:
elements = partition_text(filename=path)
if filetype == FileType.HTML:
Expand Down Expand Up @@ -225,9 +225,9 @@ def test_partition_json_raises_with_too_many_specified():

@pytest.mark.parametrize("filename", test_files)
def test_partition_json_from_filename_exclude_metadata(filename: str):
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
path = example_doc_path(filename)
elements = []
filetype = detect_filetype(filename=path)
filetype = FileType.from_extension(os.path.splitext(path)[1])
if filetype == FileType.TXT:
elements = partition_text(filename=path)
if filetype == FileType.HTML:
Expand All @@ -249,9 +249,9 @@ def test_partition_json_from_filename_exclude_metadata(filename: str):

@pytest.mark.parametrize("filename", test_files)
def test_partition_json_from_file_exclude_metadata(filename: str):
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
path = example_doc_path(filename)
elements = []
filetype = detect_filetype(filename=path)
filetype = FileType.from_extension(os.path.splitext(path)[1])
if filetype == FileType.TXT:
elements = partition_text(filename=path)
if filetype == FileType.HTML:
Expand All @@ -274,9 +274,9 @@ def test_partition_json_from_file_exclude_metadata(filename: str):

@pytest.mark.parametrize("filename", test_files)
def test_partition_json_from_text_exclude_metadata(filename: str):
path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
path = example_doc_path(filename)
elements = []
filetype = detect_filetype(filename=path)
filetype = FileType.from_extension(os.path.splitext(path)[1])
if filetype == FileType.TXT:
elements = partition_text(filename=path)
if filetype == FileType.HTML:
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.15.1-dev1" # pragma: no cover
__version__ = "0.15.1-dev2" # pragma: no cover
Loading

0 comments on commit 3fe5c09

Please sign in to comment.