rfctr(file): refactor detect_filetype() (#3429)

**Summary** In preparation for fixing a cluster of bugs with automatic file-type detection and paving the way for some reliability improvements, refactor `unstructured.file_utils.filetype` module and improve thoroughness of tests. **Additional Context** Factor type-recognition process into three distinct strategies that are attempted in sequence. Attempted in order of preference, type-recognition falls to the next strategy when the one before it is not applicable or cannot determine the file-type. This provides a clear basis for organizing the code and tests at the top level. Consolidate the existing tests around these strategies, adding additional cases to achieve better coverage. Several bugs were uncovered in the process. Small ones were just fixed, bigger ones will be remedied in following PRs.
Unstructured-IO · Jul 23, 2024 · 3fe5c09 · 3fe5c09
1 parent 441b339
commit 3fe5c09
Show file tree

Hide file tree

Showing 13 changed files with 1,466 additions and 736 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.15.1-dev1
+## 0.15.1-dev2
 
 ### Enhancements
 
@@ -7,6 +7,10 @@
 ### Fixes
 
 * **Update import of Pinecone exception** Adds compatibility for pinecone-client>=5.0.0
+* **File-type detection catches non-existent file-path.** `detect_filetype()` no longer silently falls back to detecting a file-type based on the extension when no file exists at the path provided. Instead `FileNotFoundError` is raised. This provides consistent user notification of a mis-typed path rather than an unpredictable exception from a file-type specific partitioner when the file cannot be opened.
+* **EML files specified as a file-path are detected correctly.** Resolved a bug where an EML file submitted to `partition()` as a file-path was identified as TXT and partitioned using `partition_text()`. EML files specified by path are now identified and processed correctly, including processing any attachments.
+* **A DOCX, PPTX, or XLSX file specified by path and ambiguously identified as MIME-type "application/octet-stream" is identified correctly.** Resolves a shortcoming where a file specified by path immediately fell back to filename-extension based identification when misidentified as "application/octet-stream", either by asserted content type or a mis-guess by libmagic. An MS Office file misidentified in this way is now correctly identified regardless of its filename and whether it is specified by path or file-like object.
+* **Textual content retrieved from a URL with gzip transport compression now partitions correctly.** Resolves a bug where a textual file-type (such as Markdown) retrieved by passing a URL to `partition()` would raise when `gzip` compression was used for transport by the server.
 
 ## 0.15.0
 

diff --git a/example-docs/simple.pptx b/example-docs/simple.pptx
diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py
diff --git a/test_unstructured/file_utils/test_model.py b/test_unstructured/file_utils/test_model.py
@@ -10,6 +10,14 @@
 class DescribeFileType:
     """Unit-test suite for `unstructured.file_utils.model.Filetype`."""
 
+    # -- .__lt__() ----------------------------------------------
+
+    def it_is_a_collection_ordered_by_name_and_can_be_sorted(self):
+        """FileType is a total order on name, e.g. FileType.A < FileType.B."""
+        assert FileType.EML < FileType.HTML < FileType.XML
+
+    # -- .from_extension() --------------------------------------
+
     @pytest.mark.parametrize(
         ("ext", "file_type"),
         [
@@ -23,10 +31,12 @@ class DescribeFileType:
     def it_can_recognize_a_file_type_from_an_extension(self, ext: str, file_type: FileType | None):
         assert FileType.from_extension(ext) is file_type
 
-    @pytest.mark.parametrize("ext", [".foobar", ".xyz", ".mdx", "", "."])
-    def but_not_when_that_extension_is_empty_or_not_registered(self, ext: str):
+    @pytest.mark.parametrize("ext", [".foobar", ".xyz", ".mdx", "", ".", None])
+    def but_not_when_that_extension_is_empty_or_None_or_not_registered(self, ext: str | None):
         assert FileType.from_extension(ext) is None
 
+    # -- .from_mime_type() --------------------------------------
+
     @pytest.mark.parametrize(
         ("mime_type", "file_type"),
         [
@@ -46,29 +56,13 @@ def it_can_recognize_a_file_type_from_a_mime_type(
     ):
         assert FileType.from_mime_type(mime_type) is file_type
 
-    @pytest.mark.parametrize("mime_type", ["text/css", "image/gif", "audio/mpeg", "foo/bar"])
-    def but_not_when_that_mime_type_is_not_registered_by_a_file_type(self, mime_type: str):
+    @pytest.mark.parametrize("mime_type", ["text/css", "image/gif", "audio/mpeg", "foo/bar", None])
+    def but_not_when_that_mime_type_is_not_registered_by_a_file_type_or_None(
+        self, mime_type: str | None
+    ):
         assert FileType.from_mime_type(mime_type) is None
 
-    @pytest.mark.parametrize(
-        ("file_type", "expected_value"),
-        [
-            (FileType.BMP, ("unstructured_inference",)),
-            (FileType.CSV, ("pandas",)),
-            (FileType.DOC, ("docx",)),
-            (FileType.EMPTY, ()),
-            (FileType.HTML, ()),
-            (FileType.ODT, ("docx", "pypandoc")),
-            (FileType.PDF, ("pdf2image", "pdfminer", "PIL")),
-            (FileType.UNK, ()),
-            (FileType.WAV, ()),
-            (FileType.ZIP, ()),
-        ],
-    )
-    def it_knows_which_importable_packages_its_partitioner_depends_on(
-        self, file_type: FileType, expected_value: tuple[str, ...]
-    ):
-        assert file_type.importable_package_dependencies == expected_value
+    # -- .extra_name --------------------------------------------
 
     @pytest.mark.parametrize(
         ("file_type", "expected_value"),
@@ -91,6 +85,30 @@ def and_it_knows_which_pip_extra_needs_to_be_installed_to_get_those_dependencies
     ):
         assert file_type.extra_name == expected_value
 
+    # -- .importable_package_dependencies -----------------------
+
+    @pytest.mark.parametrize(
+        ("file_type", "expected_value"),
+        [
+            (FileType.BMP, ("unstructured_inference",)),
+            (FileType.CSV, ("pandas",)),
+            (FileType.DOC, ("docx",)),
+            (FileType.EMPTY, ()),
+            (FileType.HTML, ()),
+            (FileType.ODT, ("docx", "pypandoc")),
+            (FileType.PDF, ("pdf2image", "pdfminer", "PIL")),
+            (FileType.UNK, ()),
+            (FileType.WAV, ()),
+            (FileType.ZIP, ()),
+        ],
+    )
+    def it_knows_which_importable_packages_its_partitioner_depends_on(
+        self, file_type: FileType, expected_value: tuple[str, ...]
+    ):
+        assert file_type.importable_package_dependencies == expected_value
+
+    # -- .is_partitionable --------------------------------------
+
     @pytest.mark.parametrize(
         ("file_type", "expected_value"),
         [
@@ -112,6 +130,8 @@ def it_knows_whether_files_of_its_type_are_directly_partitionable(
     ):
         assert file_type.is_partitionable is expected_value
 
+    # -- .mime_type ---------------------------------------------
+
     @pytest.mark.parametrize(
         ("file_type", "mime_type"),
         [
@@ -131,6 +151,8 @@ def it_knows_whether_files_of_its_type_are_directly_partitionable(
     def it_knows_its_canonical_MIME_type(self, file_type: FileType, mime_type: str):
         assert file_type.mime_type == mime_type
 
+    # -- .partitioner_function_name -----------------------------
+
     @pytest.mark.parametrize(
         ("file_type", "expected_value"),
         [
@@ -155,6 +177,8 @@ def but_it_raises_on_partitioner_function_name_access_when_the_file_type_is_not_
         with pytest.raises(ValueError, match="`.partitioner_function_name` is undefined because "):
             file_type.partitioner_function_name
 
+    # -- .partitioner_module_qname ------------------------------
+
     @pytest.mark.parametrize(
         ("file_type", "expected_value"),
         [
@@ -181,6 +205,8 @@ def but_it_raises_on_partitioner_module_qname_access_when_the_file_type_is_not_p
         with pytest.raises(ValueError, match="`.partitioner_module_qname` is undefined because "):
             file_type.partitioner_module_qname
 
+    # -- .partitioner_shortname ---------------------------------
+
     @pytest.mark.parametrize(
         ("file_type", "expected_value"),
         [

diff --git a/test_unstructured/metrics/test_element_type.py b/test_unstructured/metrics/test_element_type.py
@@ -1,6 +1,10 @@
+from __future__ import annotations
+
 import pytest
 
+from test_unstructured.unit_utils import example_doc_path
 from unstructured.metrics.element_type import (
+    FrequencyDict,
     calculate_element_type_percent_match,
     get_element_type_frequency,
 )
@@ -14,10 +18,9 @@
         (
             "fake-email.txt",
             {
-                ("UncategorizedText", None): 6,
+                ("NarrativeText", None): 1,
+                ("Title", None): 1,
                 ("ListItem", None): 2,
-                ("Title", None): 5,
-                ("NarrativeText", None): 2,
             },
         ),
         (
@@ -34,8 +37,8 @@
         ),
     ],
 )
-def test_get_element_type_frequency(filename, frequency):
-    elements = partition(filename=f"example-docs/{filename}")
+def test_get_element_type_frequency(filename: str, frequency: dict[tuple[str, int | None], int]):
+    elements = partition(example_doc_path(filename))
     elements_freq = get_element_type_frequency(elements_to_json(elements))
     assert elements_freq == frequency
 
@@ -46,11 +49,11 @@ def test_get_element_type_frequency(filename, frequency):
         (
             "fake-email.txt",
             {
-                ("UncategorizedText", None): 14,
+                ("Title", None): 1,
                 ("ListItem", None): 2,
                 ("NarrativeText", None): 2,
             },
-            (0.56, 0.56, 0.56),
+            (0.8, 0.8, 0.80),
         ),
         (
             "sample-presentation.pptx",
@@ -92,8 +95,10 @@ def test_get_element_type_frequency(filename, frequency):
         ),
     ],
 )
-def test_calculate_element_type_percent_match(filename, expected_frequency, percent_matched):
-    elements = partition(filename=f"example-docs/{filename}")
+def test_calculate_element_type_percent_match(
+    filename: str, expected_frequency: FrequencyDict, percent_matched: tuple[float, float, float]
+):
+    elements = partition(example_doc_path(filename))
     elements_frequency = get_element_type_frequency(elements_to_json(elements))
     assert (
         round(calculate_element_type_percent_match(elements_frequency, expected_frequency), 2)

diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -929,7 +929,11 @@ def test_auto_partition_raises_with_bad_type(request: FixtureRequest):
         partition(filename="made-up.fake", strategy=PartitionStrategy.HI_RES)
 
     detect_filetype_.assert_called_once_with(
-        content_type=None, encoding=None, file=None, file_filename=None, filename="made-up.fake"
+        file_path="made-up.fake",
+        file=None,
+        encoding=None,
+        content_type=None,
+        metadata_file_path=None,
     )
 
 
@@ -1305,7 +1309,7 @@ def test_auto_partition_that_requires_extras_raises_when_dependencies_are_not_in
     )
     match = r"partition_pdf\(\) is not available because one or more dependencies are not installed"
     with pytest.raises(ImportError, match=match):
-        partition(example_doc_path("layout-parser-paper-fast.pdf"))
+        partition(example_doc_path("pdf/layout-parser-paper-fast.pdf"))
 
     dependency_exists_.assert_called_once_with("pdf2image")
 

diff --git a/test_unstructured/partition/test_json.py b/test_unstructured/partition/test_json.py
@@ -9,8 +9,8 @@
 import pytest
 from pytest_mock import MockFixture
 
+from test_unstructured.unit_utils import example_doc_path
 from unstructured.documents.elements import CompositeElement
-from unstructured.file_utils.filetype import detect_filetype
 from unstructured.file_utils.model import FileType
 from unstructured.partition.email import partition_email
 from unstructured.partition.html import partition_html
@@ -43,9 +43,9 @@ def test_it_chunks_elements_when_a_chunking_strategy_is_specified():
 
 @pytest.mark.parametrize("filename", test_files)
 def test_partition_json_from_filename(filename: str):
-    path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    path = example_doc_path(filename)
     elements = []
-    filetype = detect_filetype(filename=path)
+    filetype = FileType.from_extension(os.path.splitext(path)[1])
     if filetype == FileType.TXT:
         elements = partition_text(filename=path)
     if filetype == FileType.HTML:
@@ -72,9 +72,9 @@ def test_partition_json_from_filename(filename: str):
 
 @pytest.mark.parametrize("filename", test_files)
 def test_partition_json_from_filename_with_metadata_filename(filename: str):
-    path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    path = example_doc_path(filename)
     elements = []
-    filetype = detect_filetype(filename=path)
+    filetype = FileType.from_extension(os.path.splitext(path)[1])
     if filetype == FileType.TXT:
         elements = partition_text(filename=path)
     if filetype == FileType.HTML:
@@ -97,9 +97,9 @@ def test_partition_json_from_filename_with_metadata_filename(filename: str):
 
 @pytest.mark.parametrize("filename", test_files)
 def test_partition_json_from_file(filename: str):
-    path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    path = example_doc_path(filename)
     elements = []
-    filetype = detect_filetype(filename=path)
+    filetype = FileType.from_extension(os.path.splitext(path)[1])
     if filetype == FileType.TXT:
         elements = partition_text(filename=path)
     if filetype == FileType.HTML:
@@ -126,9 +126,9 @@ def test_partition_json_from_file(filename: str):
 
 @pytest.mark.parametrize("filename", test_files)
 def test_partition_json_from_file_with_metadata_filename(filename: str):
-    path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    path = example_doc_path(filename)
     elements = []
-    filetype = detect_filetype(filename=path)
+    filetype = FileType.from_extension(os.path.splitext(path)[1])
     if filetype == FileType.TXT:
         elements = partition_text(filename=path)
     if filetype == FileType.HTML:
@@ -150,9 +150,9 @@ def test_partition_json_from_file_with_metadata_filename(filename: str):
 
 @pytest.mark.parametrize("filename", test_files)
 def test_partition_json_from_text(filename: str):
-    path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    path = example_doc_path(filename)
     elements = []
-    filetype = detect_filetype(filename=path)
+    filetype = FileType.from_extension(os.path.splitext(path)[1])
     if filetype == FileType.TXT:
         elements = partition_text(filename=path)
     if filetype == FileType.HTML:
@@ -192,9 +192,9 @@ def test_partition_json_works_with_empty_list():
 
 
 def test_partition_json_raises_with_too_many_specified():
-    path = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
+    path = example_doc_path("fake-text.txt")
     elements = []
-    filetype = detect_filetype(filename=path)
+    filetype = FileType.from_extension(os.path.splitext(path)[1])
     if filetype == FileType.TXT:
         elements = partition_text(filename=path)
     if filetype == FileType.HTML:
@@ -225,9 +225,9 @@ def test_partition_json_raises_with_too_many_specified():
 
 @pytest.mark.parametrize("filename", test_files)
 def test_partition_json_from_filename_exclude_metadata(filename: str):
-    path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    path = example_doc_path(filename)
     elements = []
-    filetype = detect_filetype(filename=path)
+    filetype = FileType.from_extension(os.path.splitext(path)[1])
     if filetype == FileType.TXT:
         elements = partition_text(filename=path)
     if filetype == FileType.HTML:
@@ -249,9 +249,9 @@ def test_partition_json_from_filename_exclude_metadata(filename: str):
 
 @pytest.mark.parametrize("filename", test_files)
 def test_partition_json_from_file_exclude_metadata(filename: str):
-    path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    path = example_doc_path(filename)
     elements = []
-    filetype = detect_filetype(filename=path)
+    filetype = FileType.from_extension(os.path.splitext(path)[1])
     if filetype == FileType.TXT:
         elements = partition_text(filename=path)
     if filetype == FileType.HTML:
@@ -274,9 +274,9 @@ def test_partition_json_from_file_exclude_metadata(filename: str):
 
 @pytest.mark.parametrize("filename", test_files)
 def test_partition_json_from_text_exclude_metadata(filename: str):
-    path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
+    path = example_doc_path(filename)
     elements = []
-    filetype = detect_filetype(filename=path)
+    filetype = FileType.from_extension(os.path.splitext(path)[1])
     if filetype == FileType.TXT:
         elements = partition_text(filename=path)
     if filetype == FileType.HTML:

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.15.1-dev1"  # pragma: no cover
+__version__ = "0.15.1-dev2"  # pragma: no cover
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.15.1-dev1" # pragma: no cover
		__version__ = "0.15.1-dev2" # pragma: no cover