Skip to content

Commit

Permalink
fix(file): fix OLE-based file-type auto-detection (#3437)
Browse files Browse the repository at this point in the history
**Summary**
A DOC, PPT, or XLS file sent to partition() as a file-like object is
misidentified as a MSG file and raises an exception in python-oxmsg
(which is used to process MSG files).

**Fix**
DOC, PPT, XLS, and MSG are all Microsoft OLE-based files, aka. Compound
File Binary Format (CFBF). These can be reliably distinguished by
inspecting magic bytes in certain locations. `libmagic` is unreliable at
this or doesn't try, reporting the generic `"application/x-ole-storage"`
which corresponds to the "container" CFBF format (vaguely like a
Microsoft Zip format) that all these document types are stored in.

Unconditionally use `filetype.guess_mime()` provided by the `filetype`
package that is part of the base unstructured install. Unlike
`libmagic`, this package reliably detects the distinguished MIME-type
(e.g. `"application/msword"`) for OLE file subtypes.

Fixes #3364
  • Loading branch information
scanny authored Jul 25, 2024
1 parent 432d209 commit 4e61acc
Show file tree
Hide file tree
Showing 7 changed files with 262 additions and 90 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.15.1-dev4
## 0.15.1-dev5

### Enhancements

Expand All @@ -14,6 +14,7 @@
* **A DOCX, PPTX, or XLSX file specified by path and ambiguously identified as MIME-type "application/octet-stream" is identified correctly.** Resolves a shortcoming where a file specified by path immediately fell back to filename-extension based identification when misidentified as "application/octet-stream", either by asserted content type or a mis-guess by libmagic. An MS Office file misidentified in this way is now correctly identified regardless of its filename and whether it is specified by path or file-like object.
* **Textual content retrieved from a URL with gzip transport compression now partitions correctly.** Resolves a bug where a textual file-type (such as Markdown) retrieved by passing a URL to `partition()` would raise when `gzip` compression was used for transport by the server.
* **A DOCX, PPTX, or XLSX content-type asserted on partition is confirmed or fixed.** Resolves a bug where calling `partition()` with a swapped MS-Office `content_type` would cause the file-type to be misidentified. A DOCX, PPTX, or XLSX MIME-type received by `partition()` is now checked for accuracy and corrected if the file is for a different MS-Office 2007+ type.
* **DOC, PPT, XLS, and MSG files are now auto-detected correctly.** Resolves a bug where DOC, PPT, and XLS files were auto-detected as MSG files under certain circumstances.

## 0.15.0

Expand Down
5 changes: 5 additions & 0 deletions example-docs/not-unstructured-payload.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"id": "Sample-1",
"name": "Sample 1",
"description": "This is sample data #1"
}
167 changes: 155 additions & 12 deletions test_unstructured/file_utils/test_filetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@
)
from unstructured.file_utils.filetype import (
_FileTypeDetectionContext,
_OleFileDifferentiator,
_TextFileDifferentiator,
_ZipFileDifferentiator,
detect_filetype,
is_json_processable,
)
from unstructured.file_utils.model import FileType

Expand Down Expand Up @@ -185,6 +187,46 @@ def test_it_detects_correct_file_type_from_file_no_name_with_swapped_ms_office_c
assert file_type is expected_value


@pytest.mark.parametrize(
("expected_value", "file_name"),
[
(FileType.DOC, "simple.doc"),
(FileType.PPT, "fake-power-point.ppt"),
(FileType.XLS, "tests-example.xls"),
(FileType.MSG, "fake-email-multiple-attachments.msg"),
],
)
@pytest.mark.parametrize(
"content_type",
[
"application/msword",
"application/vnd.ms-outlook",
"application/vnd.ms-powerpoint",
"application/vnd.ms-excel",
"anything/else",
],
)
def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_content_type(
file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock
):
"""Fixes wrong XLS asserted as DOC, PPT, etc.
Asserted content-type can be anything except `None` and differentiator will fix it if the file
is DOC, PPT, XLS, or MSG type.
"""
# -- disable strategies 2 & 3, content-type strategy should get this on its own --
ctx_mime_type_.return_value = None
with open(example_doc_path(file_name), "rb") as f:
file = io.BytesIO(f.read())

file_type = detect_filetype(file=file, content_type=content_type)

# -- Strategy 1 should not need to refer to guessed MIME-type and detection should not
# -- fall-back to strategy 2 for any of these test cases.
ctx_mime_type_.assert_not_called()
assert file_type is expected_value


# ================================================================================================
# STRATEGY #2 - GUESS MIME-TYPE WITH LIBMAGIC
# ================================================================================================
Expand Down Expand Up @@ -264,21 +306,25 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec
[
(FileType.BMP, "img/bmp_24.bmp"),
(FileType.CSV, "stanley-cups.csv"),
(FileType.DOC, "simple.doc"),
(FileType.DOCX, "simple.docx"),
(FileType.EML, "eml/fake-email.eml"),
(FileType.EPUB, "winter-sports.epub"),
(FileType.HEIC, "img/DA-1p.heic"),
(FileType.HTML, "ideas-page.html"),
(FileType.JPG, "img/example.jpg"),
(FileType.JSON, "spring-weather.html.json"),
(FileType.MSG, "fake-email.msg"),
(FileType.ODT, "simple.odt"),
(FileType.PDF, "pdf/layout-parser-paper-fast.pdf"),
(FileType.PNG, "img/DA-1p.png"),
(FileType.PPT, "fake-power-point.ppt"),
(FileType.PPTX, "fake-power-point.pptx"),
(FileType.RTF, "fake-doc.rtf"),
(FileType.TIFF, "img/layout-parser-paper-fast.tiff"),
(FileType.TXT, "norwich-city.txt"),
(FileType.WAV, "CantinaBand3.wav"),
(FileType.XLS, "tests-example.xls"),
(FileType.XLSX, "stanley-cups.xlsx"),
(FileType.XML, "factbook.xml"),
(FileType.ZIP, "simple.zip"),
Expand All @@ -290,11 +336,7 @@ def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_
"""Does not work for all types, in particular:
TODOs:
- DOC is misidentified as MSG, TODO on that below.
- MSG is misidentified as UNK, but only on CI.
- PPT is misidentified as MSG, same fix as DOC.
- TSV is identified as TXT, maybe need an `.is_tsv` predicate in `_TextFileDifferentiator`
- XLS is misidentified as MSG, same fix as DOC.
NOCANDOs: w/o an extension I think these are the best we can do.
- MD is identified as TXT
Expand All @@ -309,25 +351,44 @@ def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_
assert detect_filetype(file=file) is expected_value


# NOTE(scanny): magic gets this wrong ("application/x-ole-storage") but filetype lib gets it right
# ("application/msword"). Need a differentiator for "application/x-ole-storage".
@pytest.mark.xfail(reason="TODO: FIX", raises=AssertionError, strict=True)
@pytest.mark.parametrize(
("expected_value", "file_name"),
[
(FileType.DOC, "simple.doc"),
(FileType.PPT, "fake-power-point.ppt"),
(FileType.XLS, "tests-example.xls"),
# -- only fails on CI, maybe different libmagic version or "magic-files" --
# (FileType.MSG, "fake-email.msg"),
(FileType.MSG, "fake-email-multiple-attachments.msg"),
],
)
def test_it_detects_MS_Office_file_types_using_strategy_2_when_libmagic_guesses_mime_type(
file_name: str, expected_value: FileType
@pytest.mark.parametrize(
"guessed_mime_type",
[
"application/msword",
"application/vnd.ms-excel",
"application/vnd.ms-outlook",
"application/vnd.ms-powerpoint",
"application/x-ole-storage",
"anything/else",
],
)
def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_guessed_mime_type(
file_name: str, guessed_mime_type: str, expected_value: FileType, ctx_mime_type_: Mock
):
"""Fixes XLS wrongly-guessed as DOC, PPT, "application/x-ole-storage" etc.
It's better than that actually, the OLE differentiator will get the right file-type for any DOC,
PPT, XLS, or MSG file, regardless of guessed MIME-type.
"""
ctx_mime_type_.return_value = guessed_mime_type
# -- disable strategy 3 by not providing a file-name source --
with open(example_doc_path(file_name), "rb") as f:
file = io.BytesIO(f.read())
assert detect_filetype(file=file) is expected_value

# -- disable strategy 1 by not asserting a content-type --
file_type = detect_filetype(file=file)

ctx_mime_type_.assert_called_with()
assert file_type is expected_value


@pytest.mark.parametrize(
Expand Down Expand Up @@ -454,6 +515,7 @@ def test_it_detects_correct_file_type_from_strategy_3_when_extension_maps_to_fil
[
(FileType.BMP, "img/bmp_24.bmp", "application/zip"),
(FileType.DOC, "simple.doc", None),
(FileType.EPUB, "winter-sports.epub", "application/x-ole-storage"),
(FileType.MSG, "fake-email.msg", "application/octet-stream"),
],
)
Expand Down Expand Up @@ -575,6 +637,41 @@ def test_it_detect_CSV_from_path_and_file_when_content_contains_escaped_commas()
assert detect_filetype(file=f) == FileType.CSV


# ================================================================================================
# Describe `is_json_processable()`
# ================================================================================================


def it_affirms_JSON_is_array_of_objects_from_a_file_path():
assert is_json_processable(example_doc_path("simple.json")) is True


def and_it_affirms_JSON_is_NOT_an_array_of_objects_from_a_file_path():
assert is_json_processable(example_doc_path("not-unstructured-payload.json")) is False


def it_affirms_JSON_is_array_of_objects_from_a_file_like_object_open_for_reading_bytes():
with open(example_doc_path("simple.json"), "rb") as f:
assert is_json_processable(file=f) is True


def and_it_affirms_JSON_is_NOT_an_array_of_objects_from_a_file_like_object_open_for_reading_bytes():
with open(example_doc_path("not-unstructured-payload.json"), "rb") as f:
assert is_json_processable(file=f) is False


def it_affirms_JSON_is_array_of_objects_from_text():
with open(example_doc_path("simple.json")) as f:
text = f.read()
assert is_json_processable(file_text=text) is True


def and_it_affirms_JSON_is_NOT_an_array_of_objects_from_text():
with open(example_doc_path("not-unstructured-payload.json")) as f:
text = f.read()
assert is_json_processable(file_text=text) is False


# ================================================================================================
# MODULE-LEVEL FIXTURES
# ================================================================================================
Expand Down Expand Up @@ -891,6 +988,52 @@ def mime_type_prop_(self, request: FixtureRequest):
return property_mock(request, _FileTypeDetectionContext, "mime_type")


class Describe_OleFileDifferentiator:
"""Unit-test suite for `unstructured.file_utils.filetype._OleFileDifferentiator`."""

# -- .applies() ---------------------------------------------

def it_provides_a_qualifying_alternate_constructor_which_constructs_when_applicable(self):
"""The constructor determines whether this differentiator is applicable.
It returns an instance only when differentiating a CFBF file-type is required, which it
judges by inspecting the initial bytes of the file for the CFBF magic-bytes.
"""
ctx = _FileTypeDetectionContext(example_doc_path("simple.doc"))

differentiator = _OleFileDifferentiator.applies(ctx, "foo/bar")

assert differentiator is not None
assert isinstance(differentiator, _OleFileDifferentiator)

def and_it_returns_None_when_ole_differentiation_is_not_applicable_to_the_mime_type(self):
ctx = _FileTypeDetectionContext(example_doc_path("winter-sports.epub"))
assert _OleFileDifferentiator.applies(ctx, "application/epub") is None

# -- .file_type ---------------------------------------------

@pytest.mark.parametrize(
("file_name", "expected_value"),
[
("simple.doc", FileType.DOC),
("fake-power-point.ppt", FileType.PPT),
("tests-example.xls", FileType.XLS),
("fake-email.msg", FileType.MSG),
("README.org", None),
],
)
def it_distinguishes_the_file_type_of_applicable_zip_files(
self, file_name: str, expected_value: FileType | None
):
# -- no file-name available, just to make sure we're not relying on an extension --
with open(example_doc_path(file_name), "rb") as f:
file = io.BytesIO(f.read())
ctx = _FileTypeDetectionContext(file=file)
differentiator = _OleFileDifferentiator(ctx)

assert differentiator.file_type is expected_value


class Describe_TextFileDifferentiator:
"""Unit-test suite for `unstructured.file_utils.filetype._TextFileDifferentiator`."""

Expand Down
30 changes: 18 additions & 12 deletions test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -1221,33 +1221,39 @@ def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partiti


@pytest.mark.parametrize(
"filetype",
"file_type",
[
t
for t in FileType
if t not in (FileType.EMPTY, FileType.UNK, FileType.WAV, FileType.XLS, FileType.ZIP)
and t.partitioner_function_name != "partition_image"
if t
not in (
FileType.EMPTY,
FileType.JSON,
FileType.UNK,
FileType.WAV,
FileType.XLS,
FileType.ZIP,
)
and t.partitioner_shortname != "image"
],
)
def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(filetype: FileType):
extension = filetype.name.lower()
# -- except for two oddballs, the shortname is the extension --
partitioner_shortname = {FileType.TXT: "text", FileType.EML: "email"}.get(filetype, extension)
partition_fn_name = f"partition_{partitioner_shortname}"
module = import_module(f"unstructured.partition.{partitioner_shortname}")
def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(file_type: FileType):
partition_fn_name = file_type.partitioner_function_name
module = import_module(file_type.partitioner_module_qname)
partition_fn = getattr(module, partition_fn_name)

# -- partition the first example-doc with the extension for this filetype --
elements: list[Element] = []
doc_path = example_doc_path("pdf") if filetype == FileType.PDF else example_doc_path("")
doc_path = example_doc_path("pdf") if file_type == FileType.PDF else example_doc_path("")
extensions = file_type._extensions
for file in pathlib.Path(doc_path).iterdir():
if file.is_file() and file.suffix == f".{extension}":
if file.is_file() and file.suffix in extensions:
elements = partition_fn(str(file))
break

assert elements
assert all(
e.metadata.filetype == filetype.mime_type
e.metadata.filetype == file_type.mime_type
for e in elements
if e.metadata.filetype is not None
)
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.15.1-dev4" # pragma: no cover
__version__ = "0.15.1-dev5" # pragma: no cover
Loading

0 comments on commit 4e61acc

Please sign in to comment.