diff --git a/CHANGELOG.md b/CHANGELOG.md index 575bda24ad..13bb8014c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.1-dev4 +## 0.15.1-dev5 ### Enhancements @@ -14,6 +14,7 @@ * **A DOCX, PPTX, or XLSX file specified by path and ambiguously identified as MIME-type "application/octet-stream" is identified correctly.** Resolves a shortcoming where a file specified by path immediately fell back to filename-extension based identification when misidentified as "application/octet-stream", either by asserted content type or a mis-guess by libmagic. An MS Office file misidentified in this way is now correctly identified regardless of its filename and whether it is specified by path or file-like object. * **Textual content retrieved from a URL with gzip transport compression now partitions correctly.** Resolves a bug where a textual file-type (such as Markdown) retrieved by passing a URL to `partition()` would raise when `gzip` compression was used for transport by the server. * **A DOCX, PPTX, or XLSX content-type asserted on partition is confirmed or fixed.** Resolves a bug where calling `partition()` with a swapped MS-Office `content_type` would cause the file-type to be misidentified. A DOCX, PPTX, or XLSX MIME-type received by `partition()` is now checked for accuracy and corrected if the file is for a different MS-Office 2007+ type. +* **DOC, PPT, XLS, and MSG files are now auto-detected correctly.** Resolves a bug where DOC, PPT, and XLS files were auto-detected as MSG files under certain circumstances. ## 0.15.0 diff --git a/example-docs/not-unstructured-payload.json b/example-docs/not-unstructured-payload.json new file mode 100644 index 0000000000..14a129eede --- /dev/null +++ b/example-docs/not-unstructured-payload.json @@ -0,0 +1,5 @@ +{ + "id": "Sample-1", + "name": "Sample 1", + "description": "This is sample data #1" +} diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index ef66a22970..deed06dff1 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -19,9 +19,11 @@ ) from unstructured.file_utils.filetype import ( _FileTypeDetectionContext, + _OleFileDifferentiator, _TextFileDifferentiator, _ZipFileDifferentiator, detect_filetype, + is_json_processable, ) from unstructured.file_utils.model import FileType @@ -185,6 +187,46 @@ def test_it_detects_correct_file_type_from_file_no_name_with_swapped_ms_office_c assert file_type is expected_value +@pytest.mark.parametrize( + ("expected_value", "file_name"), + [ + (FileType.DOC, "simple.doc"), + (FileType.PPT, "fake-power-point.ppt"), + (FileType.XLS, "tests-example.xls"), + (FileType.MSG, "fake-email-multiple-attachments.msg"), + ], +) +@pytest.mark.parametrize( + "content_type", + [ + "application/msword", + "application/vnd.ms-outlook", + "application/vnd.ms-powerpoint", + "application/vnd.ms-excel", + "anything/else", + ], +) +def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_content_type( + file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock +): + """Fixes wrong XLS asserted as DOC, PPT, etc. + + Asserted content-type can be anything except `None` and differentiator will fix it if the file + is DOC, PPT, XLS, or MSG type. + """ + # -- disable strategies 2 & 3, content-type strategy should get this on its own -- + ctx_mime_type_.return_value = None + with open(example_doc_path(file_name), "rb") as f: + file = io.BytesIO(f.read()) + + file_type = detect_filetype(file=file, content_type=content_type) + + # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not + # -- fall-back to strategy 2 for any of these test cases. + ctx_mime_type_.assert_not_called() + assert file_type is expected_value + + # ================================================================================================ # STRATEGY #2 - GUESS MIME-TYPE WITH LIBMAGIC # ================================================================================================ @@ -264,6 +306,7 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec [ (FileType.BMP, "img/bmp_24.bmp"), (FileType.CSV, "stanley-cups.csv"), + (FileType.DOC, "simple.doc"), (FileType.DOCX, "simple.docx"), (FileType.EML, "eml/fake-email.eml"), (FileType.EPUB, "winter-sports.epub"), @@ -271,14 +314,17 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec (FileType.HTML, "ideas-page.html"), (FileType.JPG, "img/example.jpg"), (FileType.JSON, "spring-weather.html.json"), + (FileType.MSG, "fake-email.msg"), (FileType.ODT, "simple.odt"), (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"), (FileType.PNG, "img/DA-1p.png"), + (FileType.PPT, "fake-power-point.ppt"), (FileType.PPTX, "fake-power-point.pptx"), (FileType.RTF, "fake-doc.rtf"), (FileType.TIFF, "img/layout-parser-paper-fast.tiff"), (FileType.TXT, "norwich-city.txt"), (FileType.WAV, "CantinaBand3.wav"), + (FileType.XLS, "tests-example.xls"), (FileType.XLSX, "stanley-cups.xlsx"), (FileType.XML, "factbook.xml"), (FileType.ZIP, "simple.zip"), @@ -290,11 +336,7 @@ def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_ """Does not work for all types, in particular: TODOs: - - DOC is misidentified as MSG, TODO on that below. - - MSG is misidentified as UNK, but only on CI. - - PPT is misidentified as MSG, same fix as DOC. - TSV is identified as TXT, maybe need an `.is_tsv` predicate in `_TextFileDifferentiator` - - XLS is misidentified as MSG, same fix as DOC. NOCANDOs: w/o an extension I think these are the best we can do. - MD is identified as TXT @@ -309,25 +351,44 @@ def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_ assert detect_filetype(file=file) is expected_value -# NOTE(scanny): magic gets this wrong ("application/x-ole-storage") but filetype lib gets it right -# ("application/msword"). Need a differentiator for "application/x-ole-storage". -@pytest.mark.xfail(reason="TODO: FIX", raises=AssertionError, strict=True) @pytest.mark.parametrize( ("expected_value", "file_name"), [ (FileType.DOC, "simple.doc"), (FileType.PPT, "fake-power-point.ppt"), (FileType.XLS, "tests-example.xls"), - # -- only fails on CI, maybe different libmagic version or "magic-files" -- - # (FileType.MSG, "fake-email.msg"), + (FileType.MSG, "fake-email-multiple-attachments.msg"), ], ) -def test_it_detects_MS_Office_file_types_using_strategy_2_when_libmagic_guesses_mime_type( - file_name: str, expected_value: FileType +@pytest.mark.parametrize( + "guessed_mime_type", + [ + "application/msword", + "application/vnd.ms-excel", + "application/vnd.ms-outlook", + "application/vnd.ms-powerpoint", + "application/x-ole-storage", + "anything/else", + ], +) +def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_guessed_mime_type( + file_name: str, guessed_mime_type: str, expected_value: FileType, ctx_mime_type_: Mock ): + """Fixes XLS wrongly-guessed as DOC, PPT, "application/x-ole-storage" etc. + + It's better than that actually, the OLE differentiator will get the right file-type for any DOC, + PPT, XLS, or MSG file, regardless of guessed MIME-type. + """ + ctx_mime_type_.return_value = guessed_mime_type + # -- disable strategy 3 by not providing a file-name source -- with open(example_doc_path(file_name), "rb") as f: file = io.BytesIO(f.read()) - assert detect_filetype(file=file) is expected_value + + # -- disable strategy 1 by not asserting a content-type -- + file_type = detect_filetype(file=file) + + ctx_mime_type_.assert_called_with() + assert file_type is expected_value @pytest.mark.parametrize( @@ -454,6 +515,7 @@ def test_it_detects_correct_file_type_from_strategy_3_when_extension_maps_to_fil [ (FileType.BMP, "img/bmp_24.bmp", "application/zip"), (FileType.DOC, "simple.doc", None), + (FileType.EPUB, "winter-sports.epub", "application/x-ole-storage"), (FileType.MSG, "fake-email.msg", "application/octet-stream"), ], ) @@ -575,6 +637,41 @@ def test_it_detect_CSV_from_path_and_file_when_content_contains_escaped_commas() assert detect_filetype(file=f) == FileType.CSV +# ================================================================================================ +# Describe `is_json_processable()` +# ================================================================================================ + + +def it_affirms_JSON_is_array_of_objects_from_a_file_path(): + assert is_json_processable(example_doc_path("simple.json")) is True + + +def and_it_affirms_JSON_is_NOT_an_array_of_objects_from_a_file_path(): + assert is_json_processable(example_doc_path("not-unstructured-payload.json")) is False + + +def it_affirms_JSON_is_array_of_objects_from_a_file_like_object_open_for_reading_bytes(): + with open(example_doc_path("simple.json"), "rb") as f: + assert is_json_processable(file=f) is True + + +def and_it_affirms_JSON_is_NOT_an_array_of_objects_from_a_file_like_object_open_for_reading_bytes(): + with open(example_doc_path("not-unstructured-payload.json"), "rb") as f: + assert is_json_processable(file=f) is False + + +def it_affirms_JSON_is_array_of_objects_from_text(): + with open(example_doc_path("simple.json")) as f: + text = f.read() + assert is_json_processable(file_text=text) is True + + +def and_it_affirms_JSON_is_NOT_an_array_of_objects_from_text(): + with open(example_doc_path("not-unstructured-payload.json")) as f: + text = f.read() + assert is_json_processable(file_text=text) is False + + # ================================================================================================ # MODULE-LEVEL FIXTURES # ================================================================================================ @@ -891,6 +988,52 @@ def mime_type_prop_(self, request: FixtureRequest): return property_mock(request, _FileTypeDetectionContext, "mime_type") +class Describe_OleFileDifferentiator: + """Unit-test suite for `unstructured.file_utils.filetype._OleFileDifferentiator`.""" + + # -- .applies() --------------------------------------------- + + def it_provides_a_qualifying_alternate_constructor_which_constructs_when_applicable(self): + """The constructor determines whether this differentiator is applicable. + + It returns an instance only when differentiating a CFBF file-type is required, which it + judges by inspecting the initial bytes of the file for the CFBF magic-bytes. + """ + ctx = _FileTypeDetectionContext(example_doc_path("simple.doc")) + + differentiator = _OleFileDifferentiator.applies(ctx, "foo/bar") + + assert differentiator is not None + assert isinstance(differentiator, _OleFileDifferentiator) + + def and_it_returns_None_when_ole_differentiation_is_not_applicable_to_the_mime_type(self): + ctx = _FileTypeDetectionContext(example_doc_path("winter-sports.epub")) + assert _OleFileDifferentiator.applies(ctx, "application/epub") is None + + # -- .file_type --------------------------------------------- + + @pytest.mark.parametrize( + ("file_name", "expected_value"), + [ + ("simple.doc", FileType.DOC), + ("fake-power-point.ppt", FileType.PPT), + ("tests-example.xls", FileType.XLS), + ("fake-email.msg", FileType.MSG), + ("README.org", None), + ], + ) + def it_distinguishes_the_file_type_of_applicable_zip_files( + self, file_name: str, expected_value: FileType | None + ): + # -- no file-name available, just to make sure we're not relying on an extension -- + with open(example_doc_path(file_name), "rb") as f: + file = io.BytesIO(f.read()) + ctx = _FileTypeDetectionContext(file=file) + differentiator = _OleFileDifferentiator(ctx) + + assert differentiator.file_type is expected_value + + class Describe_TextFileDifferentiator: """Unit-test suite for `unstructured.file_utils.filetype._TextFileDifferentiator`.""" diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 3e3d4c6b96..d44a4a65c6 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -1221,33 +1221,39 @@ def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partiti @pytest.mark.parametrize( - "filetype", + "file_type", [ t for t in FileType - if t not in (FileType.EMPTY, FileType.UNK, FileType.WAV, FileType.XLS, FileType.ZIP) - and t.partitioner_function_name != "partition_image" + if t + not in ( + FileType.EMPTY, + FileType.JSON, + FileType.UNK, + FileType.WAV, + FileType.XLS, + FileType.ZIP, + ) + and t.partitioner_shortname != "image" ], ) -def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(filetype: FileType): - extension = filetype.name.lower() - # -- except for two oddballs, the shortname is the extension -- - partitioner_shortname = {FileType.TXT: "text", FileType.EML: "email"}.get(filetype, extension) - partition_fn_name = f"partition_{partitioner_shortname}" - module = import_module(f"unstructured.partition.{partitioner_shortname}") +def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(file_type: FileType): + partition_fn_name = file_type.partitioner_function_name + module = import_module(file_type.partitioner_module_qname) partition_fn = getattr(module, partition_fn_name) # -- partition the first example-doc with the extension for this filetype -- elements: list[Element] = [] - doc_path = example_doc_path("pdf") if filetype == FileType.PDF else example_doc_path("") + doc_path = example_doc_path("pdf") if file_type == FileType.PDF else example_doc_path("") + extensions = file_type._extensions for file in pathlib.Path(doc_path).iterdir(): - if file.is_file() and file.suffix == f".{extension}": + if file.is_file() and file.suffix in extensions: elements = partition_fn(str(file)) break assert elements assert all( - e.metadata.filetype == filetype.mime_type + e.metadata.filetype == file_type.mime_type for e in elements if e.metadata.filetype is not None ) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index a80302f8e7..c3eea2bc92 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.1-dev4" # pragma: no cover +__version__ = "0.15.1-dev5" # pragma: no cover diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 251f333615..c5bd11509e 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -112,12 +112,12 @@ def is_json_processable( file is JSON. """ exactly_one(filename=filename, file=file, file_text=file_text) + if file_text is None: - file_text = _read_file_start_for_type_check( - file=file, - filename=filename, - encoding=encoding, - ) + file_text = _FileTypeDetectionContext.new( + file_path=filename, file=file, encoding=encoding + ).text_head + return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None @@ -161,6 +161,11 @@ def _file_type_from_content_type(self) -> FileType | None: if not content_type: return None + # -- OLE-based file-format content_type values are sometimes unreliable. These are + # -- DOC, PPT, XLS, and MSG. + if differentiator := _OleFileDifferentiator.applies(self._ctx, content_type): + return differentiator.file_type + # -- MS-Office 2007+ (OpenXML) content_type value is sometimes unreliable -- if differentiator := _ZipFileDifferentiator.applies(self._ctx, content_type): return differentiator.file_type @@ -185,9 +190,8 @@ def _file_type_from_guessed_mime_type(self) -> FileType | None: if mime_type is None: return None - # NOTE(Crag): older magic lib does not differentiate between xls and doc - if mime_type == "application/msword" and extension == ".xls": - return FileType.XLS + if differentiator := _OleFileDifferentiator.applies(self._ctx, mime_type): + return differentiator.file_type if mime_type.endswith("xml"): return FileType.HTML if extension in (".html", ".htm") else FileType.XML @@ -248,7 +252,7 @@ def __init__( content_type: str | None = None, metadata_file_path: str | None = None, ): - self._file_path = file_path + self._file_path_arg = file_path self._file_arg = file self._encoding_arg = encoding self._content_type = content_type @@ -261,9 +265,9 @@ def new( file_path: str | None, file: IO[bytes] | None, encoding: str | None, - content_type: str | None, - metadata_file_path: str | None, - ): + content_type: str | None = None, + metadata_file_path: str | None = None, + ) -> _FileTypeDetectionContext: self = cls( file_path=file_path, file=file, @@ -320,7 +324,10 @@ def file_path(self) -> str | None: None when the caller specified the source as a file-like object instead. Useful for user feedback on an error, but users of context should have little use for it otherwise. """ - return self._file_path + if (file_path := self._file_path_arg) is None: + return None + + return os.path.realpath(file_path) if os.path.islink(file_path) else file_path @lazyproperty def is_zipfile(self) -> bool: @@ -351,19 +358,19 @@ def mime_type(self) -> str | None: A `str` return value is always in lower-case. """ + file_path = self.file_path + if LIBMAGIC_AVAILABLE: import magic mime_type = ( - magic.from_file(_resolve_symlink(self._file_path), mime=True) - if self._file_path + magic.from_file(file_path, mime=True) + if file_path else magic.from_buffer(self.file_head, mime=True) ) return mime_type.lower() if mime_type else None - mime_type = ( - ft.guess_mime(self._file_path) if self._file_path else ft.guess_mime(self.file_head) - ) + mime_type = ft.guess_mime(file_path) if file_path else ft.guess_mime(self.file_head) if mime_type is None: logger.warning( @@ -387,8 +394,8 @@ def open(self) -> Iterator[IO[bytes]]: File is guaranteed to be at read position 0 when called. """ - if self._file_path: - with open(self._file_path, "rb") as f: + if self.file_path: + with open(self.file_path, "rb") as f: yield f else: file = self._file_arg @@ -416,7 +423,7 @@ def text_head(self) -> str: else content.decode(encoding=self.encoding, errors="ignore") ) - file_path = self._file_path + file_path = self.file_path assert file_path is not None # -- guaranteed by `._validate` -- try: @@ -429,12 +436,61 @@ def text_head(self) -> str: def _validate(self) -> None: """Raise if the context is invalid.""" - if self._file_path and not os.path.isfile(self._file_path): - raise FileNotFoundError(f"no such file {self._file_path}") - if not self._file_path and not self._file_arg: + if self.file_path and not os.path.isfile(self.file_path): + raise FileNotFoundError(f"no such file {self._file_path_arg}") + if not self.file_path and not self._file_arg: raise ValueError("either `file_path` or `file` argument must be provided") +class _OleFileDifferentiator: + """Refine an OLE-storage package (CFBF) file-type that may not be as specific as it could be. + + Compound File Binary Format (CFBF), aka. OLE file, is use by Microsoft for legacy MS Office + files (DOC, PPT, XLS) as well as for Outlook MSG files. `libmagic` tends to identify these as + `"application/x-ole-storage"` which is true but too not specific enough for partitioning + purposes. + """ + + def __init__(self, ctx: _FileTypeDetectionContext): + self._ctx = ctx + + @classmethod + def applies( + cls, ctx: _FileTypeDetectionContext, mime_type: str + ) -> _OleFileDifferentiator | None: + """Constructs an instance, but only if this differentiator applies for `mime_type`.""" + return cls(ctx) if cls._is_ole_file(ctx) else None + + @property + def file_type(self) -> FileType | None: + """Differentiated file-type for Microsoft Compound File Binary Format (CFBF). + + Returns one of: + - `FileType.DOC` + - `FileType.PPT` + - `FileType.XLS` + - `FileType.MSG` + """ + # -- if this is not a CFBF file then whatever MIME-type was guessed is wrong, so return + # -- `None` to trigger fall-back to next strategy. + if not self._is_ole_file(self._ctx): + return None + + # -- `filetype` lib is better at legacy MS-Office files than `libmagic`, so rely on it to + # -- differentiate those. Note it doesn't detect MSG type though, so we assume any OLE file + # -- that is not a legacy MS-Office type to be a MSG file. + with self._ctx.open() as file: + mime_type = ft.guess_mime(file) + + return FileType.from_mime_type(mime_type or "application/vnd.ms-outlook") + + @staticmethod + def _is_ole_file(ctx: _FileTypeDetectionContext) -> bool: + """True when file has CFBF magic first 8 bytes.""" + with ctx.open() as file: + return file.read(8) == b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1" + + class _TextFileDifferentiator: """Refine a textual file-type that may not be as specific as it could be.""" @@ -597,45 +653,6 @@ def file_type(self) -> FileType | None: return FileType.ZIP -def _read_file_start_for_type_check( - filename: Optional[str] = None, - file: Optional[IO[bytes]] = None, - encoding: Optional[str] = "utf-8", -) -> str: - """Reads the start of the file and returns the text content.""" - exactly_one(filename=filename, file=file) - - if file is not None: - file.seek(0) - file_content = file.read(4096) - if isinstance(file_content, str): - file_text = file_content - else: - file_text = file_content.decode(errors="ignore") - file.seek(0) - return file_text - - # -- guaranteed by `exactly_one()` call -- - assert filename is not None - - try: - with open(filename, encoding=encoding) as f: - file_text = f.read(4096) - except UnicodeDecodeError: - formatted_encoding, _ = detect_file_encoding(filename=filename) - with open(filename, encoding=formatted_encoding) as f: - file_text = f.read(4096) - - return file_text - - -def _resolve_symlink(file_path: str) -> str: - """Resolve `file_path` containing symlink to the actual file path.""" - if os.path.islink(file_path): - file_path = os.path.realpath(file_path) - return file_path - - _P = ParamSpec("_P") diff --git a/unstructured/file_utils/model.py b/unstructured/file_utils/model.py index 0fe0caa63f..e4c567975f 100644 --- a/unstructured/file_utils/model.py +++ b/unstructured/file_utils/model.py @@ -286,7 +286,7 @@ def partitioner_shortname(self) -> str | None: "msg", [".msg"], "application/vnd.ms-outlook", - ["application/x-ole-storage"], + cast(list[str], []), ) ODT = ( "odt",