Skip to content

Commit

Permalink
spike: subsume _read_file_start_for_type_check()
Browse files Browse the repository at this point in the history
Needs tests, consider reusing `_FileTypeDetectionContext`.text_head.
  • Loading branch information
scanny committed Jul 25, 2024
1 parent e597c64 commit 3f87de3
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 72 deletions.
5 changes: 5 additions & 0 deletions example-docs/not-unstructured-payload.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"id": "Sample-1",
"name": "Sample 1",
"description": "This is sample data #1"
}
36 changes: 36 additions & 0 deletions test_unstructured/file_utils/test_filetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
_TextFileDifferentiator,
_ZipFileDifferentiator,
detect_filetype,
is_json_processable,
)
from unstructured.file_utils.model import FileType

Expand Down Expand Up @@ -636,6 +637,41 @@ def test_it_detect_CSV_from_path_and_file_when_content_contains_escaped_commas()
assert detect_filetype(file=f) == FileType.CSV


# ================================================================================================
# Describe `is_json_processable()`
# ================================================================================================


def it_affirms_JSON_is_array_of_objects_from_a_file_path():
assert is_json_processable(example_doc_path("simple.json")) is True


def and_it_affirms_JSON_is_NOT_an_array_of_objects_from_a_file_path():
assert is_json_processable(example_doc_path("not-unstructured-payload.json")) is False


def it_affirms_JSON_is_array_of_objects_from_a_file_like_object_open_for_reading_bytes():
with open(example_doc_path("simple.json"), "rb") as f:
assert is_json_processable(file=f) is True


def and_it_affirms_JSON_is_NOT_an_array_of_objects_from_a_file_like_object_open_for_reading_bytes():
with open(example_doc_path("not-unstructured-payload.json"), "rb") as f:
assert is_json_processable(file=f) is False


def it_affirms_JSON_is_array_of_objects_from_text():
with open(example_doc_path("simple.json")) as f:
text = f.read()
assert is_json_processable(file_text=text) is True


def and_it_affirms_JSON_is_NOT_an_array_of_objects_from_text():
with open(example_doc_path("not-unstructured-payload.json")) as f:
text = f.read()
assert is_json_processable(file_text=text) is False


# ================================================================================================
# MODULE-LEVEL FIXTURES
# ================================================================================================
Expand Down
30 changes: 18 additions & 12 deletions test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -1221,33 +1221,39 @@ def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partiti


@pytest.mark.parametrize(
"filetype",
"file_type",
[
t
for t in FileType
if t not in (FileType.EMPTY, FileType.UNK, FileType.WAV, FileType.XLS, FileType.ZIP)
and t.partitioner_function_name != "partition_image"
if t
not in (
FileType.EMPTY,
FileType.JSON,
FileType.UNK,
FileType.WAV,
FileType.XLS,
FileType.ZIP,
)
and t.partitioner_shortname != "image"
],
)
def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(filetype: FileType):
extension = filetype.name.lower()
# -- except for two oddballs, the shortname is the extension --
partitioner_shortname = {FileType.TXT: "text", FileType.EML: "email"}.get(filetype, extension)
partition_fn_name = f"partition_{partitioner_shortname}"
module = import_module(f"unstructured.partition.{partitioner_shortname}")
def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(file_type: FileType):
partition_fn_name = file_type.partitioner_function_name
module = import_module(file_type.partitioner_module_qname)
partition_fn = getattr(module, partition_fn_name)

# -- partition the first example-doc with the extension for this filetype --
elements: list[Element] = []
doc_path = example_doc_path("pdf") if filetype == FileType.PDF else example_doc_path("")
doc_path = example_doc_path("pdf") if file_type == FileType.PDF else example_doc_path("")
extensions = file_type._extensions
for file in pathlib.Path(doc_path).iterdir():
if file.is_file() and file.suffix == f".{extension}":
if file.is_file() and file.suffix in extensions:
elements = partition_fn(str(file))
break

assert elements
assert all(
e.metadata.filetype == filetype.mime_type
e.metadata.filetype == file_type.mime_type
for e in elements
if e.metadata.filetype is not None
)
Expand Down
84 changes: 24 additions & 60 deletions unstructured/file_utils/filetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,12 @@ def is_json_processable(
file is JSON.
"""
exactly_one(filename=filename, file=file, file_text=file_text)

if file_text is None:
file_text = _read_file_start_for_type_check(
file=file,
filename=filename,
encoding=encoding,
)
file_text = _FileTypeDetectionContext.new(
file_path=filename, file=file, encoding=encoding
).text_head

return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None


Expand Down Expand Up @@ -252,7 +252,7 @@ def __init__(
content_type: str | None = None,
metadata_file_path: str | None = None,
):
self._file_path = file_path
self._file_path_arg = file_path
self._file_arg = file
self._encoding_arg = encoding
self._content_type = content_type
Expand All @@ -265,9 +265,9 @@ def new(
file_path: str | None,
file: IO[bytes] | None,
encoding: str | None,
content_type: str | None,
metadata_file_path: str | None,
):
content_type: str | None = None,
metadata_file_path: str | None = None,
) -> _FileTypeDetectionContext:
self = cls(
file_path=file_path,
file=file,
Expand Down Expand Up @@ -324,7 +324,10 @@ def file_path(self) -> str | None:
None when the caller specified the source as a file-like object instead. Useful for user
feedback on an error, but users of context should have little use for it otherwise.
"""
return self._file_path
if (file_path := self._file_path_arg) is None:
return None

return os.path.realpath(file_path) if os.path.islink(file_path) else file_path

@lazyproperty
def is_zipfile(self) -> bool:
Expand Down Expand Up @@ -355,19 +358,19 @@ def mime_type(self) -> str | None:
A `str` return value is always in lower-case.
"""
file_path = self.file_path

if LIBMAGIC_AVAILABLE:
import magic

mime_type = (
magic.from_file(_resolve_symlink(self._file_path), mime=True)
if self._file_path
magic.from_file(file_path, mime=True)
if file_path
else magic.from_buffer(self.file_head, mime=True)
)
return mime_type.lower() if mime_type else None

mime_type = (
ft.guess_mime(self._file_path) if self._file_path else ft.guess_mime(self.file_head)
)
mime_type = ft.guess_mime(file_path) if file_path else ft.guess_mime(self.file_head)

if mime_type is None:
logger.warning(
Expand All @@ -391,8 +394,8 @@ def open(self) -> Iterator[IO[bytes]]:
File is guaranteed to be at read position 0 when called.
"""
if self._file_path:
with open(self._file_path, "rb") as f:
if self.file_path:
with open(self.file_path, "rb") as f:
yield f
else:
file = self._file_arg
Expand Down Expand Up @@ -420,7 +423,7 @@ def text_head(self) -> str:
else content.decode(encoding=self.encoding, errors="ignore")
)

file_path = self._file_path
file_path = self.file_path
assert file_path is not None # -- guaranteed by `._validate` --

try:
Expand All @@ -433,9 +436,9 @@ def text_head(self) -> str:

def _validate(self) -> None:
"""Raise if the context is invalid."""
if self._file_path and not os.path.isfile(self._file_path):
raise FileNotFoundError(f"no such file {self._file_path}")
if not self._file_path and not self._file_arg:
if self.file_path and not os.path.isfile(self.file_path):
raise FileNotFoundError(f"no such file {self._file_path_arg}")
if not self.file_path and not self._file_arg:
raise ValueError("either `file_path` or `file` argument must be provided")


Expand Down Expand Up @@ -650,45 +653,6 @@ def file_type(self) -> FileType | None:
return FileType.ZIP


def _read_file_start_for_type_check(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,
encoding: Optional[str] = "utf-8",
) -> str:
"""Reads the start of the file and returns the text content."""
exactly_one(filename=filename, file=file)

if file is not None:
file.seek(0)
file_content = file.read(4096)
if isinstance(file_content, str):
file_text = file_content
else:
file_text = file_content.decode(errors="ignore")
file.seek(0)
return file_text

# -- guaranteed by `exactly_one()` call --
assert filename is not None

try:
with open(filename, encoding=encoding) as f:
file_text = f.read(4096)
except UnicodeDecodeError:
formatted_encoding, _ = detect_file_encoding(filename=filename)
with open(filename, encoding=formatted_encoding) as f:
file_text = f.read(4096)

return file_text


def _resolve_symlink(file_path: str) -> str:
"""Resolve `file_path` containing symlink to the actual file path."""
if os.path.islink(file_path):
file_path = os.path.realpath(file_path)
return file_path


_P = ParamSpec("_P")


Expand Down

0 comments on commit 3f87de3

Please sign in to comment.