spike: subsume _read_file_start_for_type_check()

Needs tests, consider reusing `_FileTypeDetectionContext`.text_head.
Unstructured-IO · Jul 25, 2024 · 3f87de3 · 3f87de3
1 parent e597c64
commit 3f87de3
Show file tree

Hide file tree

Showing 4 changed files with 83 additions and 72 deletions.
diff --git a/example-docs/not-unstructured-payload.json b/example-docs/not-unstructured-payload.json
@@ -0,0 +1,5 @@
+{
+    "id": "Sample-1",
+    "name": "Sample 1",
+    "description": "This is sample data #1"
+}
diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py
@@ -23,6 +23,7 @@
     _TextFileDifferentiator,
     _ZipFileDifferentiator,
     detect_filetype,
+    is_json_processable,
 )
 from unstructured.file_utils.model import FileType
 
@@ -636,6 +637,41 @@ def test_it_detect_CSV_from_path_and_file_when_content_contains_escaped_commas()
         assert detect_filetype(file=f) == FileType.CSV
 
 
+# ================================================================================================
+# Describe `is_json_processable()`
+# ================================================================================================
+
+
+def it_affirms_JSON_is_array_of_objects_from_a_file_path():
+    assert is_json_processable(example_doc_path("simple.json")) is True
+
+
+def and_it_affirms_JSON_is_NOT_an_array_of_objects_from_a_file_path():
+    assert is_json_processable(example_doc_path("not-unstructured-payload.json")) is False
+
+
+def it_affirms_JSON_is_array_of_objects_from_a_file_like_object_open_for_reading_bytes():
+    with open(example_doc_path("simple.json"), "rb") as f:
+        assert is_json_processable(file=f) is True
+
+
+def and_it_affirms_JSON_is_NOT_an_array_of_objects_from_a_file_like_object_open_for_reading_bytes():
+    with open(example_doc_path("not-unstructured-payload.json"), "rb") as f:
+        assert is_json_processable(file=f) is False
+
+
+def it_affirms_JSON_is_array_of_objects_from_text():
+    with open(example_doc_path("simple.json")) as f:
+        text = f.read()
+    assert is_json_processable(file_text=text) is True
+
+
+def and_it_affirms_JSON_is_NOT_an_array_of_objects_from_text():
+    with open(example_doc_path("not-unstructured-payload.json")) as f:
+        text = f.read()
+    assert is_json_processable(file_text=text) is False
+
+
 # ================================================================================================
 # MODULE-LEVEL FIXTURES
 # ================================================================================================

diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -1221,33 +1221,39 @@ def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partiti
 
 
 @pytest.mark.parametrize(
-    "filetype",
+    "file_type",
     [
         t
         for t in FileType
-        if t not in (FileType.EMPTY, FileType.UNK, FileType.WAV, FileType.XLS, FileType.ZIP)
-        and t.partitioner_function_name != "partition_image"
+        if t
+        not in (
+            FileType.EMPTY,
+            FileType.JSON,
+            FileType.UNK,
+            FileType.WAV,
+            FileType.XLS,
+            FileType.ZIP,
+        )
+        and t.partitioner_shortname != "image"
     ],
 )
-def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(filetype: FileType):
-    extension = filetype.name.lower()
-    # -- except for two oddballs, the shortname is the extension --
-    partitioner_shortname = {FileType.TXT: "text", FileType.EML: "email"}.get(filetype, extension)
-    partition_fn_name = f"partition_{partitioner_shortname}"
-    module = import_module(f"unstructured.partition.{partitioner_shortname}")
+def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(file_type: FileType):
+    partition_fn_name = file_type.partitioner_function_name
+    module = import_module(file_type.partitioner_module_qname)
     partition_fn = getattr(module, partition_fn_name)
 
     # -- partition the first example-doc with the extension for this filetype --
     elements: list[Element] = []
-    doc_path = example_doc_path("pdf") if filetype == FileType.PDF else example_doc_path("")
+    doc_path = example_doc_path("pdf") if file_type == FileType.PDF else example_doc_path("")
+    extensions = file_type._extensions
     for file in pathlib.Path(doc_path).iterdir():
-        if file.is_file() and file.suffix == f".{extension}":
+        if file.is_file() and file.suffix in extensions:
             elements = partition_fn(str(file))
             break
 
     assert elements
     assert all(
-        e.metadata.filetype == filetype.mime_type
+        e.metadata.filetype == file_type.mime_type
         for e in elements
         if e.metadata.filetype is not None
     )

diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py
@@ -112,12 +112,12 @@ def is_json_processable(
     file is JSON.
     """
     exactly_one(filename=filename, file=file, file_text=file_text)
+
     if file_text is None:
-        file_text = _read_file_start_for_type_check(
-            file=file,
-            filename=filename,
-            encoding=encoding,
-        )
+        file_text = _FileTypeDetectionContext.new(
+            file_path=filename, file=file, encoding=encoding
+        ).text_head
+
     return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
 
 
@@ -252,7 +252,7 @@ def __init__(
         content_type: str | None = None,
         metadata_file_path: str | None = None,
     ):
-        self._file_path = file_path
+        self._file_path_arg = file_path
         self._file_arg = file
         self._encoding_arg = encoding
         self._content_type = content_type
@@ -265,9 +265,9 @@ def new(
         file_path: str | None,
         file: IO[bytes] | None,
         encoding: str | None,
-        content_type: str | None,
-        metadata_file_path: str | None,
-    ):
+        content_type: str | None = None,
+        metadata_file_path: str | None = None,
+    ) -> _FileTypeDetectionContext:
         self = cls(
             file_path=file_path,
             file=file,
@@ -324,7 +324,10 @@ def file_path(self) -> str | None:
         None when the caller specified the source as a file-like object instead. Useful for user
         feedback on an error, but users of context should have little use for it otherwise.
         """
-        return self._file_path
+        if (file_path := self._file_path_arg) is None:
+            return None
+
+        return os.path.realpath(file_path) if os.path.islink(file_path) else file_path
 
     @lazyproperty
     def is_zipfile(self) -> bool:
@@ -355,19 +358,19 @@ def mime_type(self) -> str | None:
 
         A `str` return value is always in lower-case.
         """
+        file_path = self.file_path
+
         if LIBMAGIC_AVAILABLE:
             import magic
 
             mime_type = (
-                magic.from_file(_resolve_symlink(self._file_path), mime=True)
-                if self._file_path
+                magic.from_file(file_path, mime=True)
+                if file_path
                 else magic.from_buffer(self.file_head, mime=True)
             )
             return mime_type.lower() if mime_type else None
 
-        mime_type = (
-            ft.guess_mime(self._file_path) if self._file_path else ft.guess_mime(self.file_head)
-        )
+        mime_type = ft.guess_mime(file_path) if file_path else ft.guess_mime(self.file_head)
 
         if mime_type is None:
             logger.warning(
@@ -391,8 +394,8 @@ def open(self) -> Iterator[IO[bytes]]:
 
         File is guaranteed to be at read position 0 when called.
         """
-        if self._file_path:
-            with open(self._file_path, "rb") as f:
+        if self.file_path:
+            with open(self.file_path, "rb") as f:
                 yield f
         else:
             file = self._file_arg
@@ -420,7 +423,7 @@ def text_head(self) -> str:
                 else content.decode(encoding=self.encoding, errors="ignore")
             )
 
-        file_path = self._file_path
+        file_path = self.file_path
         assert file_path is not None  # -- guaranteed by `._validate` --
 
         try:
@@ -433,9 +436,9 @@ def text_head(self) -> str:
 
     def _validate(self) -> None:
         """Raise if the context is invalid."""
-        if self._file_path and not os.path.isfile(self._file_path):
-            raise FileNotFoundError(f"no such file {self._file_path}")
-        if not self._file_path and not self._file_arg:
+        if self.file_path and not os.path.isfile(self.file_path):
+            raise FileNotFoundError(f"no such file {self._file_path_arg}")
+        if not self.file_path and not self._file_arg:
             raise ValueError("either `file_path` or `file` argument must be provided")
 
 
@@ -650,45 +653,6 @@ def file_type(self) -> FileType | None:
         return FileType.ZIP
 
 
-def _read_file_start_for_type_check(
-    filename: Optional[str] = None,
-    file: Optional[IO[bytes]] = None,
-    encoding: Optional[str] = "utf-8",
-) -> str:
-    """Reads the start of the file and returns the text content."""
-    exactly_one(filename=filename, file=file)
-
-    if file is not None:
-        file.seek(0)
-        file_content = file.read(4096)
-        if isinstance(file_content, str):
-            file_text = file_content
-        else:
-            file_text = file_content.decode(errors="ignore")
-        file.seek(0)
-        return file_text
-
-    # -- guaranteed by `exactly_one()` call --
-    assert filename is not None
-
-    try:
-        with open(filename, encoding=encoding) as f:
-            file_text = f.read(4096)
-    except UnicodeDecodeError:
-        formatted_encoding, _ = detect_file_encoding(filename=filename)
-        with open(filename, encoding=formatted_encoding) as f:
-            file_text = f.read(4096)
-
-    return file_text
-
-
-def _resolve_symlink(file_path: str) -> str:
-    """Resolve `file_path` containing symlink to the actual file path."""
-    if os.path.islink(file_path):
-        file_path = os.path.realpath(file_path)
-    return file_path
-
-
 _P = ParamSpec("_P")