bug: fix file_conversion disk leak (#3562)

Fix disk space leaks and Windows errors when accessing file.name on a NamedTemporaryFile Uses of `NamedTemporaryFile(..., delete=False)` and/or uses of `file.name` of NamedTemporaryFile have been replaced with TemporaryFileDirectory to avoid a known issue: - https://docs.python.org/3/library/tempfile.html#tempfile.NamedTemporaryFile - #3390 The first 7 commits each address an individual occurrence of the issue if reviewers want to review commit-by-commit.
Unstructured-IO · Aug 27, 2024 · f21c853 · f21c853
1 parent 4194a07
commit f21c853
Show file tree

Hide file tree

Showing 24 changed files with 162 additions and 97 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,13 @@
+## 0.15.9-dev0
+
+### Enhancements
+
+### Features
+
+### Fixes
+
+* **Fix disk space leaks and Windows errors when accessing file.name on a NamedTemporaryFile** Uses of `NamedTemporaryFile(..., delete=False)` and/or uses of `file.name` of NamedTemporaryFiles have been replaced with TemporaryFileDirectory to avoid a known issue: https://docs.python.org/3/library/tempfile.html#tempfile.NamedTemporaryFile
+
 ## 0.15.8
 
 ### Enhancements

diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt
@@ -53,7 +53,7 @@ google-auth==2.34.0
     #   google-cloud-vision
 google-cloud-vision==3.7.4
     # via -r ./extra-pdf-image.in
-googleapis-common-protos==1.64.0
+googleapis-common-protos==1.65.0
     # via
     #   google-api-core
     #   grpcio-status

diff --git a/requirements/ingest/chroma.txt b/requirements/ingest/chroma.txt
@@ -60,7 +60,7 @@ fsspec==2024.6.1
     # via huggingface-hub
 google-auth==2.34.0
     # via kubernetes
-googleapis-common-protos==1.64.0
+googleapis-common-protos==1.65.0
     # via opentelemetry-exporter-otlp-proto-grpc
 grpcio==1.66.0
     # via

diff --git a/requirements/ingest/clarifai.txt b/requirements/ingest/clarifai.txt
@@ -19,7 +19,7 @@ clarifai-grpc==10.7.1
     # via clarifai
 contextlib2==21.6.0
     # via schema
-googleapis-common-protos==1.64.0
+googleapis-common-protos==1.65.0
     # via clarifai-grpc
 grpcio==1.66.0
     # via

diff --git a/requirements/ingest/embed-aws-bedrock.txt b/requirements/ingest/embed-aws-bedrock.txt
@@ -94,7 +94,7 @@ langchain-core==0.2.35
     #   langchain-text-splitters
 langchain-text-splitters==0.2.2
     # via langchain
-langsmith==0.1.104
+langsmith==0.1.105
     # via
     #   langchain
     #   langchain-community

diff --git a/requirements/ingest/embed-huggingface.txt b/requirements/ingest/embed-huggingface.txt
@@ -72,7 +72,7 @@ langchain-core==0.2.35
     # via langchain-huggingface
 langchain-huggingface==0.0.3
     # via -r ./ingest/embed-huggingface.in
-langsmith==0.1.104
+langsmith==0.1.105
     # via langchain-core
 markupsafe==2.1.5
     # via jinja2

diff --git a/requirements/ingest/embed-mixedbreadai.txt b/requirements/ingest/embed-mixedbreadai.txt
@@ -28,7 +28,7 @@ httpcore==1.0.5
     # via
     #   -c ./ingest/../base.txt
     #   httpx
-httpx==0.27.0
+httpx==0.27.2
     # via
     #   -c ./ingest/../base.txt
     #   mixedbread-ai

diff --git a/requirements/ingest/embed-openai.txt b/requirements/ingest/embed-openai.txt
@@ -57,7 +57,7 @@ langchain-core==0.2.35
     # via langchain-openai
 langchain-openai==0.1.22
     # via -r ./ingest/embed-openai.in
-langsmith==0.1.104
+langsmith==0.1.105
     # via langchain-core
 openai==1.42.0
     # via langchain-openai

diff --git a/requirements/ingest/embed-vertexai.txt b/requirements/ingest/embed-vertexai.txt
@@ -88,7 +88,7 @@ google-resumable-media==2.7.2
     # via
     #   google-cloud-bigquery
     #   google-cloud-storage
-googleapis-common-protos[grpc]==1.64.0
+googleapis-common-protos[grpc]==1.65.0
     # via
     #   google-api-core
     #   grpc-google-iam-v1
@@ -148,7 +148,7 @@ langchain-google-vertexai==1.0.10
     # via -r ./ingest/embed-vertexai.in
 langchain-text-splitters==0.2.2
     # via langchain
-langsmith==0.1.104
+langsmith==0.1.105
     # via
     #   langchain
     #   langchain-community

diff --git a/requirements/ingest/embed-voyageai.txt b/requirements/ingest/embed-voyageai.txt
@@ -79,7 +79,7 @@ langchain-text-splitters==0.2.2
     # via langchain
 langchain-voyageai==0.1.1
     # via -r ./ingest/embed-voyageai.in
-langsmith==0.1.104
+langsmith==0.1.105
     # via
     #   langchain
     #   langchain-core

diff --git a/requirements/ingest/gcs.txt b/requirements/ingest/gcs.txt
@@ -66,7 +66,7 @@ google-crc32c==1.5.0
     #   google-resumable-media
 google-resumable-media==2.7.2
     # via google-cloud-storage
-googleapis-common-protos==1.64.0
+googleapis-common-protos==1.65.0
     # via google-api-core
 idna==3.8
     # via

diff --git a/requirements/ingest/google-drive.txt b/requirements/ingest/google-drive.txt
@@ -26,7 +26,7 @@ google-auth==2.34.0
     #   google-auth-httplib2
 google-auth-httplib2==0.2.0
     # via google-api-python-client
-googleapis-common-protos==1.64.0
+googleapis-common-protos==1.65.0
     # via google-api-core
 httplib2==0.22.0
     # via

diff --git a/test_unstructured/file_utils/test_file_conversion.py b/test_unstructured/file_utils/test_file_conversion.py
@@ -1,11 +1,16 @@
 import os
 import pathlib
+import tempfile
 from unittest.mock import patch
 
 import pypandoc
 import pytest
 
-from unstructured.file_utils.file_conversion import convert_file_to_text
+from test_unstructured.unit_utils import FixtureRequest, example_doc_path, stdlib_fn_mock
+from unstructured.file_utils.file_conversion import (
+    convert_file_to_html_text_using_pandoc,
+    convert_file_to_text,
+)
 
 DIRECTORY = pathlib.Path(__file__).parent.resolve()
 
@@ -21,3 +26,29 @@ def test_convert_to_file_raises_if_pandoc_not_available():
     with patch.object(pypandoc, "convert_file", side_effect=FileNotFoundError):
         with pytest.raises(FileNotFoundError):
             convert_file_to_text(filename, source_format="epub", target_format="html")
+
+
+@pytest.mark.parametrize(
+    ("source_format", "filename"),
+    [
+        ("epub", "winter-sports.epub"),
+        ("org", "README.org"),
+        ("rst", "README.rst"),
+        ("rtf", "fake-doc.rtf"),
+    ],
+)
+def test_convert_file_to_html_text_using_pandoc(
+    request: FixtureRequest, tmp_path: pathlib.Path, source_format: str, filename: str
+):
+    # -- Get a real tempdir: `tmp_path`
+    # -- Mock tempfile.TemporaryDirectory() using `stdlib_fn_mock`
+    # -- Set the return value of mock.__enter__ to the real tempdir
+    tempdir_ = stdlib_fn_mock(request, tempfile, "TemporaryDirectory")
+    tempdir_.return_value.__enter__.return_value = tmp_path
+
+    with open(example_doc_path(filename), "rb") as f:
+        html_text = convert_file_to_html_text_using_pandoc(file=f, source_format=source_format)
+
+    assert isinstance(html_text, str)
+    assert len(list(tmp_path.iterdir())) == 1
+    tempdir_.return_value.__exit__.assert_called_once()
diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py
@@ -605,6 +605,7 @@ def test_partition_email_can_process_attachments(tmp_path: pathlib.Path):
     expected_metadata.parent_id = None
     elements[-1].metadata.parent_id = None
 
+    assert [a.name for a in os.scandir(output_dir) if a.is_file()] == ["fake-attachment.txt"]
     assert elements[0].text.startswith("Hello!")
     for element in elements[:-1]:
         assert element.metadata.filename == "fake-email-attachment.eml"

diff --git a/test_unstructured/staging/test_base.py b/test_unstructured/staging/test_base.py
@@ -3,7 +3,7 @@
 import os
 import pathlib
 import platform
-from tempfile import NamedTemporaryFile
+import tempfile
 
 import pandas as pd
 import pytest
@@ -249,9 +249,12 @@ def test_serialized_deserialize_elements_to_json(tmpdir: str):
 
 def test_read_and_write_json_with_encoding():
     elements = partition_text("example-docs/fake-text-utf-16-be.txt")
-    with NamedTemporaryFile() as tempfile:
-        base.elements_to_json(elements, filename=tempfile.name, encoding="utf-16")
-        new_elements_filename = base.elements_from_json(filename=tempfile.name, encoding="utf-16")
+
+    with tempfile.TemporaryDirectory() as tmp_dir_path:
+        tmp_file_path = os.path.join(tmp_dir_path, "tmp_file")
+        base.elements_to_json(elements, filename=tmp_file_path, encoding="utf-16")
+        new_elements_filename = base.elements_from_json(filename=tmp_file_path, encoding="utf-16")
+
     assert elements == new_elements_filename
 
 

diff --git a/test_unstructured/unit_utils.py b/test_unstructured/unit_utils.py
@@ -5,6 +5,7 @@
 import datetime as dt
 import difflib
 import pathlib
+import types
 from typing import Any, List, Optional
 from unittest.mock import (
     ANY,
@@ -207,6 +208,22 @@ def method_mock(
     return _patch.start()
 
 
+def stdlib_fn_mock(
+    request: FixtureRequest,
+    std_mod: types.ModuleType,
+    fn_name: str,
+    autospec: bool = True,
+    **kwargs: Any,
+) -> Mock:
+    """Return mock for function `fn_name` on `std_mod`.
+
+    The patch is reversed after pytest uses it.
+    """
+    _patch = patch.object(std_mod, fn_name, autospec=autospec, **kwargs)
+    request.addfinalizer(_patch.stop)
+    return _patch.start()
+
+
 def open_mock(request: FixtureRequest, module_name: str, **kwargs: Any) -> Mock:
     """Return a mock for the builtin `open()` method in `module_name`."""
     target = "%s.open" % module_name

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.15.8"  # pragma: no cover
+__version__ = "0.15.9-dev0"  # pragma: no cover
diff --git a/unstructured/file_utils/file_conversion.py b/unstructured/file_utils/file_conversion.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import os
 import tempfile
 from typing import IO
 
@@ -54,11 +55,12 @@ def convert_file_to_html_text_using_pandoc(
     exactly_one(filename=filename, file=file)
 
     if file is not None:
-        with tempfile.NamedTemporaryFile() as tmp:
-            tmp.write(file.read())
-            tmp.flush()
+        with tempfile.TemporaryDirectory() as temp_dir_path:
+            tmp_file_path = os.path.join(temp_dir_path, f"tmp_file.{source_format}")
+            with open(tmp_file_path, "wb") as tmp_file:
+                tmp_file.write(file.read())
             return convert_file_to_text(
-                filename=tmp.name, source_format=source_format, target_format="html"
+                filename=tmp_file_path, source_format=source_format, target_format="html"
             )
 
     assert filename is not None

diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py
@@ -384,7 +384,7 @@ def convert_office_doc(
     target_filter: Optional[str] = None,
     wait_for_soffice_ready_time_out: int = 10,
 ):
-    """Converts a .doc file to a .docx file using the libreoffice CLI.
+    """Converts a .doc/.ppt file to a .docx/.pptx file using the libreoffice CLI.
 
     Parameters
     ----------

diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py
@@ -9,7 +9,7 @@
 from email.headerregistry import AddressHeader
 from email.message import EmailMessage
 from functools import partial
-from tempfile import NamedTemporaryFile, TemporaryDirectory
+from tempfile import TemporaryDirectory
 from typing import IO, Any, Callable, Final, Optional, Type, cast
 
 from unstructured.chunking import add_chunking_strategy
@@ -222,21 +222,18 @@ def extract_attachment_info(
             attachment_info["payload"] = part.get_payload(decode=True)
             list_attachments.append(attachment_info)
 
-            for idx, attachment in enumerate(list_attachments):
-                if output_dir:
+            if output_dir:
+                for idx, attachment in enumerate(list_attachments):
                     if "filename" in attachment:
                         filename = output_dir + "/" + attachment["filename"]
                         with open(filename, "wb") as f:
                             # Note(harrell) mypy wants to just us `w` when opening the file but this
                             # causes an error since the payloads are bytes not str
                             f.write(attachment["payload"])
                     else:
-                        with NamedTemporaryFile(
-                            mode="wb",
-                            dir=output_dir,
-                            delete=False,
-                        ) as f:
-                            list_attachments[idx]["filename"] = os.path.basename(f.name)
+                        filename = os.path.join(output_dir, f"attachment_{idx}")
+                        with open(filename, "wb") as f:
+                            list_attachments[idx]["filename"] = os.path.basename(filename)
                             f.write(attachment["payload"])
 
     return list_attachments

diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py
@@ -65,16 +65,15 @@ def process_data_with_ocr(
     Returns:
         DocumentLayout: The merged layout information obtained after OCR processing.
     """
-    file_name = ""
-    with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
-        data_bytes = data if isinstance(data, bytes) else data.read()
-        tmp_file.write(data_bytes)
-        tmp_file.flush()
-        file_name = tmp_file.name
+    data_bytes = data if isinstance(data, bytes) else data.read()
+
+    with tempfile.TemporaryDirectory() as tmp_dir_path:
+        tmp_file_path = os.path.join(tmp_dir_path, "tmp_file")
+        with open(tmp_file_path, "wb") as tmp_file:
+            tmp_file.write(data_bytes)
 
-    try:
         merged_layouts = process_file_with_ocr(
-            filename=file_name,
+            filename=tmp_file_path,
             out_layout=out_layout,
             extracted_layout=extracted_layout,
             is_image=is_image,
@@ -84,9 +83,6 @@ def process_data_with_ocr(
             pdf_image_dpi=pdf_image_dpi,
             ocr_drawer=ocr_drawer,
         )
-    finally:
-        if os.path.isfile(file_name):
-            os.remove(file_name)
 
     return merged_layouts
 

diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py
@@ -125,10 +125,10 @@ def save_elements(
     element_category_to_save: str,
     pdf_image_dpi: int,
     filename: str = "",
-    file: Optional[Union[bytes, BinaryIO]] = None,
+    file: bytes | IO[bytes] | None = None,
     is_image: bool = False,
     extract_image_block_to_payload: bool = False,
-    output_dir_path: Optional[str] = None,
+    output_dir_path: str | None = None,
 ):
     """
     Saves specific elements from a PDF as images either to a directory or embeds them in the
@@ -154,12 +154,16 @@ def save_elements(
             if file is None:
                 image_paths = [filename]
             else:
-                if hasattr(file, "seek"):
+                if isinstance(file, bytes):
+                    file_data = file
+                else:
                     file.seek(0)
-                temp_file = tempfile.NamedTemporaryFile(delete=False, dir=temp_dir)
-                temp_file.write(file.read() if hasattr(file, "read") else file)
-                temp_file.flush()
-                image_paths = [temp_file.name]
+                    file_data = file.read()
+
+                tmp_file_path = os.path.join(temp_dir, "tmp_file")
+                with open(tmp_file_path, "wb") as tmp_file:
+                    tmp_file.write(file_data)
+                image_paths = [tmp_file_path]
         else:
             _image_paths = convert_pdf_to_image(
                 filename,
@@ -191,6 +195,7 @@ def save_elements(
             # The page number in the metadata may have been offset
             # by starting_page_number. Make sure we use the right
             # value for indexing!
+            assert el.metadata.page_number
             metadata_page_number = el.metadata.page_number
             page_index = metadata_page_number - starting_page_number
 
@@ -208,6 +213,7 @@ def save_elements(
                     el.metadata.image_mime_type = "image/jpeg"
                 else:
                     basename = "table" if el.category == ElementType.TABLE else "figure"
+                    assert output_dir_path
                     output_f_path = os.path.join(
                         output_dir_path,
                         f"{basename}-{metadata_page_number}-{figure_number}.jpg",