Skip to content

Commit

Permalink
bug: fix file_conversion disk leak (#3562)
Browse files Browse the repository at this point in the history
Fix disk space leaks and Windows errors when accessing file.name on a
NamedTemporaryFile

Uses of `NamedTemporaryFile(..., delete=False)` and/or uses of
`file.name` of NamedTemporaryFile have been replaced with
TemporaryFileDirectory to avoid a known issue:
-
https://docs.python.org/3/library/tempfile.html#tempfile.NamedTemporaryFile
- #3390

The first 7 commits each address an individual occurrence of the issue
if reviewers want to review commit-by-commit.
  • Loading branch information
Coniferish authored Aug 27, 2024
1 parent 4194a07 commit f21c853
Show file tree
Hide file tree
Showing 24 changed files with 162 additions and 97 deletions.
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
## 0.15.9-dev0

### Enhancements

### Features

### Fixes

* **Fix disk space leaks and Windows errors when accessing file.name on a NamedTemporaryFile** Uses of `NamedTemporaryFile(..., delete=False)` and/or uses of `file.name` of NamedTemporaryFiles have been replaced with TemporaryFileDirectory to avoid a known issue: https://docs.python.org/3/library/tempfile.html#tempfile.NamedTemporaryFile

## 0.15.8

### Enhancements
Expand Down
2 changes: 1 addition & 1 deletion requirements/extra-pdf-image.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ google-auth==2.34.0
# google-cloud-vision
google-cloud-vision==3.7.4
# via -r ./extra-pdf-image.in
googleapis-common-protos==1.64.0
googleapis-common-protos==1.65.0
# via
# google-api-core
# grpcio-status
Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest/chroma.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ fsspec==2024.6.1
# via huggingface-hub
google-auth==2.34.0
# via kubernetes
googleapis-common-protos==1.64.0
googleapis-common-protos==1.65.0
# via opentelemetry-exporter-otlp-proto-grpc
grpcio==1.66.0
# via
Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest/clarifai.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ clarifai-grpc==10.7.1
# via clarifai
contextlib2==21.6.0
# via schema
googleapis-common-protos==1.64.0
googleapis-common-protos==1.65.0
# via clarifai-grpc
grpcio==1.66.0
# via
Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest/embed-aws-bedrock.txt
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ langchain-core==0.2.35
# langchain-text-splitters
langchain-text-splitters==0.2.2
# via langchain
langsmith==0.1.104
langsmith==0.1.105
# via
# langchain
# langchain-community
Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest/embed-huggingface.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ langchain-core==0.2.35
# via langchain-huggingface
langchain-huggingface==0.0.3
# via -r ./ingest/embed-huggingface.in
langsmith==0.1.104
langsmith==0.1.105
# via langchain-core
markupsafe==2.1.5
# via jinja2
Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest/embed-mixedbreadai.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ httpcore==1.0.5
# via
# -c ./ingest/../base.txt
# httpx
httpx==0.27.0
httpx==0.27.2
# via
# -c ./ingest/../base.txt
# mixedbread-ai
Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest/embed-openai.txt
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ langchain-core==0.2.35
# via langchain-openai
langchain-openai==0.1.22
# via -r ./ingest/embed-openai.in
langsmith==0.1.104
langsmith==0.1.105
# via langchain-core
openai==1.42.0
# via langchain-openai
Expand Down
4 changes: 2 additions & 2 deletions requirements/ingest/embed-vertexai.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ google-resumable-media==2.7.2
# via
# google-cloud-bigquery
# google-cloud-storage
googleapis-common-protos[grpc]==1.64.0
googleapis-common-protos[grpc]==1.65.0
# via
# google-api-core
# grpc-google-iam-v1
Expand Down Expand Up @@ -148,7 +148,7 @@ langchain-google-vertexai==1.0.10
# via -r ./ingest/embed-vertexai.in
langchain-text-splitters==0.2.2
# via langchain
langsmith==0.1.104
langsmith==0.1.105
# via
# langchain
# langchain-community
Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest/embed-voyageai.txt
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ langchain-text-splitters==0.2.2
# via langchain
langchain-voyageai==0.1.1
# via -r ./ingest/embed-voyageai.in
langsmith==0.1.104
langsmith==0.1.105
# via
# langchain
# langchain-core
Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest/gcs.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ google-crc32c==1.5.0
# google-resumable-media
google-resumable-media==2.7.2
# via google-cloud-storage
googleapis-common-protos==1.64.0
googleapis-common-protos==1.65.0
# via google-api-core
idna==3.8
# via
Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest/google-drive.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ google-auth==2.34.0
# google-auth-httplib2
google-auth-httplib2==0.2.0
# via google-api-python-client
googleapis-common-protos==1.64.0
googleapis-common-protos==1.65.0
# via google-api-core
httplib2==0.22.0
# via
Expand Down
33 changes: 32 additions & 1 deletion test_unstructured/file_utils/test_file_conversion.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
import os
import pathlib
import tempfile
from unittest.mock import patch

import pypandoc
import pytest

from unstructured.file_utils.file_conversion import convert_file_to_text
from test_unstructured.unit_utils import FixtureRequest, example_doc_path, stdlib_fn_mock
from unstructured.file_utils.file_conversion import (
convert_file_to_html_text_using_pandoc,
convert_file_to_text,
)

DIRECTORY = pathlib.Path(__file__).parent.resolve()

Expand All @@ -21,3 +26,29 @@ def test_convert_to_file_raises_if_pandoc_not_available():
with patch.object(pypandoc, "convert_file", side_effect=FileNotFoundError):
with pytest.raises(FileNotFoundError):
convert_file_to_text(filename, source_format="epub", target_format="html")


@pytest.mark.parametrize(
("source_format", "filename"),
[
("epub", "winter-sports.epub"),
("org", "README.org"),
("rst", "README.rst"),
("rtf", "fake-doc.rtf"),
],
)
def test_convert_file_to_html_text_using_pandoc(
request: FixtureRequest, tmp_path: pathlib.Path, source_format: str, filename: str
):
# -- Get a real tempdir: `tmp_path`
# -- Mock tempfile.TemporaryDirectory() using `stdlib_fn_mock`
# -- Set the return value of mock.__enter__ to the real tempdir
tempdir_ = stdlib_fn_mock(request, tempfile, "TemporaryDirectory")
tempdir_.return_value.__enter__.return_value = tmp_path

with open(example_doc_path(filename), "rb") as f:
html_text = convert_file_to_html_text_using_pandoc(file=f, source_format=source_format)

assert isinstance(html_text, str)
assert len(list(tmp_path.iterdir())) == 1
tempdir_.return_value.__exit__.assert_called_once()
1 change: 1 addition & 0 deletions test_unstructured/partition/test_email.py
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,7 @@ def test_partition_email_can_process_attachments(tmp_path: pathlib.Path):
expected_metadata.parent_id = None
elements[-1].metadata.parent_id = None

assert [a.name for a in os.scandir(output_dir) if a.is_file()] == ["fake-attachment.txt"]
assert elements[0].text.startswith("Hello!")
for element in elements[:-1]:
assert element.metadata.filename == "fake-email-attachment.eml"
Expand Down
11 changes: 7 additions & 4 deletions test_unstructured/staging/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
import pathlib
import platform
from tempfile import NamedTemporaryFile
import tempfile

import pandas as pd
import pytest
Expand Down Expand Up @@ -249,9 +249,12 @@ def test_serialized_deserialize_elements_to_json(tmpdir: str):

def test_read_and_write_json_with_encoding():
elements = partition_text("example-docs/fake-text-utf-16-be.txt")
with NamedTemporaryFile() as tempfile:
base.elements_to_json(elements, filename=tempfile.name, encoding="utf-16")
new_elements_filename = base.elements_from_json(filename=tempfile.name, encoding="utf-16")

with tempfile.TemporaryDirectory() as tmp_dir_path:
tmp_file_path = os.path.join(tmp_dir_path, "tmp_file")
base.elements_to_json(elements, filename=tmp_file_path, encoding="utf-16")
new_elements_filename = base.elements_from_json(filename=tmp_file_path, encoding="utf-16")

assert elements == new_elements_filename


Expand Down
17 changes: 17 additions & 0 deletions test_unstructured/unit_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import datetime as dt
import difflib
import pathlib
import types
from typing import Any, List, Optional
from unittest.mock import (
ANY,
Expand Down Expand Up @@ -207,6 +208,22 @@ def method_mock(
return _patch.start()


def stdlib_fn_mock(
request: FixtureRequest,
std_mod: types.ModuleType,
fn_name: str,
autospec: bool = True,
**kwargs: Any,
) -> Mock:
"""Return mock for function `fn_name` on `std_mod`.
The patch is reversed after pytest uses it.
"""
_patch = patch.object(std_mod, fn_name, autospec=autospec, **kwargs)
request.addfinalizer(_patch.stop)
return _patch.start()


def open_mock(request: FixtureRequest, module_name: str, **kwargs: Any) -> Mock:
"""Return a mock for the builtin `open()` method in `module_name`."""
target = "%s.open" % module_name
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.15.8" # pragma: no cover
__version__ = "0.15.9-dev0" # pragma: no cover
10 changes: 6 additions & 4 deletions unstructured/file_utils/file_conversion.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import os
import tempfile
from typing import IO

Expand Down Expand Up @@ -54,11 +55,12 @@ def convert_file_to_html_text_using_pandoc(
exactly_one(filename=filename, file=file)

if file is not None:
with tempfile.NamedTemporaryFile() as tmp:
tmp.write(file.read())
tmp.flush()
with tempfile.TemporaryDirectory() as temp_dir_path:
tmp_file_path = os.path.join(temp_dir_path, f"tmp_file.{source_format}")
with open(tmp_file_path, "wb") as tmp_file:
tmp_file.write(file.read())
return convert_file_to_text(
filename=tmp.name, source_format=source_format, target_format="html"
filename=tmp_file_path, source_format=source_format, target_format="html"
)

assert filename is not None
Expand Down
2 changes: 1 addition & 1 deletion unstructured/partition/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ def convert_office_doc(
target_filter: Optional[str] = None,
wait_for_soffice_ready_time_out: int = 10,
):
"""Converts a .doc file to a .docx file using the libreoffice CLI.
"""Converts a .doc/.ppt file to a .docx/.pptx file using the libreoffice CLI.
Parameters
----------
Expand Down
15 changes: 6 additions & 9 deletions unstructured/partition/email.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from email.headerregistry import AddressHeader
from email.message import EmailMessage
from functools import partial
from tempfile import NamedTemporaryFile, TemporaryDirectory
from tempfile import TemporaryDirectory
from typing import IO, Any, Callable, Final, Optional, Type, cast

from unstructured.chunking import add_chunking_strategy
Expand Down Expand Up @@ -222,21 +222,18 @@ def extract_attachment_info(
attachment_info["payload"] = part.get_payload(decode=True)
list_attachments.append(attachment_info)

for idx, attachment in enumerate(list_attachments):
if output_dir:
if output_dir:
for idx, attachment in enumerate(list_attachments):
if "filename" in attachment:
filename = output_dir + "/" + attachment["filename"]
with open(filename, "wb") as f:
# Note(harrell) mypy wants to just us `w` when opening the file but this
# causes an error since the payloads are bytes not str
f.write(attachment["payload"])
else:
with NamedTemporaryFile(
mode="wb",
dir=output_dir,
delete=False,
) as f:
list_attachments[idx]["filename"] = os.path.basename(f.name)
filename = os.path.join(output_dir, f"attachment_{idx}")
with open(filename, "wb") as f:
list_attachments[idx]["filename"] = os.path.basename(filename)
f.write(attachment["payload"])

return list_attachments
Expand Down
18 changes: 7 additions & 11 deletions unstructured/partition/pdf_image/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,16 +65,15 @@ def process_data_with_ocr(
Returns:
DocumentLayout: The merged layout information obtained after OCR processing.
"""
file_name = ""
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
data_bytes = data if isinstance(data, bytes) else data.read()
tmp_file.write(data_bytes)
tmp_file.flush()
file_name = tmp_file.name
data_bytes = data if isinstance(data, bytes) else data.read()

with tempfile.TemporaryDirectory() as tmp_dir_path:
tmp_file_path = os.path.join(tmp_dir_path, "tmp_file")
with open(tmp_file_path, "wb") as tmp_file:
tmp_file.write(data_bytes)

try:
merged_layouts = process_file_with_ocr(
filename=file_name,
filename=tmp_file_path,
out_layout=out_layout,
extracted_layout=extracted_layout,
is_image=is_image,
Expand All @@ -84,9 +83,6 @@ def process_data_with_ocr(
pdf_image_dpi=pdf_image_dpi,
ocr_drawer=ocr_drawer,
)
finally:
if os.path.isfile(file_name):
os.remove(file_name)

return merged_layouts

Expand Down
20 changes: 13 additions & 7 deletions unstructured/partition/pdf_image/pdf_image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,10 +125,10 @@ def save_elements(
element_category_to_save: str,
pdf_image_dpi: int,
filename: str = "",
file: Optional[Union[bytes, BinaryIO]] = None,
file: bytes | IO[bytes] | None = None,
is_image: bool = False,
extract_image_block_to_payload: bool = False,
output_dir_path: Optional[str] = None,
output_dir_path: str | None = None,
):
"""
Saves specific elements from a PDF as images either to a directory or embeds them in the
Expand All @@ -154,12 +154,16 @@ def save_elements(
if file is None:
image_paths = [filename]
else:
if hasattr(file, "seek"):
if isinstance(file, bytes):
file_data = file
else:
file.seek(0)
temp_file = tempfile.NamedTemporaryFile(delete=False, dir=temp_dir)
temp_file.write(file.read() if hasattr(file, "read") else file)
temp_file.flush()
image_paths = [temp_file.name]
file_data = file.read()

tmp_file_path = os.path.join(temp_dir, "tmp_file")
with open(tmp_file_path, "wb") as tmp_file:
tmp_file.write(file_data)
image_paths = [tmp_file_path]
else:
_image_paths = convert_pdf_to_image(
filename,
Expand Down Expand Up @@ -191,6 +195,7 @@ def save_elements(
# The page number in the metadata may have been offset
# by starting_page_number. Make sure we use the right
# value for indexing!
assert el.metadata.page_number
metadata_page_number = el.metadata.page_number
page_index = metadata_page_number - starting_page_number

Expand All @@ -208,6 +213,7 @@ def save_elements(
el.metadata.image_mime_type = "image/jpeg"
else:
basename = "table" if el.category == ElementType.TABLE else "figure"
assert output_dir_path
output_f_path = os.path.join(
output_dir_path,
f"{basename}-{metadata_page_number}-{figure_number}.jpg",
Expand Down
Loading

0 comments on commit f21c853

Please sign in to comment.