diff --git a/CHANGELOG.md b/CHANGELOG.md index 1343327697..35b147add4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.8-dev5 +## 0.15.8 ### Enhancements @@ -10,6 +10,7 @@ ### Fixes +* **Replace `pillow-heif` with `pi-heif`**. Replaces `pillow-heif` with `pi-heif` due to more permissive licensing on the wheel for `pi-heif`. * **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text. * **Fall back to filename extension-based file-type detection for unidentified OLE files.** Resolves a problem where a DOC file that could not be detected as such by `filetype` was incorrectly identified as a MSG file. diff --git a/requirements/base.txt b/requirements/base.txt index 5aa767b1c4..ff7f516bed 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -41,7 +41,7 @@ h11==0.14.0 # via httpcore httpcore==1.0.5 # via httpx -httpx==0.27.0 +httpx==0.27.2 # via unstructured-client idna==3.8 # via diff --git a/requirements/dev.txt b/requirements/dev.txt index 144e411f1d..4e81e1c2bc 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -354,7 +354,7 @@ wheel==0.44.0 # pip-tools widgetsnbextension==4.0.13 # via ipywidgets -zipp==3.20.0 +zipp==3.20.1 # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: diff --git a/requirements/extra-markdown.txt b/requirements/extra-markdown.txt index 7a19cc2b2f..7470dc5d12 100644 --- a/requirements/extra-markdown.txt +++ b/requirements/extra-markdown.txt @@ -8,5 +8,5 @@ importlib-metadata==8.4.0 # via markdown markdown==3.7 # via -r ./extra-markdown.in -zipp==3.20.0 +zipp==3.20.1 # via importlib-metadata diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index b6ed058e6b..d2559fad70 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -43,7 +43,7 @@ httpcore==1.0.5 # via # -c ./base.txt # httpx -httpx==0.27.0 +httpx==0.27.2 # via # -c ./base.txt # paddlepaddle @@ -176,5 +176,5 @@ urllib3==1.26.19 # -c ././deps/constraints.txt # -c ./base.txt # requests -zipp==3.20.0 +zipp==3.20.1 # via importlib-resources diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index 5e1e6b2b18..f8a746d687 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -5,7 +5,7 @@ onnx pdf2image pdfminer.six pikepdf -pillow_heif +pi_heif pypdf google-cloud-vision effdet diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 351b4cf15b..1f0a912556 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -53,7 +53,7 @@ google-auth==2.34.0 # google-cloud-vision google-cloud-vision==3.7.4 # via -r ./extra-pdf-image.in -googleapis-common-protos==1.63.2 +googleapis-common-protos==1.64.0 # via # google-api-core # grpcio-status @@ -147,6 +147,8 @@ pdfminer-six==20231228 # pdfplumber pdfplumber==0.11.4 # via layoutparser +pi-heif==0.18.0 + # via -r ./extra-pdf-image.in pikepdf==9.2.0 # via -r ./extra-pdf-image.in pillow==10.4.0 @@ -155,12 +157,10 @@ pillow==10.4.0 # matplotlib # pdf2image # pdfplumber + # pi-heif # pikepdf - # pillow-heif # torchvision # unstructured-pytesseract -pillow-heif==0.18.0 - # via -r ./extra-pdf-image.in portalocker==2.10.1 # via iopath proto-plus==1.24.0 @@ -293,5 +293,5 @@ wrapt==1.16.0 # -c ././deps/constraints.txt # -c ./base.txt # deprecated -zipp==3.20.0 +zipp==3.20.1 # via importlib-resources diff --git a/requirements/ingest/astradb.txt b/requirements/ingest/astradb.txt index 0819720396..a135e510f0 100644 --- a/requirements/ingest/astradb.txt +++ b/requirements/ingest/astradb.txt @@ -51,7 +51,7 @@ httpcore==1.0.5 # via # -c ./ingest/../base.txt # httpx -httpx[http2]==0.27.0 +httpx[http2]==0.27.2 # via # -c ./ingest/../base.txt # astrapy diff --git a/requirements/ingest/chroma.txt b/requirements/ingest/chroma.txt index 60ddec9384..790115267b 100644 --- a/requirements/ingest/chroma.txt +++ b/requirements/ingest/chroma.txt @@ -60,7 +60,7 @@ fsspec==2024.6.1 # via huggingface-hub google-auth==2.34.0 # via kubernetes -googleapis-common-protos==1.63.2 +googleapis-common-protos==1.64.0 # via opentelemetry-exporter-otlp-proto-grpc grpcio==1.66.0 # via @@ -245,7 +245,7 @@ wrapt==1.16.0 # -c ./ingest/../deps/constraints.txt # deprecated # opentelemetry-instrumentation -zipp==3.20.0 +zipp==3.20.1 # via # importlib-metadata # importlib-resources diff --git a/requirements/ingest/clarifai.txt b/requirements/ingest/clarifai.txt index b60a8246e7..34f7251d0f 100644 --- a/requirements/ingest/clarifai.txt +++ b/requirements/ingest/clarifai.txt @@ -19,7 +19,7 @@ clarifai-grpc==10.7.1 # via clarifai contextlib2==21.6.0 # via schema -googleapis-common-protos==1.63.2 +googleapis-common-protos==1.64.0 # via clarifai-grpc grpcio==1.66.0 # via @@ -61,7 +61,7 @@ requests==2.32.3 # via # -c ./ingest/../base.txt # clarifai-grpc -rich==13.7.1 +rich==13.8.0 # via clarifai schema==0.7.5 # via clarifai diff --git a/requirements/ingest/databricks-volumes.txt b/requirements/ingest/databricks-volumes.txt index 82c9e569b6..c92dca41d1 100644 --- a/requirements/ingest/databricks-volumes.txt +++ b/requirements/ingest/databricks-volumes.txt @@ -15,7 +15,7 @@ charset-normalizer==3.3.2 # via # -c ./ingest/../base.txt # requests -databricks-sdk==0.30.0 +databricks-sdk==0.31.0 # via -r ./ingest/databricks-volumes.in google-auth==2.34.0 # via databricks-sdk diff --git a/requirements/ingest/embed-aws-bedrock.txt b/requirements/ingest/embed-aws-bedrock.txt index 62c002b236..b3fa7eb586 100644 --- a/requirements/ingest/embed-aws-bedrock.txt +++ b/requirements/ingest/embed-aws-bedrock.txt @@ -62,7 +62,7 @@ httpcore==1.0.5 # via # -c ./ingest/../base.txt # httpx -httpx==0.27.0 +httpx==0.27.2 # via # -c ./ingest/../base.txt # langsmith diff --git a/requirements/ingest/embed-huggingface.txt b/requirements/ingest/embed-huggingface.txt index 6a16f71558..02b2bcca47 100644 --- a/requirements/ingest/embed-huggingface.txt +++ b/requirements/ingest/embed-huggingface.txt @@ -42,7 +42,7 @@ httpcore==1.0.5 # via # -c ./ingest/../base.txt # httpx -httpx==0.27.0 +httpx==0.27.2 # via # -c ./ingest/../base.txt # langsmith diff --git a/requirements/ingest/embed-octoai.txt b/requirements/ingest/embed-octoai.txt index f6be9fb428..f35efdd14d 100644 --- a/requirements/ingest/embed-octoai.txt +++ b/requirements/ingest/embed-octoai.txt @@ -36,7 +36,7 @@ httpcore==1.0.5 # via # -c ./ingest/../base.txt # httpx -httpx==0.27.0 +httpx==0.27.2 # via # -c ./ingest/../base.txt # openai diff --git a/requirements/ingest/embed-openai.txt b/requirements/ingest/embed-openai.txt index 05a9c12cc6..eeed8d3c89 100644 --- a/requirements/ingest/embed-openai.txt +++ b/requirements/ingest/embed-openai.txt @@ -36,7 +36,7 @@ httpcore==1.0.5 # via # -c ./ingest/../base.txt # httpx -httpx==0.27.0 +httpx==0.27.2 # via # -c ./ingest/../base.txt # langsmith diff --git a/requirements/ingest/embed-vertexai.txt b/requirements/ingest/embed-vertexai.txt index f4ed5e7082..d28e64761d 100644 --- a/requirements/ingest/embed-vertexai.txt +++ b/requirements/ingest/embed-vertexai.txt @@ -88,7 +88,7 @@ google-resumable-media==2.7.2 # via # google-cloud-bigquery # google-cloud-storage -googleapis-common-protos[grpc]==1.63.2 +googleapis-common-protos[grpc]==1.64.0 # via # google-api-core # grpc-google-iam-v1 @@ -112,7 +112,7 @@ httpcore==1.0.5 # via # -c ./ingest/../base.txt # httpx -httpx==0.27.0 +httpx==0.27.2 # via # -c ./ingest/../base.txt # langchain-google-vertexai diff --git a/requirements/ingest/embed-voyageai.txt b/requirements/ingest/embed-voyageai.txt index 9ac0b49bc0..a576aa7396 100644 --- a/requirements/ingest/embed-voyageai.txt +++ b/requirements/ingest/embed-voyageai.txt @@ -53,7 +53,7 @@ httpcore==1.0.5 # via # -c ./ingest/../base.txt # httpx -httpx==0.27.0 +httpx==0.27.2 # via # -c ./ingest/../base.txt # langsmith diff --git a/requirements/ingest/gcs.txt b/requirements/ingest/gcs.txt index 2420b99281..253d11aa12 100644 --- a/requirements/ingest/gcs.txt +++ b/requirements/ingest/gcs.txt @@ -66,7 +66,7 @@ google-crc32c==1.5.0 # google-resumable-media google-resumable-media==2.7.2 # via google-cloud-storage -googleapis-common-protos==1.63.2 +googleapis-common-protos==1.64.0 # via google-api-core idna==3.8 # via diff --git a/requirements/ingest/google-drive.txt b/requirements/ingest/google-drive.txt index f7dffc1152..69a57b6fcf 100644 --- a/requirements/ingest/google-drive.txt +++ b/requirements/ingest/google-drive.txt @@ -26,7 +26,7 @@ google-auth==2.34.0 # google-auth-httplib2 google-auth-httplib2==0.2.0 # via google-api-python-client -googleapis-common-protos==1.63.2 +googleapis-common-protos==1.64.0 # via google-api-core httplib2==0.22.0 # via diff --git a/requirements/ingest/notion.txt b/requirements/ingest/notion.txt index 9af65de90f..5f37a0e6af 100644 --- a/requirements/ingest/notion.txt +++ b/requirements/ingest/notion.txt @@ -28,7 +28,7 @@ httpcore==1.0.5 # via # -c ./ingest/../base.txt # httpx -httpx==0.27.0 +httpx==0.27.2 # via # -c ./ingest/../base.txt # notion-client diff --git a/requirements/ingest/qdrant.txt b/requirements/ingest/qdrant.txt index 4b3fed860e..dbeb86d608 100644 --- a/requirements/ingest/qdrant.txt +++ b/requirements/ingest/qdrant.txt @@ -39,7 +39,7 @@ httpcore==1.0.5 # via # -c ./ingest/../base.txt # httpx -httpx[http2]==0.27.0 +httpx[http2]==0.27.2 # via # -c ./ingest/../base.txt # qdrant-client diff --git a/requirements/ingest/singlestore.txt b/requirements/ingest/singlestore.txt index 46202853cd..6297d91734 100644 --- a/requirements/ingest/singlestore.txt +++ b/requirements/ingest/singlestore.txt @@ -56,7 +56,7 @@ wheel==0.44.0 # via # -c ./ingest/../deps/constraints.txt # singlestoredb -zipp==3.20.0 +zipp==3.20.1 # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: diff --git a/requirements/ingest/weaviate.txt b/requirements/ingest/weaviate.txt index e30f2298db..12557b287e 100644 --- a/requirements/ingest/weaviate.txt +++ b/requirements/ingest/weaviate.txt @@ -4,20 +4,12 @@ # # pip-compile ./ingest/weaviate.in # -annotated-types==0.7.0 - # via pydantic -anyio==4.4.0 - # via - # -c ./ingest/../base.txt - # httpx authlib==1.3.2 # via weaviate-client certifi==2024.7.4 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt - # httpcore - # httpx # requests cffi==1.17.0 # via cryptography @@ -27,64 +19,16 @@ charset-normalizer==3.3.2 # requests cryptography==43.0.0 # via authlib -exceptiongroup==1.2.2 - # via - # -c ./ingest/../base.txt - # anyio -grpcio==1.66.0 - # via - # -c ./ingest/../deps/constraints.txt - # grpcio-health-checking - # grpcio-tools - # weaviate-client -grpcio-health-checking==1.62.3 - # via weaviate-client -grpcio-tools==1.62.3 - # via weaviate-client -h11==0.14.0 - # via - # -c ./ingest/../base.txt - # httpcore -httpcore==1.0.5 - # via - # -c ./ingest/../base.txt - # httpx -httpx==0.27.0 - # via - # -c ./ingest/../base.txt - # weaviate-client idna==3.8 # via # -c ./ingest/../base.txt - # anyio - # httpx # requests -protobuf==4.23.4 - # via - # -c ./ingest/../deps/constraints.txt - # grpcio-health-checking - # grpcio-tools pycparser==2.22 # via cffi -pydantic==2.8.2 - # via weaviate-client -pydantic-core==2.20.1 - # via pydantic requests==2.32.3 # via # -c ./ingest/../base.txt # weaviate-client -sniffio==1.3.1 - # via - # -c ./ingest/../base.txt - # anyio - # httpx -typing-extensions==4.12.2 - # via - # -c ./ingest/../base.txt - # anyio - # pydantic - # pydantic-core urllib3==1.26.19 # via # -c ./ingest/../base.txt @@ -92,10 +36,7 @@ urllib3==1.26.19 # requests validators==0.33.0 # via weaviate-client -weaviate-client==4.7.1 +weaviate-client==3.26.7 # via # -c ./ingest/../deps/constraints.txt # -r ./ingest/weaviate.in - -# The following packages are considered to be unsafe in a requirements file: -# setuptools diff --git a/requirements/test.txt b/requirements/test.txt index 5d94ff581c..beac836a90 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -63,7 +63,7 @@ httpcore==1.0.5 # via # -c ./base.txt # httpx -httpx==0.27.0 +httpx==0.27.2 # via # -c ./base.txt # label-studio-sdk diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 2b34f40314..a0e162f586 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.8-dev5" # pragma: no cover +__version__ = "0.15.8" # pragma: no cover diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 8b733c8e3d..4cfb0b8516 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -15,8 +15,8 @@ from pdfminer.layout import LTChar, LTContainer, LTImage, LTItem, LTTextBox from pdfminer.pdftypes import PDFObjRef from pdfminer.utils import open_filename +from pi_heif import register_heif_opener from PIL import Image as PILImage -from pillow_heif import register_heif_opener from pypdf import PdfReader from unstructured.chunking import add_chunking_strategy