From 9b778e270dd8547476370a9417520679cd46c802 Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Wed, 14 Aug 2024 14:15:40 -0700 Subject: [PATCH] fix: `pytesseract>=0.3.12` installation error while installing `pdf` extra (#3522) Closes #3521. This PR resolves an installation error with `pytesseract>=0.3.12` that occurred during `pip install unstructured[pdf]==0.15.3`. ### Testing **Run following command in main branch and this PR** ``` pip uninstall -y pytesseract && pip install ".[pdf]" ``` **Results** - `main` branch ``` INFO: pip is looking at multiple versions of unstructured[pdf] to determine which version is compatible with other requirements. This could take a while. ERROR: Could not find a version that satisfies the requirement pytesseract>=0.3.12; extra == "pdf" (from unstructured[pdf]) (from versions: 0.1, 0.1.3, 0.1.4, 0.1.5, 0.1.6, 0.1.7, 0.1.8, 0.1.9, 0.2.0, 0.2.2, 0.2.4, 0.2.5, 0.2.6, 0.2.7, 0.2.8, 0.2.9, 0.3.0, 0.3.1, 0.3.2, 0.3.3, 0.3.4, 0.3.5, 0.3.6, 0.3.7, 0.3.8, 0.3.9, 0.3.10) ERROR: No matching distribution found for pytesseract>=0.3.12; extra == "pdf" ``` - this `PR` `pytesseract-0.3.13` should be installed successfully. --- CHANGELOG.md | 10 ++++++++++ requirements/deps/constraints.txt | 3 +-- requirements/extra-pdf-image.in | 4 +++- requirements/extra-pdf-image.txt | 4 +--- unstructured/__version__.py | 2 +- 5 files changed, 16 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ff3e1b496..f8dbd67395 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.15.4 + +### Enhancements + +### Features + +### Fixes + +* **Resolve an installation error with `pytesseract>=0.3.12` that occurred during `pip install unstructured[pdf]==0.15.3`.** + ## 0.15.3 ### Enhancements diff --git a/requirements/deps/constraints.txt b/requirements/deps/constraints.txt index 323ccc16fe..71c5097f66 100644 --- a/requirements/deps/constraints.txt +++ b/requirements/deps/constraints.txt @@ -22,8 +22,7 @@ Office365-REST-Python-Client<2.4.3 # unstructured-inference to be upgraded when unstructured library is upgraded # https://github.com/Unstructured-IO/unstructured/issues/1458 # unstructured-inference -# use the known compatible version of weaviate and pytesseract -pytesseract @ git+https://github.com/madmaze/pytesseract.git@v0.3.13 +# use the known compatible version of weaviate weaviate-client>3.25.0 # TODO: Pinned in transformers package, remove when that gets updated tokenizers>=0.19,<0.20 diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index 8c68a0fcdc..6fd3ba8703 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -12,4 +12,6 @@ effdet # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded # when unstructured library is. unstructured-inference==0.7.36 -pytesseract>=0.3.12 +# NOTE(christine): Pinned to a specific version of pytesseract from the GitHub repository. +# Remove this pin and switch to the latest version from PyPI once version 0.3.13 or newer is officially released. +pytesseract @ git+https://github.com/madmaze/pytesseract.git@v0.3.13 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index e9f36b682b..14fa114004 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -202,9 +202,7 @@ pypdf==4.3.1 pypdfium2==4.30.0 # via pdfplumber pytesseract @ git+https://github.com/madmaze/pytesseract.git@v0.3.13 - # via - # -c ././deps/constraints.txt - # -r ./extra-pdf-image.in + # via -r ./extra-pdf-image.in python-dateutil==2.9.0.post0 # via # -c ./base.txt diff --git a/unstructured/__version__.py b/unstructured/__version__.py index e8949bf4f6..56b0a82573 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.3" # pragma: no cover +__version__ = "0.15.4" # pragma: no cover