diff --git a/CHANGELOG.md b/CHANGELOG.md index f8dbd67395..2422dc70eb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,14 @@ +## 0.15.5-dev0 + +### Enhancements + +### Features + +### Fixes + +* **Downgrade NLTK dependency version for compatibility**. Due to the unavailability of `nltk==3.8.2` on PyPI, the NLTK dependency has been downgraded to `<3.8.2`. This change ensures continued functionality and compatibility. + + ## 0.15.4 ### Enhancements diff --git a/docker/rockylinux-9.2/Dockerfile b/docker/rockylinux-9.2/Dockerfile index 18e9839005..3bce864e37 100644 --- a/docker/rockylinux-9.2/Dockerfile +++ b/docker/rockylinux-9.2/Dockerfile @@ -26,8 +26,7 @@ RUN python3.10 -m pip install pip==${PIP_VERSION} && \ dnf -y groupremove "Development Tools" && \ dnf clean all -RUN python3.10 -c "import nltk; nltk.download('punkt')" && \ - python3.10 -c "import nltk; nltk.download('averaged_perceptron_tagger')" +RUN python3.10 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" FROM deps as code diff --git a/requirements/base.txt b/requirements/base.txt index f21db4fd37..7fa88148cd 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -69,7 +69,7 @@ mypy-extensions==1.0.0 # unstructured-client nest-asyncio==1.6.0 # via unstructured-client -nltk==3.8.2 +nltk==3.8.1 # via -r ./base.in numpy==1.26.4 # via -r ./base.in diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 56b0a82573..5faa0051d8 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.4" # pragma: no cover +__version__ = "0.15.5-dev0" # pragma: no cover diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py index da68c2540d..edbb276c62 100644 --- a/unstructured/nlp/tokenize.py +++ b/unstructured/nlp/tokenize.py @@ -16,9 +16,9 @@ CACHE_MAX_SIZE: Final[int] = 128 -NLTK_DATA_FILENAME = "nltk_data_3.8.2.tar.gz" +NLTK_DATA_FILENAME = "nltk_data.tgz" NLTK_DATA_URL = f"https://utic-public-cf.s3.amazonaws.com/{NLTK_DATA_FILENAME}" -NLTK_DATA_SHA256 = "ba2ca627c8fb1f1458c15d5a476377a5b664c19deeb99fd088ebf83e140c1663" +NLTK_DATA_SHA256 = "126faf671cd255a062c436b3d0f2d311dfeefcd92ffa43f7c3ab677309404d61" # NOTE(robinson) - mimic default dir logic from NLTK @@ -114,10 +114,10 @@ def _download_nltk_packages_if_not_present(): tagger_available = check_for_nltk_package( package_category="taggers", - package_name="averaged_perceptron_tagger_eng", + package_name="averaged_perceptron_tagger", ) tokenizer_available = check_for_nltk_package( - package_category="tokenizers", package_name="punkt_tab" + package_category="tokenizers", package_name="punkt" ) if not (tokenizer_available and tagger_available):