diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 30861fd119..43de403b66 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -72,7 +72,6 @@ jobs: - name: Install all doc and test dependencies run: | make install-ci - make install-paddleocr make install-all-ingest make check-licenses diff --git a/Dockerfile b/Dockerfile index eb7fd5b294..c0a67933a9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,7 +17,6 @@ RUN chown -R notebook-user:notebook-user /app && \ USER notebook-user RUN find requirements/ -type f -name "*.txt" -exec pip3.11 install --no-cache-dir --user -r '{}' ';' && \ - pip3.11 install unstructured.paddlepaddle && \ python3.11 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \ python3.11 -c "from unstructured.partition.model_init import initialize; initialize()" && \ python3.11 -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')" diff --git a/Makefile b/Makefile index 64c8233188..6d2105e942 100644 --- a/Makefile +++ b/Makefile @@ -277,10 +277,6 @@ install-local-inference: install install-all-docs install-pandoc: ARCH=${ARCH} ./scripts/install-pandoc.sh -.PHONY: install-paddleocr -install-paddleocr: - ARCH=${ARCH} ./scripts/install-paddleocr.sh - ## pip-compile: compiles all base/dev/test requirements .PHONY: pip-compile pip-compile: diff --git a/requirements/extra-paddleocr.in b/requirements/extra-paddleocr.in index 0abb44b74d..66031f140f 100644 --- a/requirements/extra-paddleocr.in +++ b/requirements/extra-paddleocr.in @@ -1,4 +1,5 @@ -c ./deps/constraints.txt -c base.txt +paddlepaddle==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ unstructured.paddleocr==2.8.0.1 diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 5ba24f7463..ea2bbe0877 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -4,6 +4,13 @@ # # pip-compile ./extra-paddleocr.in # +anyio==3.7.1 + # via + # -c ././deps/constraints.txt + # -c ./base.txt + # httpx +astor==0.8.1 + # via paddlepaddle attrdict==2.0.1 # via unstructured-paddleocr cachetools==5.4.0 @@ -12,6 +19,8 @@ certifi==2024.7.4 # via # -c ././deps/constraints.txt # -c ./base.txt + # httpcore + # httpx # requests charset-normalizer==3.3.2 # via @@ -27,13 +36,33 @@ cycler==0.12.1 # via matplotlib cython==3.0.11 # via unstructured-paddleocr +decorator==5.1.1 + # via paddlepaddle et-xmlfile==1.1.0 # via openpyxl +exceptiongroup==1.2.2 + # via + # -c ./base.txt + # anyio fonttools==4.53.1 # via matplotlib +h11==0.14.0 + # via + # -c ./base.txt + # httpcore +httpcore==1.0.5 + # via + # -c ./base.txt + # httpx +httpx==0.27.0 + # via + # -c ./base.txt + # paddlepaddle idna==3.7 # via # -c ./base.txt + # anyio + # httpx # requests imageio==2.34.2 # via @@ -61,7 +90,9 @@ matplotlib==3.7.2 more-itertools==10.4.0 # via cssutils networkx==3.2.1 - # via scikit-image + # via + # paddlepaddle + # scikit-image numpy==1.26.4 # via # -c ./base.txt @@ -71,6 +102,8 @@ numpy==1.26.4 # matplotlib # opencv-contrib-python # opencv-python + # opt-einsum + # paddlepaddle # scikit-image # scipy # shapely @@ -87,6 +120,8 @@ opencv-python==4.8.0.76 # unstructured-paddleocr openpyxl==3.1.5 # via unstructured-paddleocr +opt-einsum==3.3.0 + # via paddlepaddle packaging==23.2 # via # -c ././deps/constraints.txt @@ -94,6 +129,8 @@ packaging==23.2 # lazy-loader # matplotlib # scikit-image +paddlepaddle==3.0.0b1 + # via -r ./extra-paddleocr.in pdf2image==1.17.0 # via unstructured-paddleocr pillow==10.4.0 @@ -101,11 +138,16 @@ pillow==10.4.0 # imageio # imgaug # matplotlib + # paddlepaddle # pdf2image # scikit-image # unstructured-paddleocr premailer==3.10.0 # via unstructured-paddleocr +protobuf==4.23.4 + # via + # -c ././deps/constraints.txt + # paddlepaddle pyclipper==1.3.0.post5 # via unstructured-paddleocr pyparsing==3.0.9 @@ -146,12 +188,21 @@ six==1.16.0 # attrdict # imgaug # python-dateutil +sniffio==1.3.1 + # via + # -c ./base.txt + # anyio + # httpx tifffile==2024.7.24 # via scikit-image tqdm==4.66.5 # via # -c ./base.txt # unstructured-paddleocr +typing-extensions==4.12.2 + # via + # -c ./base.txt + # paddlepaddle unstructured-paddleocr==2.8.0.1 # via -r ./extra-paddleocr.in urllib3==1.26.19 diff --git a/scripts/install-paddleocr.sh b/scripts/install-paddleocr.sh deleted file mode 100755 index 9ed7359a12..0000000000 --- a/scripts/install-paddleocr.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env bash - -# aarch64 requires a custom build of paddlepaddle -if [ "${ARCH}" = "aarch64" ]; then - python3 -m pip install unstructured.paddlepaddle -else - python3 -m pip install paddlepaddle -fi -python3 -m pip install unstructured.paddleocr