diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..e1ca7558 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,4 @@ +.DS_Store +.git +.vscode +data/ diff --git a/.env.sample b/.env.sample new file mode 100644 index 00000000..f177831c --- /dev/null +++ b/.env.sample @@ -0,0 +1,27 @@ +# See marker/settings.py for more options +# The following are the default values. Uncomment and change as needed. + +# Please note the order of precedence for settings: +# 1. Environment variables +# 2. local.env file +# 3. Default values in marker/settings.py + +# See # https://docs.pydantic.dev/latest/concepts/pydantic_settings/#dotenv-env-support + +# TESSDATA_PREFIX setting is set in the Dockerfile + +## General settings: + +# TORCH_DEVICE=cpu + +# How much VRAM each GPU has (in GB). +# INFERENCE_RAM=12 + +# How much VRAM to allocate per task (in GB). Peak marker VRAM usage is around 3GB, but avg across workers is lower. +# VRAM_PER_TASK=2.5 + +# Enable debug logging +# DEBUG=False + +# Default language we assume files to be in, should be one of the keys in TESSERACT_LANGUAGES +# DEFAULT_LANG=English \ No newline at end of file diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml new file mode 100644 index 00000000..8b0c2f30 --- /dev/null +++ b/.github/workflows/docker.yml @@ -0,0 +1,32 @@ +name: Publish Docker image + +on: + push: + branches: + - master + +jobs: + build-and-push: + runs-on: ubuntu-latest + steps: + - name: Check out the repository + uses: actions/checkout@v4 + + - name: Log in to Docker Hub + uses: docker/login-action@v3.0.0 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Build the Docker image (CPU version) + run: docker build . --file Dockerfile --tag gardner/marker:cpu-${{ github.sha }} --tag gardner/marker:latest --tag gardner/marker:cpu + + - name: Push the Docker image (CPU version) + run: docker push gardner/marker:cpu-${{ github.sha }} + + - name: Build the Docker image (CUDA version) + run: docker build . --build-arg BASE_IMAGE=pytorch/pytorch:2.1.1-cuda12.1-cudnn8-devel --file Dockerfile --tag gardner/marker:cuda-${{ github.sha }} --tag gardner/marker:cuda-latest --tag gardner/marker:cuda + + - name: Push the Docker image (CUDA version) + run: docker push gardner/marker:cuda-${{ github.sha }} + diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..85c49042 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,67 @@ +ARG BASE_IMAGE=python:3.10-bookworm +FROM ${BASE_IMAGE} + +WORKDIR /app + +ARG PIP_VERSION=24.0 +ARG POETRY_VERSION=1.7.1 +ARG GS_VERSION=10.02.1 +ARG TORCH_VERSION=2.1.2 + +ENV DEBIAN_FRONTEND=noninteractive +ENV GS_URL=https://github.com/ArtifexSoftware/ghostpdl-downloads/releases/download/gs10021/ghostscript-${GS_VERSION}.tar.gz + +RUN apt-get update \ + && apt-get -y install apt-transport-https lsb-release wget gnupg2 \ + && wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - \ + && echo "deb https://notesalexp.org/tesseract-ocr5/$(lsb_release -cs)/ $(lsb_release -cs) main" > /etc/apt/sources.list.d/notesalexp.list \ + && apt-get update \ + && apt-get install -y \ + build-essential \ + cmake \ + libmagic1 \ + libtesseract-dev \ + ocrmypdf \ + python3-dev \ + python3-pip \ + tesseract-ocr \ + tesseract-ocr-deu \ + tesseract-ocr-eng \ + tesseract-ocr-fra \ + tesseract-ocr-por \ + tesseract-ocr-rus \ + tesseract-ocr-spa \ + && rm -rf /var/lib/apt/lists/* + +RUN wget -q ${GS_URL} \ + && tar -xvf ghostscript-${GS_VERSION}.tar.gz \ + && cd ghostscript-${GS_VERSION} \ + && ./configure \ + && make -j $(nproc) \ + && make install \ + && cd .. \ + && rm -rf ghostscript-${GS_VERSION} ghostscript-${GS_VERSION}.tar.gz + +RUN pip install pip==${PIP_VERSION} \ + && pip install poetry==${POETRY_VERSION} \ + && poetry config virtualenvs.create false + +# If BASE_IMAGE is pytorch/pytorch:tag then pytorch will be installed with cuda support. +# If pytorch is not installed, install the cpu version. +RUN python -c "import torch" \ + || pip install --extra-index-url https://download.pytorch.org/whl/cpu/ \ + torch==${TORCH_VERSION} \ + torchvision \ + torchaudio==${TORCH_VERSION} + +COPY ./pyproject.toml ./poetry.lock ./ + +RUN poetry install --no-dev --no-interaction --no-ansi --no-root + +ARG TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata +ENV TESSDATA_PREFIX=${TESSDATA_PREFIX} + +# Test to make sure the TESSDATA_PREFIX is set correctly +RUN find / -name tessdata 2> /dev/null | grep "${TESSDATA_PREFIX}" + +COPY . . diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..62de548f --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,29 @@ +# This example builds and runs the cuda version +services: + marker: + build: + context: . + dockerfile: Dockerfile + args: + - BASE_IMAGE=pytorch/pytorch:2.1.1-cuda12.1-cudnn8-devel + command: python convert_single.py /input/thinkpython.pdf /output/thinkpython.md --parallel_factor 2 --max_pages 10 + shm_size: '12gb' # set this to the size of VRAM if possible + volumes: + - ./input:/input + - ./output:/output + - xdg_cache:/root/.cache + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + - TORCH_DEVICE=cuda + - INFERENCE_RAM=12 + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + +volumes: + xdg_cache: \ No newline at end of file