From 10fa67e615990677c53f8a3a7b6dddaecb33fedf Mon Sep 17 00:00:00 2001
From: Gardner <gardner@bickford.nz>
Date: Thu, 7 Mar 2024 19:26:51 +1300
Subject: [PATCH] Add docker config

---
 .dockerignore                |  4 +++
 .env.sample                  | 27 +++++++++++++++
 .github/workflows/docker.yml | 32 +++++++++++++++++
 Dockerfile                   | 67 ++++++++++++++++++++++++++++++++++++
 docker-compose.yml           | 29 ++++++++++++++++
 5 files changed, 159 insertions(+)
 create mode 100644 .dockerignore
 create mode 100644 .env.sample
 create mode 100644 .github/workflows/docker.yml
 create mode 100644 Dockerfile
 create mode 100644 docker-compose.yml

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 00000000..e1ca7558
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,4 @@
+.DS_Store
+.git
+.vscode
+data/
diff --git a/.env.sample b/.env.sample
new file mode 100644
index 00000000..f177831c
--- /dev/null
+++ b/.env.sample
@@ -0,0 +1,27 @@
+# See marker/settings.py for more options
+# The following are the default values. Uncomment and change as needed.
+
+# Please note the order of precedence for settings:
+# 1. Environment variables
+# 2. local.env file
+# 3. Default values in marker/settings.py
+
+# See # https://docs.pydantic.dev/latest/concepts/pydantic_settings/#dotenv-env-support
+
+# TESSDATA_PREFIX setting is set in the Dockerfile
+
+## General settings:
+
+# TORCH_DEVICE=cpu
+
+# How much VRAM each GPU has (in GB).
+# INFERENCE_RAM=12
+
+# How much VRAM to allocate per task (in GB).  Peak marker VRAM usage is around 3GB, but avg across workers is lower.
+# VRAM_PER_TASK=2.5
+
+# Enable debug logging
+# DEBUG=False
+
+# Default language we assume files to be in, should be one of the keys in TESSERACT_LANGUAGES
+# DEFAULT_LANG=English
\ No newline at end of file
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
new file mode 100644
index 00000000..8b0c2f30
--- /dev/null
+++ b/.github/workflows/docker.yml
@@ -0,0 +1,32 @@
+name: Publish Docker image
+
+on:
+  push:
+    branches:
+      - master
+
+jobs:
+  build-and-push:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Check out the repository
+      uses: actions/checkout@v4
+
+    - name: Log in to Docker Hub
+      uses: docker/login-action@v3.0.0
+      with:
+        username: ${{ secrets.DOCKER_USERNAME }}
+        password: ${{ secrets.DOCKER_PASSWORD }}
+
+    - name: Build the Docker image (CPU version)
+      run: docker build . --file Dockerfile --tag gardner/marker:cpu-${{ github.sha }} --tag gardner/marker:latest --tag gardner/marker:cpu
+
+    - name: Push the Docker image (CPU version)
+      run: docker push gardner/marker:cpu-${{ github.sha }}
+
+    - name: Build the Docker image (CUDA version)
+      run: docker build . --build-arg BASE_IMAGE=pytorch/pytorch:2.1.1-cuda12.1-cudnn8-devel --file Dockerfile --tag gardner/marker:cuda-${{ github.sha }} --tag gardner/marker:cuda-latest --tag gardner/marker:cuda
+
+    - name: Push the Docker image (CUDA version)
+      run: docker push gardner/marker:cuda-${{ github.sha }}
+
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 00000000..85c49042
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,67 @@
+ARG BASE_IMAGE=python:3.10-bookworm
+FROM ${BASE_IMAGE}
+
+WORKDIR /app
+
+ARG PIP_VERSION=24.0
+ARG POETRY_VERSION=1.7.1
+ARG GS_VERSION=10.02.1
+ARG TORCH_VERSION=2.1.2
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV GS_URL=https://github.com/ArtifexSoftware/ghostpdl-downloads/releases/download/gs10021/ghostscript-${GS_VERSION}.tar.gz
+
+RUN apt-get update \
+  && apt-get -y install apt-transport-https lsb-release wget gnupg2 \
+  && wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - \
+  && echo "deb https://notesalexp.org/tesseract-ocr5/$(lsb_release -cs)/ $(lsb_release -cs) main" > /etc/apt/sources.list.d/notesalexp.list \
+  && apt-get update \
+  && apt-get install -y \
+    build-essential \
+    cmake \
+    libmagic1 \
+    libtesseract-dev \
+    ocrmypdf \
+    python3-dev \
+    python3-pip \
+    tesseract-ocr \
+    tesseract-ocr-deu \
+    tesseract-ocr-eng \
+    tesseract-ocr-fra \
+    tesseract-ocr-por \
+    tesseract-ocr-rus \
+    tesseract-ocr-spa \
+  && rm -rf /var/lib/apt/lists/*
+
+RUN wget -q ${GS_URL} \
+  && tar -xvf ghostscript-${GS_VERSION}.tar.gz \
+  && cd ghostscript-${GS_VERSION} \
+  && ./configure \
+  && make -j $(nproc) \
+  && make install \
+  && cd .. \
+  && rm -rf ghostscript-${GS_VERSION} ghostscript-${GS_VERSION}.tar.gz
+
+RUN pip install pip==${PIP_VERSION} \
+  && pip install poetry==${POETRY_VERSION} \
+  && poetry config virtualenvs.create false
+
+# If BASE_IMAGE is pytorch/pytorch:tag then pytorch will be installed with cuda support.
+# If pytorch is not installed, install the cpu version.
+RUN python -c "import torch" \
+  || pip install --extra-index-url https://download.pytorch.org/whl/cpu/ \
+    torch==${TORCH_VERSION} \
+    torchvision \
+    torchaudio==${TORCH_VERSION}
+
+COPY ./pyproject.toml ./poetry.lock ./
+
+RUN poetry install --no-dev --no-interaction --no-ansi --no-root
+
+ARG TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata
+ENV TESSDATA_PREFIX=${TESSDATA_PREFIX}
+
+# Test to make sure the TESSDATA_PREFIX is set correctly
+RUN find / -name tessdata 2> /dev/null | grep "${TESSDATA_PREFIX}"
+
+COPY . .
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 00000000..62de548f
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,29 @@
+# This example builds and runs the cuda version
+services:
+  marker:
+    build:
+      context: .
+      dockerfile: Dockerfile
+      args:
+        - BASE_IMAGE=pytorch/pytorch:2.1.1-cuda12.1-cudnn8-devel
+    command: python convert_single.py /input/thinkpython.pdf /output/thinkpython.md --parallel_factor 2 --max_pages 10
+    shm_size: '12gb' # set this to the size of VRAM if possible
+    volumes:
+      - ./input:/input
+      - ./output:/output
+      - xdg_cache:/root/.cache
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - TORCH_DEVICE=cuda
+      - INFERENCE_RAM=12
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+volumes:
+  xdg_cache:
\ No newline at end of file