Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DRAFT: Add Dockerfile #18

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.DS_Store
.git
.vscode
data/
27 changes: 27 additions & 0 deletions .env.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# See marker/settings.py for more options
# The following are the default values. Uncomment and change as needed.

# Please note the order of precedence for settings:
# 1. Environment variables
# 2. local.env file
# 3. Default values in marker/settings.py

# See # https://docs.pydantic.dev/latest/concepts/pydantic_settings/#dotenv-env-support

# TESSDATA_PREFIX setting is set in the Dockerfile

## General settings:

# TORCH_DEVICE=cpu

# How much VRAM each GPU has (in GB).
# INFERENCE_RAM=12

# How much VRAM to allocate per task (in GB). Peak marker VRAM usage is around 3GB, but avg across workers is lower.
# VRAM_PER_TASK=2.5

# Enable debug logging
# DEBUG=False

# Default language we assume files to be in, should be one of the keys in TESSERACT_LANGUAGES
# DEFAULT_LANG=English
35 changes: 35 additions & 0 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: Publish Docker image

on:
push:
branches:
- master

jobs:
build-and-push:
runs-on: ubuntu-latest
steps:
- name: Check out the repository
uses: actions/checkout@v4

- name: Log in to Docker Hub
uses: docker/[email protected]
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: Build the Docker image (CPU version)
run: docker build . --file Dockerfile --tag ${{ secrets.DOCKER_USERNAME }}/marker:cpu-${{ github.sha }} --tag ${{ secrets.DOCKER_USERNAME }}/marker:latest --tag ${{ secrets.DOCKER_USERNAME }}/marker:cpu

- name: Push the Docker image (CPU version)
run: docker push ${{ secrets.DOCKER_USERNAME }}/marker:cpu-${{ github.sha }}

# The following steps cause the GitHub Action to fail
# with an out of disk space error:

# - name: Build the Docker image (CUDA version)
# run: docker build . --build-arg BASE_IMAGE=pytorch/pytorch:2.1.1-cuda12.1-cudnn8-devel --file Dockerfile --tag ${{ secrets.DOCKER_USERNAME }}/marker:cuda-${{ github.sha }} --tag ${{ secrets.DOCKER_USERNAME }}/marker:cuda-latest --tag ${{ secrets.DOCKER_USERNAME }}/marker:cuda

# - name: Push the Docker image (CUDA version)
# run: docker push ${{ secrets.DOCKER_USERNAME }}/marker:cuda-${{ github.sha }}

67 changes: 67 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
ARG BASE_IMAGE=python:3.10-bookworm
FROM ${BASE_IMAGE}

WORKDIR /app

ARG PIP_VERSION=24.0
ARG POETRY_VERSION=1.7.1
ARG GS_VERSION=10.02.1
ARG TORCH_VERSION=2.1.2

ENV DEBIAN_FRONTEND=noninteractive
ENV GS_URL=https://github.com/ArtifexSoftware/ghostpdl-downloads/releases/download/gs10021/ghostscript-${GS_VERSION}.tar.gz

RUN apt-get update \
&& apt-get -y install apt-transport-https lsb-release wget gnupg2 \
&& wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - \
&& echo "deb https://notesalexp.org/tesseract-ocr5/$(lsb_release -cs)/ $(lsb_release -cs) main" > /etc/apt/sources.list.d/notesalexp.list \
&& apt-get update \
&& apt-get install -y \
build-essential \
cmake \
libmagic1 \
libtesseract-dev \
ocrmypdf \
python3-dev \
python3-pip \
tesseract-ocr \
tesseract-ocr-deu \
tesseract-ocr-eng \
tesseract-ocr-fra \
tesseract-ocr-por \
tesseract-ocr-rus \
tesseract-ocr-spa \
&& rm -rf /var/lib/apt/lists/*

RUN wget -q ${GS_URL} \
&& tar -xvf ghostscript-${GS_VERSION}.tar.gz \
&& cd ghostscript-${GS_VERSION} \
&& ./configure \
&& make -j $(nproc) \
&& make install \
&& cd .. \
&& rm -rf ghostscript-${GS_VERSION} ghostscript-${GS_VERSION}.tar.gz

RUN pip install pip==${PIP_VERSION} \
&& pip install poetry==${POETRY_VERSION} \
&& poetry config virtualenvs.create false

# If BASE_IMAGE is pytorch/pytorch:tag then pytorch will be installed with cuda support.
# If pytorch is not installed, install the cpu version.
RUN python -c "import torch" \
|| pip install --extra-index-url https://download.pytorch.org/whl/cpu/ \
torch==${TORCH_VERSION} \
torchvision \
torchaudio==${TORCH_VERSION}

COPY ./pyproject.toml ./poetry.lock ./

RUN poetry install --no-dev --no-interaction --no-ansi --no-root

ARG TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata
ENV TESSDATA_PREFIX=${TESSDATA_PREFIX}

# Test to make sure the TESSDATA_PREFIX is set correctly
RUN find / -name tessdata 2> /dev/null | grep "${TESSDATA_PREFIX}"

COPY . .
29 changes: 29 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# This example builds and runs the cuda version
services:
marker:
build:
context: .
dockerfile: Dockerfile
args:
- BASE_IMAGE=pytorch/pytorch:2.1.1-cuda12.1-cudnn8-devel
command: python convert_single.py /input/thinkpython.pdf /output/thinkpython.md --parallel_factor 2 --max_pages 10
shm_size: '12gb' # set this to the size of VRAM if possible
volumes:
- ./input:/input
- ./output:/output
- xdg_cache:/root/.cache
environment:
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
- TORCH_DEVICE=cuda
- INFERENCE_RAM=12
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]

volumes:
xdg_cache: