-
Notifications
You must be signed in to change notification settings - Fork 1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
162 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
.DS_Store | ||
.git | ||
.vscode | ||
data/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# See marker/settings.py for more options | ||
# The following are the default values. Uncomment and change as needed. | ||
|
||
# Please note the order of precedence for settings: | ||
# 1. Environment variables | ||
# 2. local.env file | ||
# 3. Default values in marker/settings.py | ||
|
||
# See # https://docs.pydantic.dev/latest/concepts/pydantic_settings/#dotenv-env-support | ||
|
||
# TESSDATA_PREFIX setting is set in the Dockerfile | ||
|
||
## General settings: | ||
|
||
# TORCH_DEVICE=cpu | ||
|
||
# How much VRAM each GPU has (in GB). | ||
# INFERENCE_RAM=12 | ||
|
||
# How much VRAM to allocate per task (in GB). Peak marker VRAM usage is around 3GB, but avg across workers is lower. | ||
# VRAM_PER_TASK=2.5 | ||
|
||
# Enable debug logging | ||
# DEBUG=False | ||
|
||
# Default language we assume files to be in, should be one of the keys in TESSERACT_LANGUAGES | ||
# DEFAULT_LANG=English |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
name: Publish Docker image | ||
|
||
on: | ||
push: | ||
branches: | ||
- master | ||
|
||
jobs: | ||
build-and-push: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Check out the repository | ||
uses: actions/checkout@v4 | ||
|
||
- name: Log in to Docker Hub | ||
uses: docker/[email protected] | ||
with: | ||
username: ${{ secrets.DOCKER_USERNAME }} | ||
password: ${{ secrets.DOCKER_PASSWORD }} | ||
|
||
- name: Build the Docker image (CPU version) | ||
run: docker build . --file Dockerfile --tag ${{ secrets.DOCKER_USERNAME }}/marker:cpu-${{ github.sha }} --tag ${{ secrets.DOCKER_USERNAME }}/marker:latest --tag ${{ secrets.DOCKER_USERNAME }}/marker:cpu | ||
|
||
- name: Push the Docker image (CPU version) | ||
run: docker push ${{ secrets.DOCKER_USERNAME }}/marker:cpu-${{ github.sha }} | ||
|
||
# The following steps cause the GitHub Action to fail | ||
# with an out of disk space error: | ||
|
||
# - name: Build the Docker image (CUDA version) | ||
# run: docker build . --build-arg BASE_IMAGE=pytorch/pytorch:2.1.1-cuda12.1-cudnn8-devel --file Dockerfile --tag ${{ secrets.DOCKER_USERNAME }}/marker:cuda-${{ github.sha }} --tag ${{ secrets.DOCKER_USERNAME }}/marker:cuda-latest --tag ${{ secrets.DOCKER_USERNAME }}/marker:cuda | ||
|
||
# - name: Push the Docker image (CUDA version) | ||
# run: docker push ${{ secrets.DOCKER_USERNAME }}/marker:cuda-${{ github.sha }} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
ARG BASE_IMAGE=python:3.10-bookworm | ||
FROM ${BASE_IMAGE} | ||
|
||
WORKDIR /app | ||
|
||
ARG PIP_VERSION=24.0 | ||
ARG POETRY_VERSION=1.7.1 | ||
ARG GS_VERSION=10.02.1 | ||
ARG TORCH_VERSION=2.1.2 | ||
|
||
ENV DEBIAN_FRONTEND=noninteractive | ||
ENV GS_URL=https://github.com/ArtifexSoftware/ghostpdl-downloads/releases/download/gs10021/ghostscript-${GS_VERSION}.tar.gz | ||
|
||
RUN apt-get update \ | ||
&& apt-get -y install apt-transport-https lsb-release wget gnupg2 \ | ||
&& wget -O - https://notesalexp.org/debian/alexp_key.asc | apt-key add - \ | ||
&& echo "deb https://notesalexp.org/tesseract-ocr5/$(lsb_release -cs)/ $(lsb_release -cs) main" > /etc/apt/sources.list.d/notesalexp.list \ | ||
&& apt-get update \ | ||
&& apt-get install -y \ | ||
build-essential \ | ||
cmake \ | ||
libmagic1 \ | ||
libtesseract-dev \ | ||
ocrmypdf \ | ||
python3-dev \ | ||
python3-pip \ | ||
tesseract-ocr \ | ||
tesseract-ocr-deu \ | ||
tesseract-ocr-eng \ | ||
tesseract-ocr-fra \ | ||
tesseract-ocr-por \ | ||
tesseract-ocr-rus \ | ||
tesseract-ocr-spa \ | ||
&& rm -rf /var/lib/apt/lists/* | ||
|
||
RUN wget -q ${GS_URL} \ | ||
&& tar -xvf ghostscript-${GS_VERSION}.tar.gz \ | ||
&& cd ghostscript-${GS_VERSION} \ | ||
&& ./configure \ | ||
&& make -j $(nproc) \ | ||
&& make install \ | ||
&& cd .. \ | ||
&& rm -rf ghostscript-${GS_VERSION} ghostscript-${GS_VERSION}.tar.gz | ||
|
||
RUN pip install pip==${PIP_VERSION} \ | ||
&& pip install poetry==${POETRY_VERSION} \ | ||
&& poetry config virtualenvs.create false | ||
|
||
# If BASE_IMAGE is pytorch/pytorch:tag then pytorch will be installed with cuda support. | ||
# If pytorch is not installed, install the cpu version. | ||
RUN python -c "import torch" \ | ||
|| pip install --extra-index-url https://download.pytorch.org/whl/cpu/ \ | ||
torch==${TORCH_VERSION} \ | ||
torchvision \ | ||
torchaudio==${TORCH_VERSION} | ||
|
||
COPY ./pyproject.toml ./poetry.lock ./ | ||
|
||
RUN poetry install --no-dev --no-interaction --no-ansi --no-root | ||
|
||
ARG TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata | ||
ENV TESSDATA_PREFIX=${TESSDATA_PREFIX} | ||
|
||
# Test to make sure the TESSDATA_PREFIX is set correctly | ||
RUN find / -name tessdata 2> /dev/null | grep "${TESSDATA_PREFIX}" | ||
|
||
COPY . . |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
# This example builds and runs the cuda version | ||
services: | ||
marker: | ||
build: | ||
context: . | ||
dockerfile: Dockerfile | ||
args: | ||
- BASE_IMAGE=pytorch/pytorch:2.1.1-cuda12.1-cudnn8-devel | ||
command: python convert_single.py /input/thinkpython.pdf /output/thinkpython.md --parallel_factor 2 --max_pages 10 | ||
shm_size: '12gb' # set this to the size of VRAM if possible | ||
volumes: | ||
- ./input:/input | ||
- ./output:/output | ||
- xdg_cache:/root/.cache | ||
environment: | ||
- NVIDIA_VISIBLE_DEVICES=all | ||
- NVIDIA_DRIVER_CAPABILITIES=compute,utility | ||
- TORCH_DEVICE=cuda | ||
- INFERENCE_RAM=12 | ||
deploy: | ||
resources: | ||
reservations: | ||
devices: | ||
- driver: nvidia | ||
count: all | ||
capabilities: [gpu] | ||
|
||
volumes: | ||
xdg_cache: |