Skip to content

Commit

Permalink
build torch and apex in a base image
Browse files Browse the repository at this point in the history
  • Loading branch information
francoishernandez committed Jun 4, 2024
1 parent 8917d33 commit 66d1c21
Show file tree
Hide file tree
Showing 7 changed files with 138 additions and 53 deletions.
26 changes: 16 additions & 10 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,28 @@ on:
description: "EOLE version"
required: true
type: string
# to facilitate initial tests in PR
push:
branches:
- "docker"
torch_version:
description: "PyTorch version"
required: true
type: string
cuda_version:
description: "CUDA version"
required: true
type: string

run-name: ${{ github.workflow }} -- ${{ inputs.eole_version || 'test' }}
run-name: ${{ github.workflow }} -- ${{ inputs.eole_version }} torch:${{ inputs.torch_version }}, cuda:${{ inputs.cuda_version }}

env:
EOLE_VERSION: ${{ inputs.eole_version || 'test' }}
EOLE_VERSION: ${{ inputs.eole_version }}
TORCH_VERSION: ${{ inputs.torch_version }}
CUDA_VERSION: ${{ inputs.cuda_version }}

jobs:
build:
runs-on: ubuntu-22.04
strategy:
matrix:
cuda_version: [11.8.0, 12.1.0]
# strategy:
# matrix:
# cuda_version: [11.8.0, 12.1.0]
permissions: write-all
steps:
- name: Checkout repo
Expand All @@ -35,4 +41,4 @@ jobs:
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build
run: |
docker/build.sh ${{ env.EOLE_VERSION }} ${{ matrix.cuda_version}}
docker/build.sh ${{ env.EOLE_VERSION }} ${{ env.TORCH_VERSION}} ${{ env.CUDA_VERSION}}
39 changes: 39 additions & 0 deletions .github/workflows/build_base.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: Build Base Docker Image

on:
workflow_dispatch:
inputs:
torch_version:
description: "PyTorch version"
required: true
type: string
cuda_version:
description: "CUDA version"
required: true
type: string

run-name: ${{ github.workflow }} -- torch:${{ inputs.torch_version }} -- cuda:${{ inputs.cuda_version }}

env:
TORCH_VERSION: ${{ inputs.torch_version }}
CUDA_VERSION: ${{ inputs.cuda_version }}

jobs:
build:
runs-on: ubuntu-22.04
# strategy:
# matrix:
# cuda_version: [11.8.0, 12.1.0]
permissions: write-all
steps:
- name: Checkout repo
uses: actions/checkout@v4
- name: Login to ghcr
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build
run: |
docker/build_base.sh ${{ env.TORCH_VERSION }} ${{ env.CUDA_VERSION }}
37 changes: 2 additions & 35 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,40 +1,7 @@
ARG TORCH_VERSION=2.3.0
ARG CUDA_VERSION=12.1.0
FROM nvidia/cuda:$CUDA_VERSION-devel-ubuntu22.04

RUN apt-get update && apt-get install -y locales gcc g++ python3-dev
RUN apt-get update && apt-get install -y \
git \
python3-pip \
python3-dev \
libprotobuf-dev \
libprotobuf-c-dev

RUN pip3 install --upgrade pip
RUN pip3 install packaging

# Install torch
RUN CU=$(echo "${CUDA_VERSION%.*}" | sed 's/\.//g'); pip3 install torch --index-url "https://download.pytorch.org/whl/cu$CU"

# Install apex
RUN mkdir /setup
WORKDIR /setup
RUN git clone https://github.com/nvidia/apex
WORKDIR /setup/apex
RUN pip3 install ninja
ENV TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.6"
RUN pip3 install -v --no-build-isolation \
--config-settings --global-option="--cpp_ext" \
--config-settings --global-option="--cuda_ext" \
--config-settings --global-option="--deprecated_fused_adam" \
--global-option="--xentropy" \
--global-option="--fast_multihead_attn" \
./

# Install flash-attention
RUN pip install flash-attn --no-build-isolation

# Install AutoAWQ
RUN pip install autoawq
FROM ghcr.io/eole-nlp/eole-base:torch$TORCH_VERSION-ubuntu22.04-cuda$CUDA_VERSION

COPY . /eole
WORKDIR /eole
Expand Down
43 changes: 43 additions & 0 deletions docker/Dockerfile-base
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# base image with torch and apex
# relatively lighter than the full-fledged ngc pytorch images
# ARG TORCH_VERSION=2.3.0
ARG CUDA_VERSION=12.1.0
FROM nvidia/cuda:$CUDA_VERSION-devel-ubuntu22.04

ARG TORCH_VERSION==2.3.0


RUN apt-get update && apt-get install -y \
libprotobuf-dev \
libprotobuf-c-dev \
g++ \
gcc \
git \
locales \
python3-dev \
python3-pip


RUN pip3 install --upgrade pip
RUN pip3 install packaging

# Install torch
RUN CU=$(echo "${CUDA_VERSION%.*}" | sed 's/\.//g'); pip3 install torch==$TORCH_VERSION --index-url "https://download.pytorch.org/whl/cu$CU"

# Install apex
RUN mkdir /setup
WORKDIR /setup
RUN git clone https://github.com/nvidia/apex
WORKDIR /setup/apex
RUN pip3 install ninja
ENV TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.6;8.7;9.0+PTX"
RUN export MAX_JOBS=$(nproc); pip3 install -v --no-build-isolation \
--config-settings --global-option="--cpp_ext" \
--config-settings --global-option="--cuda_ext" \
--config-settings --global-option="--deprecated_fused_adam" \
--global-option="--xentropy" \
--global-option="--fast_multihead_attn" \
./

# Install flash-attention
RUN pip install flash-attn --no-build-isolation
15 changes: 8 additions & 7 deletions docker/build.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash
#
# Build and push version X of OpenNMT-py with CUDA Y:
# ./build.sh X Y
# Build and push version X of EOLE with torch Y and CUDA Z:
# ./build.sh X Y Z

set -e

Expand All @@ -15,14 +15,15 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
ROOT_DIR=$DIR/..
cd $ROOT_DIR


EOLE_VERSION="$1"
CUDA_VERSION="$2"
[ -z "$CUDA_VERSION" ] && CUDA_VERSION="12.1.0"
TORCH_VERSION="$2"
CUDA_VERSION="$3"

IMAGE="ghcr.io/eole-nlp/eole"
TAG="$EOLE_VERSION-ubuntu22.04-cuda${CUDA_VERSION%.*}"
TAG="$EOLE_VERSION-torch$TORCH_VERSION-ubuntu22.04-cuda${CUDA_VERSION%.*}"

echo "Building $IMAGE:$TAG with CUDA_VERSION=$CUDA_VERSION"
echo "Building $IMAGE:$TAG with TORCH_VERSION=$TORCH_VERSION,CUDA_VERSION=$CUDA_VERSION"

docker build -t $IMAGE:$TAG --progress=plain -f docker/Dockerfile --build-arg CUDA_VERSION=$CUDA_VERSION --no-cache .
docker build -t $IMAGE:$TAG --progress=plain -f docker/Dockerfile --build-arg TORCH_VERSION=$TORCH_VERSION --build-arg CUDA_VERSION=$CUDA_VERSION --no-cache .
docker push $IMAGE:$TAG
28 changes: 28 additions & 0 deletions docker/build_base.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash
#
# Build and push base image with torch version X and CUDA version Y:
# ./build.sh X Y

set -e

# allow user to run this script from anywhere
# from https://stackoverflow.com/a/246128
# one-liner which will give you the full directory name
# of the script no matter where it is being called from
unset CDPATH
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

ROOT_DIR=$DIR/..
cd $ROOT_DIR

TORCH_VERSION="$1"
CUDA_VERSION="$2"
[ -z "$CUDA_VERSION" ] && CUDA_VERSION="12.1.0"

IMAGE="ghcr.io/eole-nlp/eole-base"
TAG="torch$TORCH_VERSION-ubuntu22.04-cuda$CUDA_VERSION"

echo "Building $IMAGE:$TAG with TORCH_VERSION=$TORCH_VERSION,CUDA_VERSION=$CUDA_VERSION"

docker build -t $IMAGE:$TAG --progress=plain -f docker/Dockerfile-base --build-arg TORCH_VERSION=$TORCH_VERSION --build-arg CUDA_VERSION=$CUDA_VERSION --no-cache .
docker push $IMAGE:$TAG
3 changes: 2 additions & 1 deletion requirements.opt.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ rapidfuzz
scipy
bitsandbytes>=0.41.2
spacy
gradio
gradio
autoawq

0 comments on commit 66d1c21

Please sign in to comment.