Combining Inference and PEFT Tokens in a Batch #1135
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: "docker-build" | |
on: | |
pull_request: | |
paths: | |
- "docker/**" | |
- "!docker/README.md" | |
- ".github/workflows/docker-build.yml" | |
push: | |
branches: | |
- "inference" | |
- "master" | |
schedule: | |
# At 00:00 on day-of-month 1, 14, and 28. | |
- cron: "0 0 1,14,28 * *" | |
workflow_dispatch: | |
# Cancel outdated workflows if they are still running | |
concurrency: | |
group: docker-build-${{ github.head_ref || github.run_id }} | |
cancel-in-progress: true | |
jobs: | |
rocm-builder-start: | |
name: Start an AWS instance to build the ROCM Docker images | |
runs-on: ubuntu-latest | |
if: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} | |
env: | |
ROCM_BUILDER_INSTANCE_ID: ${{ secrets.ROCM_BUILDER_INSTANCE_ID }} | |
steps: | |
- name: Configure AWS credentials | |
uses: aws-actions/configure-aws-credentials@v1 | |
with: | |
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
aws-region: us-east-2 | |
- name: Start EC2 instance | |
run: aws ec2 start-instances --instance-ids $ROCM_BUILDER_INSTANCE_ID | |
docker-build-rocm: | |
name: Build and Install FlexFlow in a Docker Container (ROCm backend) | |
runs-on: ubuntu-20.04 | |
if: ${{ ( github.event_name != 'push' && github.event_name != 'schedule' && github.event_name != 'workflow_dispatch' ) || github.ref_name != 'inference' }} | |
env: | |
FF_GPU_BACKEND: "hip_rocm" | |
hip_version: 5.6 | |
steps: | |
- name: Checkout Git Repository | |
uses: actions/checkout@v3 | |
with: | |
submodules: recursive | |
- name: Free additional space on runner | |
run: .github/workflows/helpers/free_space_on_runner.sh | |
- name: Build Docker container | |
run: FF_HIP_ARCH="gfx1100,gfx1036" ./docker/build.sh flexflow | |
- name: Check availability of flexflow modules in Python | |
run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${hip_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'" | |
keep-runner-registered: | |
name: Keep runner alive | |
if: ${{ github.event_name == 'schedule' }} | |
runs-on: [self-hosted, rocm_builder] | |
defaults: | |
run: | |
shell: bash -l {0} # required to use an activated conda environment | |
env: | |
CONDA: "3" | |
needs: rocm-builder-start | |
steps: | |
- name: Keep alive | |
run: | | |
echo "Keep self-hosted runner registered with Github" | |
sleep 10m | |
docker-build-and-publish-rocm: | |
name: Build and Deploy FlexFlow Docker Containers (ROCm backend) | |
needs: rocm-builder-start | |
runs-on: [self-hosted, rocm_builder] | |
if: ${{ ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} | |
strategy: | |
matrix: | |
hip_version: ["5.3", "5.4", "5.5", "5.6"] | |
fail-fast: false | |
env: | |
FF_GPU_BACKEND: "hip_rocm" | |
hip_version: ${{ matrix.hip_version }} | |
steps: | |
- name: Checkout Git Repository | |
uses: actions/checkout@v3 | |
with: | |
submodules: recursive | |
- name: Build Docker container | |
# On push to inference, build for all compatible architectures, so that we can publish | |
# a pre-built general-purpose image. On all other cases, only build for one architecture | |
# to save time. | |
run: FF_HIP_ARCH=all ./docker/build.sh flexflow | |
- name: Check availability of flexflow modules in Python | |
run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${hip_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'" | |
- name: Publish Docker environment image (on push to inference) | |
env: | |
FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }} | |
run: | | |
./docker/publish.sh flexflow-environment | |
./docker/publish.sh flexflow | |
docker-build-cuda: | |
name: Build and Install FlexFlow in a Docker Container (CUDA backend) | |
runs-on: ubuntu-20.04 | |
strategy: | |
matrix: | |
cuda_version: ["11.1", "11.6", "11.7", "11.8", "12.0", "12.1", "12.2"] | |
fail-fast: false | |
env: | |
FF_GPU_BACKEND: "cuda" | |
cuda_version: ${{ matrix.cuda_version }} | |
steps: | |
- name: Checkout Git Repository | |
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }} | |
uses: actions/checkout@v3 | |
with: | |
submodules: recursive | |
- name: Free additional space on runner | |
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }} | |
run: .github/workflows/helpers/free_space_on_runner.sh | |
- name: Build Docker container | |
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }} | |
env: | |
deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} | |
build_needed: ${{ matrix.cuda_version == '12.0' }} | |
run: | | |
# On push to inference, build for all compatible architectures, so that we can publish | |
# a pre-built general-purpose image. On all other cases, only build for one architecture | |
# to save time. | |
if [[ $deploy_needed == "true" ]] ; then | |
export FF_CUDA_ARCH=all | |
./docker/build.sh flexflow | |
elif [[ $build_needed == "true" ]]; then | |
export FF_CUDA_ARCH=86 | |
./docker/build.sh flexflow | |
fi | |
- name: Check availability of flexflow modules in Python | |
if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }} | |
run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; import flexflow.serve as ff; exit()'" | |
- name: Publish Docker environment image (on push to inference) | |
if: ${{ github.repository_owner == 'flexflow' && ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} | |
env: | |
FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }} | |
run: | | |
./docker/publish.sh flexflow-environment | |
./docker/publish.sh flexflow | |
rocm-builder-stop: | |
needs: [docker-build-and-publish-rocm, keep-runner-registered] | |
if: ${{ always() && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }} | |
runs-on: ubuntu-latest | |
name: Stop the AWS instance we used to build the ROCM Docker images | |
env: | |
ROCM_BUILDER_INSTANCE_ID: ${{ secrets.ROCM_BUILDER_INSTANCE_ID }} | |
steps: | |
- name: Configure AWS credentials | |
uses: aws-actions/configure-aws-credentials@v1 | |
with: | |
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
aws-region: us-east-2 | |
- name: Start EC2 instance | |
run: aws ec2 stop-instances --instance-ids $ROCM_BUILDER_INSTANCE_ID | |
notify-slack: | |
name: Notify Slack in case of failure | |
runs-on: ubuntu-20.04 | |
needs: [docker-build-cuda, docker-build-and-publish-rocm] | |
if: ${{ failure() && github.event_name == 'workflow_dispatch' && github.repository_owner == 'flexflow' }} | |
steps: | |
- name: Send Slack message | |
env: | |
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} | |
run: | | |
curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"Weekly FlexFlow Docker images build failed! <https://github.com/flexflow/FlexFlow/actions/runs/$GITHUB_RUN_ID|(See here).> :x: \"}" $SLACK_WEBHOOK |