Skip to content

SageMaker PythonSDK Integration Tests #109

SageMaker PythonSDK Integration Tests

SageMaker PythonSDK Integration Tests #109

name: SageMaker PythonSDK Integration Tests
on:
workflow_dispatch:
inputs:
mode:
description: "release/nightly containers to test. Default is nightly"
required: false
default: 'nightly'
sagemaker-repository:
description: 'Link to Github repository for SageMaker Python SDK. Can be a personal fork.'
required: false
default: ''
repository-branch:
description: 'The branch from the SagMaker Python SDK fork to use for testing'
required: false
default: ''
run_benchmark:
description: 'Runs benchmark and upload to cloud watch metrics'
required: false
default: 'true'
schedule:
- cron: '0 4 * * *'
jobs:
create-runners:
runs-on: [self-hosted, scheduler]
steps:
- name: Create new CPU instance
id: create_cpu1
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_cpu $token djl-serving
- name: Create new CPU instance
id: create_cpu2
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_cpu $token djl-serving
outputs:
cpu_instance_id1: ${{ steps.create_cpu1.outputs.action_cpu_instance_id }}
cpu_instance_id2: ${{ steps.create_cpu2.outputs.action_cpu_instance_id }}
# These tests are SLOW, and we only have 2 ml.g5.12xlarge instances available for testing
# We parallelize the tests into two separate tracks, each using 1 instance for a few tests.
# There's probably a better way to do this, but for now this works.
# If you add a test, please try to keep the groups balanced
endpoint-tests-group-1:
runs-on: [ self-hosted, cpu ]
timeout-minutes: 120
needs: create-runners
env:
run_benchmark: ${{ github.event.inputs.run_benchmark || 'true' }}
image_type: ${{ github.event.inputs.mode || 'nightly' }}
steps:
- uses: actions/checkout@v3
- name: Set up Python3
uses: actions/setup-python@v4
with:
python-version: '3.10.x'
- name: Install pip dependencies
run: pip3 install -U boto3 awscli
- name: Install SageMaker Python SDK
working-directory: tests/integration
run: |
./install_sagemaker_pysdk.sh ${{ github.event.inputs.sagemaker-repository }} $ {{ github.event.inputs.repository-branch }}
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v2
with:
aws-region: us-west-2
- name: MME Test
working-directory: tests/integration
run: |
python3 llm/sagemaker-endpoint-tests.py deepspeed-mme djl_mme ${image_type} ${run_benchmark}
- name: Test gpt2xl
working-directory: tests/integration
run: |
python3 llm/sagemaker-endpoint-tests.py gpt2-xl djl ${image_type} ${run_benchmark}
echo "sleep 30 seconds to allow endpoint deletion"
sleep 30
- name: Test stable diffusion
if: success() || failure()
working-directory: tests/integration
run: |
python3 llm/sagemaker-endpoint-tests.py stable-diffusion-2-1-base djl ${image_type} ${run_benchmark}
echo "sleep 30 seconds to allow endpoint deletion"
sleep 30
- name: Test opt-1.3b
if: success() || failure()
working-directory: tests/integration
run: |
python3 llm/sagemaker-endpoint-tests.py opt-1-3-b djl ${image_type} ${run_benchmark}
echo "sleep 30 seconds to allow endpoint deletion"
sleep 30
endpoint-tests-group-2:
runs-on: [ self-hosted, cpu ]
timeout-minutes: 120
needs: create-runners
env:
run_benchmark: ${{ github.event.inputs.run_benchmark || 'true' }}
image_type: ${{ github.event.inputs.mode || 'nightly' }}
steps:
- uses: actions/checkout@v3
- name: Set up Python3
uses: actions/setup-python@v4
with:
python-version: '3.10.x'
- name: Install pip dependencies
run: pip3 install -U boto3 awscli
- name: Install SageMaker Python SDK
working-directory: tests/integration
run: |
./install_sagemaker_pysdk.sh ${{ github.event.inputs.sagemaker-repository }} $ {{ github.event.inputs.repository-branch }}
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v2
with:
aws-region: us-west-2
- name: Test gpt-j-6b
if: success() || failure()
working-directory: tests/integration
run: |
python3 llm/sagemaker-endpoint-tests.py gpt-j-6b djl ${image_type} ${run_benchmark}
echo "sleep 30 seconds to allow endpoint deletion"
sleep 30
- name: Test gpt-neo-2.7b no code DeepSpeed
if: success() || failure()
working-directory: tests/integration
run: |
python3 llm/sagemaker-endpoint-tests.py gpt-neo-2-7-b no_code ${image_type} ${run_benchmark}
echo "sleep 30 seconds to allow endpoint deletion"
sleep 30
- name: Test DeepSpeed pythia-12b
if: success() || failure()
working-directory: tests/integration
run: |
python3 llm/sagemaker-endpoint-tests.py pythia-12b djl ${image_type} ${run_benchmark}
echo "sleep 30 seconds to allow endpoint deletion"
sleep 30
stop-runners:
if: always()
runs-on: [ self-hosted, scheduler ]
needs: [ create-runners, endpoint-tests-group-1, endpoint-tests-group-2 ]
steps:
- name: Cleanup dangling SageMaker resources
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
./cleanup_sagemaker_resources.sh sm-integration-test us-west-2
- name: Stop all instances
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
instance_id=${{ needs.create-runners.outputs.cpu_instance_id1 }}
./stop_instance.sh $instance_id
instance_id=${{ needs.create-runners.outputs.cpu_instance_id2 }}
./stop_instance.sh $instance_id