forked from deepjavalibrary/djl-serving
-
Notifications
You must be signed in to change notification settings - Fork 0
161 lines (156 loc) · 6.47 KB
/
sagemaker-integration.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
name: SageMaker PythonSDK Integration Tests
on:
workflow_dispatch:
inputs:
mode:
description: "release/nightly containers to test. Default is nightly"
required: false
default: 'nightly'
sagemaker-repository:
description: 'Link to Github repository for SageMaker Python SDK. Can be a personal fork.'
required: false
default: ''
repository-branch:
description: 'The branch from the SagMaker Python SDK fork to use for testing'
required: false
default: ''
run_benchmark:
description: 'Runs benchmark and upload to cloud watch mertcis if set to true.'
required: false
default: true
schedule:
- cron: '0 4 * * *'
jobs:
create-runners:
runs-on: [self-hosted, scheduler]
steps:
- name: Create new CPU instance
id: create_cpu1
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_cpu $token djl-serving
- name: Create new CPU instance
id: create_cpu2
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_cpu $token djl-serving
outputs:
cpu_instance_id1: ${{ steps.create_cpu1.outputs.action_cpu_instance_id }}
cpu_instance_id2: ${{ steps.create_cpu2.outputs.action_cpu_instance_id }}
# These tests are SLOW, and we only have 2 ml.g5.12xlarge instances available for testing
# We parallelize the tests into two separate tracks, each using 1 instance for a few tests.
# There's probably a better way to do this, but for now this works.
# If you add a test, please try to keep the groups balanced
endpoint-tests-group-1:
runs-on: [ self-hosted, cpu ]
timeout-minutes: 120
needs: create-runners
env:
run_benchmark: ${{ github.event.inputs.run_benchmark }}
steps:
- uses: actions/checkout@v3
- name: Set up Python3
uses: actions/setup-python@v4
with:
python-version: '3.10.x'
- name: Install pip dependencies
run: pip3 install -U boto3 awscli
- name: Install SageMaker Python SDK
working-directory: tests/integration
run: |
./install_sagemaker_pysdk.sh ${{ github.event.inputs.sagemaker-repository }} $ {{ github.event.inputs.repository-branch }}
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v2
with:
aws-region: us-west-2
- name: MME Test
working-directory: tests/integration
run: |
python3 llm/sagemaker-endpoint-tests.py deepspeed-mme djl_mme ${{ github.event.inputs.mode || 'nightly' }}
- name: Test gpt2xl
working-directory: tests/integration
run: |
python3 llm/sagemaker-endpoint-tests.py gpt2-xl djl ${{ github.event.inputs.mode || 'nightly' }}
echo "sleep 30 seconds to allow endpoint deletion"
sleep 30
- name: Test stable diffusion
if: success() || failure()
working-directory: tests/integration
run: |
python3 llm/sagemaker-endpoint-tests.py stable-diffusion-2-1-base djl ${{ github.event.inputs.mode || 'nightly' }}
echo "sleep 30 seconds to allow endpoint deletion"
sleep 30
- name: Test opt-1.3b
if: success() || failure()
working-directory: tests/integration
run: |
python3 llm/sagemaker-endpoint-tests.py opt-1-3-b djl ${{ github.event.inputs.mode || 'nightly' }}
echo "sleep 30 seconds to allow endpoint deletion"
sleep 30
endpoint-tests-group-2:
runs-on: [ self-hosted, cpu ]
timeout-minutes: 120
needs: create-runners
env:
run_benchmark: ${{ github.event.inputs.run_benchmark }}
steps:
- uses: actions/checkout@v3
- name: Set up Python3
uses: actions/setup-python@v4
with:
python-version: '3.10.x'
- name: Install pip dependencies
run: pip3 install -U boto3 awscli
- name: Install SageMaker Python SDK
working-directory: tests/integration
run: |
./install_sagemaker_pysdk.sh ${{ github.event.inputs.sagemaker-repository }} $ {{ github.event.inputs.repository-branch }}
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v2
with:
aws-region: us-west-2
- name: Test gpt-j-6b
if: success() || failure()
working-directory: tests/integration
run: |
python3 llm/sagemaker-endpoint-tests.py gpt-j-6b djl ${{ github.event.inputs.mode || 'nightly' }}
echo "sleep 30 seconds to allow endpoint deletion"
sleep 30
- name: Test gpt-neo-2.7b no code DeepSpeed
if: success() || failure()
working-directory: tests/integration
run: |
python3 llm/sagemaker-endpoint-tests.py gpt-neo-2-7-b no_code ${{ github.event.inputs.mode || 'nightly' }}
echo "sleep 30 seconds to allow endpoint deletion"
sleep 30
- name: Test DeepSpeed pythia-12b
if: success() || failure()
working-directory: tests/integration
run: |
python3 llm/sagemaker-endpoint-tests.py pythia-12b djl ${{ github.event.inputs.mode || 'nightly' }}
echo "sleep 30 seconds to allow endpoint deletion"
sleep 30
stop-runners:
if: always()
runs-on: [ self-hosted, scheduler ]
needs: [ create-runners, endpoint-tests-group-1, endpoint-tests-group-2 ]
steps:
- name: Cleanup dangling SageMaker resources
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
./cleanup_sagemaker_resources.sh sm-integration-test us-west-2
- name: Stop all instances
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
instance_id=${{ needs.create-runners.outputs.cpu_instance_id1 }}
./stop_instance.sh $instance_id
instance_id=${{ needs.create-runners.outputs.cpu_instance_id2 }}
./stop_instance.sh $instance_id