SageMaker PythonSDK Integration Tests #109

Workflow file for this run

.github/workflows/sagemaker-integration.yml at dc52d18

	name: SageMaker PythonSDK Integration Tests

	on:
	workflow_dispatch:
	inputs:
	mode:
	description: "release/nightly containers to test. Default is nightly"
	required: false
	default: 'nightly'
	sagemaker-repository:
	description: 'Link to Github repository for SageMaker Python SDK. Can be a personal fork.'
	required: false
	default: ''
	repository-branch:
	description: 'The branch from the SagMaker Python SDK fork to use for testing'
	required: false
	default: ''
	run_benchmark:
	description: 'Runs benchmark and upload to cloud watch metrics'
	required: false
	default: 'true'
	schedule:
	- cron: '0 4 * * *'

	jobs:
	create-runners:
	runs-on: [self-hosted, scheduler]
	steps:
	- name: Create new CPU instance
	id: create_cpu1
	run: \|
	cd /home/ubuntu/djl_benchmark_script/scripts
	token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
	https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
	--fail \
	\| jq '.token' \| tr -d '"' )
	./start_instance.sh action_cpu $token djl-serving
	- name: Create new CPU instance
	id: create_cpu2
	run: \|
	cd /home/ubuntu/djl_benchmark_script/scripts
	token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
	https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
	--fail \
	\| jq '.token' \| tr -d '"' )
	./start_instance.sh action_cpu $token djl-serving
	outputs:
	cpu_instance_id1: ${{ steps.create_cpu1.outputs.action_cpu_instance_id }}
	cpu_instance_id2: ${{ steps.create_cpu2.outputs.action_cpu_instance_id }}

	# These tests are SLOW, and we only have 2 ml.g5.12xlarge instances available for testing
	# We parallelize the tests into two separate tracks, each using 1 instance for a few tests.
	# There's probably a better way to do this, but for now this works.
	# If you add a test, please try to keep the groups balanced
	endpoint-tests-group-1:
	runs-on: [ self-hosted, cpu ]
	timeout-minutes: 120
	needs: create-runners
	env:
	run_benchmark: ${{ github.event.inputs.run_benchmark \|\| 'true' }}
	image_type: ${{ github.event.inputs.mode \|\| 'nightly' }}
	steps:
	- uses: actions/checkout@v3
	- name: Set up Python3
	uses: actions/setup-python@v4
	with:
	python-version: '3.10.x'
	- name: Install pip dependencies
	run: pip3 install -U boto3 awscli
	- name: Install SageMaker Python SDK
	working-directory: tests/integration
	run: \|
	./install_sagemaker_pysdk.sh ${{ github.event.inputs.sagemaker-repository }} $ {{ github.event.inputs.repository-branch }}
	- name: Configure AWS Credentials
	uses: aws-actions/configure-aws-credentials@v2
	with:
	aws-region: us-west-2
	- name: MME Test
	working-directory: tests/integration
	run: \|
	python3 llm/sagemaker-endpoint-tests.py deepspeed-mme djl_mme ${image_type} ${run_benchmark}
	- name: Test gpt2xl
	working-directory: tests/integration
	run: \|
	python3 llm/sagemaker-endpoint-tests.py gpt2-xl djl ${image_type} ${run_benchmark}
	echo "sleep 30 seconds to allow endpoint deletion"
	sleep 30
	- name: Test stable diffusion
	if: success() \|\| failure()
	working-directory: tests/integration
	run: \|
	python3 llm/sagemaker-endpoint-tests.py stable-diffusion-2-1-base djl ${image_type} ${run_benchmark}
	echo "sleep 30 seconds to allow endpoint deletion"
	sleep 30
	- name: Test opt-1.3b
	if: success() \|\| failure()
	working-directory: tests/integration
	run: \|
	python3 llm/sagemaker-endpoint-tests.py opt-1-3-b djl ${image_type} ${run_benchmark}
	echo "sleep 30 seconds to allow endpoint deletion"
	sleep 30

	endpoint-tests-group-2:
	runs-on: [ self-hosted, cpu ]
	timeout-minutes: 120
	needs: create-runners
	env:
	run_benchmark: ${{ github.event.inputs.run_benchmark \|\| 'true' }}
	image_type: ${{ github.event.inputs.mode \|\| 'nightly' }}
	steps:
	- uses: actions/checkout@v3
	- name: Set up Python3
	uses: actions/setup-python@v4
	with:
	python-version: '3.10.x'
	- name: Install pip dependencies
	run: pip3 install -U boto3 awscli
	- name: Install SageMaker Python SDK
	working-directory: tests/integration
	run: \|
	./install_sagemaker_pysdk.sh ${{ github.event.inputs.sagemaker-repository }} $ {{ github.event.inputs.repository-branch }}
	- name: Configure AWS Credentials
	uses: aws-actions/configure-aws-credentials@v2
	with:
	aws-region: us-west-2
	- name: Test gpt-j-6b
	if: success() \|\| failure()
	working-directory: tests/integration
	run: \|
	python3 llm/sagemaker-endpoint-tests.py gpt-j-6b djl ${image_type} ${run_benchmark}
	echo "sleep 30 seconds to allow endpoint deletion"
	sleep 30
	- name: Test gpt-neo-2.7b no code DeepSpeed
	if: success() \|\| failure()
	working-directory: tests/integration
	run: \|
	python3 llm/sagemaker-endpoint-tests.py gpt-neo-2-7-b no_code ${image_type} ${run_benchmark}
	echo "sleep 30 seconds to allow endpoint deletion"
	sleep 30
	- name: Test DeepSpeed pythia-12b
	if: success() \|\| failure()
	working-directory: tests/integration
	run: \|
	python3 llm/sagemaker-endpoint-tests.py pythia-12b djl ${image_type} ${run_benchmark}
	echo "sleep 30 seconds to allow endpoint deletion"
	sleep 30

	stop-runners:
	if: always()
	runs-on: [ self-hosted, scheduler ]
	needs: [ create-runners, endpoint-tests-group-1, endpoint-tests-group-2 ]
	steps:
	- name: Cleanup dangling SageMaker resources
	run: \|
	cd /home/ubuntu/djl_benchmark_script/scripts
	./cleanup_sagemaker_resources.sh sm-integration-test us-west-2
	- name: Stop all instances
	run: \|
	cd /home/ubuntu/djl_benchmark_script/scripts
	instance_id=${{ needs.create-runners.outputs.cpu_instance_id1 }}
	./stop_instance.sh $instance_id
	instance_id=${{ needs.create-runners.outputs.cpu_instance_id2 }}
	./stop_instance.sh $instance_id

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

SageMaker PythonSDK Integration Tests #109

Workflow file

SageMaker PythonSDK Integration Tests #109

Jobs

Run details

Workflow file for this run