Soak Testing #56
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Pre-requisites: | |
# - AWS Account with correct permissions | |
# - `gh-pages` branch | |
# - AWS Account needs to add the LOG_GROUP_NAME defined below | |
name: Soak Testing | |
on: | |
workflow_dispatch: | |
inputs: | |
target_commit_sha: | |
description: 'The commit SHA on this repo to use for the Soak Tests.' | |
required: true | |
test_duration_minutes: | |
description: 'The duration of the Soak Tests in minutes.' | |
required: true | |
default: 300 | |
schedule: | |
- cron: '0 15 * * *' | |
env: | |
# NOTE: The configuration of `APP_PROCESS_EXECUTABLE_NAME` is repo dependent | |
APP_PROCESS_EXECUTABLE_NAME: java | |
AWS_DEFAULT_REGION: us-east-1 | |
DEFAULT_TEST_DURATION_MINUTES: 300 | |
HOSTMETRICS_INTERVAL_SECS: 600 | |
CPU_LOAD_THRESHOLD: 55 | |
TOTAL_MEMORY_THRESHOLD: 4294967296 # 4 GiB | |
MAX_BENCHMARKS_TO_KEEP: 100 | |
# TODO: We might be able to adapt the "Soak Tests" to be "Overhead Tests". | |
# This means monitoring the Sample App's performance using high levels of TPS | |
# for the Load Generator over a shorter period of testing time. For example: | |
# https://github.com/aws-observability/aws-otel-collector/blob/main/docs/performance_model.md | |
# THROUGHPUT_PER_SECOND: TBD? | |
jobs: | |
test_apps_and_publish_results: | |
name: Soak Performance Test - (${{ matrix.app-platform }}, ${{ matrix.instrumentation-type }}) | |
runs-on: ubuntu-latest | |
permissions: | |
contents: write | |
id-token: write | |
issues: write | |
strategy: | |
fail-fast: false | |
matrix: | |
app-platform: [ spark, spark-awssdkv1, springboot ] | |
instrumentation-type: [ auto ] | |
env: | |
# NOTE: The configuration of `APP_PATH` is repo dependent | |
APP_PATH: sample-apps/${{ matrix.app-platform }} | |
LOGS_NAMESPACE: ${{ github.repository }}/soak-tests-${{ matrix.app-platform }}-${{ matrix.instrumentation-type }} | |
steps: | |
# MARK: - GitHub Workflow Event Type Specific Values | |
- name: Use INPUT as commit SHA | |
if: ${{ github.event_name == 'workflow_dispatch' }} | |
run: | | |
echo "TARGET_SHA=${{ github.event.inputs.target_commit_sha }}" | tee --append $GITHUB_ENV; | |
- name: Use LATEST as commit SHA | |
if: ${{ github.event_name != 'workflow_dispatch' }} | |
run: | | |
echo "TARGET_SHA=${{ github.sha }}" | tee --append $GITHUB_ENV; | |
- name: Configure Performance Test Duration | |
run: | | |
echo "TEST_DURATION_MINUTES=${{ github.event.inputs.test_duration_minutes || env.DEFAULT_TEST_DURATION_MINUTES }}" | tee --append $GITHUB_ENV; | |
- name: Clone This Repo @ ${{ env.TARGET_SHA }} | |
uses: actions/checkout@v4 | |
with: | |
ref: ${{ env.TARGET_SHA }} | |
# MARK: - App-Platform Specific Values | |
# NOTE: The configuration of `APP_PROCESS_COMMAND_LINE_DIMENSION_VALUE` is | |
# repo dependent | |
- name: Configure Spark & Spark-awssdkv1 Specific Values | |
if: ${{ matrix.app-platform == 'spark' || | |
matrix.app-platform == 'spark-awssdkv1' }} | |
run: | | |
echo 'APP_PROCESS_COMMAND_LINE_DIMENSION_VALUE<<EOF' >> $GITHUB_ENV | |
echo 'java -cp /app/resources:/app/classes:/app/libs/* com.amazon.sampleapp.App' | | |
tee --append $GITHUB_ENV; | |
echo 'EOF' >> $GITHUB_ENV | |
echo 'LISTEN_ADDRESS_PORT=8080' | tee --append $GITHUB_ENV; | |
- name: Configure Springboot Specific Values | |
if: ${{ matrix.app-platform == 'springboot' }} | |
run: | | |
echo 'APP_PROCESS_COMMAND_LINE_DIMENSION_VALUE<<EOF' >> $GITHUB_ENV | |
echo 'java -cp /app/resources:/app/classes:/app/libs/* com.amazon.sampleapp.DemoApplication' | | |
tee --append $GITHUB_ENV; | |
echo 'EOF' >> $GITHUB_ENV | |
echo 'LISTEN_ADDRESS_PORT=4567' | tee --append $GITHUB_ENV; | |
# MARK: - Uniquely identify this Sample App environment | |
- name: Create unique combination using matrix + commit parameters | |
run: | | |
echo "MATRIX_COMMIT_COMBO=${{ matrix.app-platform }}-${{ matrix.instrumentation-type }}-${{ env.TARGET_SHA }}" | tee --append $GITHUB_ENV; | |
# MARK: - Run Performance Tests | |
- name: Configure AWS Credentials | |
uses: aws-actions/configure-aws-credentials@v4 | |
with: | |
role-to-assume: ${{ secrets.AWS_ASSUME_ROLE_ARN }} | |
role-duration-seconds: 21600 # 6 Hours | |
aws-region: ${{ env.AWS_DEFAULT_REGION }} | |
# NOTE: We only login to prevent getting throttled for too many docker | |
# pulls. We do not publish anything to ECR. | |
- name: Login to ECR | |
run: | | |
aws ecr-public get-login-password | | |
docker login --username AWS --password-stdin public.ecr.aws | |
- name: Build Sample App locally directly to the Docker daemon | |
uses: burrunan/gradle-cache-action@v1 | |
with: | |
arguments: jibDockerBuild | |
env: | |
COMMIT_HASH: ${{ github.sha }} | |
- name: Configure Performance Test environment variables | |
run: | | |
echo "NUM_OF_CPUS=$(nproc --all)" | tee --append $GITHUB_ENV; | |
- name: Run All Docker Containers - Sample App + OTel Collector + Load Generator + Alarm Poller | |
id: check-failure-during-performance-tests | |
continue-on-error: true | |
working-directory: .github/docker-performance-tests | |
env: | |
APP_IMAGE: public.ecr.aws/aws-otel-test/aws-otel-java-${{ matrix.app-platform }}:${{ env.TARGET_SHA }} | |
INSTANCE_ID: ${{ github.run_id }}-${{ github.run_number }} | |
LISTEN_ADDRESS_PORT: ${{ env.LISTEN_ADDRESS_PORT }} | |
LOG_GROUP_NAME: otel-sdk-performance-tests | |
# Also uses: | |
# AWS_ACCESS_KEY_ID | |
# AWS_SECRET_ACCESS_KEY | |
# AWS_SESSION_TOKEN | |
# TARGET_SHA | |
# LOGS_NAMESPACE | |
# APP_PROCESS_COMMAND_LINE_DIMENSION_VALUE | |
# APP_PROCESS_EXECUTABLE_NAME | |
# HOSTMETRICS_INTERVAL_SECS | |
# TEST_DURATION_MINUTES | |
# NUM_OF_CPUS | |
# CPU_LOAD_THRESHOLD | |
# TOTAL_MEMORY_THRESHOLD | |
# AWS_DEFAULT_REGION | |
# MATRIX_COMMIT_COMBO | |
# GITHUB_RUN_ID | |
run: |- | |
docker compose up --build; | |
RUN_TESTS_EXIT_CODE=$( | |
docker inspect $( | |
docker ps --quiet --all --filter "name=docker-performance-tests_alarms-poller" | |
) --format="{{.State.ExitCode}}" | |
); | |
echo "RUN_TESTS_EXIT_CODE=$RUN_TESTS_EXIT_CODE" | tee --append $GITHUB_ENV; | |
exit $RUN_TESTS_EXIT_CODE; | |
- name: Fail early if Soak Tests failed to start | |
if: ${{ env.RUN_TESTS_EXIT_CODE == '' || env.RUN_TESTS_EXIT_CODE == 1 }} | |
run: exit 1; | |
# MARK: - Report on Performance Test Results | |
- name: Install script dependencies | |
run: pip install boto3 | |
- name: Get a snapshot of metrics and commit them to the repository | |
run: | | |
python3 .github/scripts/performance-tests/produce_metric_widget_images.py \ | |
--logs-namespace ${{ env.LOGS_NAMESPACE }} \ | |
--metrics-period ${{ env.HOSTMETRICS_INTERVAL_SECS }} \ | |
--num-of-cpus ${{ env.NUM_OF_CPUS }} \ | |
--app-process-command-line-dimension-value "${{ env.APP_PROCESS_COMMAND_LINE_DIMENSION_VALUE }}" \ | |
--target-sha ${{ env.TARGET_SHA }} \ | |
--github-run-id ${GITHUB_RUN_ID} \ | |
--test-duration-minutes ${{ env.TEST_DURATION_MINUTES }} \ | |
--cpu-load-threshold ${{ env.CPU_LOAD_THRESHOLD }} \ | |
--total-memory-threshold ${{ env.TOTAL_MEMORY_THRESHOLD }} \ | |
--app-platform ${{ matrix.app-platform }} \ | |
--instrumentation-type ${{ matrix.instrumentation-type }} \ | |
--max-benchmarks-to-keep ${{ env.MAX_BENCHMARKS_TO_KEEP }} \ | |
--github-repository ${GITHUB_REPOSITORY} | |
echo "::warning::Checkout Snapshots at this link: https://github.com/${GITHUB_REPOSITORY}/blob/gh-pages/soak-tests/snapshots/commits/${{ env.TARGET_SHA }}/runs/${GITHUB_RUN_ID}/${{ matrix.app-platform }}"; | |
git config user.email "41898282+github-actions[bot]@users.noreply.github.com"; | |
git config user.name "GitHub Actions"; | |
git fetch; | |
git checkout gh-pages; | |
git add soak-tests/snapshots/commits; | |
git commit -m "Soak Test Snapshots from ${{ env.TARGET_SHA }} - ${GITHUB_RUN_ID}"; | |
git push; | |
git checkout main; | |
- name: Prepare Performance Test results as JSON output | |
run: python3 .github/scripts/performance-tests/get-metric-data/produce_performance_test_results.py | |
--logs-namespace ${{ env.LOGS_NAMESPACE }} | |
--metrics-period ${{ env.HOSTMETRICS_INTERVAL_SECS }} | |
--num-of-cpus ${{ env.NUM_OF_CPUS }} | |
--app-process-command-line-dimension-value "${{ env.APP_PROCESS_COMMAND_LINE_DIMENSION_VALUE }}" | |
--target-sha ${{ env.TARGET_SHA }} | |
--github-run-id ${GITHUB_RUN_ID} | |
--test-duration-minutes ${{ env.TEST_DURATION_MINUTES }} | |
- name: Do we already have Performance Test graph results for this commit? | |
continue-on-error: true | |
id: check-already-have-performance-results | |
run: | | |
git checkout gh-pages; | |
HAS_RESULTS_ALREADY=$( | |
sed 's/window.BENCHMARK_DATA = //' soak-tests/per-commit-overall-results/data.js | | |
jq " | |
.entries | | |
.\"Soak Test Results - sample-app-${{ matrix.app-platform }}-${{ matrix.instrumentation-type }}\" | | |
any(.commit.id == \"${{ env.TARGET_SHA }}\") | |
" || echo false | |
); | |
git checkout main; | |
[[ $HAS_RESULTS_ALREADY == true ]] | |
- name: Graph and Report Performance Test Averages result | |
uses: benchmark-action/github-action-benchmark@v1 | |
continue-on-error: true | |
id: check-failure-after-performance-tests | |
with: | |
name: Soak Test Results - sample-app-${{ matrix.app-platform }}-${{ matrix.instrumentation-type }} | |
tool: customSmallerIsBetter | |
output-file-path: output.json | |
github-token: ${{ secrets.GITHUB_TOKEN }} | |
max-items-in-chart: ${{ env.MAX_BENCHMARKS_TO_KEEP }} | |
alert-threshold: 175% | |
# Does not work as expected, see: | |
# https://github.com/open-telemetry/opentelemetry-python/pull/1478 | |
# comment-always: true | |
fail-on-alert: true | |
auto-push: ${{ github.event_name == 'schedule' && | |
steps.check-already-have-performance-results.outcome == 'failure' && | |
github.ref == 'refs/heads/main' }} | |
gh-pages-branch: gh-pages | |
benchmark-data-dir-path: soak-tests/per-commit-overall-results | |
- name: Publish Issue if failed DURING Performance Tests | |
uses: JasonEtco/create-an-issue@v2 | |
if: ${{ github.event_name == 'schedule' && | |
steps.check-failure-during-performance-tests.outcome == 'failure' }} | |
env: | |
APP_PLATFORM: ${{ matrix.app-platform }} | |
INSTRUMENTATION_TYPE: ${{ matrix.instrumentation-type }} | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
with: | |
filename: .github/auto-issue-templates/failure-during-soak_tests.md | |
update_existing: true | |
- name: Publish Issue if failed AFTER Performance Tests | |
uses: JasonEtco/create-an-issue@v2 | |
if: ${{ github.event_name == 'schedule' && | |
steps.check-failure-after-performance-tests.outcome == 'failure' }} | |
env: | |
APP_PLATFORM: ${{ matrix.app-platform }} | |
INSTRUMENTATION_TYPE: ${{ matrix.instrumentation-type }} | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
with: | |
filename: .github/auto-issue-templates/failure-after-soak_tests.md | |
update_existing: true | |
- name: Check for Performance Degradation either DURING or AFTER Performance Tests | |
if: ${{ steps.check-failure-during-performance-tests.outcome == 'failure' || | |
steps.check-failure-after-performance-tests.outcome == 'failure' }} | |
run: >- | |
echo 'Performance Tests failed, see the logs above for details'; | |
exit 1; |