forked from aws-observability/aws-otel-java-instrumentation
-
Notifications
You must be signed in to change notification settings - Fork 0
259 lines (248 loc) · 11.8 KB
/
soak-testing.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# Pre-requisites:
# - AWS Account with correct permissions
# - `gh-pages` branch
# - AWS Account needs to add the LOG_GROUP_NAME defined below
name: Soak Testing
on:
workflow_dispatch:
inputs:
target_commit_sha:
description: 'The commit SHA on this repo to use for the Soak Tests.'
required: true
test_duration_minutes:
description: 'The duration of the Soak Tests in minutes.'
required: true
default: 300
schedule:
- cron: '0 15 * * *'
env:
# NOTE: The configuration of `APP_PROCESS_EXECUTABLE_NAME` is repo dependent
APP_PROCESS_EXECUTABLE_NAME: java
AWS_DEFAULT_REGION: us-east-1
DEFAULT_TEST_DURATION_MINUTES: 300
HOSTMETRICS_INTERVAL_SECS: 600
CPU_LOAD_THRESHOLD: 55
TOTAL_MEMORY_THRESHOLD: 4294967296 # 4 GiB
MAX_BENCHMARKS_TO_KEEP: 100
# TODO: We might be able to adapt the "Soak Tests" to be "Overhead Tests".
# This means monitoring the Sample App's performance using high levels of TPS
# for the Load Generator over a shorter period of testing time. For example:
# https://github.com/aws-observability/aws-otel-collector/blob/main/docs/performance_model.md
# THROUGHPUT_PER_SECOND: TBD?
jobs:
test_apps_and_publish_results:
name: Soak Performance Test - (${{ matrix.app-platform }}, ${{ matrix.instrumentation-type }})
runs-on: ubuntu-latest
permissions:
contents: write
id-token: write
issues: write
strategy:
fail-fast: false
matrix:
app-platform: [ spark, spark-awssdkv1, springboot ]
instrumentation-type: [ auto ]
env:
# NOTE: The configuration of `APP_PATH` is repo dependent
APP_PATH: sample-apps/${{ matrix.app-platform }}
LOGS_NAMESPACE: ${{ github.repository }}/soak-tests-${{ matrix.app-platform }}-${{ matrix.instrumentation-type }}
steps:
# MARK: - GitHub Workflow Event Type Specific Values
- name: Use INPUT as commit SHA
if: ${{ github.event_name == 'workflow_dispatch' }}
run: |
echo "TARGET_SHA=${{ github.event.inputs.target_commit_sha }}" | tee --append $GITHUB_ENV;
- name: Use LATEST as commit SHA
if: ${{ github.event_name != 'workflow_dispatch' }}
run: |
echo "TARGET_SHA=${{ github.sha }}" | tee --append $GITHUB_ENV;
- name: Configure Performance Test Duration
run: |
echo "TEST_DURATION_MINUTES=${{ github.event.inputs.test_duration_minutes || env.DEFAULT_TEST_DURATION_MINUTES }}" | tee --append $GITHUB_ENV;
- name: Clone This Repo @ ${{ env.TARGET_SHA }}
uses: actions/checkout@v4
with:
ref: ${{ env.TARGET_SHA }}
# MARK: - App-Platform Specific Values
# NOTE: The configuration of `APP_PROCESS_COMMAND_LINE_DIMENSION_VALUE` is
# repo dependent
- name: Configure Spark & Spark-awssdkv1 Specific Values
if: ${{ matrix.app-platform == 'spark' ||
matrix.app-platform == 'spark-awssdkv1' }}
run: |
echo 'APP_PROCESS_COMMAND_LINE_DIMENSION_VALUE<<EOF' >> $GITHUB_ENV
echo 'java -cp /app/resources:/app/classes:/app/libs/* com.amazon.sampleapp.App' |
tee --append $GITHUB_ENV;
echo 'EOF' >> $GITHUB_ENV
echo 'LISTEN_ADDRESS_PORT=8080' | tee --append $GITHUB_ENV;
- name: Configure Springboot Specific Values
if: ${{ matrix.app-platform == 'springboot' }}
run: |
echo 'APP_PROCESS_COMMAND_LINE_DIMENSION_VALUE<<EOF' >> $GITHUB_ENV
echo 'java -cp /app/resources:/app/classes:/app/libs/* com.amazon.sampleapp.DemoApplication' |
tee --append $GITHUB_ENV;
echo 'EOF' >> $GITHUB_ENV
echo 'LISTEN_ADDRESS_PORT=4567' | tee --append $GITHUB_ENV;
# MARK: - Uniquely identify this Sample App environment
- name: Create unique combination using matrix + commit parameters
run: |
echo "MATRIX_COMMIT_COMBO=${{ matrix.app-platform }}-${{ matrix.instrumentation-type }}-${{ env.TARGET_SHA }}" | tee --append $GITHUB_ENV;
# MARK: - Run Performance Tests
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.AWS_ASSUME_ROLE_ARN }}
role-duration-seconds: 21600 # 6 Hours
aws-region: ${{ env.AWS_DEFAULT_REGION }}
# NOTE: We only login to prevent getting throttled for too many docker
# pulls. We do not publish anything to ECR.
- name: Login to ECR
run: |
aws ecr-public get-login-password |
docker login --username AWS --password-stdin public.ecr.aws
- name: Build Sample App locally directly to the Docker daemon
uses: burrunan/gradle-cache-action@v1
with:
arguments: jibDockerBuild
env:
COMMIT_HASH: ${{ github.sha }}
- name: Configure Performance Test environment variables
run: |
echo "NUM_OF_CPUS=$(nproc --all)" | tee --append $GITHUB_ENV;
- name: Run All Docker Containers - Sample App + OTel Collector + Load Generator + Alarm Poller
id: check-failure-during-performance-tests
continue-on-error: true
working-directory: .github/docker-performance-tests
env:
APP_IMAGE: public.ecr.aws/aws-otel-test/aws-otel-java-${{ matrix.app-platform }}:${{ env.TARGET_SHA }}
INSTANCE_ID: ${{ github.run_id }}-${{ github.run_number }}
LISTEN_ADDRESS_PORT: ${{ env.LISTEN_ADDRESS_PORT }}
LOG_GROUP_NAME: otel-sdk-performance-tests
# Also uses:
# AWS_ACCESS_KEY_ID
# AWS_SECRET_ACCESS_KEY
# AWS_SESSION_TOKEN
# TARGET_SHA
# LOGS_NAMESPACE
# APP_PROCESS_COMMAND_LINE_DIMENSION_VALUE
# APP_PROCESS_EXECUTABLE_NAME
# HOSTMETRICS_INTERVAL_SECS
# TEST_DURATION_MINUTES
# NUM_OF_CPUS
# CPU_LOAD_THRESHOLD
# TOTAL_MEMORY_THRESHOLD
# AWS_DEFAULT_REGION
# MATRIX_COMMIT_COMBO
# GITHUB_RUN_ID
run: |-
docker compose up --build;
RUN_TESTS_EXIT_CODE=$(
docker inspect $(
docker ps --quiet --all --filter "name=docker-performance-tests_alarms-poller"
) --format="{{.State.ExitCode}}"
);
echo "RUN_TESTS_EXIT_CODE=$RUN_TESTS_EXIT_CODE" | tee --append $GITHUB_ENV;
exit $RUN_TESTS_EXIT_CODE;
- name: Fail early if Soak Tests failed to start
if: ${{ env.RUN_TESTS_EXIT_CODE == '' || env.RUN_TESTS_EXIT_CODE == 1 }}
run: exit 1;
# MARK: - Report on Performance Test Results
- name: Install script dependencies
run: pip install boto3
- name: Get a snapshot of metrics and commit them to the repository
run: |
python3 .github/scripts/performance-tests/produce_metric_widget_images.py \
--logs-namespace ${{ env.LOGS_NAMESPACE }} \
--metrics-period ${{ env.HOSTMETRICS_INTERVAL_SECS }} \
--num-of-cpus ${{ env.NUM_OF_CPUS }} \
--app-process-command-line-dimension-value "${{ env.APP_PROCESS_COMMAND_LINE_DIMENSION_VALUE }}" \
--target-sha ${{ env.TARGET_SHA }} \
--github-run-id ${GITHUB_RUN_ID} \
--test-duration-minutes ${{ env.TEST_DURATION_MINUTES }} \
--cpu-load-threshold ${{ env.CPU_LOAD_THRESHOLD }} \
--total-memory-threshold ${{ env.TOTAL_MEMORY_THRESHOLD }} \
--app-platform ${{ matrix.app-platform }} \
--instrumentation-type ${{ matrix.instrumentation-type }} \
--max-benchmarks-to-keep ${{ env.MAX_BENCHMARKS_TO_KEEP }} \
--github-repository ${GITHUB_REPOSITORY}
echo "::warning::Checkout Snapshots at this link: https://github.com/${GITHUB_REPOSITORY}/blob/gh-pages/soak-tests/snapshots/commits/${{ env.TARGET_SHA }}/runs/${GITHUB_RUN_ID}/${{ matrix.app-platform }}";
git config user.email "41898282+github-actions[bot]@users.noreply.github.com";
git config user.name "GitHub Actions";
git fetch;
git checkout gh-pages;
git add soak-tests/snapshots/commits;
git commit -m "Soak Test Snapshots from ${{ env.TARGET_SHA }} - ${GITHUB_RUN_ID}";
git push;
git checkout main;
- name: Prepare Performance Test results as JSON output
run: python3 .github/scripts/performance-tests/get-metric-data/produce_performance_test_results.py
--logs-namespace ${{ env.LOGS_NAMESPACE }}
--metrics-period ${{ env.HOSTMETRICS_INTERVAL_SECS }}
--num-of-cpus ${{ env.NUM_OF_CPUS }}
--app-process-command-line-dimension-value "${{ env.APP_PROCESS_COMMAND_LINE_DIMENSION_VALUE }}"
--target-sha ${{ env.TARGET_SHA }}
--github-run-id ${GITHUB_RUN_ID}
--test-duration-minutes ${{ env.TEST_DURATION_MINUTES }}
- name: Do we already have Performance Test graph results for this commit?
continue-on-error: true
id: check-already-have-performance-results
run: |
git checkout gh-pages;
HAS_RESULTS_ALREADY=$(
sed 's/window.BENCHMARK_DATA = //' soak-tests/per-commit-overall-results/data.js |
jq "
.entries |
.\"Soak Test Results - sample-app-${{ matrix.app-platform }}-${{ matrix.instrumentation-type }}\" |
any(.commit.id == \"${{ env.TARGET_SHA }}\")
" || echo false
);
git checkout main;
[[ $HAS_RESULTS_ALREADY == true ]]
- name: Graph and Report Performance Test Averages result
uses: benchmark-action/github-action-benchmark@v1
continue-on-error: true
id: check-failure-after-performance-tests
with:
name: Soak Test Results - sample-app-${{ matrix.app-platform }}-${{ matrix.instrumentation-type }}
tool: customSmallerIsBetter
output-file-path: output.json
github-token: ${{ secrets.GITHUB_TOKEN }}
max-items-in-chart: ${{ env.MAX_BENCHMARKS_TO_KEEP }}
alert-threshold: 175%
# Does not work as expected, see:
# https://github.com/open-telemetry/opentelemetry-python/pull/1478
# comment-always: true
fail-on-alert: true
auto-push: ${{ github.event_name == 'schedule' &&
steps.check-already-have-performance-results.outcome == 'failure' &&
github.ref == 'refs/heads/main' }}
gh-pages-branch: gh-pages
benchmark-data-dir-path: soak-tests/per-commit-overall-results
- name: Publish Issue if failed DURING Performance Tests
uses: JasonEtco/create-an-issue@v2
if: ${{ github.event_name == 'schedule' &&
steps.check-failure-during-performance-tests.outcome == 'failure' }}
env:
APP_PLATFORM: ${{ matrix.app-platform }}
INSTRUMENTATION_TYPE: ${{ matrix.instrumentation-type }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
filename: .github/auto-issue-templates/failure-during-soak_tests.md
update_existing: true
- name: Publish Issue if failed AFTER Performance Tests
uses: JasonEtco/create-an-issue@v2
if: ${{ github.event_name == 'schedule' &&
steps.check-failure-after-performance-tests.outcome == 'failure' }}
env:
APP_PLATFORM: ${{ matrix.app-platform }}
INSTRUMENTATION_TYPE: ${{ matrix.instrumentation-type }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
filename: .github/auto-issue-templates/failure-after-soak_tests.md
update_existing: true
- name: Check for Performance Degradation either DURING or AFTER Performance Tests
if: ${{ steps.check-failure-during-performance-tests.outcome == 'failure' ||
steps.check-failure-after-performance-tests.outcome == 'failure' }}
run: >-
echo 'Performance Tests failed, see the logs above for details';
exit 1;