From fb9332b3671c250552cf7fa7cbd0a2572ab72b79 Mon Sep 17 00:00:00 2001 From: Alan Hanson Date: Mon, 7 Oct 2024 17:07:42 +0000 Subject: [PATCH 1/3] Add region set options to more tests. Give test_live_repair, test_replace_special, and test_replay an option to indicate how many region sets should be created for the test. This allows us to use the same script to test single and multiple sub-volumes. Updated a few of the tests to no longer run a smaller test in a loop, but instead just run a longer test. This makes better sense when testing many sub-volumes as a smaller test might not cover everything. It has the side effect of not printing regular status messages, but if we find that is important we can come up with another way of displaying test progress. --- tools/test_live_repair.sh | 120 +++++++++++++++++----------------- tools/test_replace_special.sh | 102 ++++++++++++----------------- tools/test_replay.sh | 20 ++++-- 3 files changed, 115 insertions(+), 127 deletions(-) diff --git a/tools/test_live_repair.sh b/tools/test_live_repair.sh index 468cad9d6..10ce4a330 100755 --- a/tools/test_live_repair.sh +++ b/tools/test_live_repair.sh @@ -18,6 +18,7 @@ trap ctrl_c INT function ctrl_c() { echo "Stopping at your request" ${dsc} cmd shutdown + exit 1 } REGION_ROOT=${REGION_ROOT:-/var/tmp/test_live_repair} @@ -44,16 +45,20 @@ if [[ ! -f "$crucible_test" ]] || [[ ! -f "$dsc" ]] || [[ ! -f "$downstairs" ]]; fi loops=5 +region_sets=1 usage () { - echo "Usage: $0 [-l #]]" >&2 - echo " -l loops Number of test loops to perform (default 5)" >&2 + echo "Usage: $0 [-l #] [-r #]" >&2 + echo " -l loops Number of replacement loops to perform (default 5)" >&2 + echo " -r region_sets Number of region sets to create (default 1)" >&2 } -while getopts 'l:' opt; do +while getopts 'l:r:' opt; do case "$opt" in l) loops=$OPTARG ;; + r) region_sets=$OPTARG + ;; *) echo "Invalid option" usage exit 1 @@ -61,28 +66,40 @@ while getopts 'l:' opt; do esac done +((region_count=region_sets*3)) +((region_count+=1)) echo "" > "$loop_log" echo "" > "$test_log" echo "starting $(date)" | tee "$loop_log" echo "Tail $test_log for test output" -# NOTE: we are creating a single region set here plus one more region to be -# used by the replacement, and with the assumption that # the default ports -# will be used (8810, 8820, 8830). The test relies on that because we use the -# fourth region-dir for our "replacement". If you change # the number of -# regions, you must also adjust the replacement below. +# No real data was used to come up with these numbers. If you have some data +# then feel free to change things. +if [[ $region_sets -eq 1 ]]; then + extent_size=3000 +elif [[ $region_sets -eq 2 ]]; then + extent_size=1500 +elif [[ $region_sets -eq 3 ]]; then + extent_size=750 +else + extent_size=500 +fi + +# NOTE: we create the requested number of regions here plus one more region to +# be used by the replace test. We can use dsc to determine what the port will +# be for the final region. if ! ${dsc} create --cleanup \ --region-dir "$REGION_ROOT" \ - --region-count 4 \ + --region-count "$region_count" \ --ds-bin "$downstairs" \ - --extent-size 4000 \ + --extent-size "$extent_size" \ --extent-count 200 >> "$test_log"; then echo "Failed to create downstairs regions" exit 1 fi ${dsc} start --ds-bin "$downstairs" \ --region-dir "$REGION_ROOT" \ - --region-count 4 >> "$test_log" 2>&1 & + --region-count "$region_count" >> "$test_log" 2>&1 & dsc_pid=$! sleep 5 if ! ps -p $dsc_pid > /dev/null; then @@ -91,63 +108,44 @@ if ! ps -p $dsc_pid > /dev/null; then fi gen=1 -# Initial seed for verify file +# Seed the initial volume +echo "$(date) Begin pretest initial fill" | tee -a "$test_log" if ! "$crucible_test" fill --dsc 127.0.0.1:9998 -q -g "$gen"\ - --verify-out "$verify_log" >> "$test_log" 2>&1 ; then - echo Failed on initial verify seed, check "$test_log" + --skip-verify >> "$test_log" 2>&1 ; then + echo Failed on initial fill, check "$test_log" ${dsc} cmd shutdown exit 1 fi (( gen += 1 )) -# Now run the crutest replace test in a loop -count=1 -while [[ $count -le $loops ]]; do - SECONDS=0 - cp "$test_log" "$test_log".last - echo "" > "$test_log" - echo "New loop, $count starts now $(date)" >> "$test_log" - "$crucible_test" replace -c 5 \ - --dsc 127.0.0.1:9998 \ - --replacement 127.0.0.1:8840 \ - --stable -g "$gen" --verify-out "$verify_log" \ - --verify-at-start \ - --verify-in "$verify_log" >> "$test_log" 2>&1 - result=$? - if [[ $result -ne 0 ]]; then - touch /var/tmp/ds_test/up 2> /dev/null - (( err += 1 )) - duration=$SECONDS - printf "[%03d] Error $result after %d:%02d\n" "$count" \ - $((duration / 60)) $((duration % 60)) | tee -a "$loop_log" - mv "$test_log" "$test_log".lastfail - break - fi - duration=$SECONDS - (( gen += 1 )) - (( pass_total += 1 )) - (( total += duration )) - ave=$(( total / pass_total )) - printf "[%03d/%03d] %d:%02d ave:%d:%02d total:%d:%02d errors:%d \ -last_run_seconds:%d\n" \ - "$count" "$loops" \ - $((duration / 60)) $((duration % 60)) \ - $((ave / 60)) $((ave % 60)) \ - $((total / 60)) $((total % 60)) \ - "$err" $duration | tee -a "$loop_log" - (( count += 1 )) +# Figure out the port of the last dsc client, this is what we will use for the +# replacement address. +((last_client=region_count - 1)) +replacement_port=$(${dsc} cmd port -c $last_client) + +# Now run the crutest replace test +SECONDS=0 +cp "$test_log" "$test_log".last +echo "" > "$test_log" +echo "$(date) Replacement test starts now" | tee -a "$test_log" +"$crucible_test" replace -c "$loops" \ + --dsc 127.0.0.1:9998 \ + --replacement 127.0.0.1:"$replacement_port" \ + --stable -g "$gen" >> "$test_log" 2>&1 +result=$? +duration=$SECONDS +if [[ $result -ne 0 ]]; then + printf "Error $result after %d:%02d\n" \ + $((duration / 60)) $((duration % 60)) | tee -a "$loop_log" + cp "$test_log" "$test_log".lastfail + echo "See ${test_log}.lastfail for more info" +else + printf "Test took: %d:%02d\n" \ + $((duration / 60)) $((duration % 60)) | tee -a "$loop_log" +fi -done ${dsc} cmd shutdown wait "$dsc_pid" -sleep 4 -echo "Final results:" | tee -a "$loop_log" -printf "[%03d] %d:%02d ave:%d:%02d total:%d:%02d errors:%d last_run_seconds:%d\n" \ - "$count" \ - $((duration / 60)) $((duration % 60)) \ - $((ave / 60)) $((ave % 60)) \ - $((total / 60)) $((total % 60)) \ - "$err" $duration | tee -a "$loop_log" -echo "$(date) Test ends with $err" >> "$test_log" 2>&1 -exit "$err" +echo "$(date) Test ends with $result" | tee -a "$test_log" +exit $result diff --git a/tools/test_replace_special.sh b/tools/test_replace_special.sh index 128ef7712..cd9bd7b5d 100755 --- a/tools/test_replace_special.sh +++ b/tools/test_replace_special.sh @@ -14,6 +14,7 @@ trap ctrl_c INT function ctrl_c() { echo "Stopping at your request" ${dsc} cmd shutdown + exit 1 } REGION_ROOT=${REGION_ROOT:-/var/tmp/test_replace_special} @@ -41,16 +42,20 @@ if [[ ! -f "$crucible_test" ]] || [[ ! -f "$dsc" ]] || [[ ! -f "$downstairs" ]]; fi loops=5 +region_sets=1 usage () { - echo "Usage: $0 [-l #]]" >&2 - echo " -l loops Number of test loops to perform (default 5)" >&2 + echo "Usage: $0 [-l #] [-r #]" >&2 + echo " -l loops Number of test loops to perform (default 5)" >&2 + echo " -r region_sets Number of region sets to create (default 1)" >&2 } -while getopts 'l:' opt; do +while getopts 'l:r:' opt; do case "$opt" in l) loops=$OPTARG ;; + r) region_sets=$OPTARG + ;; *) echo "Invalid option" usage exit 1 @@ -58,19 +63,19 @@ while getopts 'l:' opt; do esac done +((region_count=region_sets*3)) +((region_count+=1)) echo "" > "$loop_log" echo "" > "$test_log" echo "starting $(date)" | tee "$loop_log" echo "Tail $test_log for test output" -# NOTE: we are creating a single region set here plus one more region to be -# used by the replacement, and with the assumption that # the default ports -# will be used (8810, 8820, 8830). The test relies on that # because we use -# the fourth region-dir for our "replacement". If you change # the number of -# regions, you must also adjust the replacement below. +# NOTE: We creating the requested number of regions here plus one more region +# to be used for replacement. We can use dsc to determine what the port will +# be for the final region if ! ${dsc} create --cleanup \ --region-dir "$REGION_ROOT" \ - --region-count 4 \ + --region-count "$region_count" \ --ds-bin "$downstairs" \ --extent-count 400 \ --block-size 4096 >> "$test_log"; then @@ -79,7 +84,7 @@ if ! ${dsc} create --cleanup \ fi ${dsc} start --ds-bin "$downstairs" \ --region-dir "$REGION_ROOT" \ - --region-count 4 >> "$test_log" 2>&1 & + --region-count "$region_count" >> "$test_log" 2>&1 & dsc_pid=$! sleep 5 if ! ps -p $dsc_pid > /dev/null; then @@ -90,62 +95,39 @@ fi gen=1 # Initial seed for verify file if ! "$crucible_test" fill --dsc 127.0.0.1:9998 -q -g "$gen"\ - --verify-out "$verify_log" >> "$test_log" 2>&1 ; then - echo Failed on initial verify seed, check "$test_log" + --skip-verify >> "$test_log" 2>&1 ; then + echo Failed on initial fill, check "$test_log" ${dsc} cmd shutdown exit 1 fi (( gen += 1 )) -# Now run the crutest replace-reconcole test in a loop -count=1 -while [[ $count -le $loops ]]; do - SECONDS=0 - cp "$test_log" "$test_log".last - echo "" > "$test_log" - echo "New loop, $count starts now $(date)" >> "$test_log" - "$crucible_test" replace-reconcile -c 5 \ - --dsc 127.0.0.1:9998 \ - --replacement 127.0.0.1:8840 \ - --stable -g "$gen" --verify-out "$verify_log" \ - --verify-at-start \ - --verify-in "$verify_log" >> "$test_log" 2>&1 - result=$? - if [[ $result -ne 0 ]]; then - touch /var/tmp/ds_test/up 2> /dev/null - (( err += 1 )) - duration=$SECONDS - printf "[%03d] Error $result after %d:%02d\n" "$count" \ - $((duration / 60)) $((duration % 60)) | tee -a "$loop_log" - mv "$test_log" "$test_log".lastfail - break - fi - duration=$SECONDS - # Gen should grow by at least the `-c` from crutest - (( gen += 10 )) - (( pass_total += 1 )) - (( total += duration )) - ave=$(( total / pass_total )) - printf "[%03d/%03d] %d:%02d ave:%d:%02d total:%d:%02d errors:%d \ - last_run_seconds:%d\n" \ - "$count" "$loops" \ - $((duration / 60)) $((duration % 60)) \ - $((ave / 60)) $((ave % 60)) \ - $((total / 60)) $((total % 60)) \ - "$err" $duration | tee -a "$loop_log" - (( count += 1 )) +# Figure out the port of the last dsc client, this is what we will use for the +# replacement address. +((last_client=region_count - 1)) +replacement_port=$(${dsc} cmd port -c $last_client) + +# Now run the crutest replace-reconcile test +SECONDS=0 +cp "$test_log" "$test_log".last +echo "" > "$test_log" +echo "$(date) replace-reconcile starts now" | tee -a "$test_log" +"$crucible_test" replace-reconcile -c "$loops" --dsc 127.0.0.1:9998 \ + --replacement 127.0.0.1:"$replacement_port" \ + --stable -g "$gen" >> "$test_log" 2>&1 +result=$? +duration=$SECONDS +if [[ $result -ne 0 ]]; then + printf "Error $result after %d:%02d\n" \ + $((duration / 60)) $((duration % 60)) | tee -a "$loop_log" + cp "$test_log" "$test_log".lastfail +else + printf "Test took %d:%02d\n" \ + $((duration / 60)) $((duration % 60)) | tee -a "$loop_log" +fi -done ${dsc} cmd shutdown wait "$dsc_pid" -sleep 4 -echo "Final results:" | tee -a "$loop_log" -printf "[%03d] %d:%02d ave:%d:%02d total:%d:%02d errors:%d last_run_seconds:%d\n" \ - "$count" \ - $((duration / 60)) $((duration % 60)) \ - $((ave / 60)) $((ave % 60)) \ - $((total / 60)) $((total % 60)) \ - "$err" $duration | tee -a "$loop_log" -echo "$(date) Test ends with $err" >> "$test_log" 2>&1 -exit "$err" +echo "$(date) Test ends with $result" | tee -a "$test_log" +exit $result diff --git a/tools/test_replay.sh b/tools/test_replay.sh index e6bceb929..2a777e844 100755 --- a/tools/test_replay.sh +++ b/tools/test_replay.sh @@ -12,6 +12,7 @@ trap ctrl_c INT function ctrl_c() { echo "Stopping at your request" ${dsc} cmd shutdown + exit 1 } WORK_ROOT=${WORK_ROOT:-/tmp} @@ -33,15 +34,21 @@ if [[ ! -f "$crucible_test" ]] || [[ ! -f "$dsc" ]] || [[ ! -f "$downstairs" ]]; fi loops=30 +region_sets=1 usage () { echo "Usage: $0 [-l #]]" >&2 - echo " -l loops Number of times to cause a replay." >&2 + echo " -l loops Number of times to cause a replay." >&2 + echo " -r regions Number of region sets to create (default 1)" >&2 } -while getopts 'l:' opt; do +while getopts 'l:r:' opt; do case "$opt" in l) loops=$OPTARG + echo "Set loops" + ;; + r) region_sets=$OPTARG + echo "Set region sets" ;; *) echo "Invalid option" usage @@ -50,19 +57,20 @@ while getopts 'l:' opt; do esac done +((region_count=region_sets*3)) echo "" > "$test_log" echo "starting $(date)" | tee "$test_log" echo "Tail $test_log for test output" -echo "Creating downstairs regions" | tee -a "$test_log" +echo "Creating $region_count downstairs regions" | tee -a "$test_log" if ! ${dsc} create --cleanup --ds-bin "$downstairs" \ - --extent-count 50 >> "$test_log"; then + --extent-count 50 --region-count "$region_count" >> "$test_log"; then echo "Failed to create downstairs regions" exit 1 fi -echo "Starting downstairs" | tee -a "$test_log" -${dsc} start --ds-bin "$downstairs" >> "$test_log" 2>&1 & +echo "Starting $region_count downstairs" | tee -a "$test_log" +${dsc} start --ds-bin "$downstairs" --region-count "$region_count" >> "$test_log" 2>&1 & dsc_pid=$! sleep 5 if ! ps -p $dsc_pid > /dev/null; then From 17548cd9d07e8aec05af66238341e18cf37e731b Mon Sep 17 00:00:00 2001 From: Alan Hanson Date: Tue, 8 Oct 2024 19:11:43 +0000 Subject: [PATCH 2/3] Debug CI failures --- .github/buildomat/jobs/test-up-2region-encrypted.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/buildomat/jobs/test-up-2region-encrypted.sh b/.github/buildomat/jobs/test-up-2region-encrypted.sh index ced3cb731..2de38c77f 100644 --- a/.github/buildomat/jobs/test-up-2region-encrypted.sh +++ b/.github/buildomat/jobs/test-up-2region-encrypted.sh @@ -43,7 +43,7 @@ done export BINDIR=/var/tmp/bins # Give this test one hour to finish -jobpid=$$; (sleep $(( 60 * 60 )); banner fail-timeout; ps -ef; zfs list;kill $jobpid) & +jobpid=$$; (sleep $(( 60 * 60 )); banner fail-timeout; ps -ef; zfs list; pstack $(ps -ef | grep "dsc start" | grep -v grep | awk '{print $2}') ;kill $jobpid) & echo "Setup debug logging" mkdir /tmp/debug From 2c0786a526029769ef78500aefc86c57b8d88d36 Mon Sep 17 00:00:00 2001 From: Alan Hanson Date: Wed, 9 Oct 2024 08:37:03 -0700 Subject: [PATCH 3/3] demangle that stack --- .github/buildomat/jobs/test-up-2region-encrypted.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/buildomat/jobs/test-up-2region-encrypted.sh b/.github/buildomat/jobs/test-up-2region-encrypted.sh index 2de38c77f..1d991e765 100644 --- a/.github/buildomat/jobs/test-up-2region-encrypted.sh +++ b/.github/buildomat/jobs/test-up-2region-encrypted.sh @@ -43,7 +43,7 @@ done export BINDIR=/var/tmp/bins # Give this test one hour to finish -jobpid=$$; (sleep $(( 60 * 60 )); banner fail-timeout; ps -ef; zfs list; pstack $(ps -ef | grep "dsc start" | grep -v grep | awk '{print $2}') ;kill $jobpid) & +jobpid=$$; (sleep $(( 60 * 60 )); banner fail-timeout; ps -ef; zfs list; pstack $(ps -ef | grep "dsc start" | grep -v grep | awk '{print $2}') | demangle ;kill $jobpid) & echo "Setup debug logging" mkdir /tmp/debug