From 473d62d111a09064b05b74c5a822dc95efd1af2c Mon Sep 17 00:00:00 2001 From: Victor Bustamante Date: Wed, 23 Oct 2024 17:22:07 -0300 Subject: [PATCH] chore: Don't retry migrations if they fail, raise timeout to 20 minutes --- component/toolbox/scripts/migrate | 29 ++++++++++--------- .../scripts/supporting-funcs/ssm-funcs.sh | 9 ++++-- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/component/toolbox/scripts/migrate b/component/toolbox/scripts/migrate index 7c3cf13b29..5e8c9d41b9 100755 --- a/component/toolbox/scripts/migrate +++ b/component/toolbox/scripts/migrate @@ -107,20 +107,23 @@ stop_results_file=stop_results.json upgrade_results_file=upgrade_results.json mkdir -p "$results_directory/" -# attempt on each SDF until one succeeds or they all fail -while read -r line; do - instance_id=$(echo "$line" | awk '{print $2}') - start_and_track_ssm_session "$instance_id" "$sdf_migrate_script" "$results_directory" "InstanceId=$instance_id" - await_file_results "$results_directory" 1 - concat_and_output_json "$results_directory" "$check_results_file" +# Run migration on the first SDF instance listed +read -r line <<< "$instances" - if jq -e 'all(.[]; .status == "success")' "$results_directory/$check_results_file" > /dev/null; then - echo "SDF database has been migrated" - echo "----------------------------------------" - exit 0 - fi -done <<< "$instances" +instance_id=$(echo "$line" | awk '{print $2}') + +echo "Running on $instance_id" + +start_and_track_ssm_session "$instance_id" "$sdf_migrate_script" "$results_directory" "InstanceId=$instance_id" + + +if jq -e '.status == "success"' "$results_directory/$instance_id.json" > /dev/null; then + echo "SDF database has been migrated" + echo "----------------------------------------" + exit 0 +fi -echo "Error: One or more of the checks failed to push a node into maintenance mode, try again later or look at the logs" +cat "$results_directory/$instance_id.json" +echo "Error: Failed to migrate system, try again later or look at the logs" exit 2 diff --git a/component/toolbox/scripts/supporting-funcs/ssm-funcs.sh b/component/toolbox/scripts/supporting-funcs/ssm-funcs.sh index 06e09de87b..0b1eed4ae2 100644 --- a/component/toolbox/scripts/supporting-funcs/ssm-funcs.sh +++ b/component/toolbox/scripts/supporting-funcs/ssm-funcs.sh @@ -29,8 +29,8 @@ start_and_track_ssm_session() { command_id=$(echo "$output" | jq -r '.Command.CommandId') echo "Info: tracking SSM execution ID: $command_id" - # Poll for command status with a timeout of 60 seconds - timeout=760 + # Poll for command status + timeout=1200 # 20 minutes elapsed=0 interval=5 @@ -55,6 +55,11 @@ start_and_track_ssm_session() { --instance-id "$instance_id" \ | jq -r '.StandardOutputContent') echo "$output" >"$results_directory/$instance_id.json" + elif [ "$status" == "InProgress" ]; then + output="{\"instance_id\": \"$instance_id\", \"status\": \"error\", \"message\": \"Caller timeout out after waiting\"}" + echo "$output" >"$results_directory/$instance_id.json" + echo "The github action has timed out, but the task may still be running. Check the ssm logs on aws" + return else echo "Command failed with status: $status" exit_code=$(aws ssm get-command-invocation \