From c4875ba963e8d2aab2c48481ff379296def8fcce Mon Sep 17 00:00:00 2001 From: Scott Prutton Date: Wed, 28 Aug 2024 11:24:01 -0400 Subject: [PATCH] testing --- .github/workflows/deploy-stack.yml | 33 +++++++++++++------ .github/workflows/instance-refresh.yml | 9 +++++ .github/workflows/migrate-sdf.yml | 4 +-- .github/workflows/set-service-version.yml | 2 +- .github/workflows/upgrade-veritech.yml | 1 + component/init/configs/service.toml | 1 + component/toolbox/awsi.sh | 2 +- component/toolbox/scripts/migrate | 11 ++++++- component/toolbox/scripts/service-state | 13 ++++++-- .../scripts/ssm-scripts/si-check-node-upgrade | 10 ++++-- .../scripts/ssm-scripts/si-migrate-sdf | 9 ----- .../scripts/ssm-scripts/si-service-state | 6 ++-- .../scripts/supporting-funcs/ssm-funcs.sh | 7 ++-- component/toolbox/scripts/toggle-maintenance | 5 +-- component/toolbox/scripts/upgrade | 8 ++--- 15 files changed, 80 insertions(+), 41 deletions(-) mode change 100644 => 100755 component/toolbox/scripts/migrate diff --git a/.github/workflows/deploy-stack.yml b/.github/workflows/deploy-stack.yml index d0331442e3..745ddea7b4 100644 --- a/.github/workflows/deploy-stack.yml +++ b/.github/workflows/deploy-stack.yml @@ -49,7 +49,6 @@ jobs: down-services: needs: - - set-latest-versions - set-maintenance-mode strategy: fail-fast: false @@ -63,7 +62,6 @@ jobs: upgrade-web: needs: - - set-latest-versions - set-maintenance-mode uses: ./.github/workflows/upgrade-web.yml with: @@ -72,34 +70,48 @@ jobs: upgrade-veritech: needs: - - set-latest-versions + - set-service-versions - set-maintenance-mode uses: ./.github/workflows/upgrade-veritech.yml with: environment: ${{ inputs.environment }} secrets: inherit - migrate-sdf: + upgrade-services: needs: + - set-service-versions - down-services - uses: ./.github/workflows/migrate-sdf.yml + strategy: + fail-fast: false + matrix: + service: [ "pinga", "rebaser", "sdf" ] + uses: ./.github/workflows/down-service.yml with: environment: ${{ inputs.environment }} + service: ${{ matrix.service }} secrets: inherit - upgrade-services: + up-services: needs: - - migrate-sdf + - upgrade-services strategy: fail-fast: false matrix: - service: [ "pinga", "rebaser", "sdf" ] - uses: ./.github/workflows/down-service.yml + service: [ "pinga", "rebaser" ] + uses: ./.github/workflows/up-service.yml with: environment: ${{ inputs.environment }} service: ${{ matrix.service }} secrets: inherit + migrate-and-up-sdf: + needs: + - up-services + uses: ./.github/workflows/migrate-sdf.yml + with: + environment: ${{ inputs.environment }} + secrets: inherit + e2e-validation: # We want to ensure that in-progress cron runs against tools-prod # are canceled when we do a deploy so they don't fail erroneously @@ -109,7 +121,8 @@ jobs: needs: - upgrade-web - upgrade-veritech - - upgrade-services + - migrate-and-up-sdf + if: always() uses: ./.github/workflows/e2e-validation.yml with: environment: ${{ inputs.environment }} diff --git a/.github/workflows/instance-refresh.yml b/.github/workflows/instance-refresh.yml index 0d4ff73ca8..100ee07f2d 100644 --- a/.github/workflows/instance-refresh.yml +++ b/.github/workflows/instance-refresh.yml @@ -17,9 +17,18 @@ on: jobs: replace: environment: ${{ inputs.environment }} + concurrency: + group: instance-refresh-${{ inputs.environment }}-${{ inputs.service }} + cancel-in-progress: true name: Instance refresh runs-on: ubuntu-latest steps: + - name: Configure AWS credentials for ${{ inputs.environment }} + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_KEY }} + aws-region: us-east-1 - name: Instance refresh run: | poll_instance_refresh() { diff --git a/.github/workflows/migrate-sdf.yml b/.github/workflows/migrate-sdf.yml index 369276c759..66a7a2f4c9 100644 --- a/.github/workflows/migrate-sdf.yml +++ b/.github/workflows/migrate-sdf.yml @@ -24,7 +24,7 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - - name: Down service + - name: Migrate and up service run: | - component/toolbox/awsi.sh upgrade -p pull-from-env -r us-east-1 -a y -s sdf component/toolbox/awsi.sh migrate -p pull-from-env -r us-east-1 -a y -s sdf + component/toolbox/awsi.sh service-state -p pull-from-env -r us-east-1 -a y -s sdf -S up diff --git a/.github/workflows/set-service-version.yml b/.github/workflows/set-service-version.yml index f609c8e72b..c8410165f0 100644 --- a/.github/workflows/set-service-version.yml +++ b/.github/workflows/set-service-version.yml @@ -24,7 +24,7 @@ jobs: environment: ${{ inputs.environment }} runs-on: ubuntu-latest steps: - - name: Configure AWS credentials for production + - name: Configure AWS credentials for ${{ inputs.environment }} uses: aws-actions/configure-aws-credentials@v4 with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY }} diff --git a/.github/workflows/upgrade-veritech.yml b/.github/workflows/upgrade-veritech.yml index 441fbb005f..ae421c2e7c 100644 --- a/.github/workflows/upgrade-veritech.yml +++ b/.github/workflows/upgrade-veritech.yml @@ -18,6 +18,7 @@ jobs: secrets: inherit up-veritech: + needs: upgrade-veritech uses: ./.github/workflows/up-service.yml with: environment: ${{ inputs.environment }} diff --git a/component/init/configs/service.toml b/component/init/configs/service.toml index 4a6b352882..c39326d5ac 100644 --- a/component/init/configs/service.toml +++ b/component/init/configs/service.toml @@ -1,3 +1,4 @@ +migration_mode = "skip" pkgs_path = "/tmp" create_workspace_permissions = "$SI_WORKSPACE_PERMISSIONS" create_workspace_allowlist = [ "$SI_WORKSPACE_ALLOW_LIST" ] diff --git a/component/toolbox/awsi.sh b/component/toolbox/awsi.sh index 9fa76c6611..99c13940f3 100755 --- a/component/toolbox/awsi.sh +++ b/component/toolbox/awsi.sh @@ -14,4 +14,4 @@ docker run --rm "${terminal}" \ -e AWS_ACCESS_KEY_ID="${AWS_ACCESS_KEY_ID}" \ -e AWS_SECRET_ACCESS_KEY="${AWS_SECRET_ACCESS_KEY}" \ -e AWS_SESSION_TOKEN="${AWS_SESSION_TOKEN}" \ - systeminit/toolbox:stable "$*" + systeminit/toolbox:test "$*" diff --git a/component/toolbox/scripts/migrate b/component/toolbox/scripts/migrate old mode 100644 new mode 100755 index af5bdc7a81..8e19d0b9a1 --- a/component/toolbox/scripts/migrate +++ b/component/toolbox/scripts/migrate @@ -110,9 +110,18 @@ mkdir -p "$results_directory/" # get the first SDF and go do the thing while read -r line; do instance_id=$(echo "$line" | awk '{print $2}') - start_and_track_ssm_session "$instance_id" "$sdf_migrate_script" "sdf" "migrate" "$results_directory" & + start_and_track_ssm_session "$instance_id" "$sdf_migrate_script" "$results_directory" "InstanceId=$instance_id" break done <<< "$instances" await_file_results "$results_directory" 2 concat_and_output_json "$results_directory" "$check_results_file" + +if jq -e 'all(.[]; .status == "success")' "$results_directory/$check_results_file" > /dev/null; then + echo "All running service nodes of ${service} have had their state set to $state" + echo "----------------------------------------" + exit 0 +else + echo "Error: One or more of the checks failed to push a node into maintenance mode, try again later or look at the logs" + exit 2 +fi diff --git a/component/toolbox/scripts/service-state b/component/toolbox/scripts/service-state index 0cf95a0c3b..c95f68d487 100755 --- a/component/toolbox/scripts/service-state +++ b/component/toolbox/scripts/service-state @@ -41,7 +41,7 @@ if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then fi # Parse flags -while getopts ":p:r:a:s:" opt; do +while getopts ":p:r:a:s:S:" opt; do case ${opt} in p) profile=$OPTARG @@ -120,7 +120,7 @@ i=1 while read -r line; do instance_id=$(echo "$line" | awk '{print $2}') service_name=$(echo "$line" | awk '{print $1}' | awk -F- '{print $2}') - start_and_track_ssm_session "$instance_id" "$service_state_script" "$service_name" "$state" "$results_directory" # Serially + start_and_track_ssm_session "$instance_id" "$service_state_script" "$results_directory" "InstanceId=$instance_id,Service=$service_name,Action=$state" # Serially ((i++)) done <<< "$instances" @@ -128,4 +128,11 @@ await_file_results "$results_directory" $((i - 1)) concat_and_output_json "$results_directory" "$check_results_file" -echo "All active binary services of ${service} have been set to ${state}" +if jq -e 'all(.[]; .status == "success")' "$results_directory/$check_results_file" > /dev/null; then + echo "All running service nodes of ${service} have had their state set to $state" + echo "----------------------------------------" + exit 0 +else + echo "Error: One or more of the checks failed to push a node into maintenance mode, try again later or look at the logs" + exit 2 +fi diff --git a/component/toolbox/scripts/ssm-scripts/si-check-node-upgrade b/component/toolbox/scripts/ssm-scripts/si-check-node-upgrade index 63e4a9b91a..62f5b12dc1 100644 --- a/component/toolbox/scripts/ssm-scripts/si-check-node-upgrade +++ b/component/toolbox/scripts/ssm-scripts/si-check-node-upgrade @@ -14,6 +14,10 @@ parameters: type: "String" description: "Action to execute [not yet used]" default: "N/A" + Environment: + type: "String" + description: "Environment to run in" + default: "N/A" mainSteps: - action: "aws:runShellScript" @@ -24,8 +28,10 @@ mainSteps: # JW: This assessment blindly assumes that there are no additional configuration changes to the binaries or runtimes and that they are a direct application code replacement # this is a little naive but we can check the deployment specs/ymls if we wish to conduct this check too. - DESIRED_VERSION=$(aws ssm get-parameter --query "Parameter.Value" --output text --name "si-version-$SI_SERVICE") - NEW_VERSION=$(curl -Ls https://artifacts.systeminit.com/{{ Service }}/${DESIRED_VERSION}/omnibus/linux/$(arch)/{{ Service }}-${DESIRED_VERSION}-omnibus-linux-$(arch).tar.gz.metadata.json | jq -r '.version') + export SI_SERVICE={{ Service }} + export SI_HOSTENV={{ Environment }} + export SI_VERSION=$(aws ssm get-parameter --query "Parameter.Value" --output text --name "$SI_HOSTENV-si-version-$SI_SERVICE") + NEW_VERSION=$(curl -Ls https://artifacts.systeminit.com/{{ Service }}/${SI_VERSION}/omnibus/linux/$(arch)/{{ Service }}-${SI_VERSION}-omnibus-linux-$(arch).tar.gz.metadata.json | jq -r '.version') RUNNING_VERSION=$(sudo find / -wholename '/etc/nix-omnibus/{{ Service }}/**/metadata.json' | tail -n 1 | xargs cat | jq -r '.version') # Check if both versions are set to non-empty values diff --git a/component/toolbox/scripts/ssm-scripts/si-migrate-sdf b/component/toolbox/scripts/ssm-scripts/si-migrate-sdf index e7db596f69..50a9f91d70 100644 --- a/component/toolbox/scripts/ssm-scripts/si-migrate-sdf +++ b/component/toolbox/scripts/ssm-scripts/si-migrate-sdf @@ -2,19 +2,10 @@ schemaVersion: "2.2" description: "Run a oneshot SDF with MigrationMode=RunAndQuit" parameters: - Service: - type: "String" - description: "Service to Run on Node" - default: "N/A" InstanceId: type: "String" description: "InstanceId of the executing node" default: "N/A" - Action: - type: "String" - description: "Action to execute [not yet used]" - default: "N/A" - mainSteps: - action: "aws:runShellScript" name: "example" diff --git a/component/toolbox/scripts/ssm-scripts/si-service-state b/component/toolbox/scripts/ssm-scripts/si-service-state index 8af3fef68d..e24bc58f89 100644 --- a/component/toolbox/scripts/ssm-scripts/si-service-state +++ b/component/toolbox/scripts/ssm-scripts/si-service-state @@ -24,12 +24,14 @@ mainSteps: inputs: runCommand: - | - case {{ Action }} in + case "{{ Action }}" in "down") service {{ Service }} stop + echo "{\"instance_id\": \"{{ InstanceId }}\", \"status\": \"success\", \"service\": \"{{ Service }}\", \"state\": \"{{ Action }}\" }" ;; "up") service {{ Service }} restart + echo "{\"instance_id\": \"{{ InstanceId }}\", \"status\": \"success\", \"service\": \"{{ Service }}\", \"state\": \"{{ Action }}\" }" ;; "upgrade") export SI_SERVICE={{ Service }} @@ -42,7 +44,7 @@ mainSteps: docker-compose -f /run/app/docker-compose.yaml --profile $SI_SERVICE up --wait wget https://artifacts.systeminit.com/{{ Service }}/${SI_VERSION}/omnibus/linux/$(arch)/{{ Service }}-${SI_VERSION}-omnibus-linux-$(arch).tar.gz -O - | tar -xzf - -C / - METADATA==$(sudo find / -wholename '/etc/nix-omnibus/{{ Service }}/**/metadata.json' | tail -n 1 | xargs cat | jq) + METADATA=$(sudo find / -wholename '/etc/nix-omnibus/{{ Service }}/**/metadata.json' | tail -n 1 | xargs cat | jq) COMMIT=$(echo $METADATA | jq -r '.commit') RUNNING_VERSION=$(echo $METADATA | jq -r '.version') diff --git a/component/toolbox/scripts/supporting-funcs/ssm-funcs.sh b/component/toolbox/scripts/supporting-funcs/ssm-funcs.sh index c358c93953..aa126bb270 100644 --- a/component/toolbox/scripts/supporting-funcs/ssm-funcs.sh +++ b/component/toolbox/scripts/supporting-funcs/ssm-funcs.sh @@ -13,11 +13,10 @@ start_and_track_ssm_session() { instance_id=$1 script=$2 - service=$3 - action=$4 - results_directory=$5 + results_directory=$3 + params=$4 - output=$(aws ssm send-command --instance-ids "$instance_id" --document-name "$script" --parameters "Service=$service,InstanceId=$instance_id,Action=$action" 2>&1) + output=$(aws ssm send-command --instance-ids "$instance_id" --document-name "$script" --parameters "$params" 2>&1) status=$? diff --git a/component/toolbox/scripts/toggle-maintenance b/component/toolbox/scripts/toggle-maintenance index aa564d4460..0dbbf4146f 100755 --- a/component/toolbox/scripts/toggle-maintenance +++ b/component/toolbox/scripts/toggle-maintenance @@ -1,7 +1,7 @@ #!/bin/bash # --------------------------------------------------------------------------------------------------- # Identify all the service replicas/machines and permits you to toggle maintenance mode on or off via -# an SSM document execution on the host. If the json output from the SSM executions is not enough to +# an SSM document execution on the host. If the json output from the SSM executions is not enough to # debug just look in AWS and you'll see the whole execution history in SSM Command Execution History. # --------------------------------------------------------------------------------------------------- @@ -10,7 +10,7 @@ set -eo pipefail # Find & Import all the supporting functions from the supporting folder -# Get the directory of the current script to figure out where the +# Get the directory of the current script to figure out where the # Supporting funcs are IMPORT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd) @@ -126,6 +126,7 @@ while read -r line; do instance_id=$(echo "$line" | awk '{print $2}') service_name=$(echo "$line" | awk '{print $1}' | awk -F- '{print $2}') start_and_track_ssm_session "$instance_id" "$service_maintenance_script" "$service_name" "$maintenance" "$results_directory" & + start_and_track_ssm_session "$instance_id" "$service_maintenance_script" "$results_directory" "InstanceId=$instance_id,Action=$maintenance" & ((i++)) done <<< "$instances" diff --git a/component/toolbox/scripts/upgrade b/component/toolbox/scripts/upgrade index 28871561c9..d60e6bd870 100755 --- a/component/toolbox/scripts/upgrade +++ b/component/toolbox/scripts/upgrade @@ -51,7 +51,7 @@ if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then fi # Parse flags -while getopts ":p:r:a:s:" opt; do +while getopts ":p:r:a:s:e:" opt; do case ${opt} in p) profile=$OPTARG @@ -130,7 +130,7 @@ i=1 while read -r line; do instance_id=$(echo "$line" | awk '{print $2}') service_name=$(echo "$line" | awk '{print $1}' | awk -F- '{print $2}') - start_and_track_ssm_session "$instance_id" "$upgrade_check_script" "$service_name" "check" "$results_directory" & + start_and_track_ssm_session "$instance_id" "$upgrade_check_script" "$results_directory" "InstanceId=$instance_id,Service=$service_name,Environment=$environment" & ((i++)) done <<< "$instances" @@ -162,7 +162,7 @@ upgrade_hosts_num=$(jq 'map(select(.service == "veritech")) | .[]' <<< $upgrade_ jq 'map(select(.service == "veritech")) | .[]' <<< $upgrade_candidates_json | jq -c '.' | while read -r line; do instance_id=$(echo "$line" | jq -r '.instance_id') service_name=$(echo "$line" | jq -r '.service') - start_and_track_ssm_session "$instance_id" "$service_state_script" "$service_name" "upgrade" "$results_directory" -e "$environment" # Serially + start_and_track_ssm_session "$instance_id" "$service_state_script" "$results_directory" "InstanceId=$instance_id,Service=$service_name,Action=upgrade,Environment=$environment" done # Wait until all the results arrive @@ -173,7 +173,7 @@ upgrade_hosts_num=$(jq 'map(select(.service != "veritech")) | .[]' <<< $upgrade_ jq 'map(select(.service != "veritech")) | .[]' <<< $upgrade_candidates_json | jq -c '.' | while read -r line; do instance_id=$(echo "$line" | jq -r '.instance_id') service_name=$(echo "$line" | jq -r '.service') - start_and_track_ssm_session "$instance_id" "$service_state_script" "$service_name" "upgrade" "$results_directory" -e "$environment" & # In Parallel + start_and_track_ssm_session "$instance_id" "$service_state_script" "$results_directory" "InstanceId=$instance_id,Service=$service_name,Action=upgrade,Environment=$environment" & ((i++)) done