From 75b531d7ec18983799a62ff09843b64d0920299e Mon Sep 17 00:00:00 2001 From: John Watson Date: Thu, 1 Aug 2024 16:52:38 +0100 Subject: [PATCH] feat: add toolbox script for deployment --- .gitignore | 3 + component/toolbox/Dockerfile | 5 +- component/toolbox/scripts/ssm | 2 +- component/toolbox/scripts/upgrade | 314 ++++++++++++++++++++++++++++++ 4 files changed, 321 insertions(+), 3 deletions(-) create mode 100755 component/toolbox/scripts/upgrade diff --git a/.gitignore b/.gitignore index d076d4d5d2..a22439527f 100644 --- a/.gitignore +++ b/.gitignore @@ -47,3 +47,6 @@ bin/lang-js/docs/ # direnv .direnv + +# toolbox +component/toolbox/results/ diff --git a/component/toolbox/Dockerfile b/component/toolbox/Dockerfile index 1ca9d0d0d3..57865c0450 100644 --- a/component/toolbox/Dockerfile +++ b/component/toolbox/Dockerfile @@ -1,9 +1,10 @@ FROM amazon/aws-cli:2.16.9 RUN set -eux; \ - yum update; \ + yum update -y; \ arch=$(arch | sed s/aarch64/arm64/ | sed s/x86_64/64bit/) && yum install -y \ - https://s3.amazonaws.com/session-manager-downloads/plugin/latest/linux_${arch}/session-manager-plugin.rpm; + https://s3.amazonaws.com/session-manager-downloads/plugin/latest/linux_${arch}/session-manager-plugin.rpm; \ + yum install -y jq COPY ./scripts/* /usr/local/bin/si/ ENV PATH="/usr/local/bin/si:${PATH}" diff --git a/component/toolbox/scripts/ssm b/component/toolbox/scripts/ssm index 40938cd5dc..004bf64e57 100755 --- a/component/toolbox/scripts/ssm +++ b/component/toolbox/scripts/ssm @@ -97,7 +97,7 @@ while read -r line; do ((i++)) done <<< "$instances" -read -p "Select an instance by number: " selection +read -p "Select an instance by index: " selection instance_id=$(echo "$instances" | sed -n "${selection}p" | awk '{print $2}') if [ -z "$instance_id" ]; then diff --git a/component/toolbox/scripts/upgrade b/component/toolbox/scripts/upgrade new file mode 100755 index 0000000000..2c61383a12 --- /dev/null +++ b/component/toolbox/scripts/upgrade @@ -0,0 +1,314 @@ +#!/bin/bash +# --------------------------------------------------------------------------------------------------- +# Identify all the machines specifically which are upgrade-able via this method +# SSM/Shells onto all the boxes to check if there is an upgrade available, then offer the user to upgrade +# All the endpoints to the latest version of stable if there is an update available. +# Everytime an SSM command is executed against a host a record of it is pushed into a +# ./results//.json file + once all the commands are completed an aggregated file will be +# created in that directory too. If the json output from the SSM executions is not enough to debug +# just look in AWS and you'll see the whole execution history in SSM Command Execution History. +# --------------------------------------------------------------------------------------------------- + +# Stop immediately if anything goes wrong, let's not create too much +# mess if John's shell is poor +set -eo pipefail + +usage() { + echo + echo "upgrade" + echo "----------------------------------" + echo "This script will open an SSM session to all available" + echo "nodes in the region and will check whether they have an" + echo "upgrade available, if so the user can proceed and upgrade" + echo "them all in parallel" + echo "----------------------------------" + echo "Usage: upgrade [-p profile] [-r region]" + echo " -p profile AWS profile to use" + echo " -r region AWS region to use" + echo + exit 1 +} + +# Add a check to see if the script is being sourced or executed +if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then + usage +fi + +# Parse flags +while getopts ":p:r:" opt; do + case ${opt} in + p) + profile=$OPTARG + ;; + r) + region=$OPTARG + ;; + \?) + echo "Invalid option: -$OPTARG" >&2 + usage + ;; + :) + echo "Option -$OPTARG requires an argument." >&2 + usage + ;; + esac +done + +# Function to list EC2 instances with their Name tag +list_instances() { + aws ec2 describe-instances --query 'Reservations[*].Instances[?State.Name==`running`].[Tags[?Key==`Name`].Value | [0],InstanceId,InstanceType,PrivateIpAddress]' --output text +} + +# Function to check command status +check_ssm_command_status() { + status=$(aws ssm list-command-invocations \ + --command-id "$command_id" \ + --details \ + | jq -r '.CommandInvocations[0].Status') + echo "$status" +} + +# Function to start SSM session +start_and_track_ssm_session() { + + instance_id=$1 + script=$2 + service=$3 + action=$4 + results_directory=$5 + + output=$(aws ssm send-command --instance-ids "$instance_id" --document-name "$script" --parameters "Service=$service,InstanceId=$instance_id,Action=$action" 2>&1) + + status=$? + + if [ $status -ne 0 ]; then + output=$(echo "{\"instance_id\": \"$instance_id\", \"status\": \"error\", \"service\": \"$service\", \"message\": \"$output\"}") + echo $output > "$results_directory/$instance_id.json" + return + fi + + command_id=$(echo "$output" | jq -r '.Command.CommandId') + + # Poll for command status with a timeout of 60 seconds + timeout=60 + elapsed=0 + interval=1 + + while [ $elapsed -lt $timeout ]; do + status=$(check_ssm_command_status) + + if [ "$status" == "Success" ] || [ "$status" == "Failed" ] || [ "$status" == "TimedOut" ] || [ "$status" == "Cancelled" ]; then + break + fi + + sleep $interval + elapsed=$((elapsed + interval)) + done + + # Check if command was successful + if [ "$status" == "Success" ]; then + # Get the output + output=$(aws ssm get-command-invocation \ + --command-id "$command_id" \ + --instance-id "$instance_id" \ + | jq -r '.StandardOutputContent') + echo $output > "$results_directory/$instance_id.json" + else + echo "Command failed with status: $status" + exit_code=$(aws ssm get-command-invocation \ + --command-id "$command_id" \ + --instance-id "$instance_id" \ + | jq -r '.ResponseCode') + + echo "Exit code: $exit_code" + echo "Failure message:" + aws ssm get-command-invocation \ + --command-id "$command_id" \ + --instance-id "$instance_id" \ + | jq -r '.StandardErrorContent' + fi + + } + +# Function to get input or use environment variable +get_param_or_env() { + local param=$1 + local env_var=$2 + local prompt=$3 + + if [ -z "$param" ]; then + if [ -z "${!env_var}" ]; then + read -p "$prompt: " value + echo "$value" + else + echo "${!env_var}" + fi + else + echo "$param" + fi +} + +await_ssm_results() { + + results_directory=$1 + required_file_count=$2 + + timeout=60 # Timeout in seconds + start_time=$(date +%s) # Record the start time + + while true; do + current_time=$(date +%s) + elapsed_time=$((current_time - start_time)) + + if (( elapsed_time > timeout )); then + echo "Error: Timeout reached waiting for SSM document responses to arrive. Not all files are present." + exit 1 + fi + + file_count=$(ls "$results_directory" | wc -l) + + if (( file_count >= required_file_count )); then + break + fi + + # Wait for a short period before checking again + sleep 1 + done + +} + +sassy_selection_check() { + selection=${1^^} + if [ "$selection" != "Y" ]; then + echo "Don't Trust Scott and John? We're friends I promise, exiting" + exit 1 + fi +} + +concat_and_output() { + + results_directory=$1 + output_file=$2 + + # Check if the directory exists + if [ -d "$results_directory/" ]; then + # Aggregate all the individual json documents into one + cat $results_directory/* | jq -s '.' >> $results_directory/$output_file + cat $results_directory/$output_file | jq + echo "----------------------------------------" + echo "Results can be found within $results_directory" + else + echo "Results Directory $results_directory does not exist." + exit 1 + fi + echo "----------------------------------------" + +} + +# --------------------------------------------------------------------------------------------------- +# Main script +# --------------------------------------------------------------------------------------------------- +profile=$(get_param_or_env "$profile" "AWS_PROFILE" "Enter the AWS profile to use") +region=$(get_param_or_env "$region" "AWS_REGION" "Enter the AWS region (e.g., us-west-2)") + +# Define the SSM documents to execute the functions +upgrade_check_script="si-check-node-upgrade" +service_state_script="si-service-state" + +export AWS_PROFILE="$profile" +export AWS_REGION="$region" + +# List instances with fixed-width columns and filter for the upgradeable instances +instances=$(list_instances | grep -E 'sdf|pinga|rebaser|veritech' ) +if [ -z "$instances" ]; then + echo "No running instances found." + exit 1 +fi + +echo "----------------------------------------" +echo "Running instances of sdf/pinga/rebaser/veritech in the region $region:" +printf "%-5s %-20s %-20s %-20s %-20s\n" "Index" "Name" "InstanceId" "InstanceType" "PrivateIpAddress" +i=1 +while read -r line; do + name=$(echo "$line" | awk '{print $1}') + instance_id=$(echo "$line" | awk '{print $2}') + instance_type=$(echo "$line" | awk '{print $3}') + private_ip=$(echo "$line" | awk '{print $4}') + printf "%-5s %-20s %-20s %-20s %-20s\n" "$i" "$name" "$instance_id" "$instance_type" "$private_ip" + ((i++)) +done <<< "$instances" +echo "----------------------------------------" + +read -p "Would you like to see if an SI binary upgrade is available to these hosts? (Y/N) [takes ~30 seconds] " selection + +sassy_selection_check $selection + +# Setup somewhere unique to push the results of the check into if they chose to continue +# Reset this results_directory variable between each execution run. +results_directory="./results/$(date +"%Y-%m-%d_%H-%M-%S")" +check_results_file=check_results.json +start_results_file=start_results.json +stop_results_file=stop_results.json +upgrade_results_file=upgrade_results.json +mkdir -p "$results_directory/" + +i=1 +while read -r line; do + instance_id=$(echo "$line" | awk '{print $2}') + service=$(echo "$line" | awk '{print $1}' | awk -F- '{print $2}') + start_and_track_ssm_session "$instance_id" "$upgrade_check_script" "$service" "check" "$results_directory" & + ((i++)) +done <<< "$instances" + +await_ssm_results "$results_directory" $((i - 1)) + +concat_and_output "$results_directory" "$check_results_file" + +if jq -e 'all(.[]; .status == "success") and any(.[]; .upgradeable == "true")' "$results_directory/$check_results_file" > /dev/null; then + read -p "Would you like to push the new binaries out to the upgradeable hosts? (Y/N) " selection + sassy_selection_check $selection +else + echo "Error: Either none are upgradeable or one or more of the checks failed to determine whether it was possible to upgrade the node." + exit 1 +fi + +# For all order 1 services, upgrade them in *sequence* +# i.e.: For Veritech, this means that we will always have at least 50% execution capacity as one node will +# always be functioning +upgrade_candidates_json=$(cat $results_directory/$check_results_file) + +# reset the results_directory variable for the next set of results +results_directory="./results/$(date +"%Y-%m-%d_%H-%M-%S")" +mkdir -p $results_directory + +upgrade_hosts_num=$(jq 'map(select(.service == "veritech")) | .[]' <<< $upgrade_candidates_json | jq -c '.' | wc -l) +jq 'map(select(.service == "veritech")) | .[]' <<< $upgrade_candidates_json | jq -c '.' | while read -r line; do + instance_id=$(echo "$line" | jq -r '.instance_id') + service=$(echo "$line" | jq -r '.service') + start_and_track_ssm_session "$instance_id" "$service_state_script" "$service" "upgrade" "$results_directory" # Serially +done + +# Wait until all the results arrive +await_ssm_results "$results_directory" "$upgrade_hosts_num" + +# Continue with the rest of the service nodes +upgrade_hosts_num=$(jq 'map(select(.service != "veritech")) | .[]' <<< $upgrade_candidates_json | jq -c '.' | wc -l) +jq 'map(select(.service != "veritech")) | .[]' <<< $upgrade_candidates_json | jq -c '.' | while read -r line; do + instance_id=$(echo "$line" | jq -r '.instance_id') + service=$(echo "$line" | jq -r '.service') + start_and_track_ssm_session "$instance_id" "$service_state_script" "$service" "upgrade" "$results_directory" & # In Parallel + ((i++)) +done + +# Concatenate all the results together +upgrade_hosts_num=$(jq '.[]' <<< $upgrade_candidates_json | jq -c '.' | wc -l) +await_ssm_results "$results_directory" "$upgrade_hosts_num" +concat_and_output "$results_directory" "$upgrade_results_file" + +echo "All active binary services have been rotated, WEB IS STILL TO BE UPDATED." +echo "If you are running in CI, this will be ran automatically for you. If you" +echo "executed this manually/locally, you need to run the below pipeline, with" +echo "either tools or production, depending which environment you're executing" +echo "this tool against with service set to web and version as stable." +echo "https://github.com/systeminit/si/actions/workflows/deploy-stack.yml" +echo "----------------------------------------" \ No newline at end of file