Skip to content

Commit

Permalink
Merge pull request #4275 from systeminit/feat/add-deployment-magic
Browse files Browse the repository at this point in the history
Locally executing deployment toolbox
  • Loading branch information
johnrwatson authored Aug 5, 2024
2 parents a544ef3 + 49908b9 commit 80235b0
Show file tree
Hide file tree
Showing 4 changed files with 321 additions and 2 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,6 @@ bin/lang-js/docs/

# direnv
.direnv

# toolbox
component/toolbox/results/
4 changes: 3 additions & 1 deletion component/toolbox/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
FROM amazon/aws-cli:2.16.9

RUN set -eux; \
yum update -y; \
arch=$(arch | sed s/aarch64/arm64/ | sed s/x86_64/64bit/) && yum install -y \
https://s3.amazonaws.com/session-manager-downloads/plugin/latest/linux_${arch}/session-manager-plugin.rpm;
https://s3.amazonaws.com/session-manager-downloads/plugin/latest/linux_${arch}/session-manager-plugin.rpm; \
yum install -y jq

COPY ./scripts/* /usr/local/bin/si/
ENV PATH="/usr/local/bin/si:${PATH}"
Expand Down
2 changes: 1 addition & 1 deletion component/toolbox/scripts/ssm
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ while read -r line; do
((i++))
done <<< "$instances"

read -p "Select an instance by number: " selection
read -p "Select an instance by index: " selection
instance_id=$(echo "$instances" | sed -n "${selection}p" | awk '{print $2}')

if [ -z "$instance_id" ]; then
Expand Down
314 changes: 314 additions & 0 deletions component/toolbox/scripts/upgrade
Original file line number Diff line number Diff line change
@@ -0,0 +1,314 @@
#!/bin/bash
# ---------------------------------------------------------------------------------------------------
# Identify all the machines specifically which are upgrade-able via this method
# SSM/Shells onto all the boxes to check if there is an upgrade available, then offer the user to upgrade
# All the endpoints to the latest version of stable if there is an update available.
# Everytime an SSM command is executed against a host a record of it is pushed into a
# ./results/<uuid>/<result>.json file + once all the commands are completed an aggregated file will be
# created in that directory too. If the json output from the SSM executions is not enough to debug
# just look in AWS and you'll see the whole execution history in SSM Command Execution History.
# ---------------------------------------------------------------------------------------------------

# Stop immediately if anything goes wrong, let's not create too much
# mess if John's shell is poor
set -eo pipefail

usage() {
echo
echo "upgrade"
echo "----------------------------------"
echo "This script will open an SSM session to all available"
echo "nodes in the region and will check whether they have an"
echo "upgrade available, if so the user can proceed and upgrade"
echo "them all in parallel"
echo "----------------------------------"
echo "Usage: upgrade [-p profile] [-r region]"
echo " -p profile AWS profile to use"
echo " -r region AWS region to use"
echo
exit 1
}

# Add a check to see if the script is being sourced or executed
if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then
usage
fi

# Parse flags
while getopts ":p:r:" opt; do
case ${opt} in
p)
profile=$OPTARG
;;
r)
region=$OPTARG
;;
\?)
echo "Invalid option: -$OPTARG" >&2
usage
;;
:)
echo "Option -$OPTARG requires an argument." >&2
usage
;;
esac
done

# Function to list EC2 instances with their Name tag
list_instances() {
aws ec2 describe-instances --query 'Reservations[*].Instances[?State.Name==`running`].[Tags[?Key==`Name`].Value | [0],InstanceId,InstanceType,PrivateIpAddress]' --output text
}

# Function to check command status
check_ssm_command_status() {
status=$(aws ssm list-command-invocations \
--command-id "$command_id" \
--details \
| jq -r '.CommandInvocations[0].Status')
echo "$status"
}

# Function to start SSM session
start_and_track_ssm_session() {

instance_id=$1
script=$2
service=$3
action=$4
results_directory=$5

output=$(aws ssm send-command --instance-ids "$instance_id" --document-name "$script" --parameters "Service=$service,InstanceId=$instance_id,Action=$action" 2>&1)

status=$?

if [ $status -ne 0 ]; then
output=$(echo "{\"instance_id\": \"$instance_id\", \"status\": \"error\", \"service\": \"$service\", \"message\": \"$output\"}")
echo $output > "$results_directory/$instance_id.json"
return
fi

command_id=$(echo "$output" | jq -r '.Command.CommandId')

# Poll for command status with a timeout of 60 seconds
timeout=60
elapsed=0
interval=1

while [ $elapsed -lt $timeout ]; do
status=$(check_ssm_command_status)

if [ "$status" == "Success" ] || [ "$status" == "Failed" ] || [ "$status" == "TimedOut" ] || [ "$status" == "Cancelled" ]; then
break
fi

sleep $interval
elapsed=$((elapsed + interval))
done

# Check if command was successful
if [ "$status" == "Success" ]; then
# Get the output
output=$(aws ssm get-command-invocation \
--command-id "$command_id" \
--instance-id "$instance_id" \
| jq -r '.StandardOutputContent')
echo $output > "$results_directory/$instance_id.json"
else
echo "Command failed with status: $status"
exit_code=$(aws ssm get-command-invocation \
--command-id "$command_id" \
--instance-id "$instance_id" \
| jq -r '.ResponseCode')

echo "Exit code: $exit_code"
echo "Failure message:"
aws ssm get-command-invocation \
--command-id "$command_id" \
--instance-id "$instance_id" \
| jq -r '.StandardErrorContent'
fi

}

# Function to get input or use environment variable
get_param_or_env() {
local param=$1
local env_var=$2
local prompt=$3

if [ -z "$param" ]; then
if [ -z "${!env_var}" ]; then
read -p "$prompt: " value
echo "$value"
else
echo "${!env_var}"
fi
else
echo "$param"
fi
}

await_ssm_results() {

results_directory=$1
required_file_count=$2

timeout=60 # Timeout in seconds
start_time=$(date +%s) # Record the start time

while true; do
current_time=$(date +%s)
elapsed_time=$((current_time - start_time))

if (( elapsed_time > timeout )); then
echo "Error: Timeout reached waiting for SSM document responses to arrive. Not all files are present."
exit 1
fi

file_count=$(ls "$results_directory" | wc -l)

if (( file_count >= required_file_count )); then
break
fi

# Wait for a short period before checking again
sleep 1
done

}

sassy_selection_check() {
selection=${1^^}
if [ "$selection" != "Y" ]; then
echo "Don't Trust Scott and John? We're friends I promise, exiting"
exit 1
fi
}

concat_and_output() {

results_directory=$1
output_file=$2

# Check if the directory exists
if [ -d "$results_directory/" ]; then
# Aggregate all the individual json documents into one
cat $results_directory/* | jq -s '.' >> $results_directory/$output_file
cat $results_directory/$output_file | jq
echo "----------------------------------------"
echo "Results can be found within $results_directory"
else
echo "Results Directory $results_directory does not exist."
exit 1
fi
echo "----------------------------------------"

}

# ---------------------------------------------------------------------------------------------------
# Main script
# ---------------------------------------------------------------------------------------------------
profile=$(get_param_or_env "$profile" "AWS_PROFILE" "Enter the AWS profile to use")
region=$(get_param_or_env "$region" "AWS_REGION" "Enter the AWS region (e.g., us-west-2)")

# Define the SSM documents to execute the functions
upgrade_check_script="si-check-node-upgrade"
service_state_script="si-service-state"

export AWS_PROFILE="$profile"
export AWS_REGION="$region"

# List instances with fixed-width columns and filter for the upgradeable instances
instances=$(list_instances | grep -E 'sdf|pinga|rebaser|veritech' )
if [ -z "$instances" ]; then
echo "No running instances found."
exit 1
fi

echo "----------------------------------------"
echo "Running instances of sdf/pinga/rebaser/veritech in the region $region:"
printf "%-5s %-20s %-20s %-20s %-20s\n" "Index" "Name" "InstanceId" "InstanceType" "PrivateIpAddress"
i=1
while read -r line; do
name=$(echo "$line" | awk '{print $1}')
instance_id=$(echo "$line" | awk '{print $2}')
instance_type=$(echo "$line" | awk '{print $3}')
private_ip=$(echo "$line" | awk '{print $4}')
printf "%-5s %-20s %-20s %-20s %-20s\n" "$i" "$name" "$instance_id" "$instance_type" "$private_ip"
((i++))
done <<< "$instances"
echo "----------------------------------------"

read -p "Would you like to see if an SI binary upgrade is available to these hosts? (Y/N) [takes ~30 seconds] " selection

sassy_selection_check $selection

# Setup somewhere unique to push the results of the check into if they chose to continue
# Reset this results_directory variable between each execution run.
results_directory="./results/$(date +"%Y-%m-%d_%H-%M-%S")"
check_results_file=check_results.json
start_results_file=start_results.json
stop_results_file=stop_results.json
upgrade_results_file=upgrade_results.json
mkdir -p "$results_directory/"

i=1
while read -r line; do
instance_id=$(echo "$line" | awk '{print $2}')
service=$(echo "$line" | awk '{print $1}' | awk -F- '{print $2}')
start_and_track_ssm_session "$instance_id" "$upgrade_check_script" "$service" "check" "$results_directory" &
((i++))
done <<< "$instances"

await_ssm_results "$results_directory" $((i - 1))

concat_and_output "$results_directory" "$check_results_file"

if jq -e 'all(.[]; .status == "success") and any(.[]; .upgradeable == "true")' "$results_directory/$check_results_file" > /dev/null; then
read -p "Would you like to push the new binaries out to the upgradeable hosts? (Y/N) " selection
sassy_selection_check $selection
else
echo "Error: Either none are upgradeable or one or more of the checks failed to determine whether it was possible to upgrade the node."
exit 1
fi

# For all order 1 services, upgrade them in *sequence*
# i.e.: For Veritech, this means that we will always have at least 50% execution capacity as one node will
# always be functioning
upgrade_candidates_json=$(cat $results_directory/$check_results_file)

# reset the results_directory variable for the next set of results
results_directory="./results/$(date +"%Y-%m-%d_%H-%M-%S")"
mkdir -p $results_directory

upgrade_hosts_num=$(jq 'map(select(.service == "veritech")) | .[]' <<< $upgrade_candidates_json | jq -c '.' | wc -l)
jq 'map(select(.service == "veritech")) | .[]' <<< $upgrade_candidates_json | jq -c '.' | while read -r line; do
instance_id=$(echo "$line" | jq -r '.instance_id')
service=$(echo "$line" | jq -r '.service')
start_and_track_ssm_session "$instance_id" "$service_state_script" "$service" "upgrade" "$results_directory" # Serially
done

# Wait until all the results arrive
await_ssm_results "$results_directory" "$upgrade_hosts_num"

# Continue with the rest of the service nodes
upgrade_hosts_num=$(jq 'map(select(.service != "veritech")) | .[]' <<< $upgrade_candidates_json | jq -c '.' | wc -l)
jq 'map(select(.service != "veritech")) | .[]' <<< $upgrade_candidates_json | jq -c '.' | while read -r line; do
instance_id=$(echo "$line" | jq -r '.instance_id')
service=$(echo "$line" | jq -r '.service')
start_and_track_ssm_session "$instance_id" "$service_state_script" "$service" "upgrade" "$results_directory" & # In Parallel
((i++))
done

# Concatenate all the results together
upgrade_hosts_num=$(jq '.[]' <<< $upgrade_candidates_json | jq -c '.' | wc -l)
await_ssm_results "$results_directory" "$upgrade_hosts_num"
concat_and_output "$results_directory" "$upgrade_results_file"

echo "All active binary services have been rotated, WEB IS STILL TO BE UPDATED."
echo "If you are running in CI, this will be ran automatically for you. If you"
echo "executed this manually/locally, you need to run the below pipeline, with"
echo "either tools or production, depending which environment you're executing"
echo "this tool against with service set to web and version as stable."
echo "https://github.com/systeminit/si/actions/workflows/deploy-stack.yml"
echo "----------------------------------------"

0 comments on commit 80235b0

Please sign in to comment.