-
Notifications
You must be signed in to change notification settings - Fork 321
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4275 from systeminit/feat/add-deployment-magic
Locally executing deployment toolbox
- Loading branch information
Showing
4 changed files
with
321 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,3 +47,6 @@ bin/lang-js/docs/ | |
|
||
# direnv | ||
.direnv | ||
|
||
# toolbox | ||
component/toolbox/results/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,314 @@ | ||
#!/bin/bash | ||
# --------------------------------------------------------------------------------------------------- | ||
# Identify all the machines specifically which are upgrade-able via this method | ||
# SSM/Shells onto all the boxes to check if there is an upgrade available, then offer the user to upgrade | ||
# All the endpoints to the latest version of stable if there is an update available. | ||
# Everytime an SSM command is executed against a host a record of it is pushed into a | ||
# ./results/<uuid>/<result>.json file + once all the commands are completed an aggregated file will be | ||
# created in that directory too. If the json output from the SSM executions is not enough to debug | ||
# just look in AWS and you'll see the whole execution history in SSM Command Execution History. | ||
# --------------------------------------------------------------------------------------------------- | ||
|
||
# Stop immediately if anything goes wrong, let's not create too much | ||
# mess if John's shell is poor | ||
set -eo pipefail | ||
|
||
usage() { | ||
echo | ||
echo "upgrade" | ||
echo "----------------------------------" | ||
echo "This script will open an SSM session to all available" | ||
echo "nodes in the region and will check whether they have an" | ||
echo "upgrade available, if so the user can proceed and upgrade" | ||
echo "them all in parallel" | ||
echo "----------------------------------" | ||
echo "Usage: upgrade [-p profile] [-r region]" | ||
echo " -p profile AWS profile to use" | ||
echo " -r region AWS region to use" | ||
echo | ||
exit 1 | ||
} | ||
|
||
# Add a check to see if the script is being sourced or executed | ||
if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then | ||
usage | ||
fi | ||
|
||
# Parse flags | ||
while getopts ":p:r:" opt; do | ||
case ${opt} in | ||
p) | ||
profile=$OPTARG | ||
;; | ||
r) | ||
region=$OPTARG | ||
;; | ||
\?) | ||
echo "Invalid option: -$OPTARG" >&2 | ||
usage | ||
;; | ||
:) | ||
echo "Option -$OPTARG requires an argument." >&2 | ||
usage | ||
;; | ||
esac | ||
done | ||
|
||
# Function to list EC2 instances with their Name tag | ||
list_instances() { | ||
aws ec2 describe-instances --query 'Reservations[*].Instances[?State.Name==`running`].[Tags[?Key==`Name`].Value | [0],InstanceId,InstanceType,PrivateIpAddress]' --output text | ||
} | ||
|
||
# Function to check command status | ||
check_ssm_command_status() { | ||
status=$(aws ssm list-command-invocations \ | ||
--command-id "$command_id" \ | ||
--details \ | ||
| jq -r '.CommandInvocations[0].Status') | ||
echo "$status" | ||
} | ||
|
||
# Function to start SSM session | ||
start_and_track_ssm_session() { | ||
|
||
instance_id=$1 | ||
script=$2 | ||
service=$3 | ||
action=$4 | ||
results_directory=$5 | ||
|
||
output=$(aws ssm send-command --instance-ids "$instance_id" --document-name "$script" --parameters "Service=$service,InstanceId=$instance_id,Action=$action" 2>&1) | ||
|
||
status=$? | ||
|
||
if [ $status -ne 0 ]; then | ||
output=$(echo "{\"instance_id\": \"$instance_id\", \"status\": \"error\", \"service\": \"$service\", \"message\": \"$output\"}") | ||
echo $output > "$results_directory/$instance_id.json" | ||
return | ||
fi | ||
|
||
command_id=$(echo "$output" | jq -r '.Command.CommandId') | ||
|
||
# Poll for command status with a timeout of 60 seconds | ||
timeout=60 | ||
elapsed=0 | ||
interval=1 | ||
|
||
while [ $elapsed -lt $timeout ]; do | ||
status=$(check_ssm_command_status) | ||
|
||
if [ "$status" == "Success" ] || [ "$status" == "Failed" ] || [ "$status" == "TimedOut" ] || [ "$status" == "Cancelled" ]; then | ||
break | ||
fi | ||
|
||
sleep $interval | ||
elapsed=$((elapsed + interval)) | ||
done | ||
|
||
# Check if command was successful | ||
if [ "$status" == "Success" ]; then | ||
# Get the output | ||
output=$(aws ssm get-command-invocation \ | ||
--command-id "$command_id" \ | ||
--instance-id "$instance_id" \ | ||
| jq -r '.StandardOutputContent') | ||
echo $output > "$results_directory/$instance_id.json" | ||
else | ||
echo "Command failed with status: $status" | ||
exit_code=$(aws ssm get-command-invocation \ | ||
--command-id "$command_id" \ | ||
--instance-id "$instance_id" \ | ||
| jq -r '.ResponseCode') | ||
|
||
echo "Exit code: $exit_code" | ||
echo "Failure message:" | ||
aws ssm get-command-invocation \ | ||
--command-id "$command_id" \ | ||
--instance-id "$instance_id" \ | ||
| jq -r '.StandardErrorContent' | ||
fi | ||
|
||
} | ||
|
||
# Function to get input or use environment variable | ||
get_param_or_env() { | ||
local param=$1 | ||
local env_var=$2 | ||
local prompt=$3 | ||
|
||
if [ -z "$param" ]; then | ||
if [ -z "${!env_var}" ]; then | ||
read -p "$prompt: " value | ||
echo "$value" | ||
else | ||
echo "${!env_var}" | ||
fi | ||
else | ||
echo "$param" | ||
fi | ||
} | ||
|
||
await_ssm_results() { | ||
|
||
results_directory=$1 | ||
required_file_count=$2 | ||
|
||
timeout=60 # Timeout in seconds | ||
start_time=$(date +%s) # Record the start time | ||
|
||
while true; do | ||
current_time=$(date +%s) | ||
elapsed_time=$((current_time - start_time)) | ||
|
||
if (( elapsed_time > timeout )); then | ||
echo "Error: Timeout reached waiting for SSM document responses to arrive. Not all files are present." | ||
exit 1 | ||
fi | ||
|
||
file_count=$(ls "$results_directory" | wc -l) | ||
|
||
if (( file_count >= required_file_count )); then | ||
break | ||
fi | ||
|
||
# Wait for a short period before checking again | ||
sleep 1 | ||
done | ||
|
||
} | ||
|
||
sassy_selection_check() { | ||
selection=${1^^} | ||
if [ "$selection" != "Y" ]; then | ||
echo "Don't Trust Scott and John? We're friends I promise, exiting" | ||
exit 1 | ||
fi | ||
} | ||
|
||
concat_and_output() { | ||
|
||
results_directory=$1 | ||
output_file=$2 | ||
|
||
# Check if the directory exists | ||
if [ -d "$results_directory/" ]; then | ||
# Aggregate all the individual json documents into one | ||
cat $results_directory/* | jq -s '.' >> $results_directory/$output_file | ||
cat $results_directory/$output_file | jq | ||
echo "----------------------------------------" | ||
echo "Results can be found within $results_directory" | ||
else | ||
echo "Results Directory $results_directory does not exist." | ||
exit 1 | ||
fi | ||
echo "----------------------------------------" | ||
|
||
} | ||
|
||
# --------------------------------------------------------------------------------------------------- | ||
# Main script | ||
# --------------------------------------------------------------------------------------------------- | ||
profile=$(get_param_or_env "$profile" "AWS_PROFILE" "Enter the AWS profile to use") | ||
region=$(get_param_or_env "$region" "AWS_REGION" "Enter the AWS region (e.g., us-west-2)") | ||
|
||
# Define the SSM documents to execute the functions | ||
upgrade_check_script="si-check-node-upgrade" | ||
service_state_script="si-service-state" | ||
|
||
export AWS_PROFILE="$profile" | ||
export AWS_REGION="$region" | ||
|
||
# List instances with fixed-width columns and filter for the upgradeable instances | ||
instances=$(list_instances | grep -E 'sdf|pinga|rebaser|veritech' ) | ||
if [ -z "$instances" ]; then | ||
echo "No running instances found." | ||
exit 1 | ||
fi | ||
|
||
echo "----------------------------------------" | ||
echo "Running instances of sdf/pinga/rebaser/veritech in the region $region:" | ||
printf "%-5s %-20s %-20s %-20s %-20s\n" "Index" "Name" "InstanceId" "InstanceType" "PrivateIpAddress" | ||
i=1 | ||
while read -r line; do | ||
name=$(echo "$line" | awk '{print $1}') | ||
instance_id=$(echo "$line" | awk '{print $2}') | ||
instance_type=$(echo "$line" | awk '{print $3}') | ||
private_ip=$(echo "$line" | awk '{print $4}') | ||
printf "%-5s %-20s %-20s %-20s %-20s\n" "$i" "$name" "$instance_id" "$instance_type" "$private_ip" | ||
((i++)) | ||
done <<< "$instances" | ||
echo "----------------------------------------" | ||
|
||
read -p "Would you like to see if an SI binary upgrade is available to these hosts? (Y/N) [takes ~30 seconds] " selection | ||
|
||
sassy_selection_check $selection | ||
|
||
# Setup somewhere unique to push the results of the check into if they chose to continue | ||
# Reset this results_directory variable between each execution run. | ||
results_directory="./results/$(date +"%Y-%m-%d_%H-%M-%S")" | ||
check_results_file=check_results.json | ||
start_results_file=start_results.json | ||
stop_results_file=stop_results.json | ||
upgrade_results_file=upgrade_results.json | ||
mkdir -p "$results_directory/" | ||
|
||
i=1 | ||
while read -r line; do | ||
instance_id=$(echo "$line" | awk '{print $2}') | ||
service=$(echo "$line" | awk '{print $1}' | awk -F- '{print $2}') | ||
start_and_track_ssm_session "$instance_id" "$upgrade_check_script" "$service" "check" "$results_directory" & | ||
((i++)) | ||
done <<< "$instances" | ||
|
||
await_ssm_results "$results_directory" $((i - 1)) | ||
|
||
concat_and_output "$results_directory" "$check_results_file" | ||
|
||
if jq -e 'all(.[]; .status == "success") and any(.[]; .upgradeable == "true")' "$results_directory/$check_results_file" > /dev/null; then | ||
read -p "Would you like to push the new binaries out to the upgradeable hosts? (Y/N) " selection | ||
sassy_selection_check $selection | ||
else | ||
echo "Error: Either none are upgradeable or one or more of the checks failed to determine whether it was possible to upgrade the node." | ||
exit 1 | ||
fi | ||
|
||
# For all order 1 services, upgrade them in *sequence* | ||
# i.e.: For Veritech, this means that we will always have at least 50% execution capacity as one node will | ||
# always be functioning | ||
upgrade_candidates_json=$(cat $results_directory/$check_results_file) | ||
|
||
# reset the results_directory variable for the next set of results | ||
results_directory="./results/$(date +"%Y-%m-%d_%H-%M-%S")" | ||
mkdir -p $results_directory | ||
|
||
upgrade_hosts_num=$(jq 'map(select(.service == "veritech")) | .[]' <<< $upgrade_candidates_json | jq -c '.' | wc -l) | ||
jq 'map(select(.service == "veritech")) | .[]' <<< $upgrade_candidates_json | jq -c '.' | while read -r line; do | ||
instance_id=$(echo "$line" | jq -r '.instance_id') | ||
service=$(echo "$line" | jq -r '.service') | ||
start_and_track_ssm_session "$instance_id" "$service_state_script" "$service" "upgrade" "$results_directory" # Serially | ||
done | ||
|
||
# Wait until all the results arrive | ||
await_ssm_results "$results_directory" "$upgrade_hosts_num" | ||
|
||
# Continue with the rest of the service nodes | ||
upgrade_hosts_num=$(jq 'map(select(.service != "veritech")) | .[]' <<< $upgrade_candidates_json | jq -c '.' | wc -l) | ||
jq 'map(select(.service != "veritech")) | .[]' <<< $upgrade_candidates_json | jq -c '.' | while read -r line; do | ||
instance_id=$(echo "$line" | jq -r '.instance_id') | ||
service=$(echo "$line" | jq -r '.service') | ||
start_and_track_ssm_session "$instance_id" "$service_state_script" "$service" "upgrade" "$results_directory" & # In Parallel | ||
((i++)) | ||
done | ||
|
||
# Concatenate all the results together | ||
upgrade_hosts_num=$(jq '.[]' <<< $upgrade_candidates_json | jq -c '.' | wc -l) | ||
await_ssm_results "$results_directory" "$upgrade_hosts_num" | ||
concat_and_output "$results_directory" "$upgrade_results_file" | ||
|
||
echo "All active binary services have been rotated, WEB IS STILL TO BE UPDATED." | ||
echo "If you are running in CI, this will be ran automatically for you. If you" | ||
echo "executed this manually/locally, you need to run the below pipeline, with" | ||
echo "either tools or production, depending which environment you're executing" | ||
echo "this tool against with service set to web and version as stable." | ||
echo "https://github.com/systeminit/si/actions/workflows/deploy-stack.yml" | ||
echo "----------------------------------------" |