From a0a5cd9fc177b8c5820c4483286edefa63ce2d04 Mon Sep 17 00:00:00 2001 From: Will Moore Date: Mon, 12 Apr 2021 21:29:07 -0700 Subject: [PATCH] Add instance update and reactivation Adds functionality to perform Bottlerocket version upgrades and reactivates drained instances after successful upgrade. [Issue: https://github.com/bottlerocket-os/bottlerocket-ecs-updater/issues/9] --- updater/aws.go | 80 ++++++++++++++++++++++++++++++++----------------- updater/main.go | 77 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 124 insertions(+), 33 deletions(-) diff --git a/updater/aws.go b/updater/aws.go index cd78eba..94e7b06 100644 --- a/updater/aws.go +++ b/updater/aws.go @@ -8,6 +8,7 @@ import ( "strings" "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/service/ec2" "github.com/aws/aws-sdk-go/service/ecs" "github.com/aws/aws-sdk-go/service/ssm" ) @@ -155,7 +156,7 @@ func (u *updater) activateInstance(containerInstance *string) error { if len(resp.Failures) != 0 { return fmt.Errorf("Container instance %s failed to activate: %#v", aws.StringValue(containerInstance), resp.Failures) } - log.Printf("Container instance state changed to ACTIVE") + log.Printf("Container instance %s state changed to ACTIVE", aws.StringValue(containerInstance)) return nil } @@ -209,37 +210,60 @@ func (u *updater) sendCommand(instanceIDs []string, ssmCommand string) (string, return commandID, nil } -func (u *updater) checkSSMCommandOutput(commandID string, instanceIDs []string) ([]string, error) { - updateCandidates := make([]string, 0) - for _, v := range instanceIDs { - resp, err := u.ssm.GetCommandInvocation(&ssm.GetCommandInvocationInput{ - CommandId: aws.String(commandID), - InstanceId: aws.String(v), - }) - if err != nil { - return nil, fmt.Errorf("failed to retrieve command invocation output: %#v", err) - } +func (u *updater) getCommandResult(commandID string, instanceID string) ([]byte, error) { + resp, err := u.ssm.GetCommandInvocation(&ssm.GetCommandInvocationInput{ + CommandId: aws.String(commandID), + InstanceId: aws.String(instanceID), + }) + if err != nil { + return nil, fmt.Errorf("failed to retrieve command invocation output: %#v", err) + } + commandResults := []byte(aws.StringValue(resp.StandardOutputContent)) + return commandResults, nil +} - type updateCheckResult struct { - UpdateState string `json:"update_state"` - } +func isUpdateAvailable(commandOutput []byte) (bool, string) { + type updateCheckResult struct { + UpdateState string `json:"update_state"` + } - var result updateCheckResult - err = json.Unmarshal([]byte(*resp.StandardOutputContent), &result) - if err != nil { - log.Printf("failed to unmarshal command invocation output: %#v", err) - } - log.Println("update_state: ", result) + var updateState updateCheckResult + err := json.Unmarshal(commandOutput, &updateState) + if err != nil { + log.Printf("failed to unmarshal update state: %#v", err) + return false, "no data" + } + return updateState.UpdateState == "Available", "" +} - switch result.UpdateState { - case "Available": - updateCandidates = append(updateCandidates, v) - } +// getActiveVersion unmarshals GetCommandInvocation output to determine the active version of a Bottlerocket instance. +// Takes an SSM Command ID and EC2ID as parameters and returns the active version in use. +func getActiveVersion(commandOutput []byte) (string, error) { + type version struct { + Version string `json:"version"` + } + + type image struct { + Image version `json:"image"` } - if len(updateCandidates) == 0 { - log.Printf("No instances to update") - return nil, nil + type partition struct { + ActivePartition image `json:"active_partition"` } - return updateCandidates, nil + + var activeVersion partition + err := json.Unmarshal(commandOutput, &activeVersion) + if err != nil { + log.Printf("failed to unmarshal command invocation output: %#v", err) + return "", err + } + versionInUse := activeVersion.ActivePartition.Image.Version + return versionInUse, nil +} + +// waitUntilOk takes an EC2 ID as a parameter and waits until the specified EC2 instance is in an Ok status. +func (u *updater) waitUntilOk(ec2ID string) error { + return u.ec2.WaitUntilInstanceStatusOk(&ec2.DescribeInstanceStatusInput{ + InstanceIds: []*string{aws.String(ec2ID)}, + }) } diff --git a/updater/main.go b/updater/main.go index 5667137..2b482e0 100644 --- a/updater/main.go +++ b/updater/main.go @@ -9,6 +9,7 @@ import ( "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/session" + "github.com/aws/aws-sdk-go/service/ec2" "github.com/aws/aws-sdk-go/service/ecs" "github.com/aws/aws-sdk-go/service/ssm" ) @@ -22,6 +23,7 @@ type updater struct { cluster string ecs ECSAPI ssm *ssm.SSM + ec2 *ec2.EC2 } func main() { @@ -50,6 +52,7 @@ func _main() error { cluster: *flagCluster, ecs: ecs.New(sess, aws.NewConfig().WithLogLevel(aws.LogDebugWithHTTPBody)), ssm: ssm.New(sess, aws.NewConfig().WithLogLevel(aws.LogDebugWithHTTPBody)), + ec2: ec2.New(sess, aws.NewConfig().WithLogLevel(aws.LogDebugWithHTTPBody)), } listedInstances, err := u.listContainerInstances() @@ -78,16 +81,22 @@ func _main() error { return err } - candidates, err := u.checkSSMCommandOutput(commandID, instances) - if err != nil { - return err + candidates := make([]string, 0) + for _, ec2ID := range instances { + commandOutput, err := u.getCommandResult(commandID, ec2ID) + if err != nil { + return err + } + updateState, _ := isUpdateAvailable(commandOutput) + if updateState { + candidates = append(candidates, ec2ID) + } } - if len(candidates) == 0 { log.Printf("No instances to update") return nil } - fmt.Println("Instances ready for update: ", candidates) + log.Print("Instances ready for update: ", candidates) for ec2ID, containerInstance := range ec2IDtoECSARN { err := u.drain(containerInstance) @@ -96,6 +105,64 @@ func _main() error { continue } log.Printf("Instance %s drained", ec2ID) + + ec2IDs := []string{ec2ID} + _, err = u.sendCommand(ec2IDs, "apiclient update apply --reboot") + if err != nil { + // TODO add nuanced error checking to determine the type of failure, act accordingly. + log.Printf("%#v", err) + err2 := u.activateInstance(aws.String(containerInstance)) + if err2 != nil { + log.Printf("failed to reactivate %s after failure to execute update command. Aborting update operations.", ec2ID) + return err2 + } + continue + } + + err = u.waitUntilOk(ec2ID) + if err != nil { + return fmt.Errorf("instance %s failed to enter an Ok status after reboot. Aborting update operations: %#v", ec2ID, err) + } + + err = u.activateInstance(aws.String(containerInstance)) + if err != nil { + log.Printf("instance %s failed to return to ACTIVE after reboot. Aborting update operations.", ec2ID) + return err + } + + updateStatus, err := u.sendCommand(ec2IDs, "apiclient update check") + if err != nil { + log.Printf("%#v", err) + continue + } + + updateResult, err := u.getCommandResult(updateStatus, ec2ID) + if err != nil { + log.Printf("%#v", err) + continue + } + + // TODO replace with version before and after comparison. + updateState, data := isUpdateAvailable(updateResult) + if updateState { + log.Printf("Instance %s did not update. Manual update advised.", ec2ID) + continue + } else if data != "" { + log.Printf("Unable to determine update result. Manual verification of %s required", ec2ID) + continue + } else { + log.Printf("Instance %s updated successfully", ec2ID) + } + + updatedVersion, err := getActiveVersion(updateResult) + if err != nil { + log.Printf("%#v", err) + } + if len(updatedVersion) != 0 { + log.Printf("Instance %s running Bottlerocket: %s", ec2ID, updatedVersion) + } else { + log.Printf("Unable to verify active version. Manual verification of %s required.", ec2ID) + } } return nil }