From 930fb22b52f57efd4a428bd6d8eaed6819df1433 Mon Sep 17 00:00:00 2001 From: Will Moore Date: Mon, 12 Apr 2021 21:29:07 -0700 Subject: [PATCH] Add instance update and reactivation Adds functionality to perform Bottlerocket version upgrades and reactivates drained instances after successful upgrade. [Issue: https://github.com/bottlerocket-os/bottlerocket-ecs-updater/issues/9] --- updater/aws.go | 79 +++++++++++++++++++++++++++++----------------- updater/main.go | 84 ++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 130 insertions(+), 33 deletions(-) diff --git a/updater/aws.go b/updater/aws.go index cd78eba..adf8f9b 100644 --- a/updater/aws.go +++ b/updater/aws.go @@ -8,6 +8,7 @@ import ( "strings" "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/service/ec2" "github.com/aws/aws-sdk-go/service/ecs" "github.com/aws/aws-sdk-go/service/ssm" ) @@ -155,7 +156,7 @@ func (u *updater) activateInstance(containerInstance *string) error { if len(resp.Failures) != 0 { return fmt.Errorf("Container instance %s failed to activate: %#v", aws.StringValue(containerInstance), resp.Failures) } - log.Printf("Container instance state changed to ACTIVE") + log.Printf("Container instance %s state changed to ACTIVE", aws.StringValue(containerInstance)) return nil } @@ -209,37 +210,59 @@ func (u *updater) sendCommand(instanceIDs []string, ssmCommand string) (string, return commandID, nil } -func (u *updater) checkSSMCommandOutput(commandID string, instanceIDs []string) ([]string, error) { - updateCandidates := make([]string, 0) - for _, v := range instanceIDs { - resp, err := u.ssm.GetCommandInvocation(&ssm.GetCommandInvocationInput{ - CommandId: aws.String(commandID), - InstanceId: aws.String(v), - }) - if err != nil { - return nil, fmt.Errorf("failed to retrieve command invocation output: %#v", err) - } +func (u *updater) getCommandResult(commandID string, instanceID string) ([]byte, error) { + resp, err := u.ssm.GetCommandInvocation(&ssm.GetCommandInvocationInput{ + CommandId: aws.String(commandID), + InstanceId: aws.String(instanceID), + }) + if err != nil { + return nil, fmt.Errorf("failed to retrieve command invocation output: %#v", err) + } + commandResults := []byte(aws.StringValue(resp.StandardOutputContent)) + return commandResults, nil +} - type updateCheckResult struct { - UpdateState string `json:"update_state"` - } +func isUpdateAvailable(commandOutput []byte) (bool, error) { + type updateCheckResult struct { + UpdateState string `json:"update_state"` + } - var result updateCheckResult - err = json.Unmarshal([]byte(*resp.StandardOutputContent), &result) - if err != nil { - log.Printf("failed to unmarshal command invocation output: %#v", err) - } - log.Println("update_state: ", result) + var updateState updateCheckResult + err := json.Unmarshal(commandOutput, &updateState) + if err != nil { + return false, fmt.Errorf("failed to unmarshal update state: %#v", err) + } + return updateState.UpdateState == "Available", nil +} - switch result.UpdateState { - case "Available": - updateCandidates = append(updateCandidates, v) - } +// getActiveVersion unmarshals GetCommandInvocation output to determine the active version of a Bottlerocket instance. +// Takes GetCommandInvocation output as a parameter and returns the active version in use. +func getActiveVersion(commandOutput []byte) (string, error) { + type version struct { + Version string `json:"version"` + } + + type image struct { + Image version `json:"image"` } - if len(updateCandidates) == 0 { - log.Printf("No instances to update") - return nil, nil + type partition struct { + ActivePartition image `json:"active_partition"` } - return updateCandidates, nil + + var activeVersion partition + err := json.Unmarshal(commandOutput, &activeVersion) + if err != nil { + log.Printf("failed to unmarshal command invocation output: %#v", err) + return "", err + } + versionInUse := activeVersion.ActivePartition.Image.Version + return versionInUse, nil +} + +// waitUntilOk takes an EC2 ID as a parameter and waits until the specified EC2 instance is in an Ok status. +func (u *updater) waitUntilOk(ec2ID string) error { + return u.ec2.WaitUntilInstanceStatusOk(&ec2.DescribeInstanceStatusInput{ + InstanceIds: []*string{aws.String(ec2ID)}, + }) } diff --git a/updater/main.go b/updater/main.go index 5667137..9dce1d5 100644 --- a/updater/main.go +++ b/updater/main.go @@ -9,6 +9,7 @@ import ( "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/session" + "github.com/aws/aws-sdk-go/service/ec2" "github.com/aws/aws-sdk-go/service/ecs" "github.com/aws/aws-sdk-go/service/ssm" ) @@ -22,6 +23,7 @@ type updater struct { cluster string ecs ECSAPI ssm *ssm.SSM + ec2 *ec2.EC2 } func main() { @@ -50,6 +52,7 @@ func _main() error { cluster: *flagCluster, ecs: ecs.New(sess, aws.NewConfig().WithLogLevel(aws.LogDebugWithHTTPBody)), ssm: ssm.New(sess, aws.NewConfig().WithLogLevel(aws.LogDebugWithHTTPBody)), + ec2: ec2.New(sess, aws.NewConfig().WithLogLevel(aws.LogDebugWithHTTPBody)), } listedInstances, err := u.listContainerInstances() @@ -78,16 +81,27 @@ func _main() error { return err } - candidates, err := u.checkSSMCommandOutput(commandID, instances) - if err != nil { - return err - } + candidates := make([]string, 0) + for _, ec2ID := range instances { + commandOutput, err := u.getCommandResult(commandID, ec2ID) + if err != nil { + return err + } + updateState, err := isUpdateAvailable(commandOutput) + if err != nil { + return err + } + + if updateState { + candidates = append(candidates, ec2ID) + } + } if len(candidates) == 0 { log.Printf("No instances to update") return nil } - fmt.Println("Instances ready for update: ", candidates) + log.Print("Instances ready for update: ", candidates) for ec2ID, containerInstance := range ec2IDtoECSARN { err := u.drain(containerInstance) @@ -96,6 +110,66 @@ func _main() error { continue } log.Printf("Instance %s drained", ec2ID) + + ec2IDs := []string{ec2ID} + _, err = u.sendCommand(ec2IDs, "apiclient update apply --reboot") + if err != nil { + // TODO add nuanced error checking to determine the type of failure, act accordingly. + log.Printf("%#v", err) + err2 := u.activateInstance(aws.String(containerInstance)) + if err2 != nil { + log.Printf("failed to reactivate %s after failure to execute update command. Aborting update operations.", ec2ID) + return err2 + } + continue + } + + err = u.waitUntilOk(ec2ID) + if err != nil { + return fmt.Errorf("instance %s failed to enter an Ok status after reboot. Aborting update operations: %#v", ec2ID, err) + } + + err = u.activateInstance(aws.String(containerInstance)) + if err != nil { + log.Printf("instance %s failed to return to ACTIVE after reboot. Aborting update operations.", ec2ID) + return err + } + + updateStatus, err := u.sendCommand(ec2IDs, "apiclient update check") + if err != nil { + log.Printf("%#v", err) + continue + } + + updateResult, err := u.getCommandResult(updateStatus, ec2ID) + if err != nil { + log.Printf("%#v", err) + continue + } + + // TODO version before and after comparison. + updateState, err := isUpdateAvailable(updateResult) + if err != nil { + log.Printf("Unable to determine update result. Manual verification of %s required", ec2ID) + continue + } + + if updateState { + log.Printf("Instance %s did not update. Manual update advised.", ec2ID) + continue + } else { + log.Printf("Instance %s updated successfully", ec2ID) + } + + updatedVersion, err := getActiveVersion(updateResult) + if err != nil { + log.Printf("%#v", err) + } + if len(updatedVersion) != 0 { + log.Printf("Instance %s running Bottlerocket: %s", ec2ID, updatedVersion) + } else { + log.Printf("Unable to verify active version. Manual verification of %s required.", ec2ID) + } } return nil }