From 684d92c91faf7daaaadef71b3a128cd2b8eb4401 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Thu, 12 Dec 2019 16:01:08 -0800 Subject: [PATCH 1/4] cmd/openshift-install/gather: Recognize "connection refused" Before this commit, bootstrap machines that failed to come up would look like [1]: level=info msg="Waiting up to 30m0s for the Kubernetes API at https://api.ci-op-6266tp8r-77109.origin-ci-int-aws.dev.rhcloud.com:6443..." level=error msg="Attempted to gather ClusterOperator status after installation failure: listing ClusterOperator objects: Get https://api.ci-op-6266tp8r-77109.origin-ci-int-aws.dev.rhcloud.com:6443/apis/config.openshift.io/v1/clusteroperators: dial tcp 3.221.214.197:6443: connect: connection refused" level=info msg="Pulling debug logs from the bootstrap machine" level=error msg="Attempted to gather debug logs after installation failure: failed to create SSH client, ensure the proper ssh key is in your keyring or specify with --key: dial tcp 3.84.188.207:22: connect: connection refused" level=fatal msg="Bootstrap failed to complete: waiting for Kubernetes API: context deadline exceeded" With this commit, that last error will look like: level=error msg="Attempted to gather debug logs after installation failure: failed to connect to the bootstrap machine: dial tcp 3.84.188.207:22: connect: connection refused" without the unrelated (to this failure mode) distraction about SSH keys. [1]: https://prow.svc.ci.openshift.org/view/gcs/origin-ci-test/logs/release-openshift-origin-installer-e2e-aws-upgrade/12076 --- cmd/openshift-install/gather.go | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cmd/openshift-install/gather.go b/cmd/openshift-install/gather.go index decce1a4154..272bf73db15 100644 --- a/cmd/openshift-install/gather.go +++ b/cmd/openshift-install/gather.go @@ -6,6 +6,7 @@ import ( "os" "path/filepath" "strings" + "syscall" "time" configv1 "github.com/openshift/api/config/v1" @@ -117,11 +118,15 @@ func runGatherBootstrapCmd(directory string) error { func logGatherBootstrap(bootstrap string, port int, masters []string, directory string) error { logrus.Info("Pulling debug logs from the bootstrap machine") client, err := ssh.NewClient("core", fmt.Sprintf("%s:%d", bootstrap, port), gatherBootstrapOpts.sshKeys) - if err != nil && len(gatherBootstrapOpts.sshKeys) == 0 { - return errors.Wrap(err, "failed to create SSH client, ensure the proper ssh key is in your keyring or specify with --key") - } else if err != nil { + if err != nil { + if errno, ok := err.(syscall.Errno); ok && errno == syscall.ECONNREFUSED { + return errors.Wrap(err, "failed to connect to the bootstrap machine") + } else if len(gatherBootstrapOpts.sshKeys) == 0 { + return errors.Wrap(err, "failed to create SSH client, ensure the proper ssh key is in your keyring or specify with --key") + } return errors.Wrap(err, "failed to create SSH client") } + gatherID := time.Now().Format("20060102150405") if err := ssh.Run(client, fmt.Sprintf("/usr/local/bin/installer-gather.sh --id %s %s", gatherID, strings.Join(masters, " "))); err != nil { return errors.Wrap(err, "failed to run remote command") From 5fe12ab45c88df1cd9d0c6388ea6927505928f10 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Thu, 12 Dec 2019 16:23:03 -0800 Subject: [PATCH 2/4] cmd/openshift-install/gather: Gather bootstrap console logs If we can't reach the bootstrap machine via SSH. Before this commit, we would occasionally see connection issues like [1]: level=info msg="Waiting up to 30m0s for the Kubernetes API at https://api.ci-op-6266tp8r-77109.origin-ci-int-aws.dev.rhcloud.com:6443..." level=error msg="Attempted to gather ClusterOperator status after installation failure: listing ClusterOperator objects: Get https://api.ci-op-6266tp8r-77109.origin-ci-int-aws.dev.rhcloud.com:6443/apis/config.openshift.io/v1/clusteroperators: dial tcp 3.221.214.197:6443: connect: connection refused" level=info msg="Pulling debug logs from the bootstrap machine" level=error msg="Attempted to gather debug logs after installation failure: failed to create SSH client, ensure the proper ssh key is in your keyring or specify with --key: dial tcp 3.84.188.207:22: connect: connection refused" level=fatal msg="Bootstrap failed to complete: waiting for Kubernetes API: context deadline exceeded" With this commit, when we see those connection-refused errors, we attempt to retrieve console logs for the bootstrap instance. This will make it easier for users and users to see why the machine failed to come up. It should be especially useful in continuous integration when bumping RHCOS boot images [2], when such boot-time failures are more likely. I've only implemented it on AWS for the moment, but I've set it up so we can extend it to other platforms going forward. [1]: https://prow.svc.ci.openshift.org/view/gcs/origin-ci-test/logs/release-openshift-origin-installer-e2e-aws-upgrade/12076 [2]: https://github.com/openshift/installer/pull/2777#issuecomment-565237752 --- cmd/openshift-install/gather.go | 74 ++++++++++++++++++++++++++------- pkg/gather/aws/OWNERS | 7 ++++ pkg/gather/aws/console.go | 68 ++++++++++++++++++++++++++++++ 3 files changed, 133 insertions(+), 16 deletions(-) create mode 100644 pkg/gather/aws/OWNERS create mode 100644 pkg/gather/aws/console.go diff --git a/cmd/openshift-install/gather.go b/cmd/openshift-install/gather.go index 272bf73db15..e3d837f4396 100644 --- a/cmd/openshift-install/gather.go +++ b/cmd/openshift-install/gather.go @@ -3,6 +3,7 @@ package main import ( "context" "fmt" + "io/ioutil" "os" "path/filepath" "strings" @@ -19,13 +20,14 @@ import ( "github.com/openshift/installer/pkg/asset/installconfig" assetstore "github.com/openshift/installer/pkg/asset/store" + gatheraws "github.com/openshift/installer/pkg/gather/aws" "github.com/openshift/installer/pkg/gather/ssh" "github.com/openshift/installer/pkg/terraform" - gatheraws "github.com/openshift/installer/pkg/terraform/gather/aws" - gatherazure "github.com/openshift/installer/pkg/terraform/gather/azure" - gathergcp "github.com/openshift/installer/pkg/terraform/gather/gcp" - gatherlibvirt "github.com/openshift/installer/pkg/terraform/gather/libvirt" - gatheropenstack "github.com/openshift/installer/pkg/terraform/gather/openstack" + terraformgatheraws "github.com/openshift/installer/pkg/terraform/gather/aws" + terraformgatherazure "github.com/openshift/installer/pkg/terraform/gather/azure" + terraformgathergcp "github.com/openshift/installer/pkg/terraform/gather/gcp" + terraformgatherlibvirt "github.com/openshift/installer/pkg/terraform/gather/libvirt" + terraformgatheropenstack "github.com/openshift/installer/pkg/terraform/gather/openstack" "github.com/openshift/installer/pkg/types" awstypes "github.com/openshift/installer/pkg/types/aws" azuretypes "github.com/openshift/installer/pkg/types/azure" @@ -112,7 +114,17 @@ func runGatherBootstrapCmd(directory string) error { return errors.Wrapf(err, "failed to get bootstrap and control plane host addresses from %q", tfStateFilePath) } - return logGatherBootstrap(bootstrap, port, masters, directory) + err = logGatherBootstrap(bootstrap, port, masters, directory) + if err != nil { + if errno, ok := errors.Cause(err).(syscall.Errno); ok && errno == syscall.ECONNREFUSED { + err2 := gatherConsoleLogs(context.TODO(), config, bootstrap, directory) + if err2 != nil { + logrus.Error(err2) + } + } + } + + return err } func logGatherBootstrap(bootstrap string, port int, masters []string, directory string) error { @@ -139,51 +151,81 @@ func logGatherBootstrap(bootstrap string, port int, masters []string, directory return nil } +func gatherConsoleLogs(ctx context.Context, installConfig *installconfig.InstallConfig, ip string, directory string) error { + var data []byte + platform := installConfig.Config.Platform.Name() + switch platform { + case awstypes.Name: + session, err := installConfig.AWS.Session(ctx) + if err != nil { + return err + } + + data, err = gatheraws.ConsoleLogs(ctx, session, ip) + if err != nil { + return err + } + default: + logrus.Debug("Unable to gather console logs on %q", platform) + return nil + } + + gatherID := time.Now().Format("20060102150405") + file := filepath.Join(directory, fmt.Sprintf("bootstrap-%s-console.log", gatherID)) + err := ioutil.WriteFile(file, data, 0666) + if err != nil { + return err + } + + logrus.Infof("Bootstrap gather logs captured here %q", file) + return nil +} + func extractHostAddresses(config *types.InstallConfig, tfstate *terraform.State) (bootstrap string, port int, masters []string, err error) { port = 22 switch config.Platform.Name() { case awstypes.Name: - bootstrap, err = gatheraws.BootstrapIP(tfstate) + bootstrap, err = terraformgatheraws.BootstrapIP(tfstate) if err != nil { return bootstrap, port, masters, err } - masters, err = gatheraws.ControlPlaneIPs(tfstate) + masters, err = terraformgatheraws.ControlPlaneIPs(tfstate) if err != nil { logrus.Error(err) } case azuretypes.Name: - bootstrap, err = gatherazure.BootstrapIP(tfstate) + bootstrap, err = terraformgatherazure.BootstrapIP(tfstate) if err != nil { return bootstrap, port, masters, err } - masters, err = gatherazure.ControlPlaneIPs(tfstate) + masters, err = terraformgatherazure.ControlPlaneIPs(tfstate) if err != nil { logrus.Error(err) } case gcptypes.Name: - bootstrap, err = gathergcp.BootstrapIP(tfstate) + bootstrap, err = terraformgathergcp.BootstrapIP(tfstate) if err != nil { return bootstrap, port, masters, err } - masters, err = gathergcp.ControlPlaneIPs(tfstate) + masters, err = terraformgathergcp.ControlPlaneIPs(tfstate) if err != nil { logrus.Error(err) } case libvirttypes.Name: - bootstrap, err = gatherlibvirt.BootstrapIP(tfstate) + bootstrap, err = terraformgatherlibvirt.BootstrapIP(tfstate) if err != nil { return bootstrap, port, masters, err } - masters, err = gatherlibvirt.ControlPlaneIPs(tfstate) + masters, err = terraformgatherlibvirt.ControlPlaneIPs(tfstate) if err != nil { logrus.Error(err) } case openstacktypes.Name: - bootstrap, err = gatheropenstack.BootstrapIP(tfstate) + bootstrap, err = terraformgatheropenstack.BootstrapIP(tfstate) if err != nil { return bootstrap, port, masters, err } - masters, err = gatheropenstack.ControlPlaneIPs(tfstate) + masters, err = terraformgatheropenstack.ControlPlaneIPs(tfstate) if err != nil { logrus.Error(err) } diff --git a/pkg/gather/aws/OWNERS b/pkg/gather/aws/OWNERS new file mode 100644 index 00000000000..6e59d685aa6 --- /dev/null +++ b/pkg/gather/aws/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs: https://git.k8s.io/community/contributors/guide/owners.md +# This file just uses aliases defined in OWNERS_ALIASES. + +approvers: + - aws-approvers +reviewers: + - aws-reviewers diff --git a/pkg/gather/aws/console.go b/pkg/gather/aws/console.go new file mode 100644 index 00000000000..91c3454b131 --- /dev/null +++ b/pkg/gather/aws/console.go @@ -0,0 +1,68 @@ +// Package AWS provides AWS-specific tools for gathering debugging information. +package aws + +import ( + "context" + "encoding/base64" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/session" + "github.com/aws/aws-sdk-go/service/ec2" + "github.com/pkg/errors" +) + +// ConsoleLogs retrieves console logs from the AWS instance with the +// given IP address. +func ConsoleLogs(ctx context.Context, session *session.Session, ip string) ([]byte, error) { + client := ec2.New(session) + var instanceID string + err := client.DescribeInstancesPagesWithContext( + ctx, + &ec2.DescribeInstancesInput{ + Filters: []*ec2.Filter{{ + Name: aws.String("ip-address"), + Values: []*string{&ip}, + }}, + }, + func(results *ec2.DescribeInstancesOutput, lastPage bool) bool { + for _, reservation := range results.Reservations { + for _, instance := range reservation.Instances { + if instance.InstanceId != nil { + instanceID = *instance.InstanceId + return false + } + } + } + + return !lastPage + }, + ) + if err != nil { + return nil, errors.Wrap(err, "describe instances") + } + + if instanceID == "" { + return nil, errors.Errorf("unable to find an AWS instance ID for %q", ip) + } + + consoleOutput, err := client.GetConsoleOutputWithContext( + ctx, + &ec2.GetConsoleOutputInput{ + InstanceId: &instanceID, + Latest: aws.Bool(true), + }, + ) + if err != nil { + return nil, errors.Wrapf(err, "get console output for %s", instanceID) + } + if consoleOutput.Output == nil { + return nil, errors.Errorf("nil console output for %s", instanceID) + } + + data, err := base64.StdEncoding.DecodeString(*consoleOutput.Output) + if err != nil { + return nil, errors.Wrapf(err, "decoding console output for %s", instanceID) + } + + return data, nil +} From 1ac911d583ac3f604b6b6ae4007ff0416b8b1b3f Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Thu, 12 Dec 2019 18:10:07 -0800 Subject: [PATCH 3/4] WIP: DEBUG --- cmd/openshift-install/create.go | 2 +- cmd/openshift-install/gather.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/openshift-install/create.go b/cmd/openshift-install/create.go index f9ae4c6bb39..7b866ef609e 100644 --- a/cmd/openshift-install/create.go +++ b/cmd/openshift-install/create.go @@ -95,7 +95,7 @@ var ( } err = waitForBootstrapComplete(ctx, config, rootOpts.dir) - if err != nil { + if err != nil || true { if err2 := logClusterOperatorConditions(ctx, config); err2 != nil { logrus.Error("Attempted to gather ClusterOperator status after installation failure: ", err2) } diff --git a/cmd/openshift-install/gather.go b/cmd/openshift-install/gather.go index e3d837f4396..8d2a3441956 100644 --- a/cmd/openshift-install/gather.go +++ b/cmd/openshift-install/gather.go @@ -115,7 +115,7 @@ func runGatherBootstrapCmd(directory string) error { } err = logGatherBootstrap(bootstrap, port, masters, directory) - if err != nil { + if err != nil || true { if errno, ok := errors.Cause(err).(syscall.Errno); ok && errno == syscall.ECONNREFUSED { err2 := gatherConsoleLogs(context.TODO(), config, bootstrap, directory) if err2 != nil { From d92d9e88dd6c773b3d2b5f651bf8290d8ed162c5 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Thu, 12 Dec 2019 23:26:26 -0800 Subject: [PATCH 4/4] WIP: More debugging hacks --- cmd/openshift-install/gather.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/openshift-install/gather.go b/cmd/openshift-install/gather.go index 8d2a3441956..0faba1a5156 100644 --- a/cmd/openshift-install/gather.go +++ b/cmd/openshift-install/gather.go @@ -116,12 +116,12 @@ func runGatherBootstrapCmd(directory string) error { err = logGatherBootstrap(bootstrap, port, masters, directory) if err != nil || true { - if errno, ok := errors.Cause(err).(syscall.Errno); ok && errno == syscall.ECONNREFUSED { + // if errno, ok := errors.Cause(err).(syscall.Errno); ok && errno == syscall.ECONNREFUSED { err2 := gatherConsoleLogs(context.TODO(), config, bootstrap, directory) if err2 != nil { logrus.Error(err2) } - } + // } } return err