From 4781ef62d6d30f363a1562345deaf8d726515089 Mon Sep 17 00:00:00 2001 From: Abhinav Dahiya Date: Sun, 3 May 2020 12:09:43 -0700 Subject: [PATCH 1/2] pkg: add a diagnostics error The diagnostics error allows the providing important context to provide better error reporting for the the users. The error allows the installer assets etc. to provide structural information, - Source: the source of the error, the installer assets have errors from cloud providers or internal errors, the source allows providing hat context to better categorize these errors. - Reason: is a single word reason that corrrectly summarizes the type of error, allows the users to quickly understand the type of error. also should allow internal metrics tracking to tracks these error types. --- pkg/diagnostics/error.go | 80 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 pkg/diagnostics/error.go diff --git a/pkg/diagnostics/error.go b/pkg/diagnostics/error.go new file mode 100644 index 00000000000..247eb5433bc --- /dev/null +++ b/pkg/diagnostics/error.go @@ -0,0 +1,80 @@ +package diagnostics + +import ( + "bytes" + "fmt" + "io" + "regexp" + "strings" + + "github.com/pkg/errors" +) + +// Err wraps diagnostics information for an error. +// Err allows providing information like source, reason and message +// that provides a much better user error reporting capability. +type Err struct { + Orig error + + // Source defines with entity is generating the error. + // It allows passing along information about where the error is being + // generated from. for example, the Asset. + Source string + + // Reason is a CamelCase string that summarizes the error in one word. + // It allows easy catgeorizations of known errors. + Reason string + + // Message is free-form strings which provides important details or + // diagnostics for the error. When writing messages, make sure to keep in mind + // that the audience for message is end-users who might not be experts. + Message string +} + +// Unwrap allows the error to be unwrapped. +func (e *Err) Unwrap() error { return e.Orig } + +// Error returns a string representation of the Err. The returned value +// is expected to be a single value. +// The format of the error string returned is, +// `error() from : : ` +func (e *Err) Error() string { + buf := &bytes.Buffer{} + if len(e.Source) > 0 { + fmt.Fprintf(buf, "error(%s) from %s", e.Reason, e.Source) + } else { + fmt.Fprintf(buf, "error(%s)", e.Reason) + } + if msg := strings.TrimSpace(e.Message); len(msg) > 0 { + msg = breakre.ReplaceAllString(msg, " ") + fmt.Fprintf(buf, ": %s", msg) + } + if c := errors.Cause(e.Orig); c != nil { + fmt.Fprintf(buf, ": %s", errors.Cause(e.Orig)) + } + return buf.String() +} + +// Print prints the Err to Writer in a way that is more verbose and +// sectionalized. +// The output looks like: +// Error from : +// Reason: +// +// Message: +// +// +// Original: +// +func (e *Err) Print(w io.Writer) { + fmt.Fprintf(w, "Error from %q\n", e.Source) + fmt.Fprintf(w, "Reason: %s\n", e.Reason) + if len(e.Message) > 0 { + fmt.Fprintf(w, "\nMessage:\n") + fmt.Fprintln(w, e.Message) + } + fmt.Fprintf(w, "\nOriginal error:\n") + fmt.Fprintln(w, e.Orig) +} + +var breakre = regexp.MustCompile(`\r?\n`) From 699becad7cdfad0c6241fc18e92fd5661f777340 Mon Sep 17 00:00:00 2001 From: Abhinav Dahiya Date: Sun, 3 May 2020 12:11:52 -0700 Subject: [PATCH 2/2] pkg/terraform: add diagnostics errors for terraform apply operations The terraform errors are tracked in a buffer. This buffers is then used to match against various known conditions to understand the reasons for the errors. This now allows the terraform apply to return specific errors in these cases instead of previous `failed to apply Terraform` constant string message. --- pkg/terraform/diagnose.go | 81 +++++++++++++++++++++++++ pkg/terraform/diagnose_test.go | 104 +++++++++++++++++++++++++++++++++ pkg/terraform/terraform.go | 25 ++++---- 3 files changed, 196 insertions(+), 14 deletions(-) create mode 100644 pkg/terraform/diagnose.go create mode 100644 pkg/terraform/diagnose_test.go diff --git a/pkg/terraform/diagnose.go b/pkg/terraform/diagnose.go new file mode 100644 index 00000000000..8fe5fccd3e6 --- /dev/null +++ b/pkg/terraform/diagnose.go @@ -0,0 +1,81 @@ +package terraform + +import ( + "regexp" + + "github.com/pkg/errors" + + "github.com/openshift/installer/pkg/diagnostics" +) + +// Diagnose accepts an error from terraform runs and tries to diagnose the +// underlying cause. +func Diagnose(message string) error { + for _, cand := range conditions { + if cand.match.MatchString(message) { + return &diagnostics.Err{ + Source: "Infrastructure Provider", + Reason: cand.reason, + Message: cand.message, + } + } + } + + return errors.New("failed to complete the change") +} + +type condition struct { + match *regexp.Regexp + + reason string + message string +} + +// conditions is a list matches for the error string from terraform. +// specific on the top, generic matches on the bottom. +var conditions = []condition{{ + match: regexp.MustCompile(`Error: Error creating Blob .*: Error copy/waiting`), + + reason: "Timeout", + message: `Copying the VHD to user environment was too slow, and timeout was reached for the success.`, +}, { + match: regexp.MustCompile(`Error: Error Creating/Updating Subnet .*: network.SubnetsClient#CreateOrUpdate: .* Code="AnotherOperationInProgress" Message="Another operation on this or dependent resource is in progress`), + + reason: "AzureMultiOperationFailure", + message: `Creating Subnets failed because Azure could not process multiple operations.`, +}, { + match: regexp.MustCompile(`Error: Error Creating/Updating Public IP .*: network.PublicIPAddressesClient#CreateOrUpdate: .* Code="PublicIPCountLimitReached" Message="Cannot create more than .* public IP addresses for this subscription in this region`), + + reason: "AzureQuotaLimitExceeded", + message: `Service limits exceeded for Public IPs in the the subscriptions for the region. Requesting increase in quota should fix the error.`, +}, { + match: regexp.MustCompile(`Error: compute\.VirtualMachinesClient#CreateOrUpdate: .* Code="OperationNotAllowed" Message="Operation could not be completed as it results in exceeding approved Total Regional Cores quota`), + + reason: "AzureQuotaLimitExceeded", + message: `Service limits exceeded for Virtual Machine cores in the the subscriptions for the region. Requesting increase in quota should fix the error.`, +}, { + match: regexp.MustCompile(`Error: Code="OSProvisioningTimedOut"`), + + reason: "AzureVirtualMachineFailure", + message: `Some virtual machines failed to provision in alloted time. Virtual machines can fail to provision if the bootstap virtual machine has failing services.`, +}, { + match: regexp.MustCompile(`Status=404 Code="ResourceGroupNotFound"`), + + reason: "AzureEventualConsistencyFailure", + message: `Failed to find a resource that was recently created usualy caused by Azure's eventual consistency delays.`, +}, { + match: regexp.MustCompile(`Error: Error applying IAM policy to project .*: Too many conflicts`), + + reason: "GCPTooManyIAMUpdatesInFlight", + message: `There are a lot of IAM updates to the project in flight. Failed after reaching a limit of read-modify-write on conflict backoffs.`, +}, { + match: regexp.MustCompile(`Error: .*: googleapi: Error 503: .*, backendError`), + + reason: "GCPBackendInternalError", + message: `GCP is experiencing backend service interuptions. Please try again or contact Google Support`, +}, { + match: regexp.MustCompile(`Error: Error waiting for instance to create: Internal error`), + + reason: "GCPComputeBackendTimeout", + message: `GCP is experiencing backend service interuptions, the compute instance failed to create in reasonable time.`, +}} diff --git a/pkg/terraform/diagnose_test.go b/pkg/terraform/diagnose_test.go new file mode 100644 index 00000000000..0e72393ceca --- /dev/null +++ b/pkg/terraform/diagnose_test.go @@ -0,0 +1,104 @@ +package terraform + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestDiagnose(t *testing.T) { + cases := []struct { + input string + err string + }{{ + input: `Error: Error creating Blob "rhcoskltwa.vhd" (Container "vhd" / Account "clusterkltwa"): Error copy/waiting: + on ../tmp/openshift-install-348626978/main.tf line 169, in resource "azurerm_storage_blob" "rhcos_image":" + 169: resource "azurerm_storage_blob" "rhcos_image" { +`, + err: `error\(Timeout\) from Infrastructure Provider: Copying the VHD to user environment was too slow, and timeout was reached for the success\.`, + }, { + input: `Error: Error Creating/Updating Subnet "xxxx-master-subnet" (Virtual Network "xxxx-vnet" / Resource Group "xxxx-rg"): network.SubnetsClient#CreateOrUpdate: Failure sending request: StatusCode=0 -- Original Error: autorest/azure: Service returned an error. Status= Code="AnotherOperationInProgress" Message="Another operation on this or dependent resource is in progress. To retrieve status of the operation use uri: https://management.azure.com/subscriptions/d38f1e38-4bed-438e-b227-833f997adf6a/providers/Microsoft.Network/locations/eastus2/operations/62c8a417-7168-464f-83e6-96912bd6b30a?api-version=2019-09-01." Details=[] + + on ../tmp/openshift-install-513947104/vnet/vnet.tf line 10, in resource "azurerm_subnet" "master_subnet":" + 10: resource "azurerm_subnet" "master_subnet" { +`, + err: `error\(AzureMultiOperationFailure\) from Infrastructure Provider: Creating Subnets failed because Azure could not process multiple operations\.`, + }, { + input: `Error: Error Creating/Updating Public IP "xxxx-bootstrap-pip-v4" (Resource Group "xxxx-rg"): network.PublicIPAddressesClient#CreateOrUpdate: Failure sending request: StatusCode=400 -- Original Error: Code="PublicIPCountLimitReached" Message="Cannot create more than 50 public IP addresses for this subscription in this region." Details=[] + + on ../tmp/openshift-install-172932975/bootstrap/main.tf line 65, in resource "azurerm_public_ip" "bootstrap_public_ip_v4": + 65: resource "azurerm_public_ip" "bootstrap_public_ip_v4" { +`, + + err: `error\(AzureQuotaLimitExceeded\) from Infrastructure Provider: Service limits exceeded for Public IPs in the the subscriptions for the region. Requesting increase in quota should fix the error\.`, + }, { + input: `Error: Code="OSProvisioningTimedOut" Message="OS Provisioning for VM 'xxxx-master-2' did not finish in the allotted time. The VM may still finish provisioning successfully. Please check provisioning state later. Also, make sure the image has been properly prepared (generalized).\\r\\n * Instructions for Windows: https://azure.microsoft.com/documentation/articles/virtual-machines-windows-upload-image/ \\r\\n * Instructions for Linux: https://azure.microsoft.com/documentation/articles/virtual-machines-linux-capture-image/ " + + on ../tmp/openshift-install-172932975/master/master.tf line 81, in resource "azurerm_virtual_machine" "master": + 81: resource "azurerm_virtual_machine" "master" { +`, + + err: `error\(AzureVirtualMachineFailure\) from Infrastructure Provider: Some virtual machines failed to provision in alloted time`, + }, { + input: ` +Error: Error waiting for instance to create: Internal error. Please try again or contact Google Support. (Code: '8712799794455203922') + + + on ../tmp/openshift-install-910996711/master/main.tf line 31, in resource "google_compute_instance" "master": + 31: resource "google_compute_instance" "master" { +`, + + err: `error\(GCPComputeBackendTimeout\) from Infrastructure Provider: GCP is experiencing backend service interuptions, the compute instance failed to create in reasonable time\.`, + }, { + input: `Error: Error reading Service Account "projects/project-id/serviceAccounts/xxxx-m@project-id.iam.gserviceaccount.com": googleapi: Error 503: The service is currently unavailable., backendError`, + + err: `error\(GCPBackendInternalError\) from Infrastructure Provider: GCP is experiencing backend service interuptions. Please try again or contact Google Support`, + }, { + input: ` +Error: Error adding instances to InstanceGroup: googleapi: Error 503: Internal error. Please try again or contact Google Support. (Code: 'xxxx'), backendError + + on ../tmp/openshift-install-267295217/bootstrap/main.tf line 87, in resource "google_compute_instance_group" "bootstrap": + 87: resource "google_compute_instance_group" "bootstrap" { +`, + + err: `error\(GCPBackendInternalError\) from Infrastructure Provider: GCP is experiencing backend service interuptions. Please try again or contact Google Support`, + }, { + input: ` +Error: Error applying IAM policy to project "project-id": Too many conflicts. Latest error: Error setting IAM policy for project "project-id": googleapi: Error 409: There were concurrent policy changes. Please retry the whole read-modify-write with exponential backoff., aborted + + on ../tmp/openshift-install-392130810/master/main.tf line 26, in resource "google_project_iam_member" "master-service-account-user": + 26: resource "google_project_iam_member" "master-service-account-user" { +`, + + err: `error\(GCPTooManyIAMUpdatesInFlight\) from Infrastructure Provider: There are a lot of IAM updates to the project in flight. Failed after reaching a limit of read-modify-write on conflict backoffs\.`, + }, { + input: ` +Error: Error retrieving resource group: resources.GroupsClient#Get: Failure responding to request: StatusCode=404 -- Original Error: autorest/azure: Service returned an error. Status=404 Code="ResourceGroupNotFound" Message="Resource group 'xxxxx-rg' could not be found." + + on ../tmp/openshift-install-424775273/main.tf line 124, in resource "azurerm_resource_group" "main": + 124: resource "azurerm_resource_group" "main" { +`, + + err: `error\(AzureEventualConsistencyFailure\) from Infrastructure Provider: Failed to find a resource that was recently created usualy caused by Azure's eventual consistency delays\.`, + }, { + input: ` +Error: compute.VirtualMachinesClient#CreateOrUpdate: Failure sending request: StatusCode=0 -- Original Error: autorest/azure: Service returned an error. Status= Code="OperationNotAllowed" Message="Operation could not be completed as it results in exceeding approved Total Regional Cores quota. Additional details - Deployment Model: Resource Manager, Location: centralus, Current Limit: 200, Current Usage: 198, Additional Required: 8, (Minimum) New Limit Required: 206. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/?#create/Microsoft.Support/Parameters/%7B%22subId%22:%225f675811-04fa-483f-9709-ffd8a9da03f0%22,%22pesId%22:%2206bfd9d3-516b-d5c6-5802-169c800dec89%22,%22supportTopicId%22:%22e12e3d1d-7fa0-af33-c6d0-3c50df9658a3%22%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/regional-quota-requests." + + on ../../../../tmp/openshift-install-941329162/master/master.tf line 81, in resource "azurerm_virtual_machine" "master": + 81: resource "azurerm_virtual_machine" "master" { +`, + + err: `error\(AzureQuotaLimitExceeded\) from Infrastructure Provider: Service limits exceeded for Virtual Machine cores in the the subscriptions for the region\. Requesting increase in quota should fix the error\.`, + }} + + for _, test := range cases { + t.Run("", func(t *testing.T) { + err := Diagnose(test.input) + if test.err == "" { + assert.NoError(t, err) + } else { + assert.Regexp(t, test.err, err) + } + }) + } +} diff --git a/pkg/terraform/terraform.go b/pkg/terraform/terraform.go index 70ea3e8871f..7d680df6222 100644 --- a/pkg/terraform/terraform.go +++ b/pkg/terraform/terraform.go @@ -1,7 +1,9 @@ package terraform import ( + "bytes" "fmt" + "io" "os" "path/filepath" "runtime" @@ -43,15 +45,14 @@ func Apply(dir string, platform string, extraArgs ...string) (path string, err e args = append(args, dir) sf := filepath.Join(dir, StateFileName) - tDebug := &lineprinter.Trimmer{WrappedPrint: logrus.Debug} - tError := &lineprinter.Trimmer{WrappedPrint: logrus.Error} - lpDebug := &lineprinter.LinePrinter{Print: tDebug.Print} - lpError := &lineprinter.LinePrinter{Print: tError.Print} + lpDebug := &lineprinter.LinePrinter{Print: (&lineprinter.Trimmer{WrappedPrint: logrus.Debug}).Print} + lpError := &lineprinter.LinePrinter{Print: (&lineprinter.Trimmer{WrappedPrint: logrus.Error}).Print} defer lpDebug.Close() defer lpError.Close() - if exitCode := texec.Apply(dir, args, lpDebug, lpError); exitCode != 0 { - return sf, errors.New("failed to apply using Terraform") + errBuf := &bytes.Buffer{} + if exitCode := texec.Apply(dir, args, lpDebug, io.MultiWriter(errBuf, lpError)); exitCode != 0 { + return sf, errors.Wrap(Diagnose(errBuf.String()), "failed to apply Terraform") } return sf, nil } @@ -74,10 +75,8 @@ func Destroy(dir string, platform string, extraArgs ...string) (err error) { args := append(defaultArgs, extraArgs...) args = append(args, dir) - tDebug := &lineprinter.Trimmer{WrappedPrint: logrus.Debug} - tError := &lineprinter.Trimmer{WrappedPrint: logrus.Error} - lpDebug := &lineprinter.LinePrinter{Print: tDebug.Print} - lpError := &lineprinter.LinePrinter{Print: tError.Print} + lpDebug := &lineprinter.LinePrinter{Print: (&lineprinter.Trimmer{WrappedPrint: logrus.Debug}).Print} + lpError := &lineprinter.LinePrinter{Print: (&lineprinter.Trimmer{WrappedPrint: logrus.Error}).Print} defer lpDebug.Close() defer lpError.Close() @@ -115,10 +114,8 @@ func unpackAndInit(dir string, platform string) (err error) { return errors.Wrap(err, "failed to setup embedded Terraform plugins") } - tDebug := &lineprinter.Trimmer{WrappedPrint: logrus.Debug} - tError := &lineprinter.Trimmer{WrappedPrint: logrus.Error} - lpDebug := &lineprinter.LinePrinter{Print: tDebug.Print} - lpError := &lineprinter.LinePrinter{Print: tError.Print} + lpDebug := &lineprinter.LinePrinter{Print: (&lineprinter.Trimmer{WrappedPrint: logrus.Debug}).Print} + lpError := &lineprinter.LinePrinter{Print: (&lineprinter.Trimmer{WrappedPrint: logrus.Error}).Print} defer lpDebug.Close() defer lpError.Close()