diff --git a/pkg/diagnostics/error.go b/pkg/diagnostics/error.go new file mode 100644 index 00000000000..247eb5433bc --- /dev/null +++ b/pkg/diagnostics/error.go @@ -0,0 +1,80 @@ +package diagnostics + +import ( + "bytes" + "fmt" + "io" + "regexp" + "strings" + + "github.com/pkg/errors" +) + +// Err wraps diagnostics information for an error. +// Err allows providing information like source, reason and message +// that provides a much better user error reporting capability. +type Err struct { + Orig error + + // Source defines with entity is generating the error. + // It allows passing along information about where the error is being + // generated from. for example, the Asset. + Source string + + // Reason is a CamelCase string that summarizes the error in one word. + // It allows easy catgeorizations of known errors. + Reason string + + // Message is free-form strings which provides important details or + // diagnostics for the error. When writing messages, make sure to keep in mind + // that the audience for message is end-users who might not be experts. + Message string +} + +// Unwrap allows the error to be unwrapped. +func (e *Err) Unwrap() error { return e.Orig } + +// Error returns a string representation of the Err. The returned value +// is expected to be a single value. +// The format of the error string returned is, +// `error() from : : ` +func (e *Err) Error() string { + buf := &bytes.Buffer{} + if len(e.Source) > 0 { + fmt.Fprintf(buf, "error(%s) from %s", e.Reason, e.Source) + } else { + fmt.Fprintf(buf, "error(%s)", e.Reason) + } + if msg := strings.TrimSpace(e.Message); len(msg) > 0 { + msg = breakre.ReplaceAllString(msg, " ") + fmt.Fprintf(buf, ": %s", msg) + } + if c := errors.Cause(e.Orig); c != nil { + fmt.Fprintf(buf, ": %s", errors.Cause(e.Orig)) + } + return buf.String() +} + +// Print prints the Err to Writer in a way that is more verbose and +// sectionalized. +// The output looks like: +// Error from : +// Reason: +// +// Message: +// +// +// Original: +// +func (e *Err) Print(w io.Writer) { + fmt.Fprintf(w, "Error from %q\n", e.Source) + fmt.Fprintf(w, "Reason: %s\n", e.Reason) + if len(e.Message) > 0 { + fmt.Fprintf(w, "\nMessage:\n") + fmt.Fprintln(w, e.Message) + } + fmt.Fprintf(w, "\nOriginal error:\n") + fmt.Fprintln(w, e.Orig) +} + +var breakre = regexp.MustCompile(`\r?\n`) diff --git a/pkg/terraform/diagnose.go b/pkg/terraform/diagnose.go new file mode 100644 index 00000000000..8fe5fccd3e6 --- /dev/null +++ b/pkg/terraform/diagnose.go @@ -0,0 +1,81 @@ +package terraform + +import ( + "regexp" + + "github.com/pkg/errors" + + "github.com/openshift/installer/pkg/diagnostics" +) + +// Diagnose accepts an error from terraform runs and tries to diagnose the +// underlying cause. +func Diagnose(message string) error { + for _, cand := range conditions { + if cand.match.MatchString(message) { + return &diagnostics.Err{ + Source: "Infrastructure Provider", + Reason: cand.reason, + Message: cand.message, + } + } + } + + return errors.New("failed to complete the change") +} + +type condition struct { + match *regexp.Regexp + + reason string + message string +} + +// conditions is a list matches for the error string from terraform. +// specific on the top, generic matches on the bottom. +var conditions = []condition{{ + match: regexp.MustCompile(`Error: Error creating Blob .*: Error copy/waiting`), + + reason: "Timeout", + message: `Copying the VHD to user environment was too slow, and timeout was reached for the success.`, +}, { + match: regexp.MustCompile(`Error: Error Creating/Updating Subnet .*: network.SubnetsClient#CreateOrUpdate: .* Code="AnotherOperationInProgress" Message="Another operation on this or dependent resource is in progress`), + + reason: "AzureMultiOperationFailure", + message: `Creating Subnets failed because Azure could not process multiple operations.`, +}, { + match: regexp.MustCompile(`Error: Error Creating/Updating Public IP .*: network.PublicIPAddressesClient#CreateOrUpdate: .* Code="PublicIPCountLimitReached" Message="Cannot create more than .* public IP addresses for this subscription in this region`), + + reason: "AzureQuotaLimitExceeded", + message: `Service limits exceeded for Public IPs in the the subscriptions for the region. Requesting increase in quota should fix the error.`, +}, { + match: regexp.MustCompile(`Error: compute\.VirtualMachinesClient#CreateOrUpdate: .* Code="OperationNotAllowed" Message="Operation could not be completed as it results in exceeding approved Total Regional Cores quota`), + + reason: "AzureQuotaLimitExceeded", + message: `Service limits exceeded for Virtual Machine cores in the the subscriptions for the region. Requesting increase in quota should fix the error.`, +}, { + match: regexp.MustCompile(`Error: Code="OSProvisioningTimedOut"`), + + reason: "AzureVirtualMachineFailure", + message: `Some virtual machines failed to provision in alloted time. Virtual machines can fail to provision if the bootstap virtual machine has failing services.`, +}, { + match: regexp.MustCompile(`Status=404 Code="ResourceGroupNotFound"`), + + reason: "AzureEventualConsistencyFailure", + message: `Failed to find a resource that was recently created usualy caused by Azure's eventual consistency delays.`, +}, { + match: regexp.MustCompile(`Error: Error applying IAM policy to project .*: Too many conflicts`), + + reason: "GCPTooManyIAMUpdatesInFlight", + message: `There are a lot of IAM updates to the project in flight. Failed after reaching a limit of read-modify-write on conflict backoffs.`, +}, { + match: regexp.MustCompile(`Error: .*: googleapi: Error 503: .*, backendError`), + + reason: "GCPBackendInternalError", + message: `GCP is experiencing backend service interuptions. Please try again or contact Google Support`, +}, { + match: regexp.MustCompile(`Error: Error waiting for instance to create: Internal error`), + + reason: "GCPComputeBackendTimeout", + message: `GCP is experiencing backend service interuptions, the compute instance failed to create in reasonable time.`, +}} diff --git a/pkg/terraform/diagnose_test.go b/pkg/terraform/diagnose_test.go new file mode 100644 index 00000000000..0e72393ceca --- /dev/null +++ b/pkg/terraform/diagnose_test.go @@ -0,0 +1,104 @@ +package terraform + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestDiagnose(t *testing.T) { + cases := []struct { + input string + err string + }{{ + input: `Error: Error creating Blob "rhcoskltwa.vhd" (Container "vhd" / Account "clusterkltwa"): Error copy/waiting: + on ../tmp/openshift-install-348626978/main.tf line 169, in resource "azurerm_storage_blob" "rhcos_image":" + 169: resource "azurerm_storage_blob" "rhcos_image" { +`, + err: `error\(Timeout\) from Infrastructure Provider: Copying the VHD to user environment was too slow, and timeout was reached for the success\.`, + }, { + input: `Error: Error Creating/Updating Subnet "xxxx-master-subnet" (Virtual Network "xxxx-vnet" / Resource Group "xxxx-rg"): network.SubnetsClient#CreateOrUpdate: Failure sending request: StatusCode=0 -- Original Error: autorest/azure: Service returned an error. Status= Code="AnotherOperationInProgress" Message="Another operation on this or dependent resource is in progress. To retrieve status of the operation use uri: https://management.azure.com/subscriptions/d38f1e38-4bed-438e-b227-833f997adf6a/providers/Microsoft.Network/locations/eastus2/operations/62c8a417-7168-464f-83e6-96912bd6b30a?api-version=2019-09-01." Details=[] + + on ../tmp/openshift-install-513947104/vnet/vnet.tf line 10, in resource "azurerm_subnet" "master_subnet":" + 10: resource "azurerm_subnet" "master_subnet" { +`, + err: `error\(AzureMultiOperationFailure\) from Infrastructure Provider: Creating Subnets failed because Azure could not process multiple operations\.`, + }, { + input: `Error: Error Creating/Updating Public IP "xxxx-bootstrap-pip-v4" (Resource Group "xxxx-rg"): network.PublicIPAddressesClient#CreateOrUpdate: Failure sending request: StatusCode=400 -- Original Error: Code="PublicIPCountLimitReached" Message="Cannot create more than 50 public IP addresses for this subscription in this region." Details=[] + + on ../tmp/openshift-install-172932975/bootstrap/main.tf line 65, in resource "azurerm_public_ip" "bootstrap_public_ip_v4": + 65: resource "azurerm_public_ip" "bootstrap_public_ip_v4" { +`, + + err: `error\(AzureQuotaLimitExceeded\) from Infrastructure Provider: Service limits exceeded for Public IPs in the the subscriptions for the region. Requesting increase in quota should fix the error\.`, + }, { + input: `Error: Code="OSProvisioningTimedOut" Message="OS Provisioning for VM 'xxxx-master-2' did not finish in the allotted time. The VM may still finish provisioning successfully. Please check provisioning state later. Also, make sure the image has been properly prepared (generalized).\\r\\n * Instructions for Windows: https://azure.microsoft.com/documentation/articles/virtual-machines-windows-upload-image/ \\r\\n * Instructions for Linux: https://azure.microsoft.com/documentation/articles/virtual-machines-linux-capture-image/ " + + on ../tmp/openshift-install-172932975/master/master.tf line 81, in resource "azurerm_virtual_machine" "master": + 81: resource "azurerm_virtual_machine" "master" { +`, + + err: `error\(AzureVirtualMachineFailure\) from Infrastructure Provider: Some virtual machines failed to provision in alloted time`, + }, { + input: ` +Error: Error waiting for instance to create: Internal error. Please try again or contact Google Support. (Code: '8712799794455203922') + + + on ../tmp/openshift-install-910996711/master/main.tf line 31, in resource "google_compute_instance" "master": + 31: resource "google_compute_instance" "master" { +`, + + err: `error\(GCPComputeBackendTimeout\) from Infrastructure Provider: GCP is experiencing backend service interuptions, the compute instance failed to create in reasonable time\.`, + }, { + input: `Error: Error reading Service Account "projects/project-id/serviceAccounts/xxxx-m@project-id.iam.gserviceaccount.com": googleapi: Error 503: The service is currently unavailable., backendError`, + + err: `error\(GCPBackendInternalError\) from Infrastructure Provider: GCP is experiencing backend service interuptions. Please try again or contact Google Support`, + }, { + input: ` +Error: Error adding instances to InstanceGroup: googleapi: Error 503: Internal error. Please try again or contact Google Support. (Code: 'xxxx'), backendError + + on ../tmp/openshift-install-267295217/bootstrap/main.tf line 87, in resource "google_compute_instance_group" "bootstrap": + 87: resource "google_compute_instance_group" "bootstrap" { +`, + + err: `error\(GCPBackendInternalError\) from Infrastructure Provider: GCP is experiencing backend service interuptions. Please try again or contact Google Support`, + }, { + input: ` +Error: Error applying IAM policy to project "project-id": Too many conflicts. Latest error: Error setting IAM policy for project "project-id": googleapi: Error 409: There were concurrent policy changes. Please retry the whole read-modify-write with exponential backoff., aborted + + on ../tmp/openshift-install-392130810/master/main.tf line 26, in resource "google_project_iam_member" "master-service-account-user": + 26: resource "google_project_iam_member" "master-service-account-user" { +`, + + err: `error\(GCPTooManyIAMUpdatesInFlight\) from Infrastructure Provider: There are a lot of IAM updates to the project in flight. Failed after reaching a limit of read-modify-write on conflict backoffs\.`, + }, { + input: ` +Error: Error retrieving resource group: resources.GroupsClient#Get: Failure responding to request: StatusCode=404 -- Original Error: autorest/azure: Service returned an error. Status=404 Code="ResourceGroupNotFound" Message="Resource group 'xxxxx-rg' could not be found." + + on ../tmp/openshift-install-424775273/main.tf line 124, in resource "azurerm_resource_group" "main": + 124: resource "azurerm_resource_group" "main" { +`, + + err: `error\(AzureEventualConsistencyFailure\) from Infrastructure Provider: Failed to find a resource that was recently created usualy caused by Azure's eventual consistency delays\.`, + }, { + input: ` +Error: compute.VirtualMachinesClient#CreateOrUpdate: Failure sending request: StatusCode=0 -- Original Error: autorest/azure: Service returned an error. Status= Code="OperationNotAllowed" Message="Operation could not be completed as it results in exceeding approved Total Regional Cores quota. Additional details - Deployment Model: Resource Manager, Location: centralus, Current Limit: 200, Current Usage: 198, Additional Required: 8, (Minimum) New Limit Required: 206. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/?#create/Microsoft.Support/Parameters/%7B%22subId%22:%225f675811-04fa-483f-9709-ffd8a9da03f0%22,%22pesId%22:%2206bfd9d3-516b-d5c6-5802-169c800dec89%22,%22supportTopicId%22:%22e12e3d1d-7fa0-af33-c6d0-3c50df9658a3%22%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/regional-quota-requests." + + on ../../../../tmp/openshift-install-941329162/master/master.tf line 81, in resource "azurerm_virtual_machine" "master": + 81: resource "azurerm_virtual_machine" "master" { +`, + + err: `error\(AzureQuotaLimitExceeded\) from Infrastructure Provider: Service limits exceeded for Virtual Machine cores in the the subscriptions for the region\. Requesting increase in quota should fix the error\.`, + }} + + for _, test := range cases { + t.Run("", func(t *testing.T) { + err := Diagnose(test.input) + if test.err == "" { + assert.NoError(t, err) + } else { + assert.Regexp(t, test.err, err) + } + }) + } +} diff --git a/pkg/terraform/terraform.go b/pkg/terraform/terraform.go index 70ea3e8871f..7d680df6222 100644 --- a/pkg/terraform/terraform.go +++ b/pkg/terraform/terraform.go @@ -1,7 +1,9 @@ package terraform import ( + "bytes" "fmt" + "io" "os" "path/filepath" "runtime" @@ -43,15 +45,14 @@ func Apply(dir string, platform string, extraArgs ...string) (path string, err e args = append(args, dir) sf := filepath.Join(dir, StateFileName) - tDebug := &lineprinter.Trimmer{WrappedPrint: logrus.Debug} - tError := &lineprinter.Trimmer{WrappedPrint: logrus.Error} - lpDebug := &lineprinter.LinePrinter{Print: tDebug.Print} - lpError := &lineprinter.LinePrinter{Print: tError.Print} + lpDebug := &lineprinter.LinePrinter{Print: (&lineprinter.Trimmer{WrappedPrint: logrus.Debug}).Print} + lpError := &lineprinter.LinePrinter{Print: (&lineprinter.Trimmer{WrappedPrint: logrus.Error}).Print} defer lpDebug.Close() defer lpError.Close() - if exitCode := texec.Apply(dir, args, lpDebug, lpError); exitCode != 0 { - return sf, errors.New("failed to apply using Terraform") + errBuf := &bytes.Buffer{} + if exitCode := texec.Apply(dir, args, lpDebug, io.MultiWriter(errBuf, lpError)); exitCode != 0 { + return sf, errors.Wrap(Diagnose(errBuf.String()), "failed to apply Terraform") } return sf, nil } @@ -74,10 +75,8 @@ func Destroy(dir string, platform string, extraArgs ...string) (err error) { args := append(defaultArgs, extraArgs...) args = append(args, dir) - tDebug := &lineprinter.Trimmer{WrappedPrint: logrus.Debug} - tError := &lineprinter.Trimmer{WrappedPrint: logrus.Error} - lpDebug := &lineprinter.LinePrinter{Print: tDebug.Print} - lpError := &lineprinter.LinePrinter{Print: tError.Print} + lpDebug := &lineprinter.LinePrinter{Print: (&lineprinter.Trimmer{WrappedPrint: logrus.Debug}).Print} + lpError := &lineprinter.LinePrinter{Print: (&lineprinter.Trimmer{WrappedPrint: logrus.Error}).Print} defer lpDebug.Close() defer lpError.Close() @@ -115,10 +114,8 @@ func unpackAndInit(dir string, platform string) (err error) { return errors.Wrap(err, "failed to setup embedded Terraform plugins") } - tDebug := &lineprinter.Trimmer{WrappedPrint: logrus.Debug} - tError := &lineprinter.Trimmer{WrappedPrint: logrus.Error} - lpDebug := &lineprinter.LinePrinter{Print: tDebug.Print} - lpError := &lineprinter.LinePrinter{Print: tError.Print} + lpDebug := &lineprinter.LinePrinter{Print: (&lineprinter.Trimmer{WrappedPrint: logrus.Debug}).Print} + lpError := &lineprinter.LinePrinter{Print: (&lineprinter.Trimmer{WrappedPrint: logrus.Error}).Print} defer lpDebug.Close() defer lpError.Close()