Skip to content

Commit

Permalink
Merge pull request #3535 from abhinavdahiya/tf_diagnose
Browse files Browse the repository at this point in the history
Bug 1837564: pkg/terraform: add diagnostics errors for terraform apply operations
  • Loading branch information
openshift-merge-robot authored May 20, 2020
2 parents 64529d5 + 699beca commit 254680f
Show file tree
Hide file tree
Showing 4 changed files with 276 additions and 14 deletions.
80 changes: 80 additions & 0 deletions pkg/diagnostics/error.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
package diagnostics

import (
"bytes"
"fmt"
"io"
"regexp"
"strings"

"github.com/pkg/errors"
)

// Err wraps diagnostics information for an error.
// Err allows providing information like source, reason and message
// that provides a much better user error reporting capability.
type Err struct {
Orig error

// Source defines with entity is generating the error.
// It allows passing along information about where the error is being
// generated from. for example, the Asset.
Source string

// Reason is a CamelCase string that summarizes the error in one word.
// It allows easy catgeorizations of known errors.
Reason string

// Message is free-form strings which provides important details or
// diagnostics for the error. When writing messages, make sure to keep in mind
// that the audience for message is end-users who might not be experts.
Message string
}

// Unwrap allows the error to be unwrapped.
func (e *Err) Unwrap() error { return e.Orig }

// Error returns a string representation of the Err. The returned value
// is expected to be a single value.
// The format of the error string returned is,
// `error(<Reason>) from <Source>: <Message>: <Cause of Orig>`
func (e *Err) Error() string {
buf := &bytes.Buffer{}
if len(e.Source) > 0 {
fmt.Fprintf(buf, "error(%s) from %s", e.Reason, e.Source)
} else {
fmt.Fprintf(buf, "error(%s)", e.Reason)
}
if msg := strings.TrimSpace(e.Message); len(msg) > 0 {
msg = breakre.ReplaceAllString(msg, " ")
fmt.Fprintf(buf, ": %s", msg)
}
if c := errors.Cause(e.Orig); c != nil {
fmt.Fprintf(buf, ": %s", errors.Cause(e.Orig))
}
return buf.String()
}

// Print prints the Err to Writer in a way that is more verbose and
// sectionalized.
// The output looks like:
// Error from <Source>:
// Reason: <reason>
//
// Message:
// <Message>
//
// Original:
// <Orig>
func (e *Err) Print(w io.Writer) {
fmt.Fprintf(w, "Error from %q\n", e.Source)
fmt.Fprintf(w, "Reason: %s\n", e.Reason)
if len(e.Message) > 0 {
fmt.Fprintf(w, "\nMessage:\n")
fmt.Fprintln(w, e.Message)
}
fmt.Fprintf(w, "\nOriginal error:\n")
fmt.Fprintln(w, e.Orig)
}

var breakre = regexp.MustCompile(`\r?\n`)
81 changes: 81 additions & 0 deletions pkg/terraform/diagnose.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
package terraform

import (
"regexp"

"github.com/pkg/errors"

"github.com/openshift/installer/pkg/diagnostics"
)

// Diagnose accepts an error from terraform runs and tries to diagnose the
// underlying cause.
func Diagnose(message string) error {
for _, cand := range conditions {
if cand.match.MatchString(message) {
return &diagnostics.Err{
Source: "Infrastructure Provider",
Reason: cand.reason,
Message: cand.message,
}
}
}

return errors.New("failed to complete the change")
}

type condition struct {
match *regexp.Regexp

reason string
message string
}

// conditions is a list matches for the error string from terraform.
// specific on the top, generic matches on the bottom.
var conditions = []condition{{
match: regexp.MustCompile(`Error: Error creating Blob .*: Error copy/waiting`),

reason: "Timeout",
message: `Copying the VHD to user environment was too slow, and timeout was reached for the success.`,
}, {
match: regexp.MustCompile(`Error: Error Creating/Updating Subnet .*: network.SubnetsClient#CreateOrUpdate: .* Code="AnotherOperationInProgress" Message="Another operation on this or dependent resource is in progress`),

reason: "AzureMultiOperationFailure",
message: `Creating Subnets failed because Azure could not process multiple operations.`,
}, {
match: regexp.MustCompile(`Error: Error Creating/Updating Public IP .*: network.PublicIPAddressesClient#CreateOrUpdate: .* Code="PublicIPCountLimitReached" Message="Cannot create more than .* public IP addresses for this subscription in this region`),

reason: "AzureQuotaLimitExceeded",
message: `Service limits exceeded for Public IPs in the the subscriptions for the region. Requesting increase in quota should fix the error.`,
}, {
match: regexp.MustCompile(`Error: compute\.VirtualMachinesClient#CreateOrUpdate: .* Code="OperationNotAllowed" Message="Operation could not be completed as it results in exceeding approved Total Regional Cores quota`),

reason: "AzureQuotaLimitExceeded",
message: `Service limits exceeded for Virtual Machine cores in the the subscriptions for the region. Requesting increase in quota should fix the error.`,
}, {
match: regexp.MustCompile(`Error: Code="OSProvisioningTimedOut"`),

reason: "AzureVirtualMachineFailure",
message: `Some virtual machines failed to provision in alloted time. Virtual machines can fail to provision if the bootstap virtual machine has failing services.`,
}, {
match: regexp.MustCompile(`Status=404 Code="ResourceGroupNotFound"`),

reason: "AzureEventualConsistencyFailure",
message: `Failed to find a resource that was recently created usualy caused by Azure's eventual consistency delays.`,
}, {
match: regexp.MustCompile(`Error: Error applying IAM policy to project .*: Too many conflicts`),

reason: "GCPTooManyIAMUpdatesInFlight",
message: `There are a lot of IAM updates to the project in flight. Failed after reaching a limit of read-modify-write on conflict backoffs.`,
}, {
match: regexp.MustCompile(`Error: .*: googleapi: Error 503: .*, backendError`),

reason: "GCPBackendInternalError",
message: `GCP is experiencing backend service interuptions. Please try again or contact Google Support`,
}, {
match: regexp.MustCompile(`Error: Error waiting for instance to create: Internal error`),

reason: "GCPComputeBackendTimeout",
message: `GCP is experiencing backend service interuptions, the compute instance failed to create in reasonable time.`,
}}
104 changes: 104 additions & 0 deletions pkg/terraform/diagnose_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
package terraform

import (
"testing"

"github.com/stretchr/testify/assert"
)

func TestDiagnose(t *testing.T) {
cases := []struct {
input string
err string
}{{
input: `Error: Error creating Blob "rhcoskltwa.vhd" (Container "vhd" / Account "clusterkltwa"): Error copy/waiting:
on ../tmp/openshift-install-348626978/main.tf line 169, in resource "azurerm_storage_blob" "rhcos_image":"
169: resource "azurerm_storage_blob" "rhcos_image" {
`,
err: `error\(Timeout\) from Infrastructure Provider: Copying the VHD to user environment was too slow, and timeout was reached for the success\.`,
}, {
input: `Error: Error Creating/Updating Subnet "xxxx-master-subnet" (Virtual Network "xxxx-vnet" / Resource Group "xxxx-rg"): network.SubnetsClient#CreateOrUpdate: Failure sending request: StatusCode=0 -- Original Error: autorest/azure: Service returned an error. Status=<nil> Code="AnotherOperationInProgress" Message="Another operation on this or dependent resource is in progress. To retrieve status of the operation use uri: https://management.azure.com/subscriptions/d38f1e38-4bed-438e-b227-833f997adf6a/providers/Microsoft.Network/locations/eastus2/operations/62c8a417-7168-464f-83e6-96912bd6b30a?api-version=2019-09-01." Details=[]
on ../tmp/openshift-install-513947104/vnet/vnet.tf line 10, in resource "azurerm_subnet" "master_subnet":"
10: resource "azurerm_subnet" "master_subnet" {
`,
err: `error\(AzureMultiOperationFailure\) from Infrastructure Provider: Creating Subnets failed because Azure could not process multiple operations\.`,
}, {
input: `Error: Error Creating/Updating Public IP "xxxx-bootstrap-pip-v4" (Resource Group "xxxx-rg"): network.PublicIPAddressesClient#CreateOrUpdate: Failure sending request: StatusCode=400 -- Original Error: Code="PublicIPCountLimitReached" Message="Cannot create more than 50 public IP addresses for this subscription in this region." Details=[]
on ../tmp/openshift-install-172932975/bootstrap/main.tf line 65, in resource "azurerm_public_ip" "bootstrap_public_ip_v4":
65: resource "azurerm_public_ip" "bootstrap_public_ip_v4" {
`,

err: `error\(AzureQuotaLimitExceeded\) from Infrastructure Provider: Service limits exceeded for Public IPs in the the subscriptions for the region. Requesting increase in quota should fix the error\.`,
}, {
input: `Error: Code="OSProvisioningTimedOut" Message="OS Provisioning for VM 'xxxx-master-2' did not finish in the allotted time. The VM may still finish provisioning successfully. Please check provisioning state later. Also, make sure the image has been properly prepared (generalized).\\r\\n * Instructions for Windows: https://azure.microsoft.com/documentation/articles/virtual-machines-windows-upload-image/ \\r\\n * Instructions for Linux: https://azure.microsoft.com/documentation/articles/virtual-machines-linux-capture-image/ "
on ../tmp/openshift-install-172932975/master/master.tf line 81, in resource "azurerm_virtual_machine" "master":
81: resource "azurerm_virtual_machine" "master" {
`,

err: `error\(AzureVirtualMachineFailure\) from Infrastructure Provider: Some virtual machines failed to provision in alloted time`,
}, {
input: `
Error: Error waiting for instance to create: Internal error. Please try again or contact Google Support. (Code: '8712799794455203922')
on ../tmp/openshift-install-910996711/master/main.tf line 31, in resource "google_compute_instance" "master":
31: resource "google_compute_instance" "master" {
`,

err: `error\(GCPComputeBackendTimeout\) from Infrastructure Provider: GCP is experiencing backend service interuptions, the compute instance failed to create in reasonable time\.`,
}, {
input: `Error: Error reading Service Account "projects/project-id/serviceAccounts/[email protected]": googleapi: Error 503: The service is currently unavailable., backendError`,

err: `error\(GCPBackendInternalError\) from Infrastructure Provider: GCP is experiencing backend service interuptions. Please try again or contact Google Support`,
}, {
input: `
Error: Error adding instances to InstanceGroup: googleapi: Error 503: Internal error. Please try again or contact Google Support. (Code: 'xxxx'), backendError
on ../tmp/openshift-install-267295217/bootstrap/main.tf line 87, in resource "google_compute_instance_group" "bootstrap":
87: resource "google_compute_instance_group" "bootstrap" {
`,

err: `error\(GCPBackendInternalError\) from Infrastructure Provider: GCP is experiencing backend service interuptions. Please try again or contact Google Support`,
}, {
input: `
Error: Error applying IAM policy to project "project-id": Too many conflicts. Latest error: Error setting IAM policy for project "project-id": googleapi: Error 409: There were concurrent policy changes. Please retry the whole read-modify-write with exponential backoff., aborted
on ../tmp/openshift-install-392130810/master/main.tf line 26, in resource "google_project_iam_member" "master-service-account-user":
26: resource "google_project_iam_member" "master-service-account-user" {
`,

err: `error\(GCPTooManyIAMUpdatesInFlight\) from Infrastructure Provider: There are a lot of IAM updates to the project in flight. Failed after reaching a limit of read-modify-write on conflict backoffs\.`,
}, {
input: `
Error: Error retrieving resource group: resources.GroupsClient#Get: Failure responding to request: StatusCode=404 -- Original Error: autorest/azure: Service returned an error. Status=404 Code="ResourceGroupNotFound" Message="Resource group 'xxxxx-rg' could not be found."
on ../tmp/openshift-install-424775273/main.tf line 124, in resource "azurerm_resource_group" "main":
124: resource "azurerm_resource_group" "main" {
`,

err: `error\(AzureEventualConsistencyFailure\) from Infrastructure Provider: Failed to find a resource that was recently created usualy caused by Azure's eventual consistency delays\.`,
}, {
input: `
Error: compute.VirtualMachinesClient#CreateOrUpdate: Failure sending request: StatusCode=0 -- Original Error: autorest/azure: Service returned an error. Status=<nil> Code="OperationNotAllowed" Message="Operation could not be completed as it results in exceeding approved Total Regional Cores quota. Additional details - Deployment Model: Resource Manager, Location: centralus, Current Limit: 200, Current Usage: 198, Additional Required: 8, (Minimum) New Limit Required: 206. Submit a request for Quota increase at https://aka.ms/ProdportalCRP/?#create/Microsoft.Support/Parameters/%7B%22subId%22:%225f675811-04fa-483f-9709-ffd8a9da03f0%22,%22pesId%22:%2206bfd9d3-516b-d5c6-5802-169c800dec89%22,%22supportTopicId%22:%22e12e3d1d-7fa0-af33-c6d0-3c50df9658a3%22%7D by specifying parameters listed in the ‘Details’ section for deployment to succeed. Please read more about quota limits at https://docs.microsoft.com/en-us/azure/azure-supportability/regional-quota-requests."
on ../../../../tmp/openshift-install-941329162/master/master.tf line 81, in resource "azurerm_virtual_machine" "master":
81: resource "azurerm_virtual_machine" "master" {
`,

err: `error\(AzureQuotaLimitExceeded\) from Infrastructure Provider: Service limits exceeded for Virtual Machine cores in the the subscriptions for the region\. Requesting increase in quota should fix the error\.`,
}}

for _, test := range cases {
t.Run("", func(t *testing.T) {
err := Diagnose(test.input)
if test.err == "" {
assert.NoError(t, err)
} else {
assert.Regexp(t, test.err, err)
}
})
}
}
25 changes: 11 additions & 14 deletions pkg/terraform/terraform.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package terraform

import (
"bytes"
"fmt"
"io"
"os"
"path/filepath"
"runtime"
Expand Down Expand Up @@ -43,15 +45,14 @@ func Apply(dir string, platform string, extraArgs ...string) (path string, err e
args = append(args, dir)
sf := filepath.Join(dir, StateFileName)

tDebug := &lineprinter.Trimmer{WrappedPrint: logrus.Debug}
tError := &lineprinter.Trimmer{WrappedPrint: logrus.Error}
lpDebug := &lineprinter.LinePrinter{Print: tDebug.Print}
lpError := &lineprinter.LinePrinter{Print: tError.Print}
lpDebug := &lineprinter.LinePrinter{Print: (&lineprinter.Trimmer{WrappedPrint: logrus.Debug}).Print}
lpError := &lineprinter.LinePrinter{Print: (&lineprinter.Trimmer{WrappedPrint: logrus.Error}).Print}
defer lpDebug.Close()
defer lpError.Close()

if exitCode := texec.Apply(dir, args, lpDebug, lpError); exitCode != 0 {
return sf, errors.New("failed to apply using Terraform")
errBuf := &bytes.Buffer{}
if exitCode := texec.Apply(dir, args, lpDebug, io.MultiWriter(errBuf, lpError)); exitCode != 0 {
return sf, errors.Wrap(Diagnose(errBuf.String()), "failed to apply Terraform")
}
return sf, nil
}
Expand All @@ -74,10 +75,8 @@ func Destroy(dir string, platform string, extraArgs ...string) (err error) {
args := append(defaultArgs, extraArgs...)
args = append(args, dir)

tDebug := &lineprinter.Trimmer{WrappedPrint: logrus.Debug}
tError := &lineprinter.Trimmer{WrappedPrint: logrus.Error}
lpDebug := &lineprinter.LinePrinter{Print: tDebug.Print}
lpError := &lineprinter.LinePrinter{Print: tError.Print}
lpDebug := &lineprinter.LinePrinter{Print: (&lineprinter.Trimmer{WrappedPrint: logrus.Debug}).Print}
lpError := &lineprinter.LinePrinter{Print: (&lineprinter.Trimmer{WrappedPrint: logrus.Error}).Print}
defer lpDebug.Close()
defer lpError.Close()

Expand Down Expand Up @@ -115,10 +114,8 @@ func unpackAndInit(dir string, platform string) (err error) {
return errors.Wrap(err, "failed to setup embedded Terraform plugins")
}

tDebug := &lineprinter.Trimmer{WrappedPrint: logrus.Debug}
tError := &lineprinter.Trimmer{WrappedPrint: logrus.Error}
lpDebug := &lineprinter.LinePrinter{Print: tDebug.Print}
lpError := &lineprinter.LinePrinter{Print: tError.Print}
lpDebug := &lineprinter.LinePrinter{Print: (&lineprinter.Trimmer{WrappedPrint: logrus.Debug}).Print}
lpError := &lineprinter.LinePrinter{Print: (&lineprinter.Trimmer{WrappedPrint: logrus.Error}).Print}
defer lpDebug.Close()
defer lpError.Close()

Expand Down

0 comments on commit 254680f

Please sign in to comment.