Skip to content

Commit

Permalink
add support for limiting retries for network level failures such as t…
Browse files Browse the repository at this point in the history
…cp dial timeouts
  • Loading branch information
jackofallops committed Oct 1, 2024
1 parent 1905dc4 commit 1670e5a
Showing 1 changed file with 101 additions and 9 deletions.
110 changes: 101 additions & 9 deletions sdk/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@ import (
"crypto/tls"
"encoding/json"
"encoding/xml"
"errors"
"fmt"
"io"
"log"
"math"
"net"
"net/http"
"net/url"
"regexp"
"runtime"
"strconv"
"strings"
Expand Down Expand Up @@ -443,16 +445,8 @@ func (c *Client) Execute(ctx context.Context, req *Request) (*Response, error) {

// Instantiate a RetryableHttp client and configure its CheckRetry func
r := c.retryableClient(ctx, func(ctx context.Context, r *http.Response, err error) (bool, error) {
// First check for badly malformed responses
if r == nil {
if req.IsIdempotent() {
return true, nil
}
return false, fmt.Errorf("HTTP response was nil; connection may have been reset")
}

// Eventual consistency checks
if !c.DisableRetries {
if r != nil && !c.DisableRetries {
if r.StatusCode == http.StatusFailedDependency {
return true, nil
}
Expand All @@ -474,6 +468,14 @@ func (c *Client) Execute(ctx context.Context, req *Request) (*Response, error) {
}
}

// Check for failed connections etc and decide if retries are appropriate
if r == nil {
if req.IsIdempotent() {
return extendedRetryPolicy(r, err)
}
return false, fmt.Errorf("HTTP response was nil; connection may have been reset")
}

// Fall back to default retry policy to handle rate limiting, server errors etc.
return retryablehttp.DefaultRetryPolicy(ctx, r, err)
})
Expand Down Expand Up @@ -749,3 +751,93 @@ func containsStatusCode(expected []int, actual int) bool {

return false
}

// extendedRetryPolicy extends the defaultRetryPolicy implementation in go-retryablehhtp with
// additional error conditions that should not be retried indefinitely
// `tcpDialTimeoutRe` - covers the
func extendedRetryPolicy(resp *http.Response, err error) (bool, error) {
// A regular expression to match the error returned by net/http when the
// configured number of redirects is exhausted. This error isn't typed
// specifically so we resort to matching on the error string.
redirectsErrorRe := regexp.MustCompile(`stopped after \d+ redirects\z`)

// A regular expression to match the error returned by net/http when the
// scheme specified in the URL is invalid. This error isn't typed
// specifically so we resort to matching on the error string.
schemeErrorRe := regexp.MustCompile(`unsupported protocol scheme`)

// A regular expression to match the error returned by net/http when a
// request header or value is invalid. This error isn't typed
// specifically so we resort to matching on the error string.
invalidHeaderErrorRe := regexp.MustCompile(`invalid header`)

// A regular expression to match the error returned by net/http when the
// TLS certificate is not trusted. This error isn't typed
// specifically so we resort to matching on the error string.
notTrustedErrorRe := regexp.MustCompile(`certificate is not trusted`)

// A regular expression to catch dial timeouts in the underlying TCP session
// connection
tcpDialTimeoutRe := regexp.MustCompile(`dial tcp .*: i/o timeout`)

// A regular expression to match complete packet loss
completePacketLossRe := regexp.MustCompile(`EOF$`)

if err != nil {
var v *url.Error
if errors.As(err, &v) {
// Don't retry if the error was due to too many redirects.
if redirectsErrorRe.MatchString(v.Error()) {
return false, v
}

// Don't retry if the error was due to an invalid protocol scheme.
if schemeErrorRe.MatchString(v.Error()) {
return false, v
}

// Don't retry if the error was due to an invalid header.
if invalidHeaderErrorRe.MatchString(v.Error()) {
return false, v
}

// Don't retry if the error was due to TLS cert verification failure.
if notTrustedErrorRe.MatchString(v.Error()) {
return false, v
}

if tcpDialTimeoutRe.MatchString(v.Error()) {
return false, v
}

if completePacketLossRe.MatchString(v.Error()) {
return false, v
}

var certificateVerificationError *tls.CertificateVerificationError
if ok := errors.As(v.Err, &certificateVerificationError); !ok {
return false, v
}
}

// The error is likely recoverable so retry.
return true, nil
}

// 429 Too Many Requests is recoverable. Sometimes the server puts
// a Retry-After response header to indicate when the server is
// available to start processing request from client.
if resp.StatusCode == http.StatusTooManyRequests {
return true, nil
}

// Check the response code. We retry on 500-range responses to allow
// the server time to recover, as 500's are typically not permanent
// errors and may relate to outages on the server side. This will catch
// invalid response codes as well, like 0 and 999.
if resp.StatusCode == 0 || (resp.StatusCode >= 500 && resp.StatusCode != http.StatusNotImplemented) {
return true, fmt.Errorf("unexpected HTTP status %s", resp.Status)
}

return false, nil
}

0 comments on commit 1670e5a

Please sign in to comment.