Skip to content

Commit

Permalink
host-ctr: Implement exponential backoff for image pulls
Browse files Browse the repository at this point in the history
Implements exponential backoff for image pulls.

Also move signal handling channel set up to after image pull happens
since its mostly relevant to only when we're starting a container
task.
  • Loading branch information
etungsten committed Oct 23, 2019
1 parent f6279f7 commit c52ba69
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 25 deletions.
2 changes: 1 addition & 1 deletion packages/workspaces/[email protected]
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ Type=simple
EnvironmentFile=/etc/host-containers/%i.env
ExecStart=/usr/bin/host-ctr -ctr-id='%i' -source='${CTR_SOURCE}' -superpowered='${CTR_SUPERPOWERED}'
Restart=always
RestartSec=10
RestartSec=45
TimeoutStopSec=60
KillMode=mixed

Expand Down
94 changes: 70 additions & 24 deletions workspaces/host-ctr/cmd/host-ctr/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package main
import (
"context"
"flag"
"math/rand"
"os"
"os/signal"
"regexp"
Expand All @@ -18,11 +19,15 @@ import (
"github.com/containerd/containerd/log"
"github.com/containerd/containerd/namespaces"
"github.com/containerd/containerd/oci"
cgroups "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups"
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
"github.com/pkg/errors"
)

func init() {
rand.Seed(time.Now().UnixNano())
}

func main() {
os.Exit(_main())
}
Expand All @@ -49,15 +54,25 @@ func _main() int {
flag.Usage()
return 2
}

ctx, cancel := context.WithCancel(context.Background())
ctx = namespaces.WithNamespace(ctx, namespace)
defer cancel()

// Set up channel on which to send signal notifications.
// We must use a buffered channel or risk missing the signal
// if we're not ready to receive when the signal is sent.
c := make(chan os.Signal, 1)
signal.Notify(c, syscall.SIGINT, syscall.SIGTERM)
go func(ctx context.Context, cancel context.CancelFunc) {
// Set up channel on which to send signal notifications.
// We must use a buffered channel or risk missing the signal
// if we're not ready to receive when the signal is sent.
c := make(chan os.Signal, 1)
signal.Notify(c, syscall.SIGINT, syscall.SIGTERM)
for {
select {
case s := <-c:
log.G(ctx).Info("Received signal: ", s)
cancel()
}
}
}(ctx, cancel)

// Set up containerd client
// Use host containers' containerd socket
Expand Down Expand Up @@ -151,18 +166,18 @@ func _main() int {
log.G(ctx).WithError(err).WithField("img", img.Name).Error("Failed to create container")
return 1
}
defer container.Delete(ctx, containerd.WithSnapshotCleanup)
defer container.Delete(context.TODO(), containerd.WithSnapshotCleanup)

// Create the container task
task, err := container.NewTask(ctx, cio.NewCreator(cio.WithStdio))
if err != nil {
log.G(ctx).WithError(err).Error("Failed to create container task")
return 1
}
defer task.Delete(ctx)
defer task.Delete(context.TODO())

// Wait before calling start in case the container task finishes too quickly
exitStatusC, err := task.Wait(ctx)
exitStatusC, err := task.Wait(context.TODO())
if err != nil {
log.G(ctx).WithError(err).Error("Unexpected error during container task setup.")
return 1
Expand All @@ -177,12 +192,14 @@ func _main() int {

// Block until an OS signal (e.g. SIGTERM, SIGINT) is received or the container task finishes and exits on its own.
var status containerd.ExitStatus
ctrCtx, cancel := context.WithCancel(context.Background())
defer cancel()
select {
case s := <-c:
log.G(ctx).Info("Received signal: ", s)
case <-ctx.Done():

// SIGTERM the container task and get its exit status
if err := task.Kill(ctx, syscall.SIGTERM); err != nil {
log.G(ctx).WithError(err).Error("Failed to send SIGTERM to container")
if err := task.Kill(ctrCtx, syscall.SIGTERM); err != nil {
log.G(ctrCtx).WithError(err).Error("Failed to send SIGTERM to container")
return 1
}

Expand All @@ -200,12 +217,12 @@ func _main() int {
// Create a deadline of 45 seconds
killCtrTask := func() error {
const sigkillTimeout = 45 * time.Second
killCtx, cancel := context.WithTimeout(ctx, sigkillTimeout)
killCtx, cancel := context.WithTimeout(ctrCtx, sigkillTimeout)
defer cancel()
return task.Kill(killCtx, syscall.SIGKILL)
}
if killCtrTask() != nil {
log.G(ctx).WithError(err).Error("Failed to SIGKILL container process, timed out")
log.G(ctrCtx).WithError(err).Error("Failed to SIGKILL container process, timed out")
return 1
}

Expand All @@ -216,10 +233,10 @@ func _main() int {
}
code, _, err := status.Result()
if err != nil {
log.G(ctx).WithError(err).Error("Failed to get container task exit status")
log.G(ctrCtx).WithError(err).Error("Failed to get container task exit status")
return 1
}
log.G(ctx).WithField("code", code).Info("Container task exited")
log.G(ctrCtx).WithField("code", code).Info("Container task exited")
return int(code)
}

Expand Down Expand Up @@ -291,16 +308,45 @@ var ecrRegex = regexp.MustCompile(`(^[a-zA-Z0-9][a-zA-Z0-9-_]*)\.dkr\.ecr\.([a-z
// Pulls image from specified source
func pullImage(ctx context.Context, source string, client *containerd.Client) (containerd.Image, error) {
// Pull the image
img, err := client.Pull(ctx, source,
withDynamicResolver(ctx, source),
containerd.WithSchema1Conversion)
if err != nil {
return nil, errors.Wrap(err, "Failed to pull ctr image")
// Retry with exponential backoff when failures occur, maximum retry duration will not exceed 31 seconds
const maxRetryAttempts = 5
const intervalMultiplier = 2
const maxRetryInterval = 30 * time.Second
const jitterPeakAmplitude = 4000
const jitterLowerBound = 2000
var retryInterval = 1 * time.Second
var retryAttempts = 0
var img containerd.Image
for {
var err error
img, err = client.Pull(ctx, source,
withDynamicResolver(ctx, source),
containerd.WithSchema1Conversion)
if err == nil {
break
}
if retryAttempts >= maxRetryAttempts {
return nil, errors.Wrap(err, "retries exhausted")
}
// Add a random jitter between 2 - 6 seconds to the retry interval
retryIntervalWithJitter := retryInterval + time.Duration(rand.Int31n(jitterPeakAmplitude))*time.Millisecond + jitterLowerBound*time.Millisecond
log.G(ctx).WithError(err).Warnf("Failed to pull image. Waiting %s before retrying...", retryIntervalWithJitter)
timer := time.NewTimer(retryIntervalWithJitter)
select {
case <-timer.C:
retryInterval *= intervalMultiplier
if retryInterval > maxRetryInterval {
retryInterval = maxRetryInterval
}
retryAttempts++
case <-ctx.Done():
return nil, errors.Wrap(err, "context ended while retrying")
}
}
log.G(ctx).WithField("img", img.Name()).Info("Pulled successfully")
log.G(ctx).WithField("img", img.Name()).Info("Unpacking...")
if err := img.Unpack(ctx, containerd.DefaultSnapshotter); err != nil {
return nil, errors.Wrap(err, "Failed to unpack image")
return nil, errors.Wrap(err, "failed to unpack image")
}
return img, nil
}
Expand Down Expand Up @@ -346,7 +392,7 @@ func withDynamicResolver(ctx context.Context, ref string) containerd.RemoteOpt {
if err != nil {
return errors.Wrap(err, "Failed to create ECR resolver")
}
log.G(ctx).WithField("ref", ref).Info("Pulling from Amazon ECR")
log.G(ctx).WithField("ref", ref).Info("Pulling with Amazon ECR Resolver")
c.Resolver = resolver
return nil
}
Expand Down

0 comments on commit c52ba69

Please sign in to comment.