From f85bba8f45479c0f94a88e913cfce340604b70a2 Mon Sep 17 00:00:00 2001 From: apostasie Date: Sat, 24 Aug 2024 13:15:35 -0700 Subject: [PATCH 1/3] Hardening lifecycle state store Signed-off-by: apostasie --- pkg/ocihook/ocihook.go | 7 +++---- pkg/ocihook/state/state.go | 11 ++++++++++- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/pkg/ocihook/ocihook.go b/pkg/ocihook/ocihook.go index f8afc6c7c18..e37f4083a51 100644 --- a/pkg/ocihook/ocihook.go +++ b/pkg/ocihook/ocihook.go @@ -514,10 +514,9 @@ func onCreateRuntime(opts *handlerOpts) error { // Set StartedAt lf := state.NewLifecycleState(opts.state.Annotations[labels.StateDir]) return lf.WithLock(func() error { - err := lf.Load() - if err != nil { - return err - } + // Errors are voluntarily ignored here, as they should not be fatal. + // The lifecycle struct is also already warning about the issue. + _ = lf.Load() lf.StartedAt = time.Now() return lf.Save() }) diff --git a/pkg/ocihook/state/state.go b/pkg/ocihook/state/state.go index 0c0295cdd83..f3f240218fd 100644 --- a/pkg/ocihook/state/state.go +++ b/pkg/ocihook/state/state.go @@ -24,6 +24,8 @@ import ( "path/filepath" "time" + "github.com/containerd/log" + "github.com/containerd/nerdctl/v2/pkg/lockutil" ) @@ -71,6 +73,8 @@ func (lf *LifecycleState) Load() error { } else { err = json.Unmarshal(data, lf) if err != nil { + // Logging an error, as Load errors are generally ignored downstream + log.L.Error("unable to unmarshall lifecycle data") return fmt.Errorf("unable to unmarshall lifecycle data: %w", err) } } @@ -78,11 +82,16 @@ func (lf *LifecycleState) Load() error { } func (lf *LifecycleState) Save() error { + // Write atomically (write, then move) to avoid incomplete writes from happening data, err := json.Marshal(lf) if err != nil { return fmt.Errorf("unable to marshall lifecycle data: %w", err) } - err = os.WriteFile(filepath.Join(lf.stateDir, lifecycleFile), data, 0600) + err = os.WriteFile(filepath.Join(lf.stateDir, "."+lifecycleFile), data, 0600) + if err != nil { + return fmt.Errorf("unable to write lifecycle file: %w", err) + } + err = os.Rename(filepath.Join(lf.stateDir, "."+lifecycleFile), filepath.Join(lf.stateDir, lifecycleFile)) if err != nil { return fmt.Errorf("unable to write lifecycle file: %w", err) } From 6296bb35e28db9e35e37346292939ea898d787a0 Mon Sep 17 00:00:00 2001 From: apostasie Date: Sat, 24 Aug 2024 16:49:03 -0700 Subject: [PATCH 2/3] Namestore hardening, workaround #3351 Signed-off-by: apostasie --- pkg/namestore/namestore.go | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/pkg/namestore/namestore.go b/pkg/namestore/namestore.go index 5243a8f958a..a6b57968c82 100644 --- a/pkg/namestore/namestore.go +++ b/pkg/namestore/namestore.go @@ -22,6 +22,8 @@ import ( "path/filepath" "strings" + "github.com/containerd/log" + "github.com/containerd/nerdctl/v2/pkg/identifiers" "github.com/containerd/nerdctl/v2/pkg/lockutil" ) @@ -56,12 +58,21 @@ func (x *nameStore) Acquire(name, id string) error { } fn := func() error { fileName := filepath.Join(x.dir, name) - // If containerd was bounced, previously running containers that would get restarted will go again through - // onCreateRuntime (unlike in a "normal" stop/start flow). - // As such, we need to allow reacquiring by the same id - // See: https://github.com/containerd/nerdctl/issues/3354 - if b, err := os.ReadFile(fileName); err == nil && string(b) != id { - return fmt.Errorf("name %q is already used by ID %q", name, string(b)) + if b, err := os.ReadFile(fileName); err == nil { + if strings.TrimSpace(string(b)) == "" { + // currently acquired for an empty id - this obviously should never happen + // this is recoverable, and we are not hard erroring, but still indicative that something was wrong + // https://github.com/containerd/nerdctl/issues/3351 + log.L.Errorf("current name %q is reserved for a an empty id - please report this is as a bug", name) + } else if string(b) != id { + // if acquired by a different container, we error out here + return fmt.Errorf("name %q is already used by ID %q", name, string(b)) + } + // Otherwise, this is just re-acquiring after a restart + // For example, if containerd was bounced, previously running containers that would get restarted will go + // again through onCreateRuntime (unlike in a "normal" stop/start flow). + // As such, we are allowing reacquiring by the same id + // See: https://github.com/containerd/nerdctl/issues/3354 } return os.WriteFile(fileName, []byte(id), 0600) } From b954c7a3e7739a7492d631f23c34292e7748c017 Mon Sep 17 00:00:00 2001 From: apostasie Date: Sat, 24 Aug 2024 20:59:24 -0700 Subject: [PATCH 3/3] Prevent presumably bogus reentrancy onPostStop when onCreateRuntime errored Signed-off-by: apostasie --- pkg/ocihook/ocihook.go | 25 +++++++++++++++++++------ pkg/ocihook/state/state.go | 5 +++-- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/pkg/ocihook/ocihook.go b/pkg/ocihook/ocihook.go index e37f4083a51..922e6ff8c96 100644 --- a/pkg/ocihook/ocihook.go +++ b/pkg/ocihook/ocihook.go @@ -505,24 +505,37 @@ func onCreateRuntime(opts *handlerOpts) error { log.L.WithError(err).Error("failed re-acquiring name - see https://github.com/containerd/nerdctl/issues/2992") } + var netError error if opts.cni != nil { - if err = applyNetworkSettings(opts); err != nil { - return err - } + netError = applyNetworkSettings(opts) } - // Set StartedAt lf := state.NewLifecycleState(opts.state.Annotations[labels.StateDir]) - return lf.WithLock(func() error { + + return errors.Join(netError, lf.WithLock(func() error { // Errors are voluntarily ignored here, as they should not be fatal. // The lifecycle struct is also already warning about the issue. _ = lf.Load() lf.StartedAt = time.Now() + lf.CreateError = netError != nil return lf.Save() - }) + })) } func onPostStop(opts *handlerOpts) error { + // See https://github.com/containerd/nerdctl/issues/3357 + // Check if we actually errored during runtimeCreate + // If that is the case, CreateError is set, and we are in postStop while the container will NOT be deleted (see ticket). + // In that case, do NOT treat this as a deletion, as the container is still there. + // Reset CreateError, and return. + lf := state.NewLifecycleState(opts.state.Annotations[labels.StateDir]) + if lf.WithLock(lf.Load) == nil { + if lf.CreateError { + lf.CreateError = false + return lf.WithLock(lf.Save) + } + } + ctx := context.Background() ns := opts.state.Annotations[labels.Namespace] if opts.cni != nil { diff --git a/pkg/ocihook/state/state.go b/pkg/ocihook/state/state.go index f3f240218fd..312bf973556 100644 --- a/pkg/ocihook/state/state.go +++ b/pkg/ocihook/state/state.go @@ -51,8 +51,9 @@ func NewLifecycleState(stateDir string) *LifecycleState { } type LifecycleState struct { - stateDir string - StartedAt time.Time `json:"started_at"` + stateDir string + StartedAt time.Time `json:"started_at"` + CreateError bool `json:"create_error"` } func (lf *LifecycleState) WithLock(fun func() error) error {