diff --git a/pkg/namestore/namestore.go b/pkg/namestore/namestore.go index 5243a8f958a..a6b57968c82 100644 --- a/pkg/namestore/namestore.go +++ b/pkg/namestore/namestore.go @@ -22,6 +22,8 @@ import ( "path/filepath" "strings" + "github.com/containerd/log" + "github.com/containerd/nerdctl/v2/pkg/identifiers" "github.com/containerd/nerdctl/v2/pkg/lockutil" ) @@ -56,12 +58,21 @@ func (x *nameStore) Acquire(name, id string) error { } fn := func() error { fileName := filepath.Join(x.dir, name) - // If containerd was bounced, previously running containers that would get restarted will go again through - // onCreateRuntime (unlike in a "normal" stop/start flow). - // As such, we need to allow reacquiring by the same id - // See: https://github.com/containerd/nerdctl/issues/3354 - if b, err := os.ReadFile(fileName); err == nil && string(b) != id { - return fmt.Errorf("name %q is already used by ID %q", name, string(b)) + if b, err := os.ReadFile(fileName); err == nil { + if strings.TrimSpace(string(b)) == "" { + // currently acquired for an empty id - this obviously should never happen + // this is recoverable, and we are not hard erroring, but still indicative that something was wrong + // https://github.com/containerd/nerdctl/issues/3351 + log.L.Errorf("current name %q is reserved for a an empty id - please report this is as a bug", name) + } else if string(b) != id { + // if acquired by a different container, we error out here + return fmt.Errorf("name %q is already used by ID %q", name, string(b)) + } + // Otherwise, this is just re-acquiring after a restart + // For example, if containerd was bounced, previously running containers that would get restarted will go + // again through onCreateRuntime (unlike in a "normal" stop/start flow). + // As such, we are allowing reacquiring by the same id + // See: https://github.com/containerd/nerdctl/issues/3354 } return os.WriteFile(fileName, []byte(id), 0600) } diff --git a/pkg/ocihook/ocihook.go b/pkg/ocihook/ocihook.go index f8afc6c7c18..922e6ff8c96 100644 --- a/pkg/ocihook/ocihook.go +++ b/pkg/ocihook/ocihook.go @@ -505,25 +505,37 @@ func onCreateRuntime(opts *handlerOpts) error { log.L.WithError(err).Error("failed re-acquiring name - see https://github.com/containerd/nerdctl/issues/2992") } + var netError error if opts.cni != nil { - if err = applyNetworkSettings(opts); err != nil { - return err - } + netError = applyNetworkSettings(opts) } - // Set StartedAt lf := state.NewLifecycleState(opts.state.Annotations[labels.StateDir]) - return lf.WithLock(func() error { - err := lf.Load() - if err != nil { - return err - } + + return errors.Join(netError, lf.WithLock(func() error { + // Errors are voluntarily ignored here, as they should not be fatal. + // The lifecycle struct is also already warning about the issue. + _ = lf.Load() lf.StartedAt = time.Now() + lf.CreateError = netError != nil return lf.Save() - }) + })) } func onPostStop(opts *handlerOpts) error { + // See https://github.com/containerd/nerdctl/issues/3357 + // Check if we actually errored during runtimeCreate + // If that is the case, CreateError is set, and we are in postStop while the container will NOT be deleted (see ticket). + // In that case, do NOT treat this as a deletion, as the container is still there. + // Reset CreateError, and return. + lf := state.NewLifecycleState(opts.state.Annotations[labels.StateDir]) + if lf.WithLock(lf.Load) == nil { + if lf.CreateError { + lf.CreateError = false + return lf.WithLock(lf.Save) + } + } + ctx := context.Background() ns := opts.state.Annotations[labels.Namespace] if opts.cni != nil { diff --git a/pkg/ocihook/state/state.go b/pkg/ocihook/state/state.go index 0c0295cdd83..312bf973556 100644 --- a/pkg/ocihook/state/state.go +++ b/pkg/ocihook/state/state.go @@ -24,6 +24,8 @@ import ( "path/filepath" "time" + "github.com/containerd/log" + "github.com/containerd/nerdctl/v2/pkg/lockutil" ) @@ -49,8 +51,9 @@ func NewLifecycleState(stateDir string) *LifecycleState { } type LifecycleState struct { - stateDir string - StartedAt time.Time `json:"started_at"` + stateDir string + StartedAt time.Time `json:"started_at"` + CreateError bool `json:"create_error"` } func (lf *LifecycleState) WithLock(fun func() error) error { @@ -71,6 +74,8 @@ func (lf *LifecycleState) Load() error { } else { err = json.Unmarshal(data, lf) if err != nil { + // Logging an error, as Load errors are generally ignored downstream + log.L.Error("unable to unmarshall lifecycle data") return fmt.Errorf("unable to unmarshall lifecycle data: %w", err) } } @@ -78,11 +83,16 @@ func (lf *LifecycleState) Load() error { } func (lf *LifecycleState) Save() error { + // Write atomically (write, then move) to avoid incomplete writes from happening data, err := json.Marshal(lf) if err != nil { return fmt.Errorf("unable to marshall lifecycle data: %w", err) } - err = os.WriteFile(filepath.Join(lf.stateDir, lifecycleFile), data, 0600) + err = os.WriteFile(filepath.Join(lf.stateDir, "."+lifecycleFile), data, 0600) + if err != nil { + return fmt.Errorf("unable to write lifecycle file: %w", err) + } + err = os.Rename(filepath.Join(lf.stateDir, "."+lifecycleFile), filepath.Join(lf.stateDir, lifecycleFile)) if err != nil { return fmt.Errorf("unable to write lifecycle file: %w", err) }