From 808406e1846f6038c1085dc86d931f584a7c3f9d Mon Sep 17 00:00:00 2001 From: Jordan Olshevski Date: Tue, 30 Jul 2024 11:36:28 -0500 Subject: [PATCH] Allow symphonies to progress when their namespace is missing (#182) This fixes a weird edge case in which namespaces are ripped out from under symphony resources. In this case all updates will fail - the only option is to briefly recreate the namespace, delete it, and remove any finalizers while it is deleting. Co-authored-by: Jordan Olshevski --- cmd/eno-reconciler/main.go | 6 + docs/reference.md | 10 ++ internal/controllers/liveness/namespace.go | 130 ++++++++++++++++++ .../controllers/liveness/namespace_test.go | 86 ++++++++++++ .../reconciliation/helpers_test.go | 2 + .../reconciliation/symphony_test.go | 89 +++++++++++- 6 files changed, 319 insertions(+), 4 deletions(-) create mode 100644 internal/controllers/liveness/namespace.go create mode 100644 internal/controllers/liveness/namespace_test.go diff --git a/cmd/eno-reconciler/main.go b/cmd/eno-reconciler/main.go index 1f2796b7..b1a3ec19 100644 --- a/cmd/eno-reconciler/main.go +++ b/cmd/eno-reconciler/main.go @@ -14,6 +14,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "github.com/Azure/eno/internal/controllers/aggregation" + "github.com/Azure/eno/internal/controllers/liveness" "github.com/Azure/eno/internal/controllers/reconciliation" "github.com/Azure/eno/internal/controllers/synthesis" "github.com/Azure/eno/internal/flowcontrol" @@ -95,6 +96,11 @@ func run() error { return fmt.Errorf("constructing resource slice cleanup controller: %w", err) } + err = liveness.NewNamespaceController(mgr) + if err != nil { + return fmt.Errorf("constructing namespace liveness controller: %w", err) + } + remoteConfig := mgr.GetConfig() if remoteKubeconfigFile != "" { if remoteConfig, err = k8s.GetRESTConfig(remoteKubeconfigFile); err != nil { diff --git a/docs/reference.md b/docs/reference.md index c42c93e9..2d5b7702 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -303,3 +303,13 @@ spec: name: a-different-input namespace: default ``` + +### Deletion Behavior + +Symphonies are high-level resources designed to always converge, even in the face of rare split-brain states. + +Force deleting namespaces leaves resources in a strange state in which they exist but cannot be updated. +Symphonies recover from this state by carefully recreating the namespace and forcibly removing internal finalizers. +Because of this it's possible that managed resources will not be cleaned up if they exist outside of the orphaned namespace. +Worst case, managed resources might be recreated by the Eno reconciler if it outpaces kube-controller-manager's namespace controller. +Beware of this if you plan to use symphony resources to manage resources outside of the symphony's own namespace. diff --git a/internal/controllers/liveness/namespace.go b/internal/controllers/liveness/namespace.go new file mode 100644 index 00000000..4b83e516 --- /dev/null +++ b/internal/controllers/liveness/namespace.go @@ -0,0 +1,130 @@ +package liveness + +import ( + "context" + "fmt" + "time" + + apiv1 "github.com/Azure/eno/api/v1" + "github.com/Azure/eno/internal/manager" + "github.com/go-logr/logr" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +// namespaceController is responsible for progressing symphony deletion when its namespace is forcibly deleted. +// This can happen if clients get tricky with the /finalize API. +// Without this controller Eno resources will never be deleted since updates to remove the finalizers will fail. +type namespaceController struct { + client client.Client +} + +func NewNamespaceController(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&corev1.Namespace{}). + Watches(&apiv1.Symphony{}, handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []reconcile.Request { + return []reconcile.Request{{NamespacedName: types.NamespacedName{Name: o.GetNamespace()}}} + })). + WithLogConstructor(manager.NewLogConstructor(mgr, "namespaceController")). + Complete(&namespaceController{ + client: mgr.GetClient(), + }) +} + +func (c *namespaceController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + ns := &corev1.Namespace{} + err := c.client.Get(ctx, req.NamespacedName, ns) + if client.IgnoreNotFound(err) != nil { + return ctrl.Result{}, fmt.Errorf("getting namespace: %w", err) + } + + const annoKey = "eno.azure.io/recreation-reason" + const annoValue = "OrphanedResources" + + // Delete the recreated namespace immediately. + // Its finalizers will keep it around until we've had time to remove our finalizers. + logger := logr.FromContextOrDiscard(ctx).WithValues("symphonyNamespace", ns.Name) + if ns.Annotations != nil && ns.Annotations[annoKey] == annoValue { + if ns.DeletionTimestamp != nil { + return ctrl.Result{}, c.cleanup(ctx, req.Name) + } + err := c.client.Delete(ctx, ns) + if err != nil { + return ctrl.Result{}, fmt.Errorf("deleting namespace: %w", err) + } + logger.V(0).Info("deleting recreated namespace") + return ctrl.Result{}, nil + } + if err == nil { // important that this is the GET error + return ctrl.Result{}, nil // namespace exists, nothing to do + } + + // Nothing to do if the namespace doesn't have any symphonies + list := &apiv1.SymphonyList{} + err = c.client.List(ctx, list, client.InNamespace(req.Name)) + if err != nil { + return ctrl.Result{}, fmt.Errorf("listing symphonies: %w", err) + } + if len(list.Items) == 0 { + return ctrl.Result{}, nil // no orphaned resources, nothing to do + } + if time.Since(mostRecentCreation(list)) < time.Second { + return ctrl.Result{RequeueAfter: time.Second}, nil // namespace probably just hasn't hit the cache yet + } + + // Recreate the namespace briefly so we can remove the finalizers. + // Any updates (including finalizer updates) will fail if the namespace doesn't exist. + ns.Name = req.Name + ns.Annotations = map[string]string{annoKey: annoValue} + err = c.client.Create(ctx, ns) + if err != nil { + return ctrl.Result{}, fmt.Errorf("creating namespace: %w", err) + } + logger.V(0).Info("recreating missing namespace to free orphaned symphony") + return ctrl.Result{}, nil +} + +const removeFinalizersPatch = `[{ "op": "remove", "path": "/metadata/finalizers" }]` + +func (c *namespaceController) cleanup(ctx context.Context, ns string) error { + logger := logr.FromContextOrDiscard(ctx).WithValues("symphonyNamespace", ns) + logger.V(0).Info("deleting any remaining symphonies in orphaned namespace") + err := c.client.DeleteAllOf(ctx, &apiv1.Symphony{}, client.InNamespace(ns)) + if err != nil { + return fmt.Errorf("deleting symphonies: %w", err) + } + + list := &apiv1.ResourceSliceList{} + err = c.client.List(ctx, list, client.InNamespace(ns)) + if err != nil { + return fmt.Errorf("listing resource slices: %w", err) + } + + for _, item := range list.Items { + if len(item.Finalizers) == 0 { + continue + } + err = c.client.Patch(ctx, &item, client.RawPatch(types.JSONPatchType, []byte(removeFinalizersPatch))) + if err != nil { + return fmt.Errorf("removing finalizers from resource slice %q: %w", item.Name, err) + } + logger := logger.WithValues("resourceSliceName", item.Name) + logger.V(0).Info("forcibly removed finalizers") + } + + return nil +} + +func mostRecentCreation(list *apiv1.SymphonyList) time.Time { + var max time.Time + for _, item := range list.Items { + if item.CreationTimestamp.After(max) { + max = item.CreationTimestamp.Time + } + } + return max +} diff --git a/internal/controllers/liveness/namespace_test.go b/internal/controllers/liveness/namespace_test.go new file mode 100644 index 00000000..b15e5cb2 --- /dev/null +++ b/internal/controllers/liveness/namespace_test.go @@ -0,0 +1,86 @@ +package liveness + +import ( + "testing" + + apiv1 "github.com/Azure/eno/api/v1" + "github.com/Azure/eno/internal/testutil" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/runtime/serializer" + "k8s.io/client-go/rest" + "k8s.io/client-go/util/retry" + "k8s.io/kubectl/pkg/scheme" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +func TestMissingNamespace(t *testing.T) { + ctx := testutil.NewContext(t) + mgr := testutil.NewManager(t) + cli := mgr.GetClient() + + require.NoError(t, NewNamespaceController(mgr.Manager)) + mgr.Start(t) + + ns := &corev1.Namespace{} + ns.Name = "test" + require.NoError(t, cli.Create(ctx, ns)) + + sym := &apiv1.Symphony{} + sym.Name = "test-symphony" + sym.Namespace = ns.Name + sym.Finalizers = []string{"eno.azure.io/cleanup"} // this would normally be set by another controller + require.NoError(t, cli.Create(ctx, sym)) + + // Force delete the namespace + require.NoError(t, cli.Delete(ctx, ns)) + + conf := rest.CopyConfig(mgr.RestConfig) + conf.GroupVersion = &schema.GroupVersion{Version: "v1"} + conf.NegotiatedSerializer = serializer.WithoutConversionCodecFactory{CodecFactory: scheme.Codecs} + rc, err := rest.RESTClientFor(conf) + require.NoError(t, err) + + err = retry.RetryOnConflict(testutil.Backoff, func() error { + cli.Get(ctx, client.ObjectKeyFromObject(ns), ns) + ns.Spec.Finalizers = nil + + _, err = rc.Put(). + AbsPath("/api/v1/namespaces", ns.Name, "/finalize"). + Body(ns). + Do(ctx).Raw() + return err + }) + require.NoError(t, err) + + // The namespace should be completely gone + testutil.Eventually(t, func() bool { + return errors.IsNotFound(cli.Get(ctx, client.ObjectKeyFromObject(ns), ns)) + }) + + // But we should still be able to eventually remove the symphony's finalizer + require.NoError(t, cli.Delete(ctx, sym)) + testutil.Eventually(t, func() bool { + if sym.Finalizers != nil { + sym.Finalizers = nil + err = cli.Update(ctx, sym) + if err != nil { + t.Logf("error while removing finalizer from symphony: %s", err) + } + } + + missing := errors.IsNotFound(cli.Get(ctx, client.ObjectKeyFromObject(sym), sym)) + if !missing { + t.Logf("symphony still exists") + } + return missing + }) + + // Namespace should end up being deleted + testutil.Eventually(t, func() bool { + cli.Get(ctx, client.ObjectKeyFromObject(ns), ns) + return ns.DeletionTimestamp != nil + }) +} diff --git a/internal/controllers/reconciliation/helpers_test.go b/internal/controllers/reconciliation/helpers_test.go index 141078d7..487e795b 100644 --- a/internal/controllers/reconciliation/helpers_test.go +++ b/internal/controllers/reconciliation/helpers_test.go @@ -8,6 +8,7 @@ import ( apiv1 "github.com/Azure/eno/api/v1" "github.com/Azure/eno/internal/controllers/aggregation" "github.com/Azure/eno/internal/controllers/flowcontrol" + "github.com/Azure/eno/internal/controllers/liveness" "github.com/Azure/eno/internal/controllers/replication" "github.com/Azure/eno/internal/controllers/rollout" "github.com/Azure/eno/internal/controllers/synthesis" @@ -28,6 +29,7 @@ func registerControllers(t *testing.T, mgr *testutil.Manager) { require.NoError(t, rollout.NewController(mgr.Manager, time.Millisecond)) require.NoError(t, rollout.NewSynthesizerController(mgr.Manager)) require.NoError(t, flowcontrol.NewSynthesisConcurrencyLimiter(mgr.Manager, 10, 0)) + require.NoError(t, liveness.NewNamespaceController(mgr.Manager)) } func writeGenericComposition(t *testing.T, client client.Client) (*apiv1.Synthesizer, *apiv1.Composition) { diff --git a/internal/controllers/reconciliation/symphony_test.go b/internal/controllers/reconciliation/symphony_test.go index a752fced..210379b4 100644 --- a/internal/controllers/reconciliation/symphony_test.go +++ b/internal/controllers/reconciliation/symphony_test.go @@ -10,8 +10,11 @@ import ( "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/runtime/serializer" + "k8s.io/client-go/rest" "k8s.io/client-go/util/retry" + "k8s.io/kubectl/pkg/scheme" ctrlcache "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" @@ -22,9 +25,6 @@ import ( // TestSymphonyIntegration proves that a basic symphony creation/deletion workflow works. func TestSymphonyIntegration(t *testing.T) { - scheme := runtime.NewScheme() - corev1.SchemeBuilder.AddToScheme(scheme) - ctx := testutil.NewContext(t) mgr := testutil.NewManager(t, testutil.WithCompositionNamespace(ctrlcache.AllNamespaces)) upstream := mgr.GetClient() @@ -155,3 +155,84 @@ func TestSymphonyIntegration(t *testing.T) { require.NoError(t, err) assert.Len(t, comps.Items, 0) } + +// TestOrphanedNamespaceSymphony covers a special behavior of symphonies: resilience against missing namespaces. +// Ripping the namespace out from under a symphony and its associated compositions/resource slices shouldn't prevent them from being deleted. +func TestOrphanedNamespaceSymphony(t *testing.T) { + ctx := testutil.NewContext(t) + mgr := testutil.NewManager(t, testutil.WithCompositionNamespace(ctrlcache.AllNamespaces)) + upstream := mgr.GetClient() + + // Create test namespace. + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "test"}} + require.NoError(t, upstream.Create(ctx, ns)) + + registerControllers(t, mgr) + testutil.WithFakeExecutor(t, mgr, func(ctx context.Context, s *apiv1.Synthesizer, input *krmv1.ResourceList) (*krmv1.ResourceList, error) { + output := &krmv1.ResourceList{} + output.Items = []*unstructured.Unstructured{{ + Object: map[string]any{ + "apiVersion": "v1", + "kind": "ConfigMap", + "metadata": map[string]string{ + "name": "test-obj", + "namespace": "default", + }, + }, + }} + return output, nil + }) + + // Test subject + setupTestSubject(t, mgr) + mgr.Start(t) + + syn := &apiv1.Synthesizer{} + syn.Name = "test-syn" + syn.Spec.Image = "anything" + require.NoError(t, upstream.Create(ctx, syn)) + + symph := &apiv1.Symphony{} + symph.Name = "test-comp" + symph.Namespace = ns.Name + symph.Spec.Variations = []apiv1.Variation{{Synthesizer: apiv1.SynthesizerRef{Name: syn.Name}}} + require.NoError(t, upstream.Create(ctx, symph)) + + testutil.Eventually(t, func() bool { + upstream.Get(ctx, client.ObjectKeyFromObject(symph), symph) + return symph.Status.Reconciled != nil && symph.Status.ObservedGeneration == symph.Generation + }) + + // Establish that resource slices did at one point exist + slices := &apiv1.ResourceSliceList{} + require.NoError(t, upstream.List(ctx, slices)) + require.True(t, len(slices.Items) > 0) + + // Force delete the namespace + require.NoError(t, upstream.Delete(ctx, ns)) + + conf := rest.CopyConfig(mgr.RestConfig) + conf.GroupVersion = &schema.GroupVersion{Version: "v1"} + conf.NegotiatedSerializer = serializer.WithoutConversionCodecFactory{CodecFactory: scheme.Codecs} + rc, err := rest.RESTClientFor(conf) + require.NoError(t, err) + + err = retry.RetryOnConflict(testutil.Backoff, func() error { + upstream.Get(ctx, client.ObjectKeyFromObject(ns), ns) + ns.Spec.Finalizers = nil + + _, err = rc.Put(). + AbsPath("/api/v1/namespaces", ns.Name, "/finalize"). + Body(ns). + Do(ctx).Raw() + return err + }) + require.NoError(t, err) + + // The symphony and its associated resources should eventually be cleaned up + testutil.Eventually(t, func() bool { + return errors.IsNotFound(upstream.Get(ctx, client.ObjectKeyFromObject(symph), symph)) + }) + require.NoError(t, upstream.List(ctx, slices)) + require.Empty(t, slices.Items) +}