Skip to content

Commit

Permalink
Recreate missing resource slice (#255)
Browse files Browse the repository at this point in the history
Resolve #228 

Add self healing resource slice controller to re-create the missing
resource slice
  • Loading branch information
chihshenghuang authored Dec 4, 2024
1 parent 40cec4f commit 02e1b9d
Show file tree
Hide file tree
Showing 4 changed files with 719 additions and 8 deletions.
24 changes: 16 additions & 8 deletions cmd/eno-controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"github.com/Azure/eno/internal/controllers/flowcontrol"
"github.com/Azure/eno/internal/controllers/replication"
"github.com/Azure/eno/internal/controllers/rollout"
"github.com/Azure/eno/internal/controllers/selfhealing"
"github.com/Azure/eno/internal/controllers/synthesis"
"github.com/Azure/eno/internal/controllers/watch"
"github.com/Azure/eno/internal/controllers/watchdog"
Expand All @@ -48,14 +49,15 @@ func main() {
func runController() error {
ctx := ctrl.SetupSignalHandler()
var (
debugLogging bool
watchdogThres time.Duration
rolloutCooldown time.Duration
dispatchCooldown time.Duration
taintToleration string
nodeAffinity string
concurrencyLimit int
synconf = &synthesis.Config{}
debugLogging bool
watchdogThres time.Duration
rolloutCooldown time.Duration
dispatchCooldown time.Duration
selfHealingGracePeriod time.Duration
taintToleration string
nodeAffinity string
concurrencyLimit int
synconf = &synthesis.Config{}

mgrOpts = &manager.Options{
Rest: ctrl.GetConfigOrDie(),
Expand All @@ -73,6 +75,7 @@ func runController() error {
flag.StringVar(&taintToleration, "taint-toleration", "", "Node NoSchedule taint to be tolerated by synthesizer pods e.g. taintKey=taintValue to match on value, just taintKey to match on presence of the taint")
flag.StringVar(&nodeAffinity, "node-affinity", "", "Synthesizer pods will be created with this required node affinity expression e.g. labelKey=labelValue to match on value, just labelKey to match on presence of the label")
flag.IntVar(&concurrencyLimit, "concurrency-limit", 10, "Upper bound on active syntheses. This effectively limits the number of running synthesizer pods spawned by Eno.")
flag.DurationVar(&selfHealingGracePeriod, "self-healing-grace-period", time.Minute*5, "How long before the self-healing controllers are allowed to start the resynthesis process.")
mgrOpts.Bind(flag.CommandLine)
flag.Parse()

Expand Down Expand Up @@ -113,6 +116,11 @@ func runController() error {
return fmt.Errorf("constructing rollout controller: %w", err)
}

err = selfhealing.NewSliceController(mgr, selfHealingGracePeriod)
if err != nil {
return fmt.Errorf("constructing self healing resource slice controller: %w", err)
}

err = synthesis.NewPodLifecycleController(mgr, synconf)
if err != nil {
return fmt.Errorf("constructing pod lifecycle controller: %w", err)
Expand Down
2 changes: 2 additions & 0 deletions internal/controllers/reconciliation/helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"github.com/Azure/eno/internal/controllers/liveness"
"github.com/Azure/eno/internal/controllers/replication"
"github.com/Azure/eno/internal/controllers/rollout"
"github.com/Azure/eno/internal/controllers/selfhealing"
"github.com/Azure/eno/internal/controllers/synthesis"
"github.com/Azure/eno/internal/controllers/watch"
"github.com/Azure/eno/internal/controllers/watchdog"
Expand All @@ -32,6 +33,7 @@ func registerControllers(t *testing.T, mgr *testutil.Manager) {
require.NoError(t, flowcontrol.NewSynthesisConcurrencyLimiter(mgr.Manager, 10, 0))
require.NoError(t, liveness.NewNamespaceController(mgr.Manager, 3, time.Second))
require.NoError(t, watch.NewController(mgr.Manager))
require.NoError(t, selfhealing.NewSliceController(mgr.Manager, time.Minute*5))
}

func writeGenericComposition(t *testing.T, client client.Client) (*apiv1.Synthesizer, *apiv1.Composition) {
Expand Down
191 changes: 191 additions & 0 deletions internal/controllers/selfhealing/slice.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
package selfhealing

import (
"context"
"fmt"
"time"

"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/util/workqueue"
"k8s.io/utils/ptr"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/event"
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

apiv1 "github.com/Azure/eno/api/v1"
"github.com/Azure/eno/internal/manager"
"github.com/go-logr/logr"
)

// sliceController check if the resource slice is deleted but it is still present in the composition current synthesis status.
// If yes, it will update the composition PendingResynthesis status to trigger re-synthesis process.
type sliceController struct {
client client.Client
noCacheReader client.Reader
selfHealingGracePeriod time.Duration
}

func NewSliceController(mgr ctrl.Manager, selfHealingGracePeriod time.Duration) error {
s := &sliceController{
client: mgr.GetClient(),
noCacheReader: mgr.GetAPIReader(),
selfHealingGracePeriod: selfHealingGracePeriod,
}
return ctrl.NewControllerManagedBy(mgr).
Named("selfHealingSliceController").
Watches(&apiv1.Composition{}, newCompositionHandler()).
Watches(&apiv1.ResourceSlice{}, newSliceHandler()).
WithLogConstructor(manager.NewLogConstructor(mgr, "selfHealingSliceController")).
Complete(s)
}

func (s *sliceController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
logger := logr.FromContextOrDiscard(ctx)

comp := &apiv1.Composition{}
err := s.client.Get(ctx, req.NamespacedName, comp)
if err != nil {
return ctrl.Result{}, client.IgnoreNotFound(fmt.Errorf("gettting composition: %w", err))
}

syn := &apiv1.Synthesizer{}
syn.Name = comp.Spec.Synthesizer.Name
err = s.client.Get(ctx, client.ObjectKeyFromObject(syn), syn)
if err != nil {
return ctrl.Result{}, client.IgnoreNotFound(fmt.Errorf("gettting synthesizer: %w", err))
}

logger = logger.WithValues("compositionGeneration", comp.Generation,
"compositionName", comp.Name,
"compositionNamespace", comp.Namespace,
"synthesisID", comp.Status.GetCurrentSynthesisUUID())

// Skip if the composition is not eligible for resynthesis, and check the synthesis result later
if notEligibleForResynthesis(comp) {
logger.V(1).Info("not eligible for resynthesis when checking the missing resource slice")
// Use default grace period if the time since last synthesized is exceeds than the grace period
if comp.Status.CurrentSynthesis == nil ||
comp.Status.CurrentSynthesis.Synthesized == nil ||
(s.selfHealingGracePeriod-time.Since(comp.Status.CurrentSynthesis.Synthesized.Time)) <= 0 {
return ctrl.Result{Requeue: true, RequeueAfter: s.selfHealingGracePeriod}, nil
}

// Use the remaining grace period if the time since the last synthesized is less than the grace period
return ctrl.Result{Requeue: true, RequeueAfter: s.selfHealingGracePeriod - time.Since(comp.Status.CurrentSynthesis.Synthesized.Time)}, nil
}

// Check if any resource slice referenced by the composition is deleted.
for _, ref := range comp.Status.CurrentSynthesis.ResourceSlices {
slice := &apiv1.ResourceSlice{}
slice.Name = ref.Name
slice.Namespace = comp.Namespace
err := s.client.Get(ctx, client.ObjectKeyFromObject(slice), slice)
if errors.IsNotFound(err) {
// Ensure the resource slice is missing by checking the resource from api-server
isMissing, err := s.isSliceMissing(ctx, slice)
if err != nil {
return ctrl.Result{}, err
}
if !isMissing {
continue
}

// The resource slice should not be deleted if it is still referenced by the composition.
// Update the composition status to trigger re-synthesis process.
logger.V(1).Info("found missing resource slice and start resynthesis", "compositionName", comp.Name, "resourceSliceName", ref.Name)
comp.Status.PendingResynthesis = ptr.To(metav1.Now())
err = s.client.Status().Update(ctx, comp)
if err != nil {
return ctrl.Result{}, fmt.Errorf("updating composition pending resynthesis: %w", err)
}
return ctrl.Result{}, nil
}
if err != nil {
return ctrl.Result{}, fmt.Errorf("getting resource slice: %w", err)
}
}

return ctrl.Result{}, nil
}

func (s *sliceController) isSliceMissing(ctx context.Context, slice *apiv1.ResourceSlice) (bool, error) {
err := s.noCacheReader.Get(ctx, client.ObjectKeyFromObject(slice), slice)
if errors.IsNotFound(err) {
return true, nil
}
if err != nil {
return false, fmt.Errorf("getting resource slice from non cache reader: %w", err)
}

return false, nil
}

// Compositions aren't eligible to trigger resynthesis when:
// - They haven't ever been synthesized (they'll use the latest inputs anyway)
// - They are currently being synthesized or deleted
// - They are already pending resynthesis
//
// Composition should be resynthesized when the referenced resource slice is deleted
func notEligibleForResynthesis(comp *apiv1.Composition) bool {
return comp.Status.CurrentSynthesis == nil ||
comp.Status.CurrentSynthesis.Synthesized == nil ||
comp.DeletionTimestamp != nil ||
comp.Status.PendingResynthesis != nil
}

func newCompositionHandler() handler.EventHandler {
apply := func(ctx context.Context, rli workqueue.RateLimitingInterface, obj client.Object) {
comp, ok := obj.(*apiv1.Composition)
if !ok {
logr.FromContextOrDiscard(ctx).V(0).Info("unexpected type given to newCompositionHandler")
return
}

rli.Add(reconcile.Request{NamespacedName: types.NamespacedName{Namespace: comp.Namespace, Name: comp.Name}})
}
return &handler.Funcs{
CreateFunc: func(ctx context.Context, ce event.CreateEvent, rli workqueue.RateLimitingInterface) {
// No need to handle composition creation event for now
},
UpdateFunc: func(ctx context.Context, ue event.UpdateEvent, rli workqueue.RateLimitingInterface) {
// Check the updated composition only
apply(ctx, rli, ue.ObjectNew)
},
DeleteFunc: func(ctx context.Context, de event.DeleteEvent, rli workqueue.RateLimitingInterface) {
// No need to handle composition deletion event for now
},
}
}

func newSliceHandler() handler.EventHandler {
apply := func(rli workqueue.RateLimitingInterface, obj client.Object) {
owner := metav1.GetControllerOf(obj)
if owner == nil {
// No need to check the deleted resource slice which doesn't have an owner
return
}
// Pass the composition name to the request to check missing resource slice
rli.Add(reconcile.Request{
NamespacedName: types.NamespacedName{
Name: owner.Name,
Namespace: obj.GetNamespace(),
},
})
}

return &handler.Funcs{
CreateFunc: func(ctx context.Context, ce event.CreateEvent, rli workqueue.RateLimitingInterface) {
// No need to hanlde creation event for now
},
UpdateFunc: func(ctx context.Context, ue event.UpdateEvent, rli workqueue.RateLimitingInterface) {
// No need to handle update event for now
},
DeleteFunc: func(ctx context.Context, de event.DeleteEvent, rli workqueue.RateLimitingInterface) {
apply(rli, de.Object)
},
}
}
Loading

0 comments on commit 02e1b9d

Please sign in to comment.