Skip to content

Commit

Permalink
emit an event when an unhealthy appwrapper is being reset (#207)
Browse files Browse the repository at this point in the history
  • Loading branch information
dgrove-oss authored Jul 19, 2024
1 parent 374aa61 commit e76ea7e
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 14 deletions.
27 changes: 19 additions & 8 deletions internal/controller/appwrapper/appwrapper_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/tools/record"

ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand All @@ -54,8 +55,9 @@ const (
// AppWrapperReconciler reconciles an appwrapper
type AppWrapperReconciler struct {
client.Client
Scheme *runtime.Scheme
Config *config.AppWrapperConfig
Recorder record.EventRecorder
Scheme *runtime.Scheme
Config *config.AppWrapperConfig
}

type podStatusSummary struct {
Expand Down Expand Up @@ -210,12 +212,14 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
}
}
detailMsg := fmt.Sprintf("error creating components: %v", err)
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
Type: string(workloadv1beta2.Unhealthy),
Status: metav1.ConditionTrue,
Reason: "CreateFailed",
Message: fmt.Sprintf("error creating components: %v", err),
Message: detailMsg,
})
r.Recorder.Event(aw, v1.EventTypeNormal, string(workloadv1beta2.Unhealthy), "CreateFailed: "+detailMsg)
if fatal {
return r.updateStatus(ctx, aw, workloadv1beta2.AppWrapperFailed) // always move to failed on fatal error
} else {
Expand All @@ -240,25 +244,29 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
}

// Detect externally deleted components and transition to Failed with no GracePeriod or retry
detailMsg := fmt.Sprintf("Only found %v deployed components, but was expecting %v", compStatus.deployed, compStatus.expected)
if compStatus.deployed != compStatus.expected {
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
Type: string(workloadv1beta2.Unhealthy),
Status: metav1.ConditionTrue,
Reason: "MissingComponent",
Message: fmt.Sprintf("Only found %v deployed components, but was expecting %v", compStatus.deployed, compStatus.expected),
Message: detailMsg,
})
r.Recorder.Event(aw, v1.EventTypeNormal, string(workloadv1beta2.Unhealthy), "MissingComponent: "+detailMsg)
return r.updateStatus(ctx, aw, workloadv1beta2.AppWrapperFailed)
}

// If a component's controller has put it into a failed state, we do not need
// to allow any further grace period. The situation will not self-correct.
detailMsg = fmt.Sprintf("Found %v failed components", compStatus.failed)
if compStatus.failed > 0 {
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
Type: string(workloadv1beta2.Unhealthy),
Status: metav1.ConditionTrue,
Reason: "FailedComponent",
Message: fmt.Sprintf("Found %v failed components", compStatus.failed),
Message: detailMsg,
})
r.Recorder.Event(aw, v1.EventTypeNormal, string(workloadv1beta2.Unhealthy), "FailedComponent: "+detailMsg)
return r.resetOrFail(ctx, aw, podStatus.terminalFailure, 1)
}

Expand Down Expand Up @@ -297,20 +305,22 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
if now.Before(deadline) {
return ctrl.Result{RequeueAfter: deadline.Sub(now)}, r.Status().Update(ctx, aw)
} else {
r.Recorder.Eventf(aw, v1.EventTypeNormal, string(workloadv1beta2.Unhealthy), "FoundFailedPods: %v failed pods", podStatus.failed)
return r.resetOrFail(ctx, aw, podStatus.terminalFailure, 1)
}
}

// Initiate migration of workloads that are using resources that Autopilot has flagged as unhealthy
detailMsg = fmt.Sprintf("Workload contains pods using unhealthy resources on Nodes: %v", podStatus.unhealthyNodes)
if len(podStatus.unhealthyNodes) > 0 {
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
Type: string(workloadv1beta2.Unhealthy),
Status: metav1.ConditionTrue,
Reason: "AutopilotUnhealthy",
Message: fmt.Sprintf("Workload contains pods using unhealthy resources on Nodes: %v", podStatus.unhealthyNodes),
Message: detailMsg,
})
// Autopilot triggered evacuation does not increment retry count
return r.resetOrFail(ctx, aw, false, 0)
r.Recorder.Event(aw, v1.EventTypeNormal, string(workloadv1beta2.Unhealthy), detailMsg)
return r.resetOrFail(ctx, aw, false, 0) // Autopilot triggered evacuation does not increment retry count
}

clearCondition(aw, workloadv1beta2.Unhealthy, "FoundNoFailedPods", "")
Expand Down Expand Up @@ -344,6 +354,7 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
Reason: "InsufficientPodsReady",
Message: podDetailsMessage,
})
r.Recorder.Event(aw, v1.EventTypeNormal, string(workloadv1beta2.Unhealthy), "InsufficientPodsReady: "+podDetailsMessage)
return r.resetOrFail(ctx, aw, podStatus.terminalFailure, 1)
}

Expand Down
8 changes: 5 additions & 3 deletions internal/controller/appwrapper/appwrapper_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/tools/record"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
Expand Down Expand Up @@ -61,9 +62,10 @@ var _ = Describe("AppWrapper Controller", func() {
awConfig.FaultTolerance.RetryPausePeriod = 0 * time.Second
awConfig.FaultTolerance.RetryLimit = 0
awReconciler = &AppWrapperReconciler{
Client: k8sClient,
Scheme: k8sClient.Scheme(),
Config: awConfig,
Client: k8sClient,
Recorder: &record.FakeRecorder{},
Scheme: k8sClient.Scheme(),
Config: awConfig,
}
kueuePodSets = (*workload.AppWrapper)(aw).PodSets()

Expand Down
7 changes: 4 additions & 3 deletions pkg/controller/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,10 @@ func SetupControllers(mgr ctrl.Manager, awConfig *config.AppWrapperConfig) error
}

if err := (&appwrapper.AppWrapperReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Config: awConfig,
Client: mgr.GetClient(),
Recorder: mgr.GetEventRecorderFor("appwrappers"),
Scheme: mgr.GetScheme(),
Config: awConfig,
}).SetupWithManager(mgr); err != nil {
return fmt.Errorf("appwrapper controller: %w", err)
}
Expand Down

0 comments on commit e76ea7e

Please sign in to comment.