From 92bbd54e86d63419fba30f0a0bd75cca1114df9a Mon Sep 17 00:00:00 2001 From: Hannah DeFazio Date: Wed, 11 Dec 2024 23:26:36 -0500 Subject: [PATCH] Fix InferenceService state when Predictor pod in CrashLoopBackOff (#4003) * Requeue and then double check the Pending status Signed-off-by: Hannah DeFazio * Add test case, fix old tests Signed-off-by: Hannah DeFazio * Check the retun value for PropagateModelStatus, add knative failure case Signed-off-by: Hannah DeFazio --------- Signed-off-by: Hannah DeFazio Co-authored-by: Hannah DeFazio --- .../v1beta1/inference_service_status.go | 60 ++-- .../v1beta1/inference_service_status_test.go | 263 +++++++++++++++++- .../inferenceservice/components/predictor.go | 14 +- 3 files changed, 313 insertions(+), 24 deletions(-) diff --git a/pkg/apis/serving/v1beta1/inference_service_status.go b/pkg/apis/serving/v1beta1/inference_service_status.go index cce0812b795..0373c0e3d2f 100644 --- a/pkg/apis/serving/v1beta1/inference_service_status.go +++ b/pkg/apis/serving/v1beta1/inference_service_status.go @@ -565,24 +565,27 @@ func (ss *InferenceServiceStatus) SetModelFailureInfo(info *FailureInfo) bool { return true } -func (ss *InferenceServiceStatus) PropagateModelStatus(statusSpec ComponentStatusSpec, podList *v1.PodList, rawDeployment bool) { +func (ss *InferenceServiceStatus) PropagateModelStatus(statusSpec ComponentStatusSpec, podList *v1.PodList, rawDeployment bool, serviceStatus *knservingv1.ServiceStatus) bool { // Check at least one pod is running for the latest revision of inferenceservice totalCopies := len(podList.Items) if totalCopies == 0 { - ss.UpdateModelRevisionStates(Pending, totalCopies, nil) - return - } - // Update model state to 'Loaded' if inferenceservice status is ready. - // For serverless deployment, the latest created revision and the latest ready revision should be equal - if ss.IsReady() { - if rawDeployment { - ss.UpdateModelRevisionStates(Loaded, totalCopies, nil) - return - } else if statusSpec.LatestCreatedRevision == statusSpec.LatestReadyRevision { - ss.UpdateModelRevisionStates(Loaded, totalCopies, nil) - return + if !rawDeployment { + // Make sure we haven't scaled down to 0 because of an error + for _, knativeCond := range serviceStatus.Conditions { + if knativeCond.Status == "False" { + // If any of the knative statuses are False, the model failed + // Hopefully the lastFailureInfo already has the info we need, so we don't update it here + ss.UpdateModelRevisionStates(FailedToLoad, totalCopies, nil) + return true + } + } } + + // If we made it here then hopefully there are 0 pods because we're just getting started and therefore Pending seems appropriate + ss.UpdateModelRevisionStates(Pending, totalCopies, nil) + return true } + // Update model state to 'Loading' if storage initializer is running. // If the storage initializer is terminated due to error or crashloopbackoff, update model // state to 'ModelLoadFailed' with failure info. @@ -590,25 +593,47 @@ func (ss *InferenceServiceStatus) PropagateModelStatus(statusSpec ComponentStatu if cs.Name == constants.StorageInitializerContainerName { switch { case cs.State.Running != nil: - ss.UpdateModelRevisionStates(Loading, totalCopies, nil) - return + // Double check that we aren't missing an error because the cs is looping between an error state and running + if cs.LastTerminationState.Terminated != nil { + // Requeue the Predictor reconcile loop if there was a previous error reported + // to make sure we aren't just temporarily in the cs.State.Running case before moving to the cs.State.Terminated case + return false + } else { + // If there is no previous error, we should be okay to move into the Loading state + ss.UpdateModelRevisionStates(Loading, totalCopies, nil) + return true + } + case cs.State.Terminated != nil && cs.State.Terminated.Reason == constants.StateReasonError: ss.UpdateModelRevisionStates(FailedToLoad, totalCopies, &FailureInfo{ Reason: ModelLoadFailed, Message: cs.State.Terminated.Message, ExitCode: cs.State.Terminated.ExitCode, }) - return + return true case cs.State.Waiting != nil && cs.State.Waiting.Reason == constants.StateReasonCrashLoopBackOff: ss.UpdateModelRevisionStates(FailedToLoad, totalCopies, &FailureInfo{ Reason: ModelLoadFailed, Message: cs.LastTerminationState.Terminated.Message, ExitCode: cs.LastTerminationState.Terminated.ExitCode, }) - return + return true } } } + + // Update model state to 'Loaded' if inferenceservice status is ready. + // For serverless deployment, the latest created revision and the latest ready revision should be equal + if ss.IsReady() { + if rawDeployment { + ss.UpdateModelRevisionStates(Loaded, totalCopies, nil) + return true + } else if statusSpec.LatestCreatedRevision == statusSpec.LatestReadyRevision { + ss.UpdateModelRevisionStates(Loaded, totalCopies, nil) + return true + } + } + // If the kserve container is terminated due to error or crashloopbackoff, update model // state to 'ModelLoadFailed' with failure info. for _, cs := range podList.Items[0].Status.ContainerStatuses { @@ -631,4 +656,5 @@ func (ss *InferenceServiceStatus) PropagateModelStatus(statusSpec ComponentStatu } } } + return true } diff --git a/pkg/apis/serving/v1beta1/inference_service_status_test.go b/pkg/apis/serving/v1beta1/inference_service_status_test.go index 60f4dae216f..1306cbc174d 100644 --- a/pkg/apis/serving/v1beta1/inference_service_status_test.go +++ b/pkg/apis/serving/v1beta1/inference_service_status_test.go @@ -360,9 +360,11 @@ func TestInferenceServiceStatus_PropagateModelStatus(t *testing.T) { statusSpec ComponentStatusSpec podList *v1.PodList rawDeployment bool + serviceStatus *knservingv1.ServiceStatus expectedRevisionStates *ModelRevisionStates expectedTransitionStatus TransitionStatus expectedFailureInfo *FailureInfo + expectedReturnValue bool }{ "pod list is empty": { isvcStatus: &InferenceServiceStatus{ @@ -400,12 +402,79 @@ func TestInferenceServiceStatus_PropagateModelStatus(t *testing.T) { Items: []v1.Pod{}, }, rawDeployment: false, + serviceStatus: &knservingv1.ServiceStatus{ + Status: duckv1.Status{ + Conditions: duckv1.Conditions{ + { + Type: "Ready", + Status: "True", + }, + }, + }, + }, expectedRevisionStates: &ModelRevisionStates{ ActiveModelState: "", TargetModelState: Pending, }, expectedTransitionStatus: InProgress, expectedFailureInfo: nil, + expectedReturnValue: true, + }, + "pod list is empty but knative has an error": { + isvcStatus: &InferenceServiceStatus{ + Status: duckv1.Status{ + Conditions: duckv1.Conditions{ + { + Type: "Ready", + Status: v1.ConditionFalse, + }, + }, + }, + Address: &duckv1.Addressable{}, + URL: &apis.URL{}, + Components: map[ComponentType]ComponentStatusSpec{ + PredictorComponent: { + LatestRolledoutRevision: "test-predictor-default-0001", + }, + }, + ModelStatus: ModelStatus{}, + }, + statusSpec: ComponentStatusSpec{ + LatestReadyRevision: "", + LatestCreatedRevision: "", + PreviousRolledoutRevision: "", + LatestRolledoutRevision: "", + Traffic: nil, + URL: nil, + RestURL: nil, + GrpcURL: nil, + Address: nil, + }, + podList: &v1.PodList{ // pod list is empty because the revision failed and scaled down to 0 + TypeMeta: metav1.TypeMeta{}, + ListMeta: metav1.ListMeta{}, + Items: []v1.Pod{}, + }, + rawDeployment: false, + serviceStatus: &knservingv1.ServiceStatus{ + Status: duckv1.Status{ + Conditions: duckv1.Conditions{ + { + Type: "Ready", + Status: "False", + Reason: "RevisionFailed", + Message: "For testing", + }, + }, + }, + }, + expectedRevisionStates: &ModelRevisionStates{ + ActiveModelState: "", + TargetModelState: FailedToLoad, + }, + expectedTransitionStatus: BlockedByFailedLoad, + expectedFailureInfo: nil, + expectedReturnValue: true, }, "kserve container in pending state": { isvcStatus: &InferenceServiceStatus{ @@ -466,12 +535,23 @@ func TestInferenceServiceStatus_PropagateModelStatus(t *testing.T) { }, }, rawDeployment: false, + serviceStatus: &knservingv1.ServiceStatus{ + Status: duckv1.Status{ + Conditions: duckv1.Conditions{ + { + Type: "Ready", + Status: "True", + }, + }, + }, + }, expectedRevisionStates: &ModelRevisionStates{ ActiveModelState: "", TargetModelState: Pending, }, expectedTransitionStatus: InProgress, expectedFailureInfo: nil, + expectedReturnValue: true, }, "kserve container failed due to an error": { isvcStatus: &InferenceServiceStatus{ @@ -538,6 +618,16 @@ func TestInferenceServiceStatus_PropagateModelStatus(t *testing.T) { }, }, rawDeployment: false, + serviceStatus: &knservingv1.ServiceStatus{ + Status: duckv1.Status{ + Conditions: duckv1.Conditions{ + { + Type: "Ready", + Status: "True", + }, + }, + }, + }, expectedRevisionStates: &ModelRevisionStates{ ActiveModelState: "", TargetModelState: FailedToLoad, @@ -548,6 +638,7 @@ func TestInferenceServiceStatus_PropagateModelStatus(t *testing.T) { Message: "For testing", ExitCode: 1, }, + expectedReturnValue: true, }, "kserve container failed due to crash loopBackOff": { isvcStatus: &InferenceServiceStatus{ @@ -619,6 +710,16 @@ func TestInferenceServiceStatus_PropagateModelStatus(t *testing.T) { }, }, rawDeployment: false, + serviceStatus: &knservingv1.ServiceStatus{ + Status: duckv1.Status{ + Conditions: duckv1.Conditions{ + { + Type: "Ready", + Status: "True", + }, + }, + }, + }, expectedRevisionStates: &ModelRevisionStates{ ActiveModelState: "", TargetModelState: FailedToLoad, @@ -629,6 +730,7 @@ func TestInferenceServiceStatus_PropagateModelStatus(t *testing.T) { Message: "For testing", ExitCode: 1, }, + expectedReturnValue: true, }, "storage initializer failed due to an error": { isvcStatus: &InferenceServiceStatus{ @@ -695,6 +797,16 @@ func TestInferenceServiceStatus_PropagateModelStatus(t *testing.T) { }, }, rawDeployment: false, + serviceStatus: &knservingv1.ServiceStatus{ + Status: duckv1.Status{ + Conditions: duckv1.Conditions{ + { + Type: "Ready", + Status: "True", + }, + }, + }, + }, expectedRevisionStates: &ModelRevisionStates{ ActiveModelState: "", TargetModelState: FailedToLoad, @@ -705,6 +817,7 @@ func TestInferenceServiceStatus_PropagateModelStatus(t *testing.T) { Message: "For testing", ExitCode: 1, }, + expectedReturnValue: true, }, "storage initializer failed due to crash loopBackOff": { isvcStatus: &InferenceServiceStatus{ @@ -776,6 +889,16 @@ func TestInferenceServiceStatus_PropagateModelStatus(t *testing.T) { }, }, rawDeployment: false, + serviceStatus: &knservingv1.ServiceStatus{ + Status: duckv1.Status{ + Conditions: duckv1.Conditions{ + { + Type: "Ready", + Status: "Unknown", + }, + }, + }, + }, expectedRevisionStates: &ModelRevisionStates{ ActiveModelState: "", TargetModelState: FailedToLoad, @@ -786,6 +909,7 @@ func TestInferenceServiceStatus_PropagateModelStatus(t *testing.T) { Message: "For testing", ExitCode: 1, }, + expectedReturnValue: true, }, "storage initializer in running state": { isvcStatus: &InferenceServiceStatus{ @@ -850,12 +974,109 @@ func TestInferenceServiceStatus_PropagateModelStatus(t *testing.T) { }, }, rawDeployment: false, + serviceStatus: &knservingv1.ServiceStatus{ + Status: duckv1.Status{ + Conditions: duckv1.Conditions{ + { + Type: "Ready", + Status: "True", + }, + }, + }, + }, expectedRevisionStates: &ModelRevisionStates{ ActiveModelState: "", TargetModelState: Loading, }, expectedTransitionStatus: InProgress, expectedFailureInfo: nil, + expectedReturnValue: true, + }, + "storage initializer in running state but it has a previous error": { + isvcStatus: &InferenceServiceStatus{ + Status: duckv1.Status{ + Conditions: duckv1.Conditions{ + { + Type: "Ready", + Status: v1.ConditionFalse, + }, + }, + }, + Address: &duckv1.Addressable{}, + URL: &apis.URL{}, + Components: map[ComponentType]ComponentStatusSpec{ + PredictorComponent: { + LatestRolledoutRevision: "test-predictor-default-0001", + }, + }, + ModelStatus: ModelStatus{}, + }, + statusSpec: ComponentStatusSpec{ + LatestReadyRevision: "", + LatestCreatedRevision: "", + PreviousRolledoutRevision: "", + LatestRolledoutRevision: "", + Traffic: nil, + URL: nil, + RestURL: nil, + GrpcURL: nil, + Address: nil, + }, + podList: &v1.PodList{ + TypeMeta: metav1.TypeMeta{}, + ListMeta: metav1.ListMeta{}, + Items: []v1.Pod{ + { + TypeMeta: metav1.TypeMeta{}, + ObjectMeta: metav1.ObjectMeta{ + Name: constants.StorageInitializerContainerName, + }, + Spec: v1.PodSpec{}, + Status: v1.PodStatus{ + InitContainerStatuses: []v1.ContainerStatus{ + { + Name: constants.StorageInitializerContainerName, + State: v1.ContainerState{ + Running: &v1.ContainerStateRunning{ + StartedAt: metav1.Time{}, + }, + }, + LastTerminationState: v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{ + Reason: constants.StateReasonCrashLoopBackOff, + Message: "For testing", + ExitCode: 1, + }, + }, + Ready: false, + RestartCount: 0, + Image: "", + ImageID: "", + ContainerID: "", + Started: proto.Bool(true), + }, + }, + }, + }, + }, + }, + rawDeployment: false, + serviceStatus: &knservingv1.ServiceStatus{ + Status: duckv1.Status{ + Conditions: duckv1.Conditions{ + { + Type: "Ready", + Status: "False", + Reason: "RevisionFailed", + Message: "For testing", + }, + }, + }, + }, + expectedRevisionStates: nil, // This field is not changed in this use case + expectedTransitionStatus: "", // This field is not changed in this use case + expectedFailureInfo: nil, // This field is not changed in this use case + expectedReturnValue: false, }, "kserve container is ready": { isvcStatus: &InferenceServiceStatus{ @@ -917,12 +1138,23 @@ func TestInferenceServiceStatus_PropagateModelStatus(t *testing.T) { }, }, rawDeployment: false, + serviceStatus: &knservingv1.ServiceStatus{ + Status: duckv1.Status{ + Conditions: duckv1.Conditions{ + { + Type: "Ready", + Status: "True", + }, + }, + }, + }, expectedRevisionStates: &ModelRevisionStates{ ActiveModelState: Loaded, TargetModelState: Loaded, }, expectedTransitionStatus: UpToDate, expectedFailureInfo: nil, + expectedReturnValue: true, }, "raw deployment is ready": { isvcStatus: &InferenceServiceStatus{ @@ -984,12 +1216,14 @@ func TestInferenceServiceStatus_PropagateModelStatus(t *testing.T) { }, }, rawDeployment: true, + serviceStatus: &knservingv1.ServiceStatus{}, expectedRevisionStates: &ModelRevisionStates{ ActiveModelState: Loaded, TargetModelState: Loaded, }, expectedTransitionStatus: UpToDate, expectedFailureInfo: nil, + expectedReturnValue: true, }, "skip containers other than kserve": { isvcStatus: &InferenceServiceStatus{ @@ -1049,10 +1283,21 @@ func TestInferenceServiceStatus_PropagateModelStatus(t *testing.T) { }, }, }, - rawDeployment: false, + rawDeployment: false, + serviceStatus: &knservingv1.ServiceStatus{ + Status: duckv1.Status{ + Conditions: duckv1.Conditions{ + { + Type: "Ready", + Status: "True", + }, + }, + }, + }, expectedRevisionStates: nil, expectedTransitionStatus: "", expectedFailureInfo: nil, + expectedReturnValue: true, }, "skip initcontainers other than storage initializer": { isvcStatus: &InferenceServiceStatus{ @@ -1112,17 +1357,29 @@ func TestInferenceServiceStatus_PropagateModelStatus(t *testing.T) { }, }, }, - rawDeployment: false, + rawDeployment: false, + serviceStatus: &knservingv1.ServiceStatus{ + Status: duckv1.Status{ + Conditions: duckv1.Conditions{ + { + Type: "Ready", + Status: "True", + }, + }, + }, + }, expectedRevisionStates: nil, expectedTransitionStatus: "", expectedFailureInfo: nil, + expectedReturnValue: true, }, } for name, scenario := range scenarios { t.Run(name, func(t *testing.T) { - scenario.isvcStatus.PropagateModelStatus(scenario.statusSpec, scenario.podList, scenario.rawDeployment) + rstatus := scenario.isvcStatus.PropagateModelStatus(scenario.statusSpec, scenario.podList, scenario.rawDeployment, scenario.serviceStatus) + g.Expect(rstatus).To(gomega.Equal(scenario.expectedReturnValue)) g.Expect(scenario.isvcStatus.ModelStatus.ModelRevisionStates).To(gomega.Equal(scenario.expectedRevisionStates)) g.Expect(scenario.isvcStatus.ModelStatus.TransitionStatus).To(gomega.Equal(scenario.expectedTransitionStatus)) g.Expect(scenario.isvcStatus.ModelStatus.LastFailureInfo).To(gomega.Equal(scenario.expectedFailureInfo)) diff --git a/pkg/controller/v1beta1/inferenceservice/components/predictor.go b/pkg/controller/v1beta1/inferenceservice/components/predictor.go index 6da92325e7e..f3192ab6026 100644 --- a/pkg/controller/v1beta1/inferenceservice/components/predictor.go +++ b/pkg/controller/v1beta1/inferenceservice/components/predictor.go @@ -332,6 +332,7 @@ func (p *Predictor) Reconcile(isvc *v1beta1.InferenceService) (ctrl.Result, erro var podLabelValue string // Here we allow switch between knative and vanilla deployment + kstatus := &knservingv1.ServiceStatus{} if p.deploymentMode == constants.RawDeployment { rawDeployment = true podLabelKey = constants.RawDeploymentAppLabel @@ -371,11 +372,13 @@ func (p *Predictor) Reconcile(isvc *v1beta1.InferenceService) (ctrl.Result, erro if err := controllerutil.SetControllerReference(isvc, r.Service, p.scheme); err != nil { return ctrl.Result{}, errors.Wrapf(err, "fails to set owner reference for predictor") } - status, err := r.Reconcile() + + var err error + kstatus, err = r.Reconcile() if err != nil { return ctrl.Result{}, errors.Wrapf(err, "fails to reconcile predictor") } - isvc.Status.PropagateStatus(v1beta1.PredictorComponent, status) + isvc.Status.PropagateStatus(v1beta1.PredictorComponent, kstatus) } statusSpec := isvc.Status.Components[v1beta1.PredictorComponent] @@ -388,8 +391,11 @@ func (p *Predictor) Reconcile(isvc *v1beta1.InferenceService) (ctrl.Result, erro if err != nil { return ctrl.Result{}, errors.Wrapf(err, "fails to list inferenceservice pods by label") } - isvc.Status.PropagateModelStatus(statusSpec, predictorPods, rawDeployment) - return ctrl.Result{}, nil + if isvc.Status.PropagateModelStatus(statusSpec, predictorPods, rawDeployment, kstatus) { + return ctrl.Result{}, nil + } else { + return ctrl.Result{Requeue: true}, nil + } } func multiNodeProcess(sRuntime v1alpha1.ServingRuntimeSpec, isvc *v1beta1.InferenceService, podSpec *v1.PodSpec, annotations map[string]string, isvcGeneration string) (*v1.PodSpec, error) {