Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Cherry pick] Fix InferenceService state when Predictor pod in CrashLoopBackOff (… #465

Open
wants to merge 1 commit into
base: release-v0.14
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 43 additions & 17 deletions pkg/apis/serving/v1beta1/inference_service_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -565,50 +565,75 @@ func (ss *InferenceServiceStatus) SetModelFailureInfo(info *FailureInfo) bool {
return true
}

func (ss *InferenceServiceStatus) PropagateModelStatus(statusSpec ComponentStatusSpec, podList *v1.PodList, rawDeployment bool) {
func (ss *InferenceServiceStatus) PropagateModelStatus(statusSpec ComponentStatusSpec, podList *v1.PodList, rawDeployment bool, serviceStatus *knservingv1.ServiceStatus) bool {
// Check at least one pod is running for the latest revision of inferenceservice
totalCopies := len(podList.Items)
if totalCopies == 0 {
ss.UpdateModelRevisionStates(Pending, totalCopies, nil)
return
}
// Update model state to 'Loaded' if inferenceservice status is ready.
// For serverless deployment, the latest created revision and the latest ready revision should be equal
if ss.IsReady() {
if rawDeployment {
ss.UpdateModelRevisionStates(Loaded, totalCopies, nil)
return
} else if statusSpec.LatestCreatedRevision == statusSpec.LatestReadyRevision {
ss.UpdateModelRevisionStates(Loaded, totalCopies, nil)
return
if !rawDeployment {
// Make sure we haven't scaled down to 0 because of an error
for _, knativeCond := range serviceStatus.Conditions {
if knativeCond.Status == "False" {
// If any of the knative statuses are False, the model failed
// Hopefully the lastFailureInfo already has the info we need, so we don't update it here
ss.UpdateModelRevisionStates(FailedToLoad, totalCopies, nil)
return true
}
}
}

// If we made it here then hopefully there are 0 pods because we're just getting started and therefore Pending seems appropriate
ss.UpdateModelRevisionStates(Pending, totalCopies, nil)
return true
}

// Update model state to 'Loading' if storage initializer is running.
// If the storage initializer is terminated due to error or crashloopbackoff, update model
// state to 'ModelLoadFailed' with failure info.
for _, cs := range podList.Items[0].Status.InitContainerStatuses {
if cs.Name == constants.StorageInitializerContainerName {
switch {
case cs.State.Running != nil:
ss.UpdateModelRevisionStates(Loading, totalCopies, nil)
return
// Double check that we aren't missing an error because the cs is looping between an error state and running
if cs.LastTerminationState.Terminated != nil {
// Requeue the Predictor reconcile loop if there was a previous error reported
// to make sure we aren't just temporarily in the cs.State.Running case before moving to the cs.State.Terminated case
return false
} else {
// If there is no previous error, we should be okay to move into the Loading state
ss.UpdateModelRevisionStates(Loading, totalCopies, nil)
return true
}

case cs.State.Terminated != nil && cs.State.Terminated.Reason == constants.StateReasonError:
ss.UpdateModelRevisionStates(FailedToLoad, totalCopies, &FailureInfo{
Reason: ModelLoadFailed,
Message: cs.State.Terminated.Message,
ExitCode: cs.State.Terminated.ExitCode,
})
return
return true
case cs.State.Waiting != nil && cs.State.Waiting.Reason == constants.StateReasonCrashLoopBackOff:
ss.UpdateModelRevisionStates(FailedToLoad, totalCopies, &FailureInfo{
Reason: ModelLoadFailed,
Message: cs.LastTerminationState.Terminated.Message,
ExitCode: cs.LastTerminationState.Terminated.ExitCode,
})
return
return true
}
}
}

// Update model state to 'Loaded' if inferenceservice status is ready.
// For serverless deployment, the latest created revision and the latest ready revision should be equal
if ss.IsReady() {
if rawDeployment {
ss.UpdateModelRevisionStates(Loaded, totalCopies, nil)
return true
} else if statusSpec.LatestCreatedRevision == statusSpec.LatestReadyRevision {
ss.UpdateModelRevisionStates(Loaded, totalCopies, nil)
return true
}
}

// If the kserve container is terminated due to error or crashloopbackoff, update model
// state to 'ModelLoadFailed' with failure info.
for _, cs := range podList.Items[0].Status.ContainerStatuses {
Expand All @@ -631,4 +656,5 @@ func (ss *InferenceServiceStatus) PropagateModelStatus(statusSpec ComponentStatu
}
}
}
return true
}
Loading
Loading