Skip to content

Commit

Permalink
Reuse the core kubernetes API reason for the BackoffLimitExceeded
Browse files Browse the repository at this point in the history
Signed-off-by: Yuki Iwai <[email protected]>
  • Loading branch information
tenzen-y committed Oct 15, 2024
1 parent a869150 commit b89a379
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 7 deletions.
6 changes: 1 addition & 5 deletions pkg/controller/mpi_job_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,10 +111,6 @@ const (
// From: k8s.io/kubernetes/pkg/apis/core/validation/events.go
eventMessageLimit = 1024

// jobBackoffLimitExceededReason is the reason that the k8s job controller
// uses when the backoff limit is exceeded.
jobBackoffLimitExceededReason = "BackoffLimitExceeded"

openMPISlotsEnv = "OMPI_MCA_orte_set_default_slots"
intelMPISlotsEnv = "I_MPI_PERHOST"
)
Expand Down Expand Up @@ -1149,7 +1145,7 @@ func (c *MPIJobController) updateMPIJobFailedStatus(mpiJob *kubeflow.MPIJob, lau
if msg == "" {
msg = fmt.Sprintf("MPIJob %s/%s has failed", mpiJob.Namespace, mpiJob.Name)
}
if reason == jobBackoffLimitExceededReason {
if reason == batchv1.JobReasonBackoffLimitExceeded {
// Concatenate the reason and message from the last failed Pod.
var lastFailedPod *corev1.Pod
for _, p := range launcherPods {
Expand Down
4 changes: 2 additions & 2 deletions pkg/controller/mpi_job_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -635,7 +635,7 @@ func TestLauncherFailed(t *testing.T) {
launcher.Status.Conditions = append(launcher.Status.Conditions, batchv1.JobCondition{
Type: batchv1.JobFailed,
Status: corev1.ConditionTrue,
Reason: jobBackoffLimitExceededReason,
Reason: batchv1.JobReasonBackoffLimitExceeded,
Message: "Job has reached the specified backoff limit",
})
launcher.Status.Failed = 2
Expand Down Expand Up @@ -668,7 +668,7 @@ func TestLauncherFailed(t *testing.T) {
msg := fmt.Sprintf("MPIJob %s/%s is created.", mpiJob.Namespace, mpiJob.Name)
updateMPIJobConditions(mpiJobCopy, kubeflow.JobCreated, corev1.ConditionTrue, mpiJobCreatedReason, msg)
msg = "Job has reached the specified backoff limit: second message"
updateMPIJobConditions(mpiJobCopy, kubeflow.JobFailed, corev1.ConditionTrue, jobBackoffLimitExceededReason+"/FailedReason2", msg)
updateMPIJobConditions(mpiJobCopy, kubeflow.JobFailed, corev1.ConditionTrue, batchv1.JobReasonBackoffLimitExceeded+"/FailedReason2", msg)

f.expectUpdateMPIJobStatusAction(mpiJobCopy)

Expand Down

0 comments on commit b89a379

Please sign in to comment.