Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add task status named ReleasingFailed #2922

Closed
wants to merge 13 commits into from
Closed
8 changes: 8 additions & 0 deletions cmd/scheduler/app/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ const (
defaultMinNodesToFind = 100
defaultPercentageOfNodesToFind = 100
defaultLockObjectNamespace = "volcano-system"
defaultGracePeriodSeconds = 30
defaultGracePeriodSecondsWait = 3
)

// ServerOption is the main context object for the controller manager.
Expand Down Expand Up @@ -75,6 +77,9 @@ type ServerOption struct {

NodeSelector []string
EnableCacheDumper bool

GracePeriodSeconds int64
GracePeriodSecondsWait int64
}

type DecryptFunc func(c *ServerOption) error
Expand Down Expand Up @@ -128,6 +133,9 @@ func (s *ServerOption) AddFlags(fs *pflag.FlagSet) {
fs.BoolVar(&s.EnableMetrics, "enable-metrics", false, "Enable the metrics function; it is false by default")
fs.StringSliceVar(&s.NodeSelector, "node-selector", nil, "volcano only work with the labeled node, like: --node-selector=volcano.sh/role:train --node-selector=volcano.sh/role:serving")
fs.BoolVar(&s.EnableCacheDumper, "cache-dumper", true, "Enable the cache dumper, it's true by default")

fs.Int64Var(&s.GracePeriodSeconds, "grace-period", defaultGracePeriodSeconds, "the default second grace period seconds from pod")
fs.Int64Var(&s.GracePeriodSecondsWait, "grace-period-wait", defaultGracePeriodSecondsWait, "wait time from pod send sig kill to delete pod")
}

// CheckOptionOrDie check lock-object-namespace when LeaderElection is enabled.
Expand Down
21 changes: 18 additions & 3 deletions pkg/scheduler/api/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ package api

import (
"fmt"
"time"
"volcano.sh/volcano/cmd/scheduler/app/options"

v1 "k8s.io/api/core/v1"
clientcache "k8s.io/client-go/tools/cache"
Expand All @@ -33,16 +35,29 @@ func PodKey(pod *v1.Pod) TaskID {
}

func getTaskStatus(pod *v1.Pod) TaskStatus {
opts := options.ServerOpts
waitTime := opts.GracePeriodSeconds
if pod.Spec.TerminationGracePeriodSeconds != nil {
// default grace period
waitTime = *pod.Spec.TerminationGracePeriodSeconds
}
waitTime += opts.GracePeriodSecondsWait
switch pod.Status.Phase {
case v1.PodRunning:
if pod.DeletionTimestamp != nil {
if pod.DeletionTimestamp != nil &&
time.Now().Unix()-pod.DeletionTimestamp.Unix() <= waitTime {
return Releasing
} else if pod.DeletionTimestamp != nil {
return ReleasingFailed
}

return Running
case v1.PodPending:
if pod.DeletionTimestamp != nil {
if pod.DeletionTimestamp != nil &&
time.Now().Unix()-pod.DeletionTimestamp.Unix() <= waitTime {
return Releasing
} else if pod.DeletionTimestamp != nil {
return ReleasingFailed
}

if len(pod.Spec.NodeName) == 0 {
Expand All @@ -63,7 +78,7 @@ func getTaskStatus(pod *v1.Pod) TaskStatus {
// AllocatedStatus checks whether the tasks has AllocatedStatus
func AllocatedStatus(status TaskStatus) bool {
switch status {
case Bound, Binding, Running, Allocated:
case Bound, Binding, Running, Allocated, ReleasingFailed:
return true
default:
return false
Expand Down
1 change: 1 addition & 0 deletions pkg/scheduler/api/job_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,7 @@ func (ji *JobInfo) ReadyTaskNum() int32 {
occupied += len(ji.TaskStatusIndex[Bound])
occupied += len(ji.TaskStatusIndex[Binding])
occupied += len(ji.TaskStatusIndex[Running])
occupied += len(ji.TaskStatusIndex[ReleasingFailed])
occupied += len(ji.TaskStatusIndex[Allocated])
occupied += len(ji.TaskStatusIndex[Succeeded])

Expand Down
5 changes: 5 additions & 0 deletions pkg/scheduler/api/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ const (
// Releasing means a task/pod is deleted.
Releasing

// ReleasingFailed means a task/pod delete failed
ReleasingFailed

// Succeeded means that all containers in the pod have voluntarily terminated
// with a container exit code of 0, and the system is not going to restart any of these containers.
Succeeded
Expand Down Expand Up @@ -73,6 +76,8 @@ func (ts TaskStatus) String() string {
return "Running"
case Releasing:
return "Releasing"
case ReleasingFailed:
return "ReleasingFailed"
case Succeeded:
return "Succeeded"
case Failed:
Expand Down
5 changes: 5 additions & 0 deletions pkg/scheduler/framework/session.go
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,9 @@ func updateQueueStatus(ssn *Session) {
for _, runningTask := range job.TaskStatusIndex[api.Running] {
allocatedResources[job.Queue].Add(runningTask.Resreq)
}
for _, runningTask := range job.TaskStatusIndex[api.ReleasingFailed] {
allocatedResources[job.Queue].Add(runningTask.Resreq)
}
}

// update queue status
Expand Down Expand Up @@ -277,6 +280,8 @@ func jobStatus(ssn *Session, jobInfo *api.JobInfo) scheduling.PodGroupStatus {
}
}

// todo
// Should status running add length of ReleasingFailed pod?
status.Running = int32(len(jobInfo.TaskStatusIndex[api.Running]))
status.Failed = int32(len(jobInfo.TaskStatusIndex[api.Failed]))
status.Succeeded = int32(len(jobInfo.TaskStatusIndex[api.Succeeded]))
Expand Down