Skip to content

Commit

Permalink
[YUNIKORN-1994] fix release containers metrix in queue (#665)
Browse files Browse the repository at this point in the history
Closes: #665

Signed-off-by: Manikandan R <[email protected]>
  • Loading branch information
FrankYang0529 authored and manirajv06 committed Oct 3, 2023
1 parent 54c0bf6 commit 2e63209
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 4 deletions.
1 change: 1 addition & 0 deletions pkg/metrics/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ type CoreQueueMetrics interface {
IncQueueApplicationsCompleted()
IncAllocatedContainer()
IncReleasedContainer()
AddReleasedContainers(value int)
SetQueueGuaranteedResourceMetrics(resourceName string, value float64)
SetQueueMaxResourceMetrics(resourceName string, value float64)
SetQueueAllocatedResourceMetrics(resourceName string, value float64)
Expand Down
4 changes: 4 additions & 0 deletions pkg/metrics/queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,10 @@ func (m *QueueMetrics) IncReleasedContainer() {
m.appMetrics.With(prometheus.Labels{"state": "released"}).Inc()
}

func (m *QueueMetrics) AddReleasedContainers(value int) {
m.appMetrics.With(prometheus.Labels{"state": "released"}).Add(float64(value))
}

func (m *QueueMetrics) SetQueueGuaranteedResourceMetrics(resourceName string, value float64) {
m.ResourceMetrics.With(prometheus.Labels{"state": "guaranteed", "resource": resourceName}).Set(value)
}
Expand Down
8 changes: 8 additions & 0 deletions pkg/metrics/queue_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,14 @@ func TestReleasedContainers(t *testing.T) {
verifyAppMetrics(t, "released")
}

func TestAddReleasedContainers(t *testing.T) {
cqm = getQueueMetrics()
defer unregisterQueueMetrics(t)

cqm.AddReleasedContainers(1)
verifyAppMetrics(t, "released")
}

func TestQueueGuaranteedResourceMetrics(t *testing.T) {
cqm = getQueueMetrics()
defer unregisterQueueMetrics(t)
Expand Down
6 changes: 2 additions & 4 deletions pkg/scheduler/partition.go
Original file line number Diff line number Diff line change
Expand Up @@ -768,8 +768,6 @@ func (pc *PartitionContext) removeNodeAllocations(node *objects.Node) ([]*object
log.Log(log.SchedPartition).Warn("failed to release resources from queue",
zap.String("appID", alloc.GetApplicationID()),
zap.Error(err))
} else {
metrics.GetQueueMetrics(queue.GetQueuePath()).IncReleasedContainer()
}
// remove preempted resources
if alloc.IsPreempted() {
Expand All @@ -781,6 +779,7 @@ func (pc *PartitionContext) removeNodeAllocations(node *objects.Node) ([]*object

// the allocation is removed so add it to the list that we return
released = append(released, alloc)
metrics.GetQueueMetrics(queue.GetQueuePath()).IncReleasedContainer()
log.Log(log.SchedPartition).Info("allocation removed from node",
zap.String("nodeID", node.NodeID),
zap.String("allocationId", allocID))
Expand Down Expand Up @@ -1356,8 +1355,6 @@ func (pc *PartitionContext) removeAllocation(release *si.AllocationRelease) ([]*
zap.String("appID", appID),
zap.String("allocationId", uuid),
zap.Error(err))
} else {
metrics.GetQueueMetrics(queue.GetQueuePath()).IncReleasedContainer()
}
}
if resources.StrictlyGreaterThanZero(totalPreempting) {
Expand All @@ -1371,6 +1368,7 @@ func (pc *PartitionContext) removeAllocation(release *si.AllocationRelease) ([]*
}
// track the number of allocations, when we replace the result is no change
pc.updateAllocationCount(-len(released))
metrics.GetQueueMetrics(queue.GetQueuePath()).AddReleasedContainers(len(released))

// if the termination type is timeout, we don't notify the shim, because it's
// originated from that side
Expand Down

0 comments on commit 2e63209

Please sign in to comment.