diff --git a/pkg/metrics/init.go b/pkg/metrics/init.go index 341bb2306..e7cdeb56b 100644 --- a/pkg/metrics/init.go +++ b/pkg/metrics/init.go @@ -54,6 +54,7 @@ type CoreQueueMetrics interface { IncQueueApplicationsCompleted() IncAllocatedContainer() IncReleasedContainer() + AddReleasedContainers(value int) SetQueueGuaranteedResourceMetrics(resourceName string, value float64) SetQueueMaxResourceMetrics(resourceName string, value float64) SetQueueAllocatedResourceMetrics(resourceName string, value float64) diff --git a/pkg/metrics/queue.go b/pkg/metrics/queue.go index 3d55eb0b9..a4030d816 100644 --- a/pkg/metrics/queue.go +++ b/pkg/metrics/queue.go @@ -108,6 +108,10 @@ func (m *QueueMetrics) IncReleasedContainer() { m.appMetrics.With(prometheus.Labels{"state": "released"}).Inc() } +func (m *QueueMetrics) AddReleasedContainers(value int) { + m.appMetrics.With(prometheus.Labels{"state": "released"}).Add(float64(value)) +} + func (m *QueueMetrics) SetQueueGuaranteedResourceMetrics(resourceName string, value float64) { m.ResourceMetrics.With(prometheus.Labels{"state": "guaranteed", "resource": resourceName}).Set(value) } diff --git a/pkg/metrics/queue_test.go b/pkg/metrics/queue_test.go index 7ede358eb..ae17ed17f 100644 --- a/pkg/metrics/queue_test.go +++ b/pkg/metrics/queue_test.go @@ -86,6 +86,14 @@ func TestReleasedContainers(t *testing.T) { verifyAppMetrics(t, "released") } +func TestAddReleasedContainers(t *testing.T) { + cqm = getQueueMetrics() + defer unregisterQueueMetrics(t) + + cqm.AddReleasedContainers(1) + verifyAppMetrics(t, "released") +} + func TestQueueGuaranteedResourceMetrics(t *testing.T) { cqm = getQueueMetrics() defer unregisterQueueMetrics(t) diff --git a/pkg/scheduler/partition.go b/pkg/scheduler/partition.go index f4bde125d..f08558d13 100644 --- a/pkg/scheduler/partition.go +++ b/pkg/scheduler/partition.go @@ -768,8 +768,6 @@ func (pc *PartitionContext) removeNodeAllocations(node *objects.Node) ([]*object log.Log(log.SchedPartition).Warn("failed to release resources from queue", zap.String("appID", alloc.GetApplicationID()), zap.Error(err)) - } else { - metrics.GetQueueMetrics(queue.GetQueuePath()).IncReleasedContainer() } // remove preempted resources if alloc.IsPreempted() { @@ -781,6 +779,7 @@ func (pc *PartitionContext) removeNodeAllocations(node *objects.Node) ([]*object // the allocation is removed so add it to the list that we return released = append(released, alloc) + metrics.GetQueueMetrics(queue.GetQueuePath()).IncReleasedContainer() log.Log(log.SchedPartition).Info("allocation removed from node", zap.String("nodeID", node.NodeID), zap.String("allocationId", allocID)) @@ -1356,8 +1355,6 @@ func (pc *PartitionContext) removeAllocation(release *si.AllocationRelease) ([]* zap.String("appID", appID), zap.String("allocationId", uuid), zap.Error(err)) - } else { - metrics.GetQueueMetrics(queue.GetQueuePath()).IncReleasedContainer() } } if resources.StrictlyGreaterThanZero(totalPreempting) { @@ -1371,6 +1368,7 @@ func (pc *PartitionContext) removeAllocation(release *si.AllocationRelease) ([]* } // track the number of allocations, when we replace the result is no change pc.updateAllocationCount(-len(released)) + metrics.GetQueueMetrics(queue.GetQueuePath()).AddReleasedContainers(len(released)) // if the termination type is timeout, we don't notify the shim, because it's // originated from that side