Skip to content

Commit

Permalink
[YUNIKORN-522] Smoke test for dynamic queue clean up (#404)
Browse files Browse the repository at this point in the history
Closes: #404

Signed-off-by: Craig Condit <[email protected]>
  • Loading branch information
steinsgateted authored and craigcondit committed Apr 14, 2022
1 parent 7493411 commit 764606e
Show file tree
Hide file tree
Showing 2 changed files with 182 additions and 0 deletions.
5 changes: 5 additions & 0 deletions pkg/scheduler/objects/application.go
Original file line number Diff line number Diff line change
Expand Up @@ -1607,3 +1607,8 @@ func (sa *Application) GetAllPlaceholderData() []*PlaceholderData {
}
return placeholders
}

// test only
func SetCompletingTimeout(duration time.Duration) {
completingTimeout = duration
}
177 changes: 177 additions & 0 deletions pkg/scheduler/tests/smoke_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1447,3 +1447,180 @@ func TestDupReleasesInGangScheduling(t *testing.T) {
ms.scheduler.MultiStepSchedule(5)
ms.mockRM.waitForAllocations(t, 1, 1000)
}

//nolint:funlen
func TestDynamicQueueCleanUp(t *testing.T) {
configData := `
partitions:
- name: default
queues:
- name: root
submitacl: "*"
placementrules:
- name: fixed
value: cleanup_test
create: true
`
// Register RM
// Start all tests
ms := &mockScheduler{}
defer ms.Stop()

err := ms.Init(configData, false)
assert.NilError(t, err, "RegisterResourceManager failed")

// Check queues of cache and scheduler.
part := ms.scheduler.GetClusterContext().GetPartition(partition)
assert.Assert(t, part.GetTotalPartitionResource() == nil, "partition info max resource nil")

// Check the queue root
root := part.GetQueue("root")
assert.Assert(t, root.GetMaxResource() == nil, "root queue max resource should be nil")

leafName := ""
app1ID := appID1

// Register a node, and add apps
err = ms.proxy.UpdateNode(&si.NodeRequest{
Nodes: []*si.NodeInfo{
{
NodeID: "node-1:1234",
Attributes: map[string]string{},
SchedulableResource: &si.Resource{
Resources: map[string]*si.Quantity{
"memory": {Value: 100000000},
"vcore": {Value: 20000},
},
},
Action: si.NodeInfo_CREATE,
},
{
NodeID: "node-2:1234",
Attributes: map[string]string{},
SchedulableResource: &si.Resource{
Resources: map[string]*si.Quantity{
"memory": {Value: 100000000},
"vcore": {Value: 20000},
},
},
Action: si.NodeInfo_CREATE,
},
},
RmID: "rm:123",
})

assert.NilError(t, err, "NodeRequest failed")

err = ms.proxy.UpdateApplication(&si.ApplicationRequest{
New: newAddAppRequest(map[string]string{app1ID: leafName}),
RmID: "rm:123",
})

assert.NilError(t, err, "ApplicationRequest failed")

ms.mockRM.waitForAcceptedApplication(t, app1ID, 1000)
ms.mockRM.waitForAcceptedNode(t, "node-1:1234", 1000)
ms.mockRM.waitForAcceptedNode(t, "node-2:1234", 1000)

// Get the app
app := ms.getApplication(appID1)

// Get the queue cleanup_test
leafName = "root.cleanup_test"
leaf := part.GetQueue(leafName)

// Verify app initial state
var app01 *objects.Application
app01, err = getApplication(part, appID1)
assert.NilError(t, err, "application not found")

assert.Equal(t, app01.CurrentState(), objects.New.String())

err = ms.proxy.UpdateAllocation(&si.AllocationRequest{
Asks: []*si.AllocationAsk{
{
AllocationKey: "alloc-1",
ResourceAsk: &si.Resource{
Resources: map[string]*si.Quantity{
"memory": {Value: 10000000},
"vcore": {Value: 1000},
},
},
MaxAllocations: 2,
ApplicationID: appID1,
},
},
RmID: "rm:123",
})
assert.NilError(t, err, "AllocationRequest 2 failed")

// Wait pending resource of queue a and scheduler queue
// Both pending memory = 10 * 2 = 20;
waitForPendingQueueResource(t, leaf, 20000000, 1000)
waitForPendingQueueResource(t, root, 20000000, 1000)
waitForPendingAppResource(t, app, 20000000, 1000)
assert.Equal(t, app01.CurrentState(), objects.Accepted.String())

ms.scheduler.MultiStepSchedule(5)

ms.mockRM.waitForAllocations(t, 2, 1000)

// Make sure pending resource updated to 0
waitForPendingQueueResource(t, leaf, 0, 1000)
waitForPendingQueueResource(t, root, 0, 1000)
waitForPendingAppResource(t, app, 0, 1000)

// Check allocated resources of queues, apps
assert.Equal(t, int(leaf.GetAllocatedResource().Resources[resources.MEMORY]), 20000000, "leaf allocated memory incorrect")
assert.Equal(t, int(root.GetAllocatedResource().Resources[resources.MEMORY]), 20000000, "root allocated memory incorrect")
assert.Equal(t, int(app.GetAllocatedResource().Resources[resources.MEMORY]), 20000000, "app allocated memory incorrect")

// once we start to process allocation asks from this app, verify the state again
assert.Equal(t, app01.CurrentState(), objects.Running.String())

// Check allocated resources of nodes
waitForAllocatedNodeResource(t, ms.scheduler.GetClusterContext(), ms.partitionName, []string{"node-1:1234", "node-2:1234"}, 20000000, 1000)

updateRequest := &si.AllocationRequest{
Releases: &si.AllocationReleasesRequest{
AllocationsToRelease: make([]*si.AllocationRelease, 0),
},
RmID: "rm:123",
}

// Release all allocations
for _, v := range ms.mockRM.getAllocations() {
updateRequest.Releases.AllocationsToRelease = append(updateRequest.Releases.AllocationsToRelease, &si.AllocationRelease{
UUID: v.UUID,
ApplicationID: v.ApplicationID,
PartitionName: v.PartitionName,
})
}

// Before release allocations, shorten the completingTimeout first, otherwise it will take 30 seconds for the app to become completed state.
objects.SetCompletingTimeout(time.Millisecond * 100)
defer objects.SetCompletingTimeout(time.Second * 30)

// Release Allocations.
err = ms.proxy.UpdateAllocation(updateRequest)
assert.NilError(t, err, "AllocationRequest 3 failed")

ms.mockRM.waitForAllocations(t, 0, 1000)

// Check allocated resources of queues, apps should be 0 now
assert.Equal(t, int(leaf.GetAllocatedResource().Resources[resources.MEMORY]), 0, "leaf allocated memory incorrect")
assert.Equal(t, int(root.GetAllocatedResource().Resources[resources.MEMORY]), 0, "root allocated memory incorrect")
assert.Equal(t, int(app.GetAllocatedResource().Resources[resources.MEMORY]), 0, "app allocated memory incorrect")

// Check app to Completing status
assert.Equal(t, app01.CurrentState(), objects.Completing.String())
// the app changes from completing state to completed state
err = common.WaitFor(1*time.Millisecond, time.Millisecond*200, app.IsCompleted)
assert.NilError(t, err, "App should be in Completed state")
// partition manager should be able to clean up the dynamically created queue.
if err = common.WaitFor(1*time.Millisecond, time.Second*11, func() bool {
return part.GetQueue(leafName) == nil
}); err != nil {
t.Errorf("timeout waiting for queue is cleared %v", err)
}
}

0 comments on commit 764606e

Please sign in to comment.