Skip to content

Commit

Permalink
Fixing Module CR deletion during node reboot (rh-ecosystem-edge#1261)
Browse files Browse the repository at this point in the history
In case node it rebooted(for any reason) and the Module CR is being
deleted at the same period of time, the following sequence happens:
1) node becomes NotReady
2) module-nmc controller removes Spec from NMC
3) nmc controller does not schedule any unloader pod, since the node is
   not ready
4) node becomes ready
5) nmc controller does not schedule unloader pod, since after node
   reboot it ignores scheduling unloader pod, since the kernel module is
not loaded anyway
6) no unloader pod running, meaning Status in the NMC is not deleted,
   meaning Module CR cannot be finalized, since Status in NMC is present

Solution:
When nmc controller processes Modules without Spec, and the node has
been rebooted, the nmc controller will delete the Status from NMC.
  • Loading branch information
yevgeny-shnaidman authored Dec 8, 2024
1 parent 4edd955 commit 4ee39c5
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 3 deletions.
6 changes: 4 additions & 2 deletions internal/controllers/nmc_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -407,8 +407,10 @@ func (h *nmcReconcilerHelperImpl) ProcessUnconfiguredModuleStatus(
it also fixes the scenario when node's kernel was upgraded, so unload pod will fail anyway
*/
if h.nodeAPI.NodeBecomeReadyAfter(node, status.LastTransitionTime) {
logger.Info("node was rebooted, no need to unload kernel module that is not present in kernel, will wait until NMC spec is updated")
return nil
logger.Info("node was rebooted and spec is missing: delete the status to allow Module CR unload, if needed")
patchFrom := client.MergeFrom(nmcObj.DeepCopy())
nmc.RemoveModuleStatus(&nmcObj.Status.Modules, status.Namespace, status.Name)
return h.client.Status().Patch(ctx, nmcObj, patchFrom)
}

pod, err := h.pm.GetWorkerPod(ctx, podName, status.Namespace)
Expand Down
9 changes: 8 additions & 1 deletion internal/controllers/nmc_reconciler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -781,6 +781,8 @@ var _ = Describe("nmcReconcilerHelperImpl_ProcessUnconfiguredModuleStatus", func
podName = workerPodName(nmcName, name)

client *testclient.MockClient
sw *testclient.MockStatusWriter

pm *MockpodManager
nm *node.MockNode
helper nmcReconcilerHelper
Expand All @@ -789,6 +791,7 @@ var _ = Describe("nmcReconcilerHelperImpl_ProcessUnconfiguredModuleStatus", func
BeforeEach(func() {
ctrl := gomock.NewController(GinkgoT())
client = testclient.NewMockClient(ctrl)
sw = testclient.NewMockStatusWriter(ctrl)
pm = NewMockpodManager(ctrl)
nm = node.NewMockNode(ctrl)
helper = newNMCReconcilerHelper(client, pm, nil, nm)
Expand All @@ -808,7 +811,11 @@ var _ = Describe("nmcReconcilerHelperImpl_ProcessUnconfiguredModuleStatus", func
node := v1.Node{}

It("should do nothing , if the node has been rebooted/ready lately", func() {
nm.EXPECT().NodeBecomeReadyAfter(&node, status.LastTransitionTime).Return(true)
gomock.InOrder(
nm.EXPECT().NodeBecomeReadyAfter(&node, status.LastTransitionTime).Return(true),
client.EXPECT().Status().Return(sw),
sw.EXPECT().Patch(ctx, nmc, gomock.Any()),
)

Expect(
helper.ProcessUnconfiguredModuleStatus(ctx, nmc, status, &node),
Expand Down

0 comments on commit 4ee39c5

Please sign in to comment.