Skip to content

Commit

Permalink
PWX-38103 Ignore unhealthy nodes if a healthy one is found
Browse files Browse the repository at this point in the history
for the same k8s node. If one healthy node is found then there is a PX node
which is healthy on that k8s node. The unhealthy ones are present because they
were previously running on that k8s node before. Such nodes will either find a
new k8s node or will get auto-decommissioned in case of a storageless nodes.

Signed-off-by: Piyush Nimbalkar <[email protected]>
  • Loading branch information
Piyush Nimbalkar authored and piyush-nimbalkar committed Aug 9, 2024
1 parent 8081f09 commit 344757d
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 2 deletions.
7 changes: 6 additions & 1 deletion pkg/controller/storagecluster/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4449,7 +4449,12 @@ func TestUpdateStorageClusterBasedOnStorageNodeStatuses(t *testing.T) {
var storageNodes []*storageapi.StorageNode
storageNodes = append(storageNodes, createStorageNode("k8s-node-0", true))
storageNodes = append(storageNodes, createStorageNode("k8s-node-1", true))
// Create duplicate storage nodes with unhealthy statuses and one with healthy status.
// We need to verify that the one with healthy status is considered when calculating
// what nodes to upgrade next.
storageNodes = append(storageNodes, createStorageNode("k8s-node-2", false))
storageNodes = append(storageNodes, createStorageNode("k8s-node-2", true))
storageNodes = append(storageNodes, createStorageNode("k8s-node-2", false))
storageNodes = append(storageNodes, createStorageNode("not-k8s-node", false))

driver.EXPECT().Validate(gomock.Any()).Return(nil).AnyTimes()
Expand Down Expand Up @@ -4508,7 +4513,7 @@ func TestUpdateStorageClusterBasedOnStorageNodeStatuses(t *testing.T) {
require.Empty(t, podControl.DeletePodName)

// TestCase: Mark the unhealthy storage node to healthy, the update should begin.
storageNodes[3].Status = storageapi.Status_STATUS_OK
storageNodes[5].Status = storageapi.Status_STATUS_OK
result, err = controller.Reconcile(context.TODO(), request)
require.NoError(t, err)
require.Empty(t, result)
Expand Down
14 changes: 13 additions & 1 deletion pkg/controller/storagecluster/update.go
Original file line number Diff line number Diff line change
Expand Up @@ -661,7 +661,19 @@ func (c *Controller) getPodAvailability(
if len(storageNode.SchedulerNodeName) == 0 {
nonK8sStorageNodes = append(nonK8sStorageNodes, storageNode)
} else {
storageNodeMap[storageNode.SchedulerNodeName] = storageNode
if existing, ok := storageNodeMap[storageNode.SchedulerNodeName]; ok {
// During a k8s upgrade, it may happen that a new node is started on the same k8s host using
// a scheduler node name that was previously used by a different node. In this case, if we have
// a storage node that is healthy, then we can ignore the old node which is not healthy as it will
// either find a new k8s node or it will be a storageless node that will get auto-decommissioned.
// It is mostly not possible that a storageless node will replace a storage node on the same node.
// But if it does, then refer - https://purestorage.atlassian.net/browse/PWX-38505 for more details.
if existing.Status != storageapi.Status_STATUS_OK && storageNode.Status == storageapi.Status_STATUS_OK {
storageNodeMap[storageNode.SchedulerNodeName] = storageNode
}
} else {
storageNodeMap[storageNode.SchedulerNodeName] = storageNode
}
}
}
}
Expand Down

0 comments on commit 344757d

Please sign in to comment.