longhorn · yangchiu · Nov 7, 2024 · Nov 7, 2024 · coderabbitai · Nov 7, 2024
@@ -69,6 +69,11 @@ Drain volume of ${workload_kind} ${workload_id} volume node
 Uncordon the drained node
     uncordon_node    ${drained_node}
 
+Cordon node ${node_id}
+    ${node_name} =    get_node_by_index    ${node_id}
+    cordon_node    ${node_name}
+    check_node_is_not_schedulable    ${node_name}
+
 Cordon ${workload_kind} ${workload_id} volume node
     ${workload_name} =   generate_name_with_suffix    ${workload_kind}    ${workload_id}
     ${volume_name} =    get_workload_volume_name    ${workload_name}

@@ -153,6 +153,8 @@ def wait_for_replica_rebuilding_to_complete_on_node(self, volume_name, replica_l
 
     def wait_for_replica_rebuilding_to_complete(self, volume_name):
         for node_name in self.node.list_node_names_by_role("worker"):
+            if self.node.is_node_schedulable(node_name) == "False":
+                continue
-            if self.node.is_node_schedulable(node_name) == "False":
-                continue
+            if not self.node.is_node_schedulable(node_name):
+                logging(f"Skipping node {node_name} as it is not schedulable")
+                continue
-            if self.node.is_node_schedulable(node_name) == "False":
-                continue
+            if not self.node.is_node_schedulable(node_name):
+                logging(f"Skipping node {node_name} as it is not schedulable")
+                continue
             logging(f"Waiting for volume {volume_name}'s replica on node {node_name} rebuilding completed")
             self.volume.wait_for_replica_rebuilding_complete(volume_name, node_name)
 

@@ -169,3 +169,7 @@ def check_node_schedulable(self, node_name, schedulable):
                 break
             time.sleep(self.retry_interval)
         assert node["conditions"]["Schedulable"]["status"] == schedulable
+
+    def is_node_schedulable(self, node_name):
+        node = get_longhorn_client().by_id_node(node_name)
+        return node["conditions"]["Schedulable"]["status"]
-    def is_node_schedulable(self, node_name):
-        node = get_longhorn_client().by_id_node(node_name)
-        return node["conditions"]["Schedulable"]["status"]
+    def is_node_schedulable(self, node_name: str) -> bool:
+        """Check if a node is schedulable without asserting the status.
+
+        Args:
+            node_name: The name of the node to check.
+
+        Returns:
+            bool: True if the node is schedulable, False otherwise.
+
+        Raises:
+            KeyError: If the node's conditions are not properly structured.
+        """
+        node = get_longhorn_client().by_id_node(node_name)
+        try:
+            return node["conditions"]["Schedulable"]["status"]
+        except KeyError as e:
+            raise KeyError(f"Failed to get schedulable status for node {node_name}: {e}")
-    def is_node_schedulable(self, node_name):
-        node = get_longhorn_client().by_id_node(node_name)
-        return node["conditions"]["Schedulable"]["status"]
+    def is_node_schedulable(self, node_name: str) -> bool:
+        """Check if a node is schedulable without asserting the status.
+
+        Args:
+            node_name: The name of the node to check.
+
+        Returns:
+            bool: True if the node is schedulable, False otherwise.
+
+        Raises:
+            KeyError: If the node's conditions are not properly structured.
+        """
+        node = get_longhorn_client().by_id_node(node_name)
+        try:
+            return node["conditions"]["Schedulable"]["status"]
+        except KeyError as e:
+            raise KeyError(f"Failed to get schedulable status for node {node_name}: {e}")
@@ -0,0 +1,53 @@
+*** Settings ***
+Documentation    Scheduling Test Cases
+
+Test Tags    regression
+
+Resource    ../keywords/common.resource
+Resource    ../keywords/volume.resource
+Resource    ../keywords/setting.resource
+Resource    ../keywords/deployment.resource
+Resource    ../keywords/persistentvolumeclaim.resource
+Resource    ../keywords/workload.resource
+Resource    ../keywords/k8s.resource
+
+Test Setup    Set test environment
+Test Teardown    Cleanup test resources
+
+*** Variables ***
+${LOOP_COUNT}    1
+${RETRY_COUNT}    300
+${RETRY_INTERVAL}    1
+${DATA_ENGINE}    v1
+
+*** Test Cases ***
+Test Soft Anti Affinity Scheduling
+    [Tags]    coretest
+    [Documentation]    Test that volumes with Soft Anti-Affinity work as expected.
+    ...
+    ...    With Soft Anti-Affinity, a new replica should still be scheduled on a node
+    ...    with an existing replica, which will result in "Healthy" state but limited
+    ...    redundancy.
+    ...
+    ...    1. Create a volume and attach to the current node
+    ...    2. Generate and write `data` to the volume.
+    ...    3. Set `soft anti-affinity` to true
+    ...    4. Disable current node's scheduling.
+    ...    5. Remove the replica on the current node
+    ...    6. Wait for the volume to complete rebuild. Volume should have 3 replicas.
+    ...    7. Verify `data`
+    Given Create volume 0 with    numberOfReplicas=3    dataEngine=${DATA_ENGINE}
+    And Attach volume 0
+    And Wait for volume 0 healthy
+    And Write data to volume 0
+
+    When Set setting replica-soft-anti-affinity to true
+    # disabling scheduling on a node only sets the node status to "Disable", not "Unschedulable"
+    # therefore disabling scheduling doesn't alter the node["conditions"]["Schedulable"]["status"] field
+    # only cordoning a node can set it to "Unschedulable"
+    And Cordon node 1
+    And Delete volume 0 replica on node 1
+
+    Then Wait until volume 0 replicas rebuilding completed
+    And Wait for volume 0 healthy
+    And Check volume 0 data is intact