Skip to content

Commit

Permalink
test(robot): add test case Shutdown Volume Node And Test Auto Reattac…
Browse files Browse the repository at this point in the history
…h To A New Node

Signed-off-by: Yang Chiu <[email protected]>
  • Loading branch information
yangchiu committed Dec 6, 2024
1 parent 8af8d7b commit 8acea05
Show file tree
Hide file tree
Showing 35 changed files with 143 additions and 179 deletions.
6 changes: 5 additions & 1 deletion e2e/keywords/common.resource
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
*** Settings ***
Documentation Common keywords
Library Collections
Library OperatingSystem
Library ../libs/keywords/common_keywords.py
Library ../libs/keywords/deployment_keywords.py
Expand Down Expand Up @@ -38,7 +39,10 @@ Set test environment
END

Cleanup test resources
Run keyword And Ignore Error power_on_node_by_name ${powered_off_node}
FOR ${powered_off_node} IN @{powered_off_nodes}
Run keyword And Ignore Error power_on_node_by_name ${powered_off_node}
Remove Values From List ${powered_off_nodes} ${powered_off_node}
END
uncordon_all_nodes
cleanup_control_plane_network_latency
reset_node_schedule
Expand Down
11 changes: 7 additions & 4 deletions e2e/keywords/host.resource
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
*** Settings ***
Documentation Physical Node Keywords
Library Collections
Library ../libs/keywords/common_keywords.py
Library ../libs/keywords/host_keywords.py
Library ../libs/keywords/network_keywords.py
Expand Down Expand Up @@ -34,11 +35,13 @@ Restart cluster
reboot_all_nodes
setup_control_plane_network_latency

Power on off node
Run keyword And Ignore Error
... power_on_node_by_name ${powered_off_node}
Power on off nodes
FOR ${powered_off_node} IN @{powered_off_nodes}
Run keyword And Ignore Error power_on_node_by_name ${powered_off_node}
Remove Values From List ${powered_off_nodes} ${powered_off_node}
END

Power off node ${node_id}
${powered_off_node} = get_node_by_index ${node_id}
Append to list ${powered_off_nodes} ${powered_off_node}
power_off_node_by_name ${powered_off_node}
Set Test Variable ${powered_off_node}
5 changes: 5 additions & 0 deletions e2e/keywords/sharemanager.resource
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ Delete sharemanager pod of deployment ${deployment_id} and wait for recreation
${volume_name} = get_workload_volume_name ${deployment_name}
delete_sharemanager_pod_and_wait_for_recreation ${volume_name}

Wait for sharemanager pod of deployment ${deployment_id} restart
${deployment_name} = generate_name_with_suffix deployment ${deployment_id}
${volume_name} = get_workload_volume_name ${deployment_name}
wait_for_sharemanager_pod_restart ${volume_name}

Wait for sharemanager pod of deployment ${deployment_id} running
${deployment_name} = generate_name_with_suffix deployment ${deployment_id}
${volume_name} = get_workload_volume_name ${deployment_name}
Expand Down
13 changes: 13 additions & 0 deletions e2e/keywords/variables.resource
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
*** Settings ***
Documentation Global Variables
*** Variables ***
${LOOP_COUNT} 1
${RETRY_COUNT} 300
${RETRY_INTERVAL} 1
${VOLUME_TYPE} RWO
${CONTROL_PLANE_NODE_NETWORK_LATENCY_IN_MS} 0
${RWX_VOLUME_FAST_FAILOVER} false
${DATA_ENGINE} v1

@{powered_off_nodes}=
11 changes: 10 additions & 1 deletion e2e/keywords/workload.resource
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,18 @@ Power off volume node of ${workload_kind} ${workload_id}
${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id}
${volume_name} = get_workload_volume_name ${workload_name}
${powered_off_node} = get_volume_node ${volume_name}
Append to list ${powered_off_nodes} ${powered_off_node}
${last_volume_node} = get_volume_node ${volume_name}
power_off_volume_node ${volume_name}
Set Test Variable ${powered_off_node}
Set Test Variable ${last_volume_node}

Power off volume node of ${workload_kind} ${workload_id} without waiting
${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id}
${volume_name} = get_workload_volume_name ${workload_name}
${powered_off_node} = get_volume_node ${volume_name}
Append to list ${powered_off_nodes} ${powered_off_node}
${last_volume_node} = get_volume_node ${volume_name}
power_off_volume_node ${volume_name} waiting=False
Set Test Variable ${last_volume_node}

Reboot volume node of ${workload_kind} ${workload_id}
Expand Down
9 changes: 5 additions & 4 deletions e2e/libs/host/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,15 @@ def reboot_all_worker_nodes(self, shut_down_time_in_sec=NODE_REBOOT_DOWN_TIME_SE
waiter.wait(InstanceIds=instance_ids)
logging(f"Started instances")

def power_off_node(self, power_off_node_name):
def power_off_node(self, power_off_node_name, waiting=True):
instance_ids = [self.mapping[power_off_node_name]]
resp = self.aws_client.stop_instances(InstanceIds=instance_ids, Force=True)
assert resp['ResponseMetadata']['HTTPStatusCode'] == 200, f"Failed to stop instances {instance_ids} response: {resp}"
logging(f"Stopping instances {instance_ids}")
waiter = self.aws_client.get_waiter('instance_stopped')
waiter.wait(InstanceIds=instance_ids)
logging(f"Stopped instances")
if waiting:
waiter = self.aws_client.get_waiter('instance_stopped')
waiter.wait(InstanceIds=instance_ids)
logging(f"Stopped instances")

def power_on_node(self, power_on_node_name):
instance_ids = [self.mapping[power_on_node_name]]
Expand Down
2 changes: 1 addition & 1 deletion e2e/libs/host/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def reboot_all_worker_nodes(self, shut_down_time_in_sec):
return NotImplemented

@abstractmethod
def power_off_node(self, node_name):
def power_off_node(self, node_name, waiting):
return NotImplemented

@abstractmethod
Expand Down
5 changes: 4 additions & 1 deletion e2e/libs/host/harvester.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def reboot_all_worker_nodes(self, shut_down_time_in_sec):
for node_name in node_names:
self.power_on_node(node_name)

def power_off_node(self, node_name):
def power_off_node(self, node_name, waiting=True):
vm_id = self.mapping[node_name]

url = f"{self.url}/{vm_id}"
Expand All @@ -68,6 +68,9 @@ def power_off_node(self, node_name):
logging(f"Stopping vm failed with error {e}")
logging(f"Stopping vm {vm_id}")

if not waiting:
return

stopped = False
for i in range(self.retry_count):
logging(f"Waiting for vm {vm_id} stopped ... ({i})")
Expand Down
6 changes: 3 additions & 3 deletions e2e/libs/keywords/host_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,10 @@ def reboot_node_by_name(self, node_name, downtime_in_min=1):
logging(f'Rebooting node {node_name} with downtime {reboot_down_time_sec} seconds')
self.host.reboot_node(node_name, reboot_down_time_sec)

def power_off_volume_node(self, volume_name):
def power_off_volume_node(self, volume_name, waiting=True):
node_id = self.volume_keywords.get_node_id_by_replica_locality(volume_name, "volume node")
logging(f'Power off volume {volume_name} node {node_id}')
self.host.power_off_node(node_id)
logging(f'Power off volume {volume_name} node {node_id} with waiting = {waiting}')
self.host.power_off_node(node_id, waiting)

def power_on_node_by_name(self, node_name):
self.host.power_on_node(node_name)
Expand Down
20 changes: 20 additions & 0 deletions e2e/libs/keywords/sharemanager_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,32 @@ def delete_sharemanager_pod_and_wait_for_recreation(self, name):

assert False, f"sharemanager pod {sharemanager_pod_name} not recreated"

def wait_for_sharemanager_pod_restart(self, name):
sharemanager_pod_name = "share-manager-" + name
sharemanager_pod = get_pod(sharemanager_pod_name, "longhorn-system")
last_creation_time = sharemanager_pod.metadata.creation_timestamp

retry_count, retry_interval = get_retry_count_and_interval()
for i in range(retry_count):
logging(f"Waiting for sharemanager for volume {name} restart ... ({i})")
time.sleep(retry_interval)
sharemanager_pod = get_pod(sharemanager_pod_name, "longhorn-system")
if sharemanager_pod == None:
continue
creation_time = sharemanager_pod.metadata.creation_timestamp
logging(f"Getting new sharemanager which is created at {creation_time}, and old one is created at {last_creation_time}")
if creation_time > last_creation_time:
return

assert False, f"sharemanager pod {sharemanager_pod_name} isn't restarted"


def wait_for_share_manager_pod_running(self, name):
sharemanager_pod_name = "share-manager-" + name
retry_count, retry_interval = get_retry_count_and_interval()
for i in range(retry_count):
sharemanager_pod = get_pod(sharemanager_pod_name, "longhorn-system")
logging(f"Waiting for sharemanager for volume {name} running, currently {sharemanager_pod.status.phase} ... ({i})")
if sharemanager_pod.status.phase == "Running":
return

Expand Down
10 changes: 1 addition & 9 deletions e2e/tests/negative/cluster_restart.robot
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Documentation Negative Test Cases
Test Tags negative cluster

Resource ../keywords/variables.resource
Resource ../keywords/common.resource
Resource ../keywords/deployment.resource
Resource ../keywords/longhorn.resource
Expand All @@ -16,15 +17,6 @@ Resource ../keywords/setting.resource
Test Setup Set test environment
Test Teardown Cleanup test resources

*** Variables ***
${LOOP_COUNT} 1
${RETRY_COUNT} 300
${RETRY_INTERVAL} 1
${CONTROL_PLANE_NODE_NETWORK_LATENCY_IN_MS} 0
${RWX_VOLUME_FAST_FAILOVER} false
${DATA_ENGINE} v1


*** Test Cases ***
Restart Cluster While Workload Heavy Writing
Given Set setting rwx-volume-fast-failover to ${RWX_VOLUME_FAST_FAILOVER}
Expand Down
8 changes: 1 addition & 7 deletions e2e/tests/negative/component_resilience.robot
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Documentation Negative Test Cases
Test Tags negative

Resource ../keywords/variables.resource
Resource ../keywords/common.resource
Resource ../keywords/volume.resource
Resource ../keywords/backing_image.resource
Expand All @@ -18,13 +19,6 @@ Resource ../keywords/sharemanager.resource
Test Setup Set test environment
Test Teardown Cleanup test resources

*** Variables ***
${LOOP_COUNT} 1
${RETRY_COUNT} 300
${RETRY_INTERVAL} 1
${RWX_VOLUME_FAST_FAILOVER} false
${DATA_ENGINE} v1

*** Keywords ***
Delete instance-manager of volume ${volume_id} and wait for recover
When Delete instance-manager of volume ${volume_id}
Expand Down
8 changes: 1 addition & 7 deletions e2e/tests/negative/kubelet_restart.robot
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Documentation Negative Test Cases
Test Tags negative

Resource ../keywords/variables.resource
Resource ../keywords/common.resource
Resource ../keywords/storageclass.resource
Resource ../keywords/persistentvolumeclaim.resource
Expand All @@ -14,13 +15,6 @@ Resource ../keywords/setting.resource
Test Setup Set test environment
Test Teardown Cleanup test resources

*** Variables ***
${LOOP_COUNT} 1
${RETRY_COUNT} 300
${RETRY_INTERVAL} 1
${RWX_VOLUME_FAST_FAILOVER} false
${DATA_ENGINE} v1

*** Test Cases ***
Restart Volume Node Kubelet While Workload Heavy Writing
Given Set setting rwx-volume-fast-failover to ${RWX_VOLUME_FAST_FAILOVER}
Expand Down
9 changes: 1 addition & 8 deletions e2e/tests/negative/network_disconnect.robot
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Documentation Negative Test Cases
Test Tags negative

Resource ../keywords/variables.resource
Resource ../keywords/volume.resource
Resource ../keywords/storageclass.resource
Resource ../keywords/statefulset.resource
Expand All @@ -14,14 +15,6 @@ Resource ../keywords/setting.resource
Test Setup Set test environment
Test Teardown Cleanup test resources

*** Variables ***
${LOOP_COUNT} 1
${LATENCY_IN_MS} 0
${RETRY_COUNT} 300
${RETRY_INTERVAL} 1
${RWX_VOLUME_FAST_FAILOVER} false
${DATA_ENGINE} v1

*** Test Cases ***
Disconnect Volume Node Network While Workload Heavy Writing
Given Set setting rwx-volume-fast-failover to ${RWX_VOLUME_FAST_FAILOVER}
Expand Down
8 changes: 1 addition & 7 deletions e2e/tests/negative/node_delete.robot
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Documentation Negative Test Cases
Test Tags negative

Resource ../keywords/variables.resource
Resource ../keywords/common.resource
Resource ../keywords/host.resource
Resource ../keywords/storageclass.resource
Expand All @@ -15,13 +16,6 @@ Resource ../keywords/setting.resource
Test Setup Set test environment
Test Teardown Cleanup test resources

*** Variables ***
${LOOP_COUNT} 1
${RETRY_COUNT} 300
${RETRY_INTERVAL} 1
${RWX_VOLUME_FAST_FAILOVER} false
${DATA_ENGINE} v1

*** Test Cases ***
Delete Volume Node While Replica Rebuilding
Given Set setting node-down-pod-deletion-policy to do-nothing
Expand Down
8 changes: 1 addition & 7 deletions e2e/tests/negative/node_drain.robot
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Documentation Negative Test Cases
Test Tags negative

Resource ../keywords/variables.resource
Resource ../keywords/common.resource
Resource ../keywords/storageclass.resource
Resource ../keywords/persistentvolumeclaim.resource
Expand All @@ -18,13 +19,6 @@ Resource ../keywords/node.resource
Test Setup Set test environment
Test Teardown Cleanup test resources

*** Variables ***
${LOOP_COUNT} 1
${RETRY_COUNT} 300
${RETRY_INTERVAL} 1
${RWX_VOLUME_FAST_FAILOVER} false
${DATA_ENGINE} v1

*** Test Cases ***
Force Drain Volume Node While Replica Rebuilding
Given Set setting rwx-volume-fast-failover to ${RWX_VOLUME_FAST_FAILOVER}
Expand Down
Loading

0 comments on commit 8acea05

Please sign in to comment.