Skip to content

Commit

Permalink
test(robot): add replica rebuilding test
Browse files Browse the repository at this point in the history
longhorn/longhorn-8731

Signed-off-by: Chris <[email protected]>
  • Loading branch information
chriscchien authored and yangchiu committed Jun 18, 2024
1 parent 9bfdf5b commit d043545
Show file tree
Hide file tree
Showing 11 changed files with 235 additions and 1 deletion.
1 change: 1 addition & 0 deletions e2e/keywords/common.resource
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ Set test environment

Cleanup test resources
cleanup_control_plane_network_latency
reset_node_schedule
cleanup_node_exec
cleanup_stress_helper
cleanup_recurringjobs
Expand Down
16 changes: 16 additions & 0 deletions e2e/keywords/node.resource
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,19 @@ Add ${disk_type} type disk ${disk_path} for all worker nodes
FOR ${worker_node} IN @{worker_nodes}
add_disk ${worker_node} ${disk_type} ${disk_path}
END

Disable node ${node_id} scheduling
${node_name} = get_node_by_index ${node_id}
disable_node_scheduling ${node_name}

Enable node ${node_id} scheduling
${node_name} = get_node_by_index ${node_id}
enable_node_scheduling ${node_name}

Disable node ${node_id} default disk
${node_name} = get_node_by_index ${node_id}
disable_default_disk ${node_name}

Enable node ${node_id} default disk
${node_name} = get_node_by_index ${node_id}
enable_default_disk ${node_name}
36 changes: 36 additions & 0 deletions e2e/keywords/volume.resource
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,25 @@ Wait for volume ${volume_id} healthy
${volume_name} = generate_name_with_suffix volume ${volume_id}
wait_for_volume_healthy ${volume_name}

Wait for volume ${volume_id} degraded
${volume_name} = generate_name_with_suffix volume ${volume_id}
wait_for_volume_degraded ${volume_name}

Check volume ${volume_id} replica on node ${node_id} kept in stopped
${volume_name} = generate_name_with_suffix volume ${volume_id}
${node_name} = get_node_by_index ${node_id}
FOR ${i} IN RANGE ${LOOP_COUNT}
wait_for_replica_stopped ${volume_name} ${node_name}
Sleep ${RETRY_INTERVAL}
END

Check for volume ${volume_id} kept in degraded
${volume_name} = generate_name_with_suffix volume ${volume_id}
FOR ${i} IN RANGE ${LOOP_COUNT}
wait_for_volume_degraded ${volume_name}
Sleep ${RETRY_INTERVAL}
END

Wait for volume ${volume_id} migration ready
${volume_name} = generate_name_with_suffix volume ${volume_id}
wait_for_volume_migration_ready ${volume_name}
Expand Down Expand Up @@ -130,6 +149,23 @@ Crash volume ${volume_id} replica processes
${volume_name} = generate_name_with_suffix volume ${volume_id}
crash_replica_processes ${volume_name}

Crash volume ${volume_id} replica process on node ${node_id}
${volume_name} = generate_name_with_suffix volume ${volume_id}
${node_name} = get_node_by_index ${node_id}
${crashed_replica_name} = crash_node_replica_process ${volume_name} ${node_name}
Set Test Variable ${crashed_replica_name}

Check volume ${volume_id} crashed replica reused on node ${node_id}
${volume_name} = generate_name_with_suffix volume ${volume_id}
${node_name} = get_node_by_index ${node_id}
${current_replica_name} = get_replica_name_on_node ${volume_name} ${node_name}
Should Be Equal ${crashed_replica_name} ${current_replica_name}

Wait volume ${volume_id} replica on node ${node_id} stopped
${volume_name} = generate_name_with_suffix volume ${volume_id}
${node_name} = get_node_by_index ${node_id}
wait_for_replica_stopped ${volume_name} ${node_name}

Check volume ${volume_id} data is intact
${volume_name} = generate_name_with_suffix volume ${volume_id}
check_data_checksum ${volume_name}
Expand Down
17 changes: 17 additions & 0 deletions e2e/libs/keywords/node_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,20 @@ def cleanup_disks(self):
for node_name in nodes:
logging(f"Resetting node {node_name} disks to default")
self.node.reset_disks(node_name)

def disable_default_disk(self, node_name):
self.node.set_default_disk_scheduling(node_name, allowScheduling=False)

def enable_default_disk(self, node_name):
self.node.set_default_disk_scheduling(node_name, allowScheduling=True)

def disable_node_scheduling(self, node_name):
self.node.set_node_scheduling(node_name, allowScheduling=False)

def enable_node_scheduling(self, node_name):
self.node.set_node_scheduling(node_name, allowScheduling=True)

def reset_node_schedule(self):
nodes = self.node.list_node_names_by_role("worker")
for node_name in nodes:
self.enable_node_scheduling(node_name)
12 changes: 12 additions & 0 deletions e2e/libs/keywords/volume_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,18 @@ async def wait_for_both_replica_rebuildings():
def crash_replica_processes(self, volume_name):
self.volume.crash_replica_processes(volume_name)

def crash_node_replica_process(self, volume_name, node_name):
return self.volume.crash_node_replica_process(volume_name, node_name)

def wait_for_replica_stopped(self, volume_name, node_name):
self.volume.wait_for_replica_stopped(volume_name, node_name)

def wait_for_replica_running(self, volume_name, node_name):
self.volume.wait_for_replica_running(volume_name, node_name)

def get_replica_name_on_node(self, volume_name, node_name):
return self.volume.get_replica_name_on_node(volume_name, node_name)

def wait_for_replica_rebuilding_to_stop_on_node(self, volume_name, replica_locality):
node_id = self.get_node_id_by_replica_locality(volume_name, replica_locality)
retry_count, retry_interval = get_retry_count_and_interval()
Expand Down
37 changes: 36 additions & 1 deletion e2e/libs/node/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
from utility.utility import get_longhorn_client
from utility.utility import get_retry_count_and_interval
from utility.utility import logging

from utility.constant import DISK_BEING_SYNCING
from utility.constant import NODE_UPDATE_RETRY_INTERVAL

class Node:

Expand Down Expand Up @@ -45,6 +46,7 @@ def reset_disks(self, node_name):
for disk_name, disk in iter(node.disks.items()):
if disk.path == self.DEFAULT_DISK_PATH:
disks[disk_name] = disk
disk.allowScheduling = True
else:
logging(f"Try to remove disk {disk_name} from node {node_name}")
self.update_disks(node_name, disks)
Expand Down Expand Up @@ -105,3 +107,36 @@ def filter_nodes(nodes, condition):
return control_plane_nodes
elif role == "worker":
return worker_nodes

def set_node_scheduling(self, node_name, allowScheduling=True, retry=False):
node = self.longhorn_client.by_id_node(node_name)

if node.tags is None:
node.tags = []

if not retry:
self.longhorn_client.update(node, allowScheduling=allowScheduling)

# Retry if "too many retries error" happened.
for _ in range(self.retry_count):
try:
node = self.longhorn_client.update(node, allowScheduling=allowScheduling,
tags=node.tags)
except Exception as e:
if DISK_BEING_SYNCING in str(e.error.message):
time.sleep(NODE_UPDATE_RETRY_INTERVAL)
continue
print(e)
raise
else:
break

return node

def set_default_disk_scheduling(self, node_name, allowScheduling):
node = self.longhorn_client.by_id_node(node_name)

for disk_name, disk in iter(node.disks.items()):
if disk.path == self.DEFAULT_DISK_PATH:
disk.allowScheduling = allowScheduling
self.update_disks(node_name, node.disks)
3 changes: 3 additions & 0 deletions e2e/libs/utility/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,6 @@
STREAM_EXEC_TIMEOUT = 300

LONGHORN_NAMESPACE = 'longhorn-system'

DISK_BEING_SYNCING = "being syncing and please retry later"
NODE_UPDATE_RETRY_INTERVAL = 6
9 changes: 9 additions & 0 deletions e2e/libs/volume/crd.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,12 @@ def wait_for_volume_state(self, volume_name, desired_state):
time.sleep(self.retry_interval)
assert volume["status"]["state"] == desired_state

def is_replica_running(self, volume_name, node_name, is_running):
return Rest(self.node_exec).is_replica_running(volume_name, node_name, is_running)

def get_replica_name_on_node(self, volume_name, node_name):
return Rest(self.node_exec).get_replica_name_on_node(volume_name, node_name)

def wait_for_volume_keep_in_state(self, volume_name, desired_state):
self.wait_for_volume_state(volume_name, desired_state)

Expand Down Expand Up @@ -395,6 +401,9 @@ def is_replica_rebuilding_in_progress(self, volume_name, node_name):
def crash_replica_processes(self, volume_name):
return Rest(self.node_exec).crash_replica_processes(volume_name)

def crash_node_replica_process(self, volume_name, node_name):
return Rest(self.node_exec).crash_node_replica_process(volume_name, node_name)

def wait_for_replica_rebuilding_complete(self, volume_name, node_name):
return Rest(self.node_exec).wait_for_replica_rebuilding_complete(volume_name, node_name)

Expand Down
30 changes: 30 additions & 0 deletions e2e/libs/volume/rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,36 @@ def crash_replica_processes(self, volume_name):
'--name ' + r_name
pod_exec(rm_name, LONGHORN_NAMESPACE, delete_command)

def crash_node_replica_process(self, volume_name, node_name):
logging(f"Crashing volume {volume_name} replica process on node {node_name}")
volume = self.longhorn_client.by_id_volume(volume_name)
r_name = None
for r in volume.replicas:
if r.hostId == node_name:
rm_name = r.instanceManagerName
r_name = r.name
delete_command = 'longhorn-instance-manager process delete ' + \
'--name ' + r_name
pod_exec(rm_name, LONGHORN_NAMESPACE, delete_command)

return r_name

def is_replica_running(self, volume_name, node_name, is_running):
for i in range(self.retry_count):
volume = self.longhorn_client.by_id_volume(volume_name)
for r in volume.replicas:
if r.hostId == node_name and r.running == is_running:
return

assert False, f"Volume {volume_name} replica on node {node_name} running state is not {is_running}"

def get_replica_name_on_node(self, volume_name, node_name):
for i in range(self.retry_count):
volume = self.longhorn_client.by_id_volume(volume_name)
for r in volume.replicas:
if r.hostId == node_name:
return r.name

def wait_for_replica_rebuilding_complete(self, volume_name, node_name):
completed = False
for i in range(self.retry_count):
Expand Down
12 changes: 12 additions & 0 deletions e2e/libs/volume/volume.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,18 @@ def is_replica_rebuilding_in_progress(self, volume_name, node_name):
def crash_replica_processes(self, volume_name):
return self.volume.crash_replica_processes(volume_name)

def crash_node_replica_process(self, volume_name, node_name):
return self.volume.crash_node_replica_process(volume_name, node_name)

def wait_for_replica_stopped(self, volume_name, node_name):
return self.volume.is_replica_running(volume_name, node_name, is_running=False)

def wait_for_replica_running(self, volume_name, node_name):
return self.volume.is_replica_running(volume_name, node_name, is_running=True)

def get_replica_name_on_node(self, volume_name, node_name):
return self.volume.get_replica_name_on_node(volume_name, node_name)

def wait_for_replica_rebuilding_complete(self, volume_name, node_name):
return self.volume.wait_for_replica_rebuilding_complete(volume_name, node_name)

Expand Down
63 changes: 63 additions & 0 deletions e2e/tests/test_cases/replica_rebuilding.robot
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
*** Settings ***
Documentation Replica Rebuilding
Test Tags manual_test_case

Resource ../keywords/common.resource
Resource ../keywords/deployment.resource
Resource ../keywords/volume.resource
Resource ../keywords/node.resource

Test Setup Set test environment
Test Teardown Cleanup test resources

*** Variables ***
${LOOP_COUNT} 30
${RETRY_COUNT} 300
${RETRY_INTERVAL} 1

*** Test Cases ***
Replica Rebuilding
[Documentation] -- Manual test plan --
... 1. Create and attach a volume.
... 2. Write a large amount of data to the volume.
... 3. Disable disk scheduling and the node scheduling for one replica.
... 4. Crash the replica progress. Verify
... - the corresponding replica in not running state.
... - the volume will keep robustness Degraded.
... 5. Enable the disk scheduling. Verify nothing changes.
... 6. Enable the node scheduling. Verify.
... - the failed replica is reused by Longhorn.
... - the data content is correct after rebuilding.
... - volume r/w works fine.
...
... == Not implemented ==
... 7. Direct delete one replica via UI. Verify
... - a new replica will be replenished immediately.
... - the rebuilding progress in UI page looks good.
... - the data content is correct after rebuilding.
... - volume r/w works fine.
When Create volume 0 with 10 GB and 3 replicas
And Attach volume 0 to node 0
And Wait for volume 0 healthy

And Write 1 GB data to volume 0

And Disable node 1 scheduling
And Disable node 1 default disk

And Crash volume 0 replica process on node 1
Then Wait volume 0 replica on node 1 stopped
And Wait for volume 0 degraded

And Enable node 1 default disk
Then Check volume 0 replica on node 1 kept in stopped
And Check for volume 0 kept in degraded

And Enable node 1 scheduling
Then Wait until volume 0 replica rebuilding started on node 1
And Wait for volume 0 healthy
And Check volume 0 crashed replica reused on node 1

And Check volume 0 data is intact
And Check volume 0 works

0 comments on commit d043545

Please sign in to comment.