Skip to content

Commit

Permalink
test(robot): migrate test_replica_rebuild_per_volume_limit
Browse files Browse the repository at this point in the history
Signed-off-by: Yang Chiu <[email protected]>
  • Loading branch information
yangchiu committed Nov 11, 2024
1 parent 95b2041 commit b5d2aee
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 28 deletions.
12 changes: 12 additions & 0 deletions e2e/keywords/volume.resource
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,10 @@ Delete volume ${volume_id} replica on ${replica_locality}
${volume_name} = generate_name_with_suffix volume ${volume_id}
delete_replica_on_node ${volume_name} ${replica_locality}

Delete ${count} replicas of volume ${volume_id}
${volume_name} = generate_name_with_suffix volume ${volume_id}
delete_replicas ${volume_name} ${count}

Wait for volume ${volume_id} healthy
${volume_name} = generate_name_with_suffix volume ${volume_id}
wait_for_volume_healthy ${volume_name}
Expand Down Expand Up @@ -178,6 +182,10 @@ Wait until volume ${volume_id} replicas rebuilding completed
${volume_name} = generate_name_with_suffix volume ${volume_id}
wait_for_replica_rebuilding_to_complete ${volume_name}

Monitor only one replica rebuilding will start at a time for volume ${volume_id}
${volume_name} = generate_name_with_suffix volume ${volume_id}
wait_for_replica_rebuilding_to_complete ${volume_name}

Wait until volume ${volume_id} replica rebuilding stopped on ${replica_locality}
${volume_name} = generate_name_with_suffix volume ${volume_id}
wait_for_replica_rebuilding_to_stop_on_node ${volume_name} ${replica_locality}
Expand All @@ -192,6 +200,10 @@ Both volume ${volume_id_0} and volume ${volume_id_1} replica rebuilding on ${rep
${volume_name_1} = generate_name_with_suffix volume ${volume_id_1}
both_replica_rebuildings_will_start_at_the_same_time_on_node ${volume_name_0} ${volume_name_1} ${replica_locality}

Only one replica rebuilding will start at a time for volume ${volume_id}
${volume_name} = generate_name_with_suffix volume ${volume_id}
only_one_replica_rebuilding_will_start_at_a_time ${volume_name}

Crash volume ${volume_id} replica processes
${volume_name} = generate_name_with_suffix volume ${volume_id}
crash_replica_processes ${volume_name}
Expand Down
28 changes: 23 additions & 5 deletions e2e/libs/keywords/volume_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,13 @@ def delete_replica_on_nodes(self, volume_name, replica_locality):
logging(f"Deleting volume {volume_name}'s replica on node {node_id}")
self.volume.delete_replica(volume_name, node_id)

def delete_replicas(self, volume_name, count):
replica_list = self.replica.get(volume_name, node_name="")
replica_names = [replica['metadata']['name'] for replica in replica_list]
for i in range(int(count)):
logging(f"Deleting volume {volume_name} replica volume {replica_names[i]}")
self.volume.delete_replica_by_name(volume_name, replica_names[i])

def set_annotation(self, volume_name, annotation_key, annotation_value):
self.volume.set_annotation(volume_name, annotation_key, annotation_value)

Expand All @@ -152,11 +159,7 @@ def wait_for_replica_rebuilding_to_complete_on_node(self, volume_name, replica_l
self.volume.wait_for_replica_rebuilding_complete(volume_name, node_name)

def wait_for_replica_rebuilding_to_complete(self, volume_name):
for node_name in self.node.list_node_names_by_role("worker"):
if self.node.is_node_schedulable(node_name) == "False":
continue
logging(f"Waiting for volume {volume_name}'s replica on node {node_name} rebuilding completed")
self.volume.wait_for_replica_rebuilding_complete(volume_name, node_name)
self.volume.wait_for_replica_rebuilding_complete(volume_name)

async def only_one_replica_rebuilding_will_start_at_a_time_on_node(self, volume_name_0, volume_name_1, replica_locality):

Expand Down Expand Up @@ -203,6 +206,21 @@ async def wait_for_both_replica_rebuildings():
assert self.volume.is_replica_rebuilding_in_progress(volume_name_0, node_id) and self.volume.is_replica_rebuilding_in_progress(volume_name_1, node_id), \
f"Expect {volume_name_0} and {volume_name_1} replica rebuilding at the same time"

async def only_one_replica_rebuilding_will_start_at_a_time(self, volume_name):

async def wait_for_replica_rebuilding():
tasks = [
asyncio.create_task(self.volume.wait_for_replica_rebuilding_start(volume_name), name=volume_name),
]

done, pending = await asyncio.wait(tasks, return_when=asyncio.ALL_COMPLETED)
logging(f"Observed {done.pop().get_name()} started replica rebuilding")

await wait_for_replica_rebuilding()

assert self.volume.is_replica_rebuilding_in_progress(volume_name), \
f"Expect {volume_name} replica rebuilding in progress"

def crash_replica_processes(self, volume_name):
self.volume.crash_replica_processes(volume_name)

Expand Down
19 changes: 18 additions & 1 deletion e2e/libs/volume/crd.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,23 @@ def delete_replica(self, volume_name, node_name):
name=replica_list['items'][0]['metadata']['name']
)

def delete_replica_by_name(self, volume_name, replica_name):
replica = self.obj_api.get_namespaced_custom_object(
group="longhorn.io",
version="v1beta2",
namespace="longhorn-system",
plural="replicas",
name=replica_name
)
logging(f"Deleting replica {replica['metadata']['name']}")
self.obj_api.delete_namespaced_custom_object(
group="longhorn.io",
version="v1beta2",
namespace="longhorn-system",
plural="replicas",
name=replica['metadata']['name']
)

def wait_for_replica_rebuilding_start(self, volume_name, node_name):
return Rest().wait_for_replica_rebuilding_start(volume_name, node_name)

Expand All @@ -432,7 +449,7 @@ def crash_replica_processes(self, volume_name):
def crash_node_replica_process(self, volume_name, node_name):
return Rest().crash_node_replica_process(volume_name, node_name)

def wait_for_replica_rebuilding_complete(self, volume_name, node_name):
def wait_for_replica_rebuilding_complete(self, volume_name, node_name=None):
return Rest().wait_for_replica_rebuilding_complete(volume_name, node_name)

def check_data_checksum(self, volume_name, data_id):
Expand Down
69 changes: 50 additions & 19 deletions e2e/libs/volume/rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,29 +118,42 @@ def keep_writing_data(self, volume_name, size):
def delete_replica(self, volume_name, node_name):
return NotImplemented

async def wait_for_replica_rebuilding_start(self, volume_name, node_name):
async def wait_for_replica_rebuilding_start(self, volume_name, node_name=None):
rebuilding_replica_name = None
for i in range(self.retry_count):
try:
v = get_longhorn_client().by_id_volume(volume_name)
logging(f"Trying to get volume {volume_name} rebuilding replicas ... ({i})")
for replica in v.replicas:
if replica.hostId == node_name:
if node_name and replica.hostId == node_name and replica.mode == "WO":
rebuilding_replica_name = replica.name
break
elif replica.mode == "WO":
rebuilding_replica_name = replica.name
node_name = replica.hostId
break
if rebuilding_replica_name:
break
except Exception as e:
logging(f"Failed to get volume {volume_name} with error: {e}")
await asyncio.sleep(self.retry_interval)
assert rebuilding_replica_name != None
assert rebuilding_replica_name != None, f"Waiting for replica rebuilding start for volume {volume_name} on node {node_name} failed: replicas = {v.replicas}"
logging(f"Got volume {volume_name} rebuilding replica = {rebuilding_replica_name} on node {node_name}")

started = False
for i in range(self.retry_count):
try:
v = get_longhorn_client().by_id_volume(volume_name)
logging(f"Got volume {volume_name} rebuild status = {v.rebuildStatus}")

# During monitoring replica rebuilding
# at the same time monitoring if there are unexpected concurrent replica rebuilding
rebuilding_count = 0
for replica in v.replicas:
if replica.mode == "WO":
rebuilding_count +=1
assert rebuilding_count <= 1, f"Unexpected concurrent replica rebuilding = {rebuilding_count}, replicas = {v.replicas}"

for status in v.rebuildStatus:
for replica in v.replicas:
if status.replica == replica.name and \
Expand All @@ -156,7 +169,7 @@ async def wait_for_replica_rebuilding_start(self, volume_name, node_name):
await asyncio.sleep(self.retry_interval)
assert started, f"wait for replica on node {node_name} rebuilding timeout: {v}"

def is_replica_rebuilding_in_progress(self, volume_name, node_name):
def is_replica_rebuilding_in_progress(self, volume_name, node_name=None):
in_progress = False
for i in range(self.retry_count):
try:
Expand All @@ -165,8 +178,9 @@ def is_replica_rebuilding_in_progress(self, volume_name, node_name):
for status in v.rebuildStatus:
for replica in v.replicas:
if status.replica == replica.name and \
replica.hostId == node_name and \
(node_name is None or replica.hostId == node_name) and \
status.state == "in_progress":
node_name = replica.hostId if not node_name else node_name
logging(f"Volume {volume_name} replica rebuilding {replica.name} in progress on {node_name}")
in_progress = True
break
Expand Down Expand Up @@ -217,31 +231,48 @@ def get_replica_name_on_node(self, volume_name, node_name):
if r.hostId == node_name:
return r.name

def wait_for_replica_rebuilding_complete(self, volume_name, node_name):
def wait_for_replica_rebuilding_complete(self, volume_name, node_name=None):
completed = False
for i in range(self.retry_count):
logging(f"wait for {volume_name} replica rebuilding completed on {node_name} ... ({i})")
logging(f"wait for {volume_name} replica rebuilding completed on {'all nodes' if not node_name else node_name} ... ({i})")
try:
v = get_longhorn_client().by_id_volume(volume_name)

# During monitoring replica rebuilding
# at the same time monitoring if there are unexpected concurrent replica rebuilding
rebuilding_count = 0
for replica in v.replicas:
# use replica.mode is RW or RO to check if this replica
# has been rebuilt or not
# because rebuildStatus is not reliable
# when the rebuild progress reaches 100%
# it will be removed from rebuildStatus immediately
# and you will just get an empty rebuildStatus []
# so it's no way to distinguish "rebuilding not started yet"
# or "rebuilding already completed" using rebuildStatus
if replica.hostId == node_name and replica.mode == "RW":
if replica.mode == "WO":
rebuilding_count +=1
assert rebuilding_count <= 1, f"Unexpected concurrent replica rebuilding = {rebuilding_count}, replicas = {v.replicas}"

if node_name:
for replica in v.replicas:
# use replica.mode is RW or RO to check if this replica
# has been rebuilt or not
# because rebuildStatus is not reliable
# when the rebuild progress reaches 100%
# it will be removed from rebuildStatus immediately
# and you will just get an empty rebuildStatus []
# so it's no way to distinguish "rebuilding not started yet"
# or "rebuilding already completed" using rebuildStatus
if replica.hostId == node_name and replica.mode == "RW":
completed = True
break
else:
rw_replica_count = 0
for replica in v.replicas:
if replica.mode == "RW":
rw_replica_count += 1
if rw_replica_count == v.numberOfReplicas:
completed = True
break
if completed:
break
except Exception as e:
logging(f"Failed to get volume {volume_name} with error: {e}")
time.sleep(self.retry_interval)
logging(f"Completed volume {volume_name} replica rebuilding on {node_name}")
assert completed, f"Expect volume {volume_name} replica rebuilding completed on {node_name}"
logging(f"Completed volume {volume_name} replica rebuilding on {'all nodes' if not node_name else node_name}")
assert completed, f"Expect volume {volume_name} replica rebuilding completed on {'all nodes' if not node_name else node_name}"

def check_data_checksum(self, volume_name, data_id):
return NotImplemented
Expand Down
9 changes: 6 additions & 3 deletions e2e/libs/volume/volume.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,13 @@ def keep_writing_data(self, volume_name):
def delete_replica(self, volume_name, node_name):
return self.volume.delete_replica(volume_name, node_name)

def wait_for_replica_rebuilding_start(self, volume_name, node_name):
def delete_replica_by_name(self, volume_name, replica_name):
return self.volume.delete_replica_by_name(volume_name, replica_name)

def wait_for_replica_rebuilding_start(self, volume_name, node_name=None):
return self.volume.wait_for_replica_rebuilding_start(volume_name, node_name)

def is_replica_rebuilding_in_progress(self, volume_name, node_name):
def is_replica_rebuilding_in_progress(self, volume_name, node_name=None):
return self.volume.is_replica_rebuilding_in_progress(volume_name, node_name)

def crash_replica_processes(self, volume_name):
Expand All @@ -122,7 +125,7 @@ def wait_for_replica_running(self, volume_name, node_name):
def get_replica_name_on_node(self, volume_name, node_name):
return self.volume.get_replica_name_on_node(volume_name, node_name)

def wait_for_replica_rebuilding_complete(self, volume_name, node_name):
def wait_for_replica_rebuilding_complete(self, volume_name, node_name=None):
return self.volume.wait_for_replica_rebuilding_complete(volume_name, node_name)

def check_data_checksum(self, volume_name, data_id):
Expand Down
51 changes: 51 additions & 0 deletions e2e/tests/regression/test_replica.robot
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
*** Settings ***
Documentation Replica Test Cases
Test Tags regression

Resource ../keywords/common.resource
Resource ../keywords/volume.resource
Resource ../keywords/setting.resource
Resource ../keywords/deployment.resource
Resource ../keywords/persistentvolumeclaim.resource
Resource ../keywords/workload.resource

Test Setup Set test environment
Test Teardown Cleanup test resources

*** Variables ***
${LOOP_COUNT} 1
${RETRY_COUNT} 300
${RETRY_INTERVAL} 1
${DATA_ENGINE} v1

*** Test Cases ***
Test Replica Rebuilding Per Volume Limit
[Tags] coretest
[Documentation] Test the volume always only have one replica scheduled for rebuild
...
... 1. Set soft anti-affinity to `true`.
... 2. Create a volume with 1 replica.
... 3. Attach the volume and write a few hundreds MB data to it.
... 4. Scale the volume replica to 5.
... 5. Monitor the volume replica list to make sure there should be only 1 replica in WO state.
... 6. Wait for the volume to complete rebuilding. Then remove 4 of the 5 replicas.
... 7. Monitoring the volume replica list again.
... 8. Once the rebuild was completed again, verify the data checksum.
Given Set setting replica-soft-anti-affinity to true
And Create volume 0 with numberOfReplicas=3 dataEngine=${DATA_ENGINE}
And Attach volume 0
And Wait for volume 0 healthy
And Write data to volume 0

When Update volume 0 replica count to 5
Then Only one replica rebuilding will start at a time for volume 0
And Monitor only one replica rebuilding will start at a time for volume 0
And Wait until volume 0 replicas rebuilding completed

When Delete 4 replicas of volume 0
Then Only one replica rebuilding will start at a time for volume 0
And Monitor only one replica rebuilding will start at a time for volume 0
And Wait until volume 0 replicas rebuilding completed
And Wait for volume 0 healthy
And Check volume 0 data is intact

0 comments on commit b5d2aee

Please sign in to comment.