Skip to content

Commit

Permalink
test(robot): add test case Migration Confirmation After Migration Nod…
Browse files Browse the repository at this point in the history
…e Down

Signed-off-by: Yang Chiu <[email protected]>
  • Loading branch information
yangchiu committed Dec 10, 2024
1 parent 8acea05 commit e739c0d
Show file tree
Hide file tree
Showing 17 changed files with 192 additions and 12 deletions.
49 changes: 49 additions & 0 deletions e2e/keywords/migration.resource
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
*** Settings ***
Documentation Migration Keywords
Library ../libs/keywords/common_keywords.py
Library ../libs/keywords/engine_keywords.py
Library ../libs/keywords/replica_keywords.py

*** Keywords ***
Get volume ${volume_id} engine name
${volume_name} = generate_name_with_suffix volume ${volume_id}
${engine_name} = get_engine_name ${volume_name}
Set Test Variable ${engine_name}

Volume ${volume_id} engine should be the same
${volume_name} = generate_name_with_suffix volume ${volume_id}
${new_engine_name} = get_engine_name ${volume_name}
Should Be Equal ${engine_name} ${new_engine_name}

Volume ${volume_id} engine should be different
${volume_name} = generate_name_with_suffix volume ${volume_id}
${new_engine_name} = get_engine_name ${volume_name}
Should Not Be Equal ${engine_name} ${new_engine_name}

Get volume ${volume_id} replica names
${volume_name} = generate_name_with_suffix volume ${volume_id}
${replica_names} = get_replica_names ${volume_name}
Set Test Variable ${replica_names}

Volume ${volume_id} replicas should be the same
${volume_name} = generate_name_with_suffix volume ${volume_id}
${new_replica_names} = get_replica_names ${volume_name}
Should Be Equal As Strings ${replica_names} ${new_replica_names}

Volume ${volume_id} replicas should be different
${volume_name} = generate_name_with_suffix volume ${volume_id}
${new_replica_names} = get_replica_names ${volume_name}
Should Not Be Equal As Strings ${replica_names} ${new_replica_names}

Get volume ${volume_id} engine and replica names
Get volume ${volume_id} engine name
Get volume ${volume_id} replica names

Volume ${volume_id} migration should fail or rollback
Volume ${volume_id} engine should be the same
Volume ${volume_id} replicas should be the same

Volume ${volume_id} migration should succeed
Volume ${volume_id} engine should be different
Volume ${volume_id} replicas should be different
1 change: 1 addition & 0 deletions e2e/keywords/variables.resource
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@ ${VOLUME_TYPE} RWO
${CONTROL_PLANE_NODE_NETWORK_LATENCY_IN_MS} 0
${RWX_VOLUME_FAST_FAILOVER} false
${DATA_ENGINE} v1
${VOLUME_STATE_CHECK_TIMEOUT} 120

@{powered_off_nodes}=
13 changes: 10 additions & 3 deletions e2e/keywords/volume.resource
Original file line number Diff line number Diff line change
Expand Up @@ -141,18 +141,25 @@ Wait for volume ${volume_id} degraded
Check volume ${volume_id} replica on node ${node_id} kept in stopped
${volume_name} = generate_name_with_suffix volume ${volume_id}
${node_name} = get_node_by_index ${node_id}
FOR ${i} IN RANGE ${LOOP_COUNT}
FOR ${i} IN RANGE ${VOLUME_STATE_CHECK_TIMEOUT}
wait_for_replica_stopped ${volume_name} ${node_name}
Sleep ${RETRY_INTERVAL}
END

Check for volume ${volume_id} kept in degraded
Check volume ${volume_id} kept in degraded
${volume_name} = generate_name_with_suffix volume ${volume_id}
FOR ${i} IN RANGE ${LOOP_COUNT}
FOR ${i} IN RANGE ${VOLUME_STATE_CHECK_TIMEOUT}
wait_for_volume_degraded ${volume_name}
Sleep ${RETRY_INTERVAL}
END

Check volume ${volume_id} kept in attaching
${volume_name} = generate_name_with_suffix volume ${volume_id}
FOR ${i} IN RANGE ${VOLUME_STATE_CHECK_TIMEOUT}
wait_for_volume_attaching ${volume_name}
Sleep ${RETRY_INTERVAL}
END

Check volume ${volume_id} kept in detached
${volume_name} = generate_name_with_suffix volume ${volume_id}
FOR ${i} IN RANGE ${LOOP_COUNT}
Expand Down
10 changes: 8 additions & 2 deletions e2e/libs/engine/crd.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import logging

from kubernetes import client

from engine.base import Base
Expand Down Expand Up @@ -61,3 +59,11 @@ def validate_engine_setting(self, volume_name, setting_name, value):
for engine in engines:
assert str(engine["spec"][setting_name]) == value, \
f"Expected volume {volume_name} engine setting {setting_name} is {value}, but it's {str(engine['spec'][setting_name])}"

def get_engine_name(self, volume_name):
logging(f"Getting volume {volume_name} engine name")
engines = self.get_engines(volume_name)
assert len(engines) == 1, f"Expect volume {volume_name} only has one engine, but there are {engines}"
engine_name = engines[0]["metadata"]["name"]
logging(f"Got volume {volume_name} engine name {engine_name}")
return engine_name
3 changes: 3 additions & 0 deletions e2e/libs/engine/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,8 @@ def get_engine_state(self, volume_name, node_name):
engines_states[engine_name] = engine_state
return engines_states

def get_engine_name(self, volume_name):
return self.engine.get_engine_name(volume_name)

def validate_engine_setting(self, volume_name, setting_name, value):
return self.engine.validate_engine_setting(volume_name, setting_name, value)
2 changes: 2 additions & 0 deletions e2e/libs/host/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def power_off_node(self, power_off_node_name, waiting=True):
waiter = self.aws_client.get_waiter('instance_stopped')
waiter.wait(InstanceIds=instance_ids)
logging(f"Stopped instances")
self.node.wait_for_node_down(power_off_node_name)

def power_on_node(self, power_on_node_name):
instance_ids = [self.mapping[power_on_node_name]]
Expand All @@ -86,3 +87,4 @@ def power_on_node(self, power_on_node_name):
waiter = self.aws_client.get_waiter('instance_running')
waiter.wait(InstanceIds=instance_ids)
logging(f"Started instances")
self.node.wait_for_node_up(power_on_node_name)
4 changes: 4 additions & 0 deletions e2e/libs/host/harvester.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ def power_off_node(self, node_name, waiting=True):
time.sleep(self.retry_interval)
assert stopped, f"Expected vm {vm_id} to be stopped but it's not"

self.node.wait_for_node_down(power_off_node_name)

def power_on_node(self, node_name):
vm_id = self.mapping[node_name]

Expand Down Expand Up @@ -111,3 +113,5 @@ def power_on_node(self, node_name):
logging(f"Getting vm status failed with error {e}")
time.sleep(self.retry_interval)
assert started, f"Expected vm {vm_id} to be started but it's not"

self.node.wait_for_node_up(power_on_node_name)
3 changes: 3 additions & 0 deletions e2e/libs/keywords/engine_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,8 @@ def __init__(self):
def get_engine_instance_manager_name(self, volume_name):
return self.engine.get_engine_instance_manager_name(volume_name)

def get_engine_name(self, volume_name):
return self.engine.get_engine_name(volume_name)

def validate_engine_setting(self, volume_name, setting_name, value):
return self.engine.validate_engine_setting(volume_name, setting_name, value)
3 changes: 3 additions & 0 deletions e2e/libs/keywords/replica_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,6 @@ def validate_replica_setting(self, volume_name, setting_name, value):

def get_replicas(self, volume_name=None, node_name=None, disk_uuid=None):
return self.replica.get(volume_name, node_name, disk_uuid)

def get_replica_names(self, volume_name, numberOfReplicas=3):
return self.replica.get_replica_names(volume_name, numberOfReplicas)
28 changes: 28 additions & 0 deletions e2e/libs/node/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ def set_node(self, node_name: str, allowScheduling: bool, evictionRequested: boo
def set_node_scheduling(self, node_name, allowScheduling=True, retry=False):
node = get_longhorn_client().by_id_node(node_name)

logging(f"Setting node {node_name} allowScheduling to {allowScheduling}")

if node.tags is None:
node.tags = []

Expand Down Expand Up @@ -246,3 +248,29 @@ def wait_for_disk_not_in_pressure(self, node_name, disk_name):
def get_disk_uuid(self, node_name, disk_name):
node = get_longhorn_client().by_id_node(node_name)
return node["disks"][disk_name]["diskUUID"]

def wait_for_node_down(self, node_name):
down = False
for i in range(self.retry_count):
logging(f"Waiting for k8s node {node_name} down ... ({i})")
node = self.get_node_by_name(node_name)
for condition in node.status.conditions:
if condition.type == "Ready" and condition.status != "True":
down = True
if down:
break
time.sleep(self.retry_interval)
assert down, f"Waiting for node {node_name} down failed: {node.status.conditions}"

def wait_for_node_up(self, node_name):
up = False
for i in range(self.retry_count):
logging(f"Waiting for k8s node {node_name} up ... ({i})")
node = self.get_node_by_name(node_name)
for condition in node.status.conditions:
if condition.type == "Ready" and condition.status == "True":
up = True
if up:
break
time.sleep(self.retry_interval)
assert up, f"Waiting for node {node_name} up failed: {node.status.conditions}"
8 changes: 8 additions & 0 deletions e2e/libs/replica/crd.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@ def get(self, volume_name=None, node_name=None, disk_uuid=None):
)
return replicas["items"]

def get_replica_names(self, volume_name, numberOfReplicas):
logging(f"Getting volume {volume_name} replica names")
replicas = self.get(volume_name)
assert len(replicas) == numberOfReplicas, f"Expect volume {volume_name} has {numberOfReplicas} replicas, but there are {replicas}"
replica_names = [ replica['metadata']['name'] for replica in replicas ]
logging(f"Got volume {volume_name} replica names {replica_names}")
return replica_names

def delete(self, volume_name, node_name):
if volume_name == "" or node_name == "":
logging(f"Deleting all replicas")
Expand Down
3 changes: 3 additions & 0 deletions e2e/libs/replica/replica.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ def delete(self, volume_name="", node_name=""):
def get(self, volume_name, node_name, disk_uuid=None):
return self.replica.get(volume_name, node_name, disk_uuid)

def get_replica_names(self, volume_name, numberOfReplicas):
return self.replica.get_replica_names(volume_name, numberOfReplicas)

def wait_for_rebuilding_start(self, volume_name, node_name):
return self.replica.wait_for_rebuilding_start(volume_name,node_name)

Expand Down
27 changes: 22 additions & 5 deletions e2e/libs/volume/crd.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,12 @@ def wait_for_volume_state(self, volume_name, desired_state):
time.sleep(self.retry_interval)
assert volume["status"]["state"] == desired_state

def wait_for_volume_attaching(self, volume_name):
self.wait_for_volume_state(volume_name, "attaching")
volume = self.get(volume_name)
assert volume["spec"]["nodeID"] != ""
assert volume["status"]["currentNodeID"] == ""

def is_replica_running(self, volume_name, node_name, is_running):
return Rest().is_replica_running(volume_name, node_name, is_running)

Expand Down Expand Up @@ -307,43 +313,54 @@ def wait_for_volume_migration_to_be_ready(self, volume_name):
logging(f"Waiting for volume {volume_name} migration to be ready ({i}) ...")
try:
engines = self.engine.get_engines(volume_name)
volume = self.get(volume_name)
ready = len(engines) == 2
for engine in engines:
ready = ready and engine['status']['endpoint']
ready = volume['spec']['migrationNodeID'] and volume['spec']['migrationNodeID'] == volume['status']['currentMigrationNodeID']
ready = volume['spec']['nodeID'] and volume['spec']['nodeID'] == volume['status']['currentNodeID']
if ready:
break
except Exception as e:
logging(f"Getting volume {volume_name} engines error: {e}")
time.sleep(self.retry_interval)
assert ready
assert ready, f"Waiting for volume {volume_name} migration to be ready failed: engines = {engines}, volume = {volume}"

def wait_for_volume_migration_complete(self, volume_name, node_name):
complete = False
for i in range(self.retry_count):
logging(f"Waiting for volume {volume_name} migration to node {node_name} complete ({i}) ...")
try:
engines = self.engine.get_engines(volume_name)
complete = len(engines) == 1 and engines[0]['status']['endpoint'] and engines[0]['status']['ownerID'] == node_name
volume = self.get(volume_name)
engine_check = len(engines) == 1 and engines[0]['status']['endpoint'] and engines[0]['status']['ownerID'] == node_name
migration_node_check = volume['spec']['migrationNodeID'] == "" and volume['status']['currentMigrationNodeID'] == ""
node_check = volume['spec']['nodeID'] == node_name and volume['spec']['nodeID'] == volume['status']['currentNodeID']
complete = engine_check and migration_node_check and node_check
if complete:
break
except Exception as e:
logging(f"Getting volume {volume_name} engines error: {e}")
time.sleep(self.retry_interval)
assert complete
assert complete, f"Waiting for volume {volume_name} migration complete failed: engines = {engines}, volume = {volume}"

def wait_for_volume_migration_to_rollback(self, volume_name, node_name):
rollback = False
for i in range(self.retry_count):
logging(f"Waiting for volume {volume_name} migration to rollback to node {node_name} ({i}) ...")
try:
engines = self.engine.get_engines(volume_name)
rollback = len(engines) == 1 and engines[0]['status']['endpoint'] and engines[0]['status']['ownerID'] == node_name
volume = self.get(volume_name)
engine_check = len(engines) == 1 and engines[0]['status']['endpoint'] and engines[0]['status']['ownerID'] == node_name
migration_node_check = volume['spec']['migrationNodeID'] == "" and volume['status']['currentMigrationNodeID'] == ""
node_check = volume['spec']['nodeID'] == node_name and volume['spec']['nodeID'] == volume['status']['currentNodeID']
rollback = engine_check and migration_node_check and node_check
if rollback:
break
except Exception as e:
logging(f"Getting volume {volume_name} engines error: {e}")
time.sleep(self.retry_interval)
assert rollback
assert rollback, f"Waiting for volume {volume_name} migration rollback failed: engines = {engines}, volume = {volume}"

def wait_for_volume_restoration_completed(self, volume_name, backup_name):
completed = False
Expand Down
2 changes: 1 addition & 1 deletion e2e/libs/volume/volume.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def wait_for_volume_detached(self, volume_name):
self.volume.wait_for_volume_state(volume_name, "detached")

def wait_for_volume_attaching(self, volume_name):
self.volume.wait_for_volume_state(volume_name, "attaching")
self.volume.wait_for_volume_attaching(volume_name)

def wait_for_volume_stuck_attaching(self, volume_name):
self.volume.wait_for_volume_keep_in_state(volume_name, "attaching")
Expand Down
41 changes: 41 additions & 0 deletions e2e/tests/negative/live_migration.robot
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
*** Settings ***
Documentation Negative Test Cases
Test Tags negative

Resource ../keywords/variables.resource
Resource ../keywords/common.resource
Resource ../keywords/volume.resource
Resource ../keywords/host.resource
Resource ../keywords/migration.resource

Test Setup Set test environment
Test Teardown Cleanup test resources


*** Test Cases ***
Migration Confirmation After Migration Node Down
Given Create volume 0 with migratable=True accessMode=RWX dataEngine=${DATA_ENGINE}
And Attach volume 0 to node 0
And Wait for volume 0 healthy
And Write data to volume 0
And Get volume 0 engine and replica names

And Attach volume 0 to node 1
And Wait for volume 0 migration to be ready

# power off migration node
When Power off node 1
# migration confirmation by deatching from the original node

Check failure on line 29 in e2e/tests/negative/live_migration.robot

View workflow job for this annotation

GitHub Actions / codespell

deatching ==> detaching
And Detach volume 0 from node 0

# volume stuck in attaching status and waiting for migration node to come back
Then Check volume 0 kept in attaching
And Volume 0 migration should fail or rollback

# power on migration node
When Power on off nodes

Then Wait for volume 0 to migrate to node 1
And Wait for volume 0 healthy
And Check volume 0 data is intact
2 changes: 1 addition & 1 deletion e2e/tests/regression/test_basic.robot
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ Replica Rebuilding

And Enable node 1 default disk
Then Check volume 0 replica on node 1 kept in stopped
And Check for volume 0 kept in degraded
And Check volume 0 kept in degraded

And Enable node 1 scheduling
Then Wait until volume 0 replica rebuilding started on node 1
Expand Down
5 changes: 5 additions & 0 deletions e2e/tests/regression/test_migration.robot
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Resource ../keywords/persistentvolumeclaim.resource
Resource ../keywords/recurringjob.resource
Resource ../keywords/statefulset.resource
Resource ../keywords/volume.resource
Resource ../keywords/migration.resource

Test Setup Set test environment
Test Teardown Cleanup test resources
Expand All @@ -32,11 +33,13 @@ Test Migration Confirm
Given Create volume 0 with migratable=True accessMode=RWX dataEngine=${DATA_ENGINE}
When Attach volume 0 to node 0
And Wait for volume 0 healthy
And Get volume 0 engine and replica names
And Write data to volume 0
And Attach volume 0 to node 1
Then Wait for volume 0 migration to be ready
And Detach volume 0 from node 0
And Wait for volume 0 to migrate to node 1
And Volume 0 migration should succeed
And Wait for volume 0 healthy
And Check volume 0 data is intact

Expand All @@ -57,10 +60,12 @@ Test Migration Rollback
Given Create volume 0 with migratable=True accessMode=RWX dataEngine=${DATA_ENGINE}
When Attach volume 0 to node 0
And Wait for volume 0 healthy
And Get volume 0 engine and replica names
And Write data to volume 0
And Attach volume 0 to node 1
Then Wait for volume 0 migration to be ready
And Detach volume 0 from node 1
And Wait for volume 0 to stay on node 0
And Volume 0 migration should fail or rollback
And Wait for volume 0 healthy
And Check volume 0 data is intact

0 comments on commit e739c0d

Please sign in to comment.