diff --git a/e2e/keywords/migration.resource b/e2e/keywords/migration.resource new file mode 100644 index 000000000..7379c6f87 --- /dev/null +++ b/e2e/keywords/migration.resource @@ -0,0 +1,49 @@ +*** Settings *** +Documentation Migration Keywords + +Library ../libs/keywords/common_keywords.py +Library ../libs/keywords/engine_keywords.py +Library ../libs/keywords/replica_keywords.py + +*** Keywords *** +Get volume ${volume_id} engine name + ${volume_name} = generate_name_with_suffix volume ${volume_id} + ${engine_name} = get_engine_name ${volume_name} + Set Test Variable ${engine_name} + +Volume ${volume_id} engine should be the same + ${volume_name} = generate_name_with_suffix volume ${volume_id} + ${new_engine_name} = get_engine_name ${volume_name} + Should Be Equal ${engine_name} ${new_engine_name} + +Volume ${volume_id} engine should be different + ${volume_name} = generate_name_with_suffix volume ${volume_id} + ${new_engine_name} = get_engine_name ${volume_name} + Should Not Be Equal ${engine_name} ${new_engine_name} + +Get volume ${volume_id} replica names + ${volume_name} = generate_name_with_suffix volume ${volume_id} + ${replica_names} = get_replica_names ${volume_name} + Set Test Variable ${replica_names} + +Volume ${volume_id} replicas should be the same + ${volume_name} = generate_name_with_suffix volume ${volume_id} + ${new_replica_names} = get_replica_names ${volume_name} + Should Be Equal As Strings ${replica_names} ${new_replica_names} + +Volume ${volume_id} replicas should be different + ${volume_name} = generate_name_with_suffix volume ${volume_id} + ${new_replica_names} = get_replica_names ${volume_name} + Should Not Be Equal As Strings ${replica_names} ${new_replica_names} + +Get volume ${volume_id} engine and replica names + Get volume ${volume_id} engine name + Get volume ${volume_id} replica names + +Volume ${volume_id} migration should fail or rollback + Volume ${volume_id} engine should be the same + Volume ${volume_id} replicas should be the same + +Volume ${volume_id} migration should succeed + Volume ${volume_id} engine should be different + Volume ${volume_id} replicas should be different diff --git a/e2e/keywords/variables.resource b/e2e/keywords/variables.resource index c213dcabc..e6ad82d28 100644 --- a/e2e/keywords/variables.resource +++ b/e2e/keywords/variables.resource @@ -9,5 +9,6 @@ ${VOLUME_TYPE} RWO ${CONTROL_PLANE_NODE_NETWORK_LATENCY_IN_MS} 0 ${RWX_VOLUME_FAST_FAILOVER} false ${DATA_ENGINE} v1 +${VOLUME_STATE_CHECK_TIMEOUT} 120 @{powered_off_nodes}= diff --git a/e2e/keywords/volume.resource b/e2e/keywords/volume.resource index 921de837c..ee23739fc 100644 --- a/e2e/keywords/volume.resource +++ b/e2e/keywords/volume.resource @@ -141,18 +141,25 @@ Wait for volume ${volume_id} degraded Check volume ${volume_id} replica on node ${node_id} kept in stopped ${volume_name} = generate_name_with_suffix volume ${volume_id} ${node_name} = get_node_by_index ${node_id} - FOR ${i} IN RANGE ${LOOP_COUNT} + FOR ${i} IN RANGE ${VOLUME_STATE_CHECK_TIMEOUT} wait_for_replica_stopped ${volume_name} ${node_name} Sleep ${RETRY_INTERVAL} END -Check for volume ${volume_id} kept in degraded +Check volume ${volume_id} kept in degraded ${volume_name} = generate_name_with_suffix volume ${volume_id} - FOR ${i} IN RANGE ${LOOP_COUNT} + FOR ${i} IN RANGE ${VOLUME_STATE_CHECK_TIMEOUT} wait_for_volume_degraded ${volume_name} Sleep ${RETRY_INTERVAL} END +Check volume ${volume_id} kept in attaching + ${volume_name} = generate_name_with_suffix volume ${volume_id} + FOR ${i} IN RANGE ${VOLUME_STATE_CHECK_TIMEOUT} + wait_for_volume_attaching ${volume_name} + Sleep ${RETRY_INTERVAL} + END + Check volume ${volume_id} kept in detached ${volume_name} = generate_name_with_suffix volume ${volume_id} FOR ${i} IN RANGE ${LOOP_COUNT} diff --git a/e2e/libs/engine/crd.py b/e2e/libs/engine/crd.py index fdaa94f62..302ab151d 100644 --- a/e2e/libs/engine/crd.py +++ b/e2e/libs/engine/crd.py @@ -1,5 +1,3 @@ -import logging - from kubernetes import client from engine.base import Base @@ -61,3 +59,11 @@ def validate_engine_setting(self, volume_name, setting_name, value): for engine in engines: assert str(engine["spec"][setting_name]) == value, \ f"Expected volume {volume_name} engine setting {setting_name} is {value}, but it's {str(engine['spec'][setting_name])}" + + def get_engine_name(self, volume_name): + logging(f"Getting volume {volume_name} engine name") + engines = self.get_engines(volume_name) + assert len(engines) == 1, f"Expect volume {volume_name} only has one engine, but there are {engines}" + engine_name = engines[0]["metadata"]["name"] + logging(f"Got volume {volume_name} engine name {engine_name}") + return engine_name diff --git a/e2e/libs/engine/engine.py b/e2e/libs/engine/engine.py index d57f06777..b881181d2 100644 --- a/e2e/libs/engine/engine.py +++ b/e2e/libs/engine/engine.py @@ -42,5 +42,8 @@ def get_engine_state(self, volume_name, node_name): engines_states[engine_name] = engine_state return engines_states + def get_engine_name(self, volume_name): + return self.engine.get_engine_name(volume_name) + def validate_engine_setting(self, volume_name, setting_name, value): return self.engine.validate_engine_setting(volume_name, setting_name, value) diff --git a/e2e/libs/host/aws.py b/e2e/libs/host/aws.py index 180610f97..4407a3252 100644 --- a/e2e/libs/host/aws.py +++ b/e2e/libs/host/aws.py @@ -80,6 +80,7 @@ def power_off_node(self, power_off_node_name, waiting=True): waiter = self.aws_client.get_waiter('instance_stopped') waiter.wait(InstanceIds=instance_ids) logging(f"Stopped instances") + self.node.wait_for_node_down(power_off_node_name) def power_on_node(self, power_on_node_name): instance_ids = [self.mapping[power_on_node_name]] @@ -89,3 +90,4 @@ def power_on_node(self, power_on_node_name): waiter = self.aws_client.get_waiter('instance_running') waiter.wait(InstanceIds=instance_ids) logging(f"Started instances") + self.node.wait_for_node_up(power_on_node_name) diff --git a/e2e/libs/host/harvester.py b/e2e/libs/host/harvester.py index 4e23c88ea..908bbbb4d 100644 --- a/e2e/libs/host/harvester.py +++ b/e2e/libs/host/harvester.py @@ -88,6 +88,8 @@ def power_off_node(self, node_name, waiting=True): time.sleep(self.retry_interval) assert stopped, f"Expected vm {vm_id} to be stopped but it's not" + self.node.wait_for_node_down(power_off_node_name) + def power_on_node(self, node_name): vm_id = self.mapping[node_name] @@ -115,3 +117,5 @@ def power_on_node(self, node_name): logging(f"Getting vm status failed with error {e}") time.sleep(self.retry_interval) assert started, f"Expected vm {vm_id} to be started but it's not" + + self.node.wait_for_node_up(power_on_node_name) diff --git a/e2e/libs/keywords/engine_keywords.py b/e2e/libs/keywords/engine_keywords.py index c9eea58eb..082dc608a 100644 --- a/e2e/libs/keywords/engine_keywords.py +++ b/e2e/libs/keywords/engine_keywords.py @@ -9,5 +9,8 @@ def __init__(self): def get_engine_instance_manager_name(self, volume_name): return self.engine.get_engine_instance_manager_name(volume_name) + def get_engine_name(self, volume_name): + return self.engine.get_engine_name(volume_name) + def validate_engine_setting(self, volume_name, setting_name, value): return self.engine.validate_engine_setting(volume_name, setting_name, value) diff --git a/e2e/libs/keywords/replica_keywords.py b/e2e/libs/keywords/replica_keywords.py index f02b757c6..0ef1534c6 100644 --- a/e2e/libs/keywords/replica_keywords.py +++ b/e2e/libs/keywords/replica_keywords.py @@ -11,3 +11,6 @@ def validate_replica_setting(self, volume_name, setting_name, value): def get_replicas(self, volume_name=None, node_name=None, disk_uuid=None): return self.replica.get(volume_name, node_name, disk_uuid) + + def get_replica_names(self, volume_name, numberOfReplicas=3): + return self.replica.get_replica_names(volume_name, numberOfReplicas) diff --git a/e2e/libs/node/node.py b/e2e/libs/node/node.py index 7e48fe5d2..a45f15554 100644 --- a/e2e/libs/node/node.py +++ b/e2e/libs/node/node.py @@ -175,6 +175,8 @@ def set_node(self, node_name: str, allowScheduling: bool, evictionRequested: boo def set_node_scheduling(self, node_name, allowScheduling=True, retry=False): node = get_longhorn_client().by_id_node(node_name) + logging(f"Setting node {node_name} allowScheduling to {allowScheduling}") + if node.tags is None: node.tags = [] @@ -250,3 +252,29 @@ def wait_for_disk_not_in_pressure(self, node_name, disk_name): def get_disk_uuid(self, node_name, disk_name): node = get_longhorn_client().by_id_node(node_name) return node["disks"][disk_name]["diskUUID"] + + def wait_for_node_down(self, node_name): + down = False + for i in range(self.retry_count): + logging(f"Waiting for k8s node {node_name} down ... ({i})") + node = self.get_node_by_name(node_name) + for condition in node.status.conditions: + if condition.type == "Ready" and condition.status != "True": + down = True + if down: + break + time.sleep(self.retry_interval) + assert down, f"Waiting for node {node_name} down failed: {node.status.conditions}" + + def wait_for_node_up(self, node_name): + up = False + for i in range(self.retry_count): + logging(f"Waiting for k8s node {node_name} up ... ({i})") + node = self.get_node_by_name(node_name) + for condition in node.status.conditions: + if condition.type == "Ready" and condition.status == "True": + up = True + if up: + break + time.sleep(self.retry_interval) + assert up, f"Waiting for node {node_name} up failed: {node.status.conditions}" diff --git a/e2e/libs/replica/crd.py b/e2e/libs/replica/crd.py index f2e4a0712..954d3d61c 100644 --- a/e2e/libs/replica/crd.py +++ b/e2e/libs/replica/crd.py @@ -28,6 +28,14 @@ def get(self, volume_name=None, node_name=None, disk_uuid=None): ) return replicas["items"] + def get_replica_names(self, volume_name, numberOfReplicas): + logging(f"Getting volume {volume_name} replica names") + replicas = self.get(volume_name) + assert len(replicas) == numberOfReplicas, f"Expect volume {volume_name} has {numberOfReplicas} replicas, but there are {replicas}" + replica_names = [ replica['metadata']['name'] for replica in replicas ] + logging(f"Got volume {volume_name} replica names {replica_names}") + return replica_names + def delete(self, volume_name, node_name): if volume_name == "" or node_name == "": logging(f"Deleting all replicas") diff --git a/e2e/libs/replica/replica.py b/e2e/libs/replica/replica.py index 31312a8e8..3d86327a8 100644 --- a/e2e/libs/replica/replica.py +++ b/e2e/libs/replica/replica.py @@ -19,6 +19,9 @@ def delete(self, volume_name="", node_name=""): def get(self, volume_name, node_name, disk_uuid=None): return self.replica.get(volume_name, node_name, disk_uuid) + def get_replica_names(self, volume_name, numberOfReplicas): + return self.replica.get_replica_names(volume_name, numberOfReplicas) + def wait_for_rebuilding_start(self, volume_name, node_name): return self.replica.wait_for_rebuilding_start(volume_name,node_name) diff --git a/e2e/libs/volume/crd.py b/e2e/libs/volume/crd.py index 5539cc57c..c5d7607ed 100644 --- a/e2e/libs/volume/crd.py +++ b/e2e/libs/volume/crd.py @@ -255,6 +255,12 @@ def wait_for_volume_state(self, volume_name, desired_state): time.sleep(self.retry_interval) assert volume["status"]["state"] == desired_state + def wait_for_volume_attaching(self, volume_name): + self.wait_for_volume_state(volume_name, "attaching") + volume = self.get(volume_name) + assert volume["spec"]["nodeID"] != "" + assert volume["status"]["currentNodeID"] == "" + def is_replica_running(self, volume_name, node_name, is_running): return Rest().is_replica_running(volume_name, node_name, is_running) @@ -306,15 +312,18 @@ def wait_for_volume_migration_to_be_ready(self, volume_name): logging(f"Waiting for volume {volume_name} migration to be ready ({i}) ...") try: engines = self.engine.get_engines(volume_name) + volume = self.get(volume_name) ready = len(engines) == 2 for engine in engines: ready = ready and engine['status']['endpoint'] + ready = volume['spec']['migrationNodeID'] and volume['spec']['migrationNodeID'] == volume['status']['currentMigrationNodeID'] + ready = volume['spec']['nodeID'] and volume['spec']['nodeID'] == volume['status']['currentNodeID'] if ready: break except Exception as e: logging(f"Getting volume {volume_name} engines error: {e}") time.sleep(self.retry_interval) - assert ready + assert ready, f"Waiting for volume {volume_name} migration to be ready failed: engines = {engines}, volume = {volume}" def wait_for_volume_migration_complete(self, volume_name, node_name): complete = False @@ -322,13 +331,17 @@ def wait_for_volume_migration_complete(self, volume_name, node_name): logging(f"Waiting for volume {volume_name} migration to node {node_name} complete ({i}) ...") try: engines = self.engine.get_engines(volume_name) - complete = len(engines) == 1 and engines[0]['status']['endpoint'] and engines[0]['status']['ownerID'] == node_name + volume = self.get(volume_name) + engine_check = len(engines) == 1 and engines[0]['status']['endpoint'] and engines[0]['status']['ownerID'] == node_name + migration_node_check = volume['spec']['migrationNodeID'] == "" and volume['status']['currentMigrationNodeID'] == "" + node_check = volume['spec']['nodeID'] == node_name and volume['spec']['nodeID'] == volume['status']['currentNodeID'] + complete = engine_check and migration_node_check and node_check if complete: break except Exception as e: logging(f"Getting volume {volume_name} engines error: {e}") time.sleep(self.retry_interval) - assert complete + assert complete, f"Waiting for volume {volume_name} migration complete failed: engines = {engines}, volume = {volume}" def wait_for_volume_migration_to_rollback(self, volume_name, node_name): rollback = False @@ -336,13 +349,17 @@ def wait_for_volume_migration_to_rollback(self, volume_name, node_name): logging(f"Waiting for volume {volume_name} migration to rollback to node {node_name} ({i}) ...") try: engines = self.engine.get_engines(volume_name) - rollback = len(engines) == 1 and engines[0]['status']['endpoint'] and engines[0]['status']['ownerID'] == node_name + volume = self.get(volume_name) + engine_check = len(engines) == 1 and engines[0]['status']['endpoint'] and engines[0]['status']['ownerID'] == node_name + migration_node_check = volume['spec']['migrationNodeID'] == "" and volume['status']['currentMigrationNodeID'] == "" + node_check = volume['spec']['nodeID'] == node_name and volume['spec']['nodeID'] == volume['status']['currentNodeID'] + rollback = engine_check and migration_node_check and node_check if rollback: break except Exception as e: logging(f"Getting volume {volume_name} engines error: {e}") time.sleep(self.retry_interval) - assert rollback + assert rollback, f"Waiting for volume {volume_name} migration rollback failed: engines = {engines}, volume = {volume}" def wait_for_volume_restoration_completed(self, volume_name, backup_name): completed = False diff --git a/e2e/libs/volume/volume.py b/e2e/libs/volume/volume.py index 8ba8eec4a..dcefa23a6 100644 --- a/e2e/libs/volume/volume.py +++ b/e2e/libs/volume/volume.py @@ -56,7 +56,7 @@ def wait_for_volume_detached(self, volume_name): self.volume.wait_for_volume_state(volume_name, "detached") def wait_for_volume_attaching(self, volume_name): - self.volume.wait_for_volume_state(volume_name, "attaching") + self.volume.wait_for_volume_attaching(volume_name) def wait_for_volume_stuck_attaching(self, volume_name): self.volume.wait_for_volume_keep_in_state(volume_name, "attaching") diff --git a/e2e/tests/negative/live_migration.robot b/e2e/tests/negative/live_migration.robot new file mode 100644 index 000000000..9753214f9 --- /dev/null +++ b/e2e/tests/negative/live_migration.robot @@ -0,0 +1,41 @@ +*** Settings *** +Documentation Negative Test Cases + +Test Tags negative + +Resource ../keywords/variables.resource +Resource ../keywords/common.resource +Resource ../keywords/volume.resource +Resource ../keywords/host.resource +Resource ../keywords/migration.resource + +Test Setup Set test environment +Test Teardown Cleanup test resources + + +*** Test Cases *** +Migration Confirmation After Migration Node Down + Given Create volume 0 with migratable=True accessMode=RWX dataEngine=${DATA_ENGINE} + And Attach volume 0 to node 0 + And Wait for volume 0 healthy + And Write data to volume 0 + And Get volume 0 engine and replica names + + And Attach volume 0 to node 1 + And Wait for volume 0 migration to be ready + + # power off migration node + When Power off node 1 + # migration confirmation by detaching from the original node + And Detach volume 0 from node 0 + + # volume stuck in attaching status and waiting for migration node to come back + Then Check volume 0 kept in attaching + And Volume 0 migration should fail or rollback + + # power on migration node + When Power on off nodes + + Then Wait for volume 0 to migrate to node 1 + And Wait for volume 0 healthy + And Check volume 0 data is intact diff --git a/e2e/tests/regression/test_basic.robot b/e2e/tests/regression/test_basic.robot index c8fa7d506..9f8924de1 100644 --- a/e2e/tests/regression/test_basic.robot +++ b/e2e/tests/regression/test_basic.robot @@ -147,7 +147,7 @@ Replica Rebuilding And Enable node 1 default disk Then Check volume 0 replica on node 1 kept in stopped - And Check for volume 0 kept in degraded + And Check volume 0 kept in degraded And Enable node 1 scheduling Then Wait until volume 0 replica rebuilding started on node 1 diff --git a/e2e/tests/regression/test_migration.robot b/e2e/tests/regression/test_migration.robot index 850f27082..cf81e179a 100644 --- a/e2e/tests/regression/test_migration.robot +++ b/e2e/tests/regression/test_migration.robot @@ -10,6 +10,7 @@ Resource ../keywords/persistentvolumeclaim.resource Resource ../keywords/recurringjob.resource Resource ../keywords/statefulset.resource Resource ../keywords/volume.resource +Resource ../keywords/migration.resource Test Setup Set test environment Test Teardown Cleanup test resources @@ -32,11 +33,13 @@ Test Migration Confirm Given Create volume 0 with migratable=True accessMode=RWX dataEngine=${DATA_ENGINE} When Attach volume 0 to node 0 And Wait for volume 0 healthy + And Get volume 0 engine and replica names And Write data to volume 0 And Attach volume 0 to node 1 Then Wait for volume 0 migration to be ready And Detach volume 0 from node 0 And Wait for volume 0 to migrate to node 1 + And Volume 0 migration should succeed And Wait for volume 0 healthy And Check volume 0 data is intact @@ -57,10 +60,12 @@ Test Migration Rollback Given Create volume 0 with migratable=True accessMode=RWX dataEngine=${DATA_ENGINE} When Attach volume 0 to node 0 And Wait for volume 0 healthy + And Get volume 0 engine and replica names And Write data to volume 0 And Attach volume 0 to node 1 Then Wait for volume 0 migration to be ready And Detach volume 0 from node 1 And Wait for volume 0 to stay on node 0 + And Volume 0 migration should fail or rollback And Wait for volume 0 healthy And Check volume 0 data is intact