From beebd8c63fa722bdbff1437caff43682dbea2ebe Mon Sep 17 00:00:00 2001 From: Chris Date: Mon, 7 Oct 2024 11:09:59 +0800 Subject: [PATCH] test(robot): add test case Test Longhorn components recovery longhorn/longhorn#9536 Signed-off-by: Chris --- e2e/keywords/backing_image.resource | 6 + e2e/keywords/longhorn.resource | 16 ++ e2e/keywords/sharemanager.resource | 10 + e2e/keywords/workload.resource | 21 +- e2e/libs/backing_image/backing_image.py | 18 +- e2e/libs/backing_image/base.py | 16 ++ e2e/libs/backing_image/crd.py | 91 ++++++++ e2e/libs/backing_image/rest.py | 12 ++ e2e/libs/k8s/k8s.py | 27 ++- e2e/libs/keywords/backing_image_keywords.py | 20 ++ e2e/libs/keywords/k8s_keywords.py | 4 + e2e/libs/keywords/sharemanager_keywords.py | 12 ++ e2e/libs/keywords/workload_keywords.py | 12 +- e2e/libs/sharemanager/base.py | 16 ++ e2e/libs/sharemanager/crd.py | 52 ++++- e2e/libs/sharemanager/rest.py | 12 ++ e2e/libs/sharemanager/sharemanager.py | 12 ++ e2e/libs/workload/workload.py | 4 +- e2e/tests/negative/component_resilience.robot | 203 ++++++++++++++++++ 19 files changed, 548 insertions(+), 16 deletions(-) create mode 100644 e2e/libs/backing_image/crd.py create mode 100644 e2e/tests/negative/component_resilience.robot diff --git a/e2e/keywords/backing_image.resource b/e2e/keywords/backing_image.resource index 0e974f3ae0..5131889ba8 100644 --- a/e2e/keywords/backing_image.resource +++ b/e2e/keywords/backing_image.resource @@ -22,3 +22,9 @@ Clean up backing image ${backing_image_name} from a disk Delete backing image ${backing_image_name} delete_backing_image ${backing_image_name} + +Delete backing image managers and wait for recreation + delete_all_backing_image_managers_and_wait_for_recreation + +Wait backing image managers running + wait_all_backing_image_managers_running diff --git a/e2e/keywords/longhorn.resource b/e2e/keywords/longhorn.resource index 163baf9806..f413066528 100644 --- a/e2e/keywords/longhorn.resource +++ b/e2e/keywords/longhorn.resource @@ -66,3 +66,19 @@ Check all Longhorn CRD removed Install Longhorn install_longhorn_system + +Delete instance-manager of volume ${volume_id} + ${volume_name} = generate_name_with_suffix volume ${volume_id} + ${node_name} = get_volume_node ${volume_name} + ${pod_name} = get_instance_manager_on_node ${node_name} + delete_pod ${pod_name} longhorn-system + +Delete instance-manager of deployment ${deployment_id} volume + ${deployment_name} = generate_name_with_suffix deployment ${deployment_id} + ${volume_name} = get_workload_volume_name ${deployment_name} + ${node_name} = get_volume_node ${volume_name} + ${pod_name} = get_instance_manager_on_node ${node_name} + delete_pod ${pod_name} longhorn-system + +Wait for Longhorn components all running + wait_for_namespace_pods_running longhorn-system diff --git a/e2e/keywords/sharemanager.resource b/e2e/keywords/sharemanager.resource index fd883a9542..6fe84fda83 100644 --- a/e2e/keywords/sharemanager.resource +++ b/e2e/keywords/sharemanager.resource @@ -20,3 +20,13 @@ Check sharemanager ${condition} using headless service Wait for all sharemanager to be deleted wait_for_sharemanagers_deleted + +Delete sharemanager of deployment ${deployment_id} and wait for recreation + ${deployment_name} = generate_name_with_suffix deployment ${deployment_id} + ${volume_name} = get_workload_volume_name ${deployment_name} + delete_sharemanager_and_wait_for_recreation ${volume_name} + +Wait for sharemanager of deployment ${deployment_id} running + ${deployment_name} = generate_name_with_suffix deployment ${deployment_id} + ${volume_name} = get_workload_volume_name ${deployment_name} + wait_for_share_manager_running ${volume_name} diff --git a/e2e/keywords/workload.resource b/e2e/keywords/workload.resource index e3dce4a006..c3eb835b6c 100644 --- a/e2e/keywords/workload.resource +++ b/e2e/keywords/workload.resource @@ -186,10 +186,6 @@ Check ${workload_kind} ${workload_id} pod is ${expect_state} on another node ${node_name} = get_pod_node ${pod} Should Not Be Equal ${node_name} ${last_volume_node} -Delete Longhorn ${workload_kind} ${workload_name} pod on node ${node_id} - ${node_name} = get_node_by_index ${node_id} - delete_workload_pod_on_node ${workload_name} ${node_name} longhorn-system - Trim ${workload_kind} ${workload_id} volume should ${condition} ${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id} @@ -200,3 +196,20 @@ Trim ${workload_kind} ${workload_id} volume should ${condition} ELSE Fail "Invalid condition value: ${condition}" END + +Delete Longhorn ${workload_kind} ${workload_name} pod on node ${node_id} + ${node_name} = get_node_by_index ${node_id} + + IF '${workload_name}' == 'engine-image' + ${label_selector} = Set Variable longhorn.io/component=engine-image + ELSE IF '${workload_name}' == 'instance-manager' + ${label_selector} = Set Variable longhorn.io/component=instance-manager + ELSE + ${label_selector} = Set Variable ${EMPTY} + END + delete_workload_pod_on_node ${workload_name} ${node_name} longhorn-system ${label_selector} + +Delete Longhorn ${workload_kind} ${workload_name} pod + ${pod_name} = get_workload_pod_name ${workload_name} longhorn-system + Log ${pod_name} + delete_pod ${pod_name} longhorn-system diff --git a/e2e/libs/backing_image/backing_image.py b/e2e/libs/backing_image/backing_image.py index a2432dde29..12666deee8 100644 --- a/e2e/libs/backing_image/backing_image.py +++ b/e2e/libs/backing_image/backing_image.py @@ -1,6 +1,6 @@ from backing_image.base import Base from backing_image.rest import Rest - +from backing_image.crd import CRD from strategy import LonghornOperationStrategy @@ -30,3 +30,19 @@ def delete(self, bi_name): def cleanup_backing_images(self): return self.backing_image.cleanup_backing_images() + + def delete_backing_image_manager(self, name): + self.backing_image = CRD() + return self.backing_image.delete_backing_image_manager(name) + + def wait_all_backing_image_managers_running(self): + self.backing_image = CRD() + return self.backing_image.wait_all_backing_image_managers_running() + + def wait_backing_image_manager_restart(self, name, last_creation_time): + self.backing_image = CRD() + self.backing_image.wait_backing_image_manager_restart(name, last_creation_time) + + def list_backing_image_manager(self): + self.backing_image = CRD() + return self.backing_image.list_backing_image_manager() diff --git a/e2e/libs/backing_image/base.py b/e2e/libs/backing_image/base.py index 4ed7cf6f49..09ae461179 100644 --- a/e2e/libs/backing_image/base.py +++ b/e2e/libs/backing_image/base.py @@ -30,3 +30,19 @@ def delete(self, bi_name): @abstractmethod def cleanup_backing_images(self): return NotImplemented + + @abstractmethod + def wait_all_backing_image_managers_running(self): + return NotImplemented + + @abstractmethod + def list_backing_image_manager(self): + return NotImplemented + + @abstractmethod + def delete_backing_image_manager(self, name): + return NotImplemented + + @abstractmethod + def wait_backing_image_manager_restart(self, name, last_creation_time): + return NotImplemented diff --git a/e2e/libs/backing_image/crd.py b/e2e/libs/backing_image/crd.py new file mode 100644 index 0000000000..2410510fc1 --- /dev/null +++ b/e2e/libs/backing_image/crd.py @@ -0,0 +1,91 @@ +from kubernetes import client +from datetime import datetime +from backing_image.base import Base + +from utility.utility import logging +from utility.utility import get_retry_count_and_interval +import time + +class CRD(Base): + def __init__(self): + self.obj_api = client.CustomObjectsApi() + self.retry_count, self.retry_interval = get_retry_count_and_interval() + + def create(self, bi_name, source_type, url, expected_checksum): + return NotImplemented + + def get(self, bi_name): + return NotImplemented + + def all_disk_file_status_are_ready(self, bi_name): + return NotImplemented + def clean_up_backing_image_from_a_random_disk(self, bi_name): + return NotImplemented + + def delete(self, bi_name): + return NotImplemented + + def wait_for_backing_image_disk_cleanup(self, bi_name, disk_id): + return NotImplemented + + def wait_for_backing_image_delete(self, bi_name): + return NotImplemented + + def cleanup_backing_images(self): + return NotImplemented + + def list_backing_image_manager(self): + label_selector = 'longhorn.io/component=backing-image-manager' + return self.obj_api.list_namespaced_custom_object( + group="longhorn.io", + version="v1beta2", + namespace="longhorn-system", + plural="backingimagemanagers", + label_selector=label_selector) + + def delete_backing_image_manager(self, name): + logging(f"deleting backing image manager {name} ...") + self.obj_api.delete_namespaced_custom_object( + group="longhorn.io", + version="v1beta2", + namespace="longhorn-system", + plural="backingimagemanagers", + name=name + ) + + def wait_all_backing_image_managers_running(self): + for i in range(self.retry_count): + all_running = True + backing_image_managers = self.list_backing_image_manager() + for backing_image_manager in backing_image_managers["items"]: + current_state = backing_image_manager["status"]["currentState"] + name = backing_image_manager["metadata"]["name"] + logging(f"backing image manager {name} currently in {current_state} state") + if current_state != "running": + all_running = False + if all_running is True: + return + time.sleep(self.retry_interval) + assert False, f"Waiting all backing image manager in running state timeout" + + def wait_backing_image_manager_restart(self, name, last_creation_time): + for i in range(self.retry_count): + time.sleep(self.retry_interval) + try: + backing_image_manager = self.obj_api.get_namespaced_custom_object( + group="longhorn.io", + version="v1beta2", + namespace="longhorn-system", + plural="backingimagemanagers", + name=name + ) + except Exception as e: + logging(f"Finding backing image manager {name} failed with error {e}") + continue + + creation_time = backing_image_manager["metadata"]["creationTimestamp"] + fmt = "%Y-%m-%dT%H:%M:%SZ" + if datetime.strptime(creation_time, fmt) > datetime.strptime(last_creation_time, fmt): + return + + assert False, f"Wait backing image manager {name} restart failed ..." diff --git a/e2e/libs/backing_image/rest.py b/e2e/libs/backing_image/rest.py index 6b451d935c..ee562cb1ba 100644 --- a/e2e/libs/backing_image/rest.py +++ b/e2e/libs/backing_image/rest.py @@ -110,3 +110,15 @@ def cleanup_backing_images(self): break time.sleep(self.retry_interval) assert len(get_longhorn_client().list_backing_image()) == 0 + + def delete_backing_image_manager(self, name): + return NotImplemented + + def wait_all_backing_image_managers_running(self): + return NotImplemented + + def wait_backing_image_manager_restart(self, name, last_creation_time): + return NotImplemented + + def list_backing_image_manager(self): + return NotImplemented diff --git a/e2e/libs/k8s/k8s.py b/e2e/libs/k8s/k8s.py index faff415d7c..5fcb3bb404 100644 --- a/e2e/libs/k8s/k8s.py +++ b/e2e/libs/k8s/k8s.py @@ -1,12 +1,12 @@ import time -import subprocess import asyncio -import os from kubernetes import client from kubernetes.client.rest import ApiException from workload.pod import create_pod from workload.pod import delete_pod from workload.pod import new_pod_manifest +from workload.pod import wait_for_pod_status +from workload.pod import get_pod from workload.constant import IMAGE_UBUNTU from utility.utility import subprocess_exec_cmd from utility.utility import logging @@ -95,6 +95,7 @@ def check_instance_manager_pdb_not_exist(instance_manager): exec_cmd = ["kubectl", "get", "pdb", "-n", "longhorn-system"] res = subprocess_exec_cmd(exec_cmd) assert instance_manager not in res.decode('utf-8') + def wait_namespaced_job_complete(job_label, namespace): retry_count, retry_interval = get_retry_count_and_interval() api = client.BatchV1Api() @@ -170,3 +171,25 @@ def delete_namespace(namespace): api.delete_namespace(name=namespace) except ApiException as e: assert e.status == 404 + +def wait_for_namespace_pods_running(namespace): + retry_count, retry_interval = get_retry_count_and_interval() + + for i in range(retry_count): + time.sleep(retry_interval) + pod_list = list_namespace_pods(namespace) + all_running = True + + for pod in pod_list.items: + pod_name = pod.metadata.name + pod_status = pod.status.phase + + if pod_status != "Running": + logging(f"Pod {pod_name} is in {pod_status} state, waiting...") + all_running = False + + if all_running: + logging(f"All pods in namespace {namespace} are in Running state!") + return + + assert False, f"wait all pod in namespace {namespace} running failed" diff --git a/e2e/libs/keywords/backing_image_keywords.py b/e2e/libs/keywords/backing_image_keywords.py index f6526e297a..08927ee276 100644 --- a/e2e/libs/keywords/backing_image_keywords.py +++ b/e2e/libs/keywords/backing_image_keywords.py @@ -20,3 +20,23 @@ def delete_backing_image(self, bi_name): def cleanup_backing_images(self): self.backing_image.cleanup_backing_images() + + def delete_backing_image_manager(self, name): + self.backing_image.delete_backing_image_manager(name) + + def wait_all_backing_image_managers_running(self): + self.backing_image.wait_all_backing_image_managers_running() + + def wait_backing_image_manager_restart(self, name, last_creation_time): + self.backing_image.wait_backing_image_manager_restart(name, last_creation_time) + + def list_backing_image_manager(self): + return self.backing_image.list_backing_image_manager() + + def delete_all_backing_image_managers_and_wait_for_recreation(self): + backing_image_managers = self.backing_image.list_backing_image_manager() + for backing_image in backing_image_managers["items"]: + name = backing_image["metadata"]["name"] + last_creation_time = backing_image["metadata"]["creationTimestamp"] + self.backing_image.delete_backing_image_manager(name) + self.backing_image.wait_backing_image_manager_restart(name, last_creation_time) diff --git a/e2e/libs/keywords/k8s_keywords.py b/e2e/libs/keywords/k8s_keywords.py index ed5b0b7f84..781abc4523 100644 --- a/e2e/libs/keywords/k8s_keywords.py +++ b/e2e/libs/keywords/k8s_keywords.py @@ -9,6 +9,7 @@ from k8s.k8s import check_node_cordoned from k8s.k8s import get_instance_manager_on_node from k8s.k8s import check_instance_manager_pdb_not_exist +from k8s.k8s import wait_for_namespace_pods_running from utility.utility import logging from node import Node @@ -78,3 +79,6 @@ def get_instance_manager_on_node(self, node_name): def check_instance_manager_pdb_not_exist(self, instance_manager): return check_instance_manager_pdb_not_exist(instance_manager) + + def wait_for_namespace_pods_running(self, namespace): + return wait_for_namespace_pods_running(namespace) diff --git a/e2e/libs/keywords/sharemanager_keywords.py b/e2e/libs/keywords/sharemanager_keywords.py index 95c8a4b861..cafc00fbca 100644 --- a/e2e/libs/keywords/sharemanager_keywords.py +++ b/e2e/libs/keywords/sharemanager_keywords.py @@ -47,3 +47,15 @@ def wait_for_sharemanagers_deleted(self, name=[]): time.sleep(retry_interval) assert AssertionError, f"Failed to wait for all sharemanagers to be deleted" + + def delete_sharemanager(self, name): + return self.sharemanager.delete(name) + + def delete_sharemanager_and_wait_for_recreation(self, name): + sharemanager = self.sharemanager.get(name) + last_creation_time = sharemanager["metadata"]["creationTimestamp"] + self.sharemanager.delete(name) + self.sharemanager.wait_for_restart(name, last_creation_time) + + def wait_for_share_manager_running(self, name): + return self.sharemanager.wait_for_running(name) diff --git a/e2e/libs/keywords/workload_keywords.py b/e2e/libs/keywords/workload_keywords.py index d27845d91a..1a28f5f9c5 100644 --- a/e2e/libs/keywords/workload_keywords.py +++ b/e2e/libs/keywords/workload_keywords.py @@ -46,9 +46,9 @@ def create_pod(self, pod_name, claim_name): logging(f'Creating pod {pod_name} using pvc {claim_name}') create_pod(new_busybox_manifest(pod_name, claim_name)) - def delete_pod(self, pod_name): + def delete_pod(self, pod_name, namespace='default'): logging(f'Deleting pod {pod_name}') - delete_pod(pod_name) + delete_pod(pod_name, namespace) def cleanup_pods(self): cleanup_pods() @@ -61,15 +61,15 @@ def check_pod_data_checksum(self, expected_checksum, pod_name, file_name): logging(f'Checking checksum for file {file_name} in pod {pod_name}') check_pod_data_checksum(expected_checksum, pod_name, file_name) - def delete_workload_pod_on_node(self, workload_name, node_name, namespace="default"): - pods = get_workload_pods(workload_name, namespace=namespace) + def delete_workload_pod_on_node(self, workload_name, node_name, namespace="default", label_selector=""): + pods = get_workload_pods(workload_name, namespace=namespace, label_selector=label_selector) for pod in pods: if pod.spec.node_name == node_name: logging(f'Deleting pod {pod.metadata.name} on node {node_name}') delete_pod(pod.metadata.name, namespace=namespace) - def get_workload_pod_name(self, workload_name): - return get_workload_pod_names(workload_name)[0] + def get_workload_pod_name(self, workload_name, namespace="default"): + return get_workload_pod_names(workload_name, namespace)[0] def get_workload_persistent_volume_claim_name(self, workload_name): return get_workload_persistent_volume_claim_name(workload_name) diff --git a/e2e/libs/sharemanager/base.py b/e2e/libs/sharemanager/base.py index ff1fac613c..f0ed4aa9b5 100644 --- a/e2e/libs/sharemanager/base.py +++ b/e2e/libs/sharemanager/base.py @@ -5,3 +5,19 @@ class Base(ABC): @abstractmethod def list(self): return NotImplemented + + @abstractmethod + def get(self, name): + return NotImplemented + + @abstractmethod + def delete(self, name): + return NotImplemented + + @abstractmethod + def wait_for_running(self, name): + return NotImplemented + + @abstractmethod + def wait_for_restart(self, name, last_creation_time): + return NotImplemented diff --git a/e2e/libs/sharemanager/crd.py b/e2e/libs/sharemanager/crd.py index 0955d6b9c9..8c481e69f0 100644 --- a/e2e/libs/sharemanager/crd.py +++ b/e2e/libs/sharemanager/crd.py @@ -1,12 +1,16 @@ from kubernetes import client +from datetime import datetime from sharemanager.base import Base - +from utility.utility import logging +from utility.utility import get_retry_count_and_interval +import time class CRD(Base): def __init__(self): self.obj_api = client.CustomObjectsApi() + self.retry_count, self.retry_interval = get_retry_count_and_interval() def list(self, label_selector=None): return self.obj_api.list_namespaced_custom_object( @@ -16,3 +20,49 @@ def list(self, label_selector=None): plural="sharemanagers", label_selector=label_selector ) + + def get(self, name): + return self.obj_api.get_namespaced_custom_object( + group="longhorn.io", + version="v1beta2", + namespace="longhorn-system", + plural="sharemanagers", + name=name + ) + + def delete(self, name): + logging(f"deleting sharemanager {name} ...") + return self.obj_api.delete_namespaced_custom_object( + group="longhorn.io", + version="v1beta2", + namespace="longhorn-system", + plural="sharemanagers", + name=name + ) + + def wait_for_running(self, name): + for i in range(self.retry_count): + sharemanager = self.get(name) + current_status = sharemanager["status"]["state"] + logging(f"wait sharemanager {name} running, current status = {current_status}") + if current_status == "running": + return + time.sleep(self.retry_interval) + + assert False, f"Failed to wait sharemanager {name} in running state" + + def wait_for_restart(self, name, last_creation_time): + for i in range(self.retry_count): + time.sleep(self.retry_interval) + try: + sharemanager = self.get(name) + except Exception as e: + logging(f"Finding sharemanager {name} failed with error {e}") + continue + + creation_time = sharemanager["metadata"]["creationTimestamp"] + fmt = "%Y-%m-%dT%H:%M:%SZ" + if datetime.strptime(creation_time, fmt) > datetime.strptime(last_creation_time, fmt): + return + + assert False, f"Wait share manager {name} restart failed ..." diff --git a/e2e/libs/sharemanager/rest.py b/e2e/libs/sharemanager/rest.py index dbd4d83e22..ba93cb9118 100644 --- a/e2e/libs/sharemanager/rest.py +++ b/e2e/libs/sharemanager/rest.py @@ -10,3 +10,15 @@ def __init__(self): def list(self): return self.longhorn_client.list_share_manager() + + def get(self, name): + return NotImplemented + + def delete(self, name): + return NotImplemented + + def wait_for_running(self, name): + return NotImplemented + + def wait_for_restart(self, name, last_creation_time): + return NotImplemented diff --git a/e2e/libs/sharemanager/sharemanager.py b/e2e/libs/sharemanager/sharemanager.py index fe133f0c88..c617ca3541 100644 --- a/e2e/libs/sharemanager/sharemanager.py +++ b/e2e/libs/sharemanager/sharemanager.py @@ -17,3 +17,15 @@ def __init__(self): def list(self): return self.sharemanager.list() + + def delete(self, name): + return self.sharemanager.delete(name) + + def wait_for_running(self, name): + return self.sharemanager.wait_for_running(name) + + def get(self, name): + return self.sharemanager.get(name) + + def wait_for_restart(self, name, last_creation_time): + return self.sharemanager.wait_for_restart(name, last_creation_time) diff --git a/e2e/libs/workload/workload.py b/e2e/libs/workload/workload.py index ca0fd9bfe1..0311fa342f 100644 --- a/e2e/libs/workload/workload.py +++ b/e2e/libs/workload/workload.py @@ -14,8 +14,8 @@ from workload.pod import wait_for_pod_status -def get_workload_pod_names(workload_name): - pod_list = get_workload_pods(workload_name) +def get_workload_pod_names(workload_name, namespace="default"): + pod_list = get_workload_pods(workload_name, namespace) pod_names = [] for pod in pod_list: pod_names.append(pod.metadata.name) diff --git a/e2e/tests/negative/component_resilience.robot b/e2e/tests/negative/component_resilience.robot new file mode 100644 index 0000000000..4c5cc50596 --- /dev/null +++ b/e2e/tests/negative/component_resilience.robot @@ -0,0 +1,203 @@ +*** Settings *** +Documentation Negative Test Cases + +Test Tags negative + +Resource ../keywords/common.resource +Resource ../keywords/volume.resource +Resource ../keywords/backing_image.resource +Resource ../keywords/storageclass.resource +Resource ../keywords/persistentvolumeclaim.resource +Resource ../keywords/k8s.resource +Resource ../keywords/deployment.resource +Resource ../keywords/workload.resource +Resource ../keywords/setting.resource +Resource ../keywords/longhorn.resource +Resource ../keywords/sharemanager.resource + +Test Setup Set test environment +Test Teardown Cleanup test resources + +*** Variables *** +${LOOP_COUNT} 1 +${RETRY_COUNT} 300 +${RETRY_INTERVAL} 1 +${RWX_VOLUME_FAST_FAILOVER} false +${DATA_ENGINE} v1 + +*** Keywords *** +Delete instance-manager of volume ${volume_id} and wait for recover + When Delete instance-manager of volume ${volume_id} + And Wait for volume ${volume_id} degraded + And Wait for volume ${volume_id} healthy + And Check volume ${volume_id} data is intact + +Delete instance-manager of deployment ${deployment_id} volume and wait for recover + When Delete instance-manager of deployment ${deployment_id} volume + And Wait for volume of deployment ${deployment_id} attached and degraded + And Wait for volume of deployment ${deployment_id} healthy + And Wait for deployment ${deployment_id} pods stable + And Check deployment ${deployment_id} data in file data.txt is intact + +*** Test Cases *** +Test Longhorn components recovery + [Documentation] -- Manual test plan -- + ... Test data setup: + ... Deploy Longhorn on a 3 nodes cluster. + ... Create volume 0 using Longhorn API. + ... Create volume 1 with backing image. + ... Create a RWO volume using the Longhorn storage class(deployment 0) + ... Create a RWX volume using the Longhorn storage class(deployment 1) + ... + ... Write some data in all the volumes created and record the data. + ... Have all the volumes in attached state. + ... + ... Test steps: + ... Delete one pod of all the Longhorn components like longhorn-manager, ui, csi components etc and verify they are able to recover. + Given Create volume 0 with dataEngine=${DATA_ENGINE} + And Attach volume 0 + And Wait for volume 0 healthy + And Write data to volume 0 + + When Create storageclass longhorn-test with dataEngine=${DATA_ENGINE} + And Create persistentvolumeclaim 0 using RWO volume with longhorn-test storageclass + And Create deployment 0 with persistentvolumeclaim 0 + And Write 100 MB data to file data.txt in deployment 0 + + IF '${DATA_ENGINE}' == 'v1' + When Create backing image bi with url=https://longhorn-backing-image.s3-us-west-1.amazonaws.com/parrot.qcow2 + And Create volume 1 with backingImage=bi dataEngine=${DATA_ENGINE} + And Attach volume 1 + And Wait for volume 1 healthy + And Write data to volume 1 + + When Create storageclass longhorn-test-1 with dataEngine=${DATA_ENGINE} + And Create persistentvolumeclaim 1 using RWX volume with longhorn-test-1 storageclass + And Create deployment 1 with persistentvolumeclaim 1 + And Write 100 MB data to file data.txt in deployment 1 + END + + When Delete Longhorn DaemonSet longhorn-csi-plugin pod on node 1 + And Delete Longhorn Deployment csi-attacher pod on node 1 + And Delete Longhorn Deployment csi-provisioner pod on node 1 + And Delete Longhorn Deployment csi-resizer pod on node 1 + And Delete Longhorn Deployment csi-snapshotter pod on node 1 + And Delete Longhorn DaemonSet longhorn-manager pod on node 1 + And Delete Longhorn DaemonSet engine-image pod on node 1 + And Delete Longhorn component instance-manager pod on node 1 + And Delete Longhorn Deployment longhorn-ui pod + And Delete Longhorn Deployment longhorn-driver-deployer pod + + Then Wait for Longhorn components all running + And Wait for volume 0 healthy + And Check volume 0 data is intact + And Wait for deployment 0 pods stable + And Check deployment 0 data in file data.txt is intact + IF '${DATA_ENGINE}' == 'v1' + And Check volume 1 data is intact + And Wait for deployment 1 pods stable + And Check deployment 1 data in file data.txt is intact + END + +Test Longhorn volume recovery + [Documentation] -- Manual test plan -- + ... Test data setup: + ... Deploy Longhorn on a 3 nodes cluster. + ... Create volume 0 using Longhorn API. + ... + ... Write some data in the volume created and compute the md5sum. + ... Have the volume in attached state. + ... + ... Test steps: + ... Delete the IM of the volume and make sure volume recovers. Check the data as well. + ... Start replica rebuilding for the aforementioned volume, and delete the IM-e while it is rebuilding. Verify the recovered volumes. + Given Create volume 0 with dataEngine=${DATA_ENGINE} + And Attach volume 0 + And Wait for volume 0 healthy + And Write data to volume 0 + Then Delete instance-manager of volume 0 and wait for recover + + When Delete volume 0 replica on replica node + And Wait until volume 0 replica rebuilding started on replica node + Then Delete instance-manager of volume 0 and wait for recover + +Test Longhorn backing image volume recovery + [Documentation] -- Manual test plan -- + ... Test data setup: + ... Deploy Longhorn on a 3 nodes cluster. + ... Create volume 0 with backing image. + ... + ... Write some data in the volume created and compute the md5sum. + ... Have the volume in attached state. + ... + ... Test steps: + ... Delete the IM of the volume and make sure volume recovers. Check the data as well. + ... Start replica rebuilding for the aforementioned volume, and delete the IM-e while it is rebuilding. Verify the recovered volume. + ... Delete the backing image manager pod and verify the pod gets recreated. + IF '${DATA_ENGINE}' == 'v1' + When Create backing image bi with url=https://longhorn-backing-image.s3-us-west-1.amazonaws.com/parrot.qcow2 + And Create volume 0 with backingImage=bi dataEngine=${DATA_ENGINE} + And Attach volume 0 + And Wait for volume 0 healthy + And Write data to volume 0 + Then Delete instance-manager of volume 0 and wait for recover + + When Delete volume 0 replica on replica node + And Wait until volume 0 replica rebuilding started on replica node + Then Delete instance-manager of volume 0 and wait for recover + + When Delete backing image managers and wait for recreation + Then Wait backing image managers running + END + +Test Longhorn dynamic provisioned RWX volume recovery + [Documentation] -- Manual test plan -- + ... Test data setup: + ... Deploy Longhorn on a 3 nodes cluster. + ... Create a RWX volume using the Longhorn storage class + ... + ... Write some data in the volume created and compute the md5sum. + ... Have the volume in attached state. + ... + ... Test steps: + ... Delete the IM of the volume and make sure volume recovers. Check the data as well. + ... Start replica rebuilding for the aforementioned volume, and delete the IM-e while it is rebuilding. Verify the recovered volume. + ... Delete the Share-manager pod and verify the RWX volume is able recover. Verify the data too. + IF '${DATA_ENGINE}' == 'v1' + When Create storageclass longhorn-test with dataEngine=${DATA_ENGINE} + And Create persistentvolumeclaim 0 using RWX volume with longhorn-test storageclass + And Create deployment 0 with persistentvolumeclaim 0 + And Write 500 MB data to file data.txt in deployment 0 + Then Delete instance-manager of deployment 0 volume and wait for recover + + When Delete replica of deployment 0 volume on replica node + And Wait until volume of deployment 0 replica rebuilding started on replica node + Then Delete instance-manager of deployment 0 volume and wait for recover + + When Delete sharemanager of deployment 0 and wait for recreation + And Wait for sharemanager of deployment 0 running + And Wait for deployment 0 pods stable + And Check deployment 0 data in file data.txt is intact + END + +Test Longhorn dynamic provisioned RWO volume recovery + [Documentation] -- Manual test plan -- + ... Test data setup: + ... Deploy Longhorn on a 3 nodes cluster. + ... Create a RWO volume using the Longhorn storage class + ... + ... Write some data in the volume created and compute the md5sum. + ... Have the volume in attached state. + ... + ... Test steps: + ... Delete the IM of the volume and make sure volume recovers. Check the data as well. + ... Start replica rebuilding for the aforementioned volume, and delete the IM-e while it is rebuilding. Verify the recovered volume. + When Create storageclass longhorn-test with dataEngine=${DATA_ENGINE} + And Create persistentvolumeclaim 0 using RWO volume with longhorn-test storageclass + And Create deployment 0 with persistentvolumeclaim 0 + And Write 500 MB data to file data.txt in deployment 0 + Then Delete instance-manager of deployment 0 volume and wait for recover + + When Delete replica of deployment 0 volume on replica node + And Wait until volume of deployment 0 replica rebuilding started on replica node + Then Delete instance-manager of deployment 0 volume and wait for recover