test: add different nodes reboot test cases

Signed-off-by: Yang Chiu <[email protected]>
longhorn · Sep 21, 2023 · 7ae2289 · 7ae2289
1 parent 304a99d
commit 7ae2289
Show file tree

Hide file tree

Showing 18 changed files with 330 additions and 139 deletions.
diff --git a/e2e/keywords/common.resource b/e2e/keywords/common.resource
@@ -14,6 +14,7 @@ Library             ../libs/keywords/workload_keywords.py
 Set test environment
     init_k8s_api_client
     init_node_exec    ${TEST NAME}
+    init_storageclasses
     @{volume_list} =    Create List
     Set Test Variable    ${volume_list}
     @{deployment_list} =    Create List
@@ -27,3 +28,4 @@ Cleanup test resources
     cleanup_volumes    ${volume_list}
     cleanup_deployments    ${deployment_list}
     cleanup_statefulsets    ${statefulset_list}
+    cleanup_storageclasses
diff --git a/e2e/keywords/node.resource b/e2e/keywords/node.resource
@@ -24,8 +24,30 @@ Reboot volume ${idx} replica node
         wait for volume_attached    ${item}
     END
 
+Reboot node ${idx}
+    power_off_node    ${idx}
+
+Reboot all worker nodes
+    power_off_all_worker_nodes
+
+Power off node ${idx} for ${power_off_time_in_min} mins
+    power_off_node    ${idx}    ${power_off_time_in_min}
+
+Power off all worker nodes for ${power_off_time_in_min} mins
+    power_off_all_worker_nodes    ${power_off_time_in_min}
+
+Wait for longhorn ready
+    wait_for_all_instance_manager_running
+    FOR    ${deployment}    IN    @{deployment_list}
+        wait_for_workload_pod_stable   ${deployment}
+    END
+    FOR    ${statefulset}    IN    @{statefulset_list}
+        wait_for_workload_pod_stable   ${statefulset}
+    END
+
 Restart cluster
-    restart_all_nodes
+    reboot_all_nodes
+    wait_for_all_instance_manager_running
     FOR    ${deployment}    IN    @{deployment_list}
         wait_for_workload_pod_stable   ${deployment}
     END

diff --git a/e2e/keywords/workload.resource b/e2e/keywords/workload.resource
@@ -13,6 +13,14 @@ Create statefulset ${idx} with ${volume_type} volume
     ${statefulset_name} =    create_statefulset    ${volume_type}
     Insert Into List    ${statefulset_list}    ${idx}    ${statefulset_name}
 
+Create deployment ${idx} with ${volume_type} and ${option} volume
+    ${deployment_name} =    create_deployment    ${volume_type}    ${option}
+    Insert Into List    ${deployment_list}    ${idx}    ${deployment_name}
+
+Create statefulset ${idx} with ${volume_type} and ${option} volume
+    ${statefulset_name} =    create_statefulset    ${volume_type}    ${option}
+    Insert Into List    ${statefulset_list}    ${idx}    ${statefulset_name}
+
 Keep writing data to deployment ${idx}
     ${pod_name} =    get_workload_pod_name    ${deployment_list}[${idx}]
     keep_writing_pod_data    ${pod_name}

diff --git a/e2e/libs/keywords/node_keywords.py b/e2e/libs/keywords/node_keywords.py
@@ -1,4 +1,6 @@
 from utility.utility import get_test_pod_running_node
+from utility.utility import get_node
+from utility.utility import wait_for_all_instance_manager_running
 from robot.libraries.BuiltIn import BuiltIn
 from node import Node
 import logging
@@ -10,16 +12,24 @@ def __init__(self):
         self.node = Node()
 
     def reboot_volume_node(self, volume_name):
-        test_pod_running_node = get_test_pod_running_node()
         volume_keywords = BuiltIn().get_library_instance('volume_keywords')
         volume_node = volume_keywords.get_volume_node(volume_name)
-        self.node.reboot_node(test_pod_running_node, volume_node)
+        self.node.reboot_node(volume_node)
 
     def reboot_replica_node(self, volume_name):
-        test_pod_running_node = get_test_pod_running_node()
         volume_keywords = BuiltIn().get_library_instance('volume_keywords')
         replica_node = volume_keywords.get_replica_node(volume_name)
-        self.node.reboot_node(test_pod_running_node, replica_node)
+        self.node.reboot_node(replica_node)
 
-    def restart_all_nodes(self):
-        self.node.restart_all_nodes()
+    def power_off_node(self, idx, power_off_time_in_min=1):
+        node_name = get_node(idx)
+        self.node.reboot_node(node_name, int(power_off_time_in_min) * 60)
+
+    def power_off_all_worker_nodes(self, power_off_time_in_min=1):
+        self.node.reboot_all_worker_nodes(int(power_off_time_in_min) * 60)
+
+    def reboot_all_nodes(self):
+        self.node.reboot_all_nodes()
+
+    def wait_for_all_instance_manager_running(self):
+        wait_for_all_instance_manager_running()
diff --git a/e2e/libs/keywords/workload_keywords.py b/e2e/libs/keywords/workload_keywords.py
@@ -6,16 +6,21 @@ class workload_keywords:
     def __init__(self):
         logging.warn("initialize workload_keywords class")
 
-    def create_deployment(self, volume_type="rwo"):
-        pvc_filepath = f"./templates/workload/{volume_type}_pvc.yaml"
-        deployment_filepath = f"./templates/workload/deployment_with_{volume_type}_volume.yaml"
-        pvc_name = create_pvc(pvc_filepath)
-        deployment_name = create_deployment(deployment_filepath)
+    def init_storageclasses(self):
+        create_storageclass('longhorn-test')
+        create_storageclass('strict-local')
+
+    def cleanup_storageclasses(self):
+        delete_storageclass('longhorn-test')
+        delete_storageclass('strict-local')
+
+    def create_deployment(self, volume_type="rwo", option=""):
+        pvc_name = create_pvc(volume_type, option)
+        deployment_name = create_deployment(volume_type, option)
         return deployment_name
 
-    def create_statefulset(self, volume_type="rwo"):
-        statefulset_filepath = f"./templates/workload/statefulset_with_{volume_type}_volume.yaml"
-        statefulset_name = create_statefulset(statefulset_filepath)
+    def create_statefulset(self, volume_type="rwo", option=""):
+        statefulset_name = create_statefulset(volume_type, option)
         return statefulset_name
 
     def get_workload_pod_name(self, workload_name):

diff --git a/e2e/libs/node/node.py b/e2e/libs/node/node.py
@@ -4,6 +4,7 @@
 import logging
 from utility.utility import apply_cr_from_yaml, get_cr
 from utility.utility import wait_for_cluster_ready
+from utility.utility import list_nodes
 import boto3
 
 RETRY_COUNT = 180
@@ -17,53 +18,57 @@ def __init__(self):
         self.aws_client = boto3.client('ec2')
         #logging.warn(f"describe_instances = {self.aws_client.describe_instances()}")
 
-    def restart_all_nodes(self):
+    def reboot_all_nodes(self, shut_down_time_in_sec=60):
         instance_ids = [value for value in self.mapping.values()]
         print(instance_ids)
+
         resp = self.aws_client.stop_instances(InstanceIds=instance_ids)
         print(resp)
         waiter = self.aws_client.get_waiter('instance_stopped')
         waiter.wait(InstanceIds=instance_ids)
         print(f"all instances stopped")
-        time.sleep(60)
+
+        time.sleep(shut_down_time_in_sec)
+
         resp = self.aws_client.start_instances(InstanceIds=instance_ids)
         print(resp)
         waiter = self.aws_client.get_waiter('instance_running')
         waiter.wait(InstanceIds=instance_ids)
         wait_for_cluster_ready()
         print(f"all instances running")
 
-    def reboot_node(self, running_on_node_name, reboot_node_name, shut_down_time_in_sec=10):
-        with open('/tmp/instance_mapping', 'r') as f:
-            mapping = yaml.safe_load(f)
-            reboot_node_id = mapping[reboot_node_name]
+    def reboot_node(self, reboot_node_name, shut_down_time_in_sec=60):
+        instance_ids = [self.mapping[reboot_node_name]]
+        print(instance_ids)
 
-        filepath = './templates/litmus/reboot-node.yaml'
-        with open(filepath, 'r') as f:
-            data = yaml.safe_load(f)
-            data['spec']['components']['runner']['nodeSelector']['kubernetes.io/hostname'] = running_on_node_name
-            data['spec']['experiments'][0]['spec']['components']['nodeSelector']['kubernetes.io/hostname'] = running_on_node_name
-            data['spec']['experiments'][0]['spec']['components']['env'][1]['value'] = str(shut_down_time_in_sec)
-            data['spec']['experiments'][0]['spec']['components']['env'][2]['value'] = reboot_node_id
+        resp = self.aws_client.stop_instances(InstanceIds=instance_ids)
+        print(resp)
+        waiter = self.aws_client.get_waiter('instance_stopped')
+        waiter.wait(InstanceIds=instance_ids)
+        print(f"instances {instance_ids} stopped")
 
-        with open(filepath, 'w') as file:
-            yaml.dump(data,file,sort_keys=False)
+        time.sleep(shut_down_time_in_sec)
+
+        resp = self.aws_client.start_instances(InstanceIds=instance_ids)
+        print(resp)
+        waiter = self.aws_client.get_waiter('instance_running')
+        waiter.wait(InstanceIds=instance_ids)
+        print(f"instances {instance_ids} running")
+
+    def reboot_all_worker_nodes(self, shut_down_time_in_sec=60):
+        instance_ids = [self.mapping[value] for value in list_nodes()]
+        print(instance_ids)
+
+        resp = self.aws_client.stop_instances(InstanceIds=instance_ids)
+        print(resp)
+        waiter = self.aws_client.get_waiter('instance_stopped')
+        waiter.wait(InstanceIds=instance_ids)
+        print(f"instances {instance_ids} stopped")
 
-        apply_cr_from_yaml(filepath)
         time.sleep(shut_down_time_in_sec)
 
-        for i in range(RETRY_COUNT):
-            results = get_cr('litmuschaos.io',
-                             'v1alpha1',
-                             'default',
-                             'chaosresults',
-                             'reboot-node-ec2-terminate-by-id')
-            if results['status']['experimentStatus']['verdict'] == 'Pass':
-                break
-            time.sleep(RETRY_INTERVAL)
-        api = client.CoreV1Api()
-        chaosresults_pods = api.list_namespaced_pod(namespace='default', label_selector='name=ec2-terminate-by-id')
-        logs = api.read_namespaced_pod_log(name=chaosresults_pods.items[0].metadata.name, namespace='default')
-        logging.info(logs)
-        assert results['status']['experimentStatus']['verdict'] == 'Pass', \
-               f"expect verdict = Pass, but get results = {results}"
+        resp = self.aws_client.start_instances(InstanceIds=instance_ids)
+        print(resp)
+        waiter = self.aws_client.get_waiter('instance_running')
+        waiter.wait(InstanceIds=instance_ids)
+        print(f"instances {instance_ids} running")
diff --git a/e2e/libs/utility/utility.py b/e2e/libs/utility/utility.py
@@ -56,6 +56,25 @@ def wait_for_cluster_ready():
         time.sleep(RETRY_INTERVAL)
     assert ready, f"expect cluster's ready but it isn't {resp}"
 
+def wait_for_all_instance_manager_running():
+    core_api = client.CoreV1Api()
+    longhorn_client = get_longhorn_client()
+    nodes = list_nodes()
+
+    for _ in range(RETRY_COUNTS):
+        instance_managers = longhorn_client.list_instance_manager()
+        instance_manager_map = {}
+        try:
+            for im in instance_managers:
+                if im.currentState == "running":
+                    instance_manager_map[im.nodeID] = im
+            if len(instance_manager_map) == len(nodes):
+                break
+            time.sleep(RETRY_INTERVAL)
+        except Exception as e:
+            print(f"exception when get instance manager state: {e}")
+    assert len(instance_manager_map) == len(nodes), f"expect all instance managers running: {instance_managers}"
+
 def get_node(index):
     nodes = list_nodes()
     return nodes[int(index)]

diff --git a/e2e/libs/workload/workload.py b/e2e/libs/workload/workload.py
@@ -11,10 +11,48 @@
 POD_WAIT_TIMEOUT = 240
 WAIT_FOR_POD_STABLE_MAX_RETRY = 90
 
-def create_deployment(filepath):
+def get_name_suffix(*args):
+    suffix = ""
+    for arg in args:
+        if arg:
+            suffix += f"-{arg}"
+    return suffix
+
+def create_storageclass(name):
+    if name == 'strict-local':
+        filepath = "./templates/workload/strict_local_storageclass.yaml"
+    else:
+        filepath = "./templates/workload/storageclass.yaml"
+
+    with open(filepath, 'r') as f:
+        namespace = 'default'
+        manifest_dict = yaml.safe_load(f)
+        api = client.StorageV1Api()
+        try:
+            api.create_storage_class(body=manifest_dict)
+        except Exception as e:
+            print(f"Exception when create storageclass: {e}")
+
+def delete_storageclass(name):
+    api = client.StorageV1Api()
+    try:
+        api.delete_storage_class(name, grace_period_seconds=0)
+    except ApiException as e:
+        assert e.status == 404
+
+def create_deployment(volume_type, option):
+    filepath = f"./templates/workload/deployment.yaml"
     with open(filepath, 'r') as f:
         namespace = 'default'
         manifest_dict = yaml.safe_load(f)
+        suffix = get_name_suffix(volume_type, option)
+        # correct workload name
+        manifest_dict['metadata']['name'] += suffix
+        manifest_dict['metadata']['labels']['app'] += suffix
+        manifest_dict['spec']['selector']['matchLabels']['app'] += suffix
+        manifest_dict['spec']['template']['metadata']['labels']['app'] += suffix
+        # correct claim name
+        manifest_dict['spec']['template']['spec']['volumes'][0]['persistentVolumeClaim']['claimName'] += suffix
         api = client.AppsV1Api()
         try:
             deployment = api.create_namespaced_deployment(
@@ -64,10 +102,23 @@ def delete_deployment(name, namespace='default'):
         time.sleep(RETRY_INTERVAL)
     assert deleted
 
-def create_statefulset(filepath):
+def create_statefulset(volume_type, option):
+    filepath = "./templates/workload/statefulset.yaml"
     with open(filepath, 'r') as f:
         namespace = 'default'
         manifest_dict = yaml.safe_load(f)
+        suffix = get_name_suffix(volume_type, option)
+        # correct workload name
+        manifest_dict['metadata']['name'] += suffix
+        manifest_dict['spec']['selector']['matchLabels']['app'] += suffix
+        manifest_dict['spec']['serviceName'] += suffix
+        manifest_dict['spec']['template']['metadata']['labels']['app'] += suffix
+        # correct storageclass name
+        if option:
+            manifest_dict['spec']['volumeClaimTemplates'][0]['spec']['storageClassName'] += f"-{option}"
+        # correct access mode`
+        if volume_type == 'rwx':
+            manifest_dict['spec']['volumeClaimTemplates'][0]['spec']['accessModes'][0] = 'ReadWriteMany'
         api = client.AppsV1Api()
         try:
             statefulset = api.create_namespaced_stateful_set(
@@ -116,10 +167,20 @@ def delete_statefulset(name, namespace='default'):
         time.sleep(RETRY_INTERVAL)
     assert deleted
 
-def create_pvc(filepath):
+def create_pvc(volume_type, option):
+    filepath = "./templates/workload/pvc.yaml"
     with open(filepath, 'r') as f:
         namespace = 'default'
         manifest_dict = yaml.safe_load(f)
+        suffix = get_name_suffix(volume_type, option)
+        # correct pvc name
+        manifest_dict['metadata']['name'] += suffix
+        # correct storageclass name
+        if option:
+            manifest_dict['spec']['storageClassName'] += f"-{option}"
+        # correct access mode`
+        if volume_type == 'rwx':
+            manifest_dict['spec']['accessModes'][0] = 'ReadWriteMany'
         api = client.CoreV1Api()
         try:
             pvc = api.create_namespaced_persistent_volume_claim(
@@ -129,10 +190,8 @@ def create_pvc(filepath):
             print(f"Exception when create pvc: {e}")
     return pvc.metadata.name
 
-
 def delete_pvc(name, namespace='default'):
     api = client.CoreV1Api()
-
     try:
         api.delete_namespaced_persistent_volume_claim(
             name=name,

diff --git a/.../workload/deployment_with_rwo_volume.yaml → e2e/templates/workload/deployment.yaml b/.../workload/deployment_with_rwo_volume.yaml → e2e/templates/workload/deployment.yaml
@@ -1,19 +1,19 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: test-deployment-rwo
+  name: test-deployment
   labels:
-    app: test-deployment-rwo
+    app: test-deployment
   namespace: default
 spec:
   replicas: 1
   selector:
     matchLabels:
-      app: test-deployment-rwo
+      app: test-deployment
   template:
     metadata:
       labels:
-        app: test-deployment-rwo
+        app: test-deployment
     spec:
       containers:
         - image: busybox
@@ -26,4 +26,4 @@ spec:
       volumes:
         - name: pod-data
           persistentVolumeClaim:
-            claimName: test-rwo-pvc
+            claimName: test-pvc