Skip to content

Commit

Permalink
test: add different nodes reboot test cases
Browse files Browse the repository at this point in the history
Signed-off-by: Yang Chiu <[email protected]>
  • Loading branch information
yangchiu authored and David Ko committed Sep 21, 2023
1 parent 304a99d commit 7ae2289
Show file tree
Hide file tree
Showing 18 changed files with 330 additions and 139 deletions.
2 changes: 2 additions & 0 deletions e2e/keywords/common.resource
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Library ../libs/keywords/workload_keywords.py
Set test environment
init_k8s_api_client
init_node_exec ${TEST NAME}
init_storageclasses
@{volume_list} = Create List
Set Test Variable ${volume_list}
@{deployment_list} = Create List
Expand All @@ -27,3 +28,4 @@ Cleanup test resources
cleanup_volumes ${volume_list}
cleanup_deployments ${deployment_list}
cleanup_statefulsets ${statefulset_list}
cleanup_storageclasses
24 changes: 23 additions & 1 deletion e2e/keywords/node.resource
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,30 @@ Reboot volume ${idx} replica node
wait for volume_attached ${item}
END

Reboot node ${idx}
power_off_node ${idx}

Reboot all worker nodes
power_off_all_worker_nodes

Power off node ${idx} for ${power_off_time_in_min} mins
power_off_node ${idx} ${power_off_time_in_min}

Power off all worker nodes for ${power_off_time_in_min} mins
power_off_all_worker_nodes ${power_off_time_in_min}

Wait for longhorn ready
wait_for_all_instance_manager_running
FOR ${deployment} IN @{deployment_list}
wait_for_workload_pod_stable ${deployment}
END
FOR ${statefulset} IN @{statefulset_list}
wait_for_workload_pod_stable ${statefulset}
END

Restart cluster
restart_all_nodes
reboot_all_nodes
wait_for_all_instance_manager_running
FOR ${deployment} IN @{deployment_list}
wait_for_workload_pod_stable ${deployment}
END
Expand Down
8 changes: 8 additions & 0 deletions e2e/keywords/workload.resource
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,14 @@ Create statefulset ${idx} with ${volume_type} volume
${statefulset_name} = create_statefulset ${volume_type}
Insert Into List ${statefulset_list} ${idx} ${statefulset_name}

Create deployment ${idx} with ${volume_type} and ${option} volume
${deployment_name} = create_deployment ${volume_type} ${option}
Insert Into List ${deployment_list} ${idx} ${deployment_name}

Create statefulset ${idx} with ${volume_type} and ${option} volume
${statefulset_name} = create_statefulset ${volume_type} ${option}
Insert Into List ${statefulset_list} ${idx} ${statefulset_name}

Keep writing data to deployment ${idx}
${pod_name} = get_workload_pod_name ${deployment_list}[${idx}]
keep_writing_pod_data ${pod_name}
Expand Down
22 changes: 16 additions & 6 deletions e2e/libs/keywords/node_keywords.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from utility.utility import get_test_pod_running_node
from utility.utility import get_node
from utility.utility import wait_for_all_instance_manager_running
from robot.libraries.BuiltIn import BuiltIn
from node import Node
import logging
Expand All @@ -10,16 +12,24 @@ def __init__(self):
self.node = Node()

def reboot_volume_node(self, volume_name):
test_pod_running_node = get_test_pod_running_node()
volume_keywords = BuiltIn().get_library_instance('volume_keywords')
volume_node = volume_keywords.get_volume_node(volume_name)
self.node.reboot_node(test_pod_running_node, volume_node)
self.node.reboot_node(volume_node)

def reboot_replica_node(self, volume_name):
test_pod_running_node = get_test_pod_running_node()
volume_keywords = BuiltIn().get_library_instance('volume_keywords')
replica_node = volume_keywords.get_replica_node(volume_name)
self.node.reboot_node(test_pod_running_node, replica_node)
self.node.reboot_node(replica_node)

def restart_all_nodes(self):
self.node.restart_all_nodes()
def power_off_node(self, idx, power_off_time_in_min=1):
node_name = get_node(idx)
self.node.reboot_node(node_name, int(power_off_time_in_min) * 60)

def power_off_all_worker_nodes(self, power_off_time_in_min=1):
self.node.reboot_all_worker_nodes(int(power_off_time_in_min) * 60)

def reboot_all_nodes(self):
self.node.reboot_all_nodes()

def wait_for_all_instance_manager_running(self):
wait_for_all_instance_manager_running()
21 changes: 13 additions & 8 deletions e2e/libs/keywords/workload_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,21 @@ class workload_keywords:
def __init__(self):
logging.warn("initialize workload_keywords class")

def create_deployment(self, volume_type="rwo"):
pvc_filepath = f"./templates/workload/{volume_type}_pvc.yaml"
deployment_filepath = f"./templates/workload/deployment_with_{volume_type}_volume.yaml"
pvc_name = create_pvc(pvc_filepath)
deployment_name = create_deployment(deployment_filepath)
def init_storageclasses(self):
create_storageclass('longhorn-test')
create_storageclass('strict-local')

def cleanup_storageclasses(self):
delete_storageclass('longhorn-test')
delete_storageclass('strict-local')

def create_deployment(self, volume_type="rwo", option=""):
pvc_name = create_pvc(volume_type, option)
deployment_name = create_deployment(volume_type, option)
return deployment_name

def create_statefulset(self, volume_type="rwo"):
statefulset_filepath = f"./templates/workload/statefulset_with_{volume_type}_volume.yaml"
statefulset_name = create_statefulset(statefulset_filepath)
def create_statefulset(self, volume_type="rwo", option=""):
statefulset_name = create_statefulset(volume_type, option)
return statefulset_name

def get_workload_pod_name(self, workload_name):
Expand Down
67 changes: 36 additions & 31 deletions e2e/libs/node/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import logging
from utility.utility import apply_cr_from_yaml, get_cr
from utility.utility import wait_for_cluster_ready
from utility.utility import list_nodes
import boto3

RETRY_COUNT = 180
Expand All @@ -17,53 +18,57 @@ def __init__(self):
self.aws_client = boto3.client('ec2')
#logging.warn(f"describe_instances = {self.aws_client.describe_instances()}")

def restart_all_nodes(self):
def reboot_all_nodes(self, shut_down_time_in_sec=60):
instance_ids = [value for value in self.mapping.values()]
print(instance_ids)

resp = self.aws_client.stop_instances(InstanceIds=instance_ids)
print(resp)
waiter = self.aws_client.get_waiter('instance_stopped')
waiter.wait(InstanceIds=instance_ids)
print(f"all instances stopped")
time.sleep(60)

time.sleep(shut_down_time_in_sec)

resp = self.aws_client.start_instances(InstanceIds=instance_ids)
print(resp)
waiter = self.aws_client.get_waiter('instance_running')
waiter.wait(InstanceIds=instance_ids)
wait_for_cluster_ready()
print(f"all instances running")

def reboot_node(self, running_on_node_name, reboot_node_name, shut_down_time_in_sec=10):
with open('/tmp/instance_mapping', 'r') as f:
mapping = yaml.safe_load(f)
reboot_node_id = mapping[reboot_node_name]
def reboot_node(self, reboot_node_name, shut_down_time_in_sec=60):
instance_ids = [self.mapping[reboot_node_name]]
print(instance_ids)

filepath = './templates/litmus/reboot-node.yaml'
with open(filepath, 'r') as f:
data = yaml.safe_load(f)
data['spec']['components']['runner']['nodeSelector']['kubernetes.io/hostname'] = running_on_node_name
data['spec']['experiments'][0]['spec']['components']['nodeSelector']['kubernetes.io/hostname'] = running_on_node_name
data['spec']['experiments'][0]['spec']['components']['env'][1]['value'] = str(shut_down_time_in_sec)
data['spec']['experiments'][0]['spec']['components']['env'][2]['value'] = reboot_node_id
resp = self.aws_client.stop_instances(InstanceIds=instance_ids)
print(resp)
waiter = self.aws_client.get_waiter('instance_stopped')
waiter.wait(InstanceIds=instance_ids)
print(f"instances {instance_ids} stopped")

with open(filepath, 'w') as file:
yaml.dump(data,file,sort_keys=False)
time.sleep(shut_down_time_in_sec)

resp = self.aws_client.start_instances(InstanceIds=instance_ids)
print(resp)
waiter = self.aws_client.get_waiter('instance_running')
waiter.wait(InstanceIds=instance_ids)
print(f"instances {instance_ids} running")

def reboot_all_worker_nodes(self, shut_down_time_in_sec=60):
instance_ids = [self.mapping[value] for value in list_nodes()]
print(instance_ids)

resp = self.aws_client.stop_instances(InstanceIds=instance_ids)
print(resp)
waiter = self.aws_client.get_waiter('instance_stopped')
waiter.wait(InstanceIds=instance_ids)
print(f"instances {instance_ids} stopped")

apply_cr_from_yaml(filepath)
time.sleep(shut_down_time_in_sec)

for i in range(RETRY_COUNT):
results = get_cr('litmuschaos.io',
'v1alpha1',
'default',
'chaosresults',
'reboot-node-ec2-terminate-by-id')
if results['status']['experimentStatus']['verdict'] == 'Pass':
break
time.sleep(RETRY_INTERVAL)
api = client.CoreV1Api()
chaosresults_pods = api.list_namespaced_pod(namespace='default', label_selector='name=ec2-terminate-by-id')
logs = api.read_namespaced_pod_log(name=chaosresults_pods.items[0].metadata.name, namespace='default')
logging.info(logs)
assert results['status']['experimentStatus']['verdict'] == 'Pass', \
f"expect verdict = Pass, but get results = {results}"
resp = self.aws_client.start_instances(InstanceIds=instance_ids)
print(resp)
waiter = self.aws_client.get_waiter('instance_running')
waiter.wait(InstanceIds=instance_ids)
print(f"instances {instance_ids} running")
19 changes: 19 additions & 0 deletions e2e/libs/utility/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,25 @@ def wait_for_cluster_ready():
time.sleep(RETRY_INTERVAL)
assert ready, f"expect cluster's ready but it isn't {resp}"

def wait_for_all_instance_manager_running():
core_api = client.CoreV1Api()
longhorn_client = get_longhorn_client()
nodes = list_nodes()

for _ in range(RETRY_COUNTS):
instance_managers = longhorn_client.list_instance_manager()
instance_manager_map = {}
try:
for im in instance_managers:
if im.currentState == "running":
instance_manager_map[im.nodeID] = im
if len(instance_manager_map) == len(nodes):
break
time.sleep(RETRY_INTERVAL)
except Exception as e:
print(f"exception when get instance manager state: {e}")
assert len(instance_manager_map) == len(nodes), f"expect all instance managers running: {instance_managers}"

def get_node(index):
nodes = list_nodes()
return nodes[int(index)]
Expand Down
69 changes: 64 additions & 5 deletions e2e/libs/workload/workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,48 @@
POD_WAIT_TIMEOUT = 240
WAIT_FOR_POD_STABLE_MAX_RETRY = 90

def create_deployment(filepath):
def get_name_suffix(*args):
suffix = ""
for arg in args:
if arg:
suffix += f"-{arg}"
return suffix

def create_storageclass(name):
if name == 'strict-local':
filepath = "./templates/workload/strict_local_storageclass.yaml"
else:
filepath = "./templates/workload/storageclass.yaml"

with open(filepath, 'r') as f:
namespace = 'default'
manifest_dict = yaml.safe_load(f)
api = client.StorageV1Api()
try:
api.create_storage_class(body=manifest_dict)
except Exception as e:
print(f"Exception when create storageclass: {e}")

def delete_storageclass(name):
api = client.StorageV1Api()
try:
api.delete_storage_class(name, grace_period_seconds=0)
except ApiException as e:
assert e.status == 404

def create_deployment(volume_type, option):
filepath = f"./templates/workload/deployment.yaml"
with open(filepath, 'r') as f:
namespace = 'default'
manifest_dict = yaml.safe_load(f)
suffix = get_name_suffix(volume_type, option)
# correct workload name
manifest_dict['metadata']['name'] += suffix
manifest_dict['metadata']['labels']['app'] += suffix
manifest_dict['spec']['selector']['matchLabels']['app'] += suffix
manifest_dict['spec']['template']['metadata']['labels']['app'] += suffix
# correct claim name
manifest_dict['spec']['template']['spec']['volumes'][0]['persistentVolumeClaim']['claimName'] += suffix
api = client.AppsV1Api()
try:
deployment = api.create_namespaced_deployment(
Expand Down Expand Up @@ -64,10 +102,23 @@ def delete_deployment(name, namespace='default'):
time.sleep(RETRY_INTERVAL)
assert deleted

def create_statefulset(filepath):
def create_statefulset(volume_type, option):
filepath = "./templates/workload/statefulset.yaml"
with open(filepath, 'r') as f:
namespace = 'default'
manifest_dict = yaml.safe_load(f)
suffix = get_name_suffix(volume_type, option)
# correct workload name
manifest_dict['metadata']['name'] += suffix
manifest_dict['spec']['selector']['matchLabels']['app'] += suffix
manifest_dict['spec']['serviceName'] += suffix
manifest_dict['spec']['template']['metadata']['labels']['app'] += suffix
# correct storageclass name
if option:
manifest_dict['spec']['volumeClaimTemplates'][0]['spec']['storageClassName'] += f"-{option}"
# correct access mode`
if volume_type == 'rwx':
manifest_dict['spec']['volumeClaimTemplates'][0]['spec']['accessModes'][0] = 'ReadWriteMany'
api = client.AppsV1Api()
try:
statefulset = api.create_namespaced_stateful_set(
Expand Down Expand Up @@ -116,10 +167,20 @@ def delete_statefulset(name, namespace='default'):
time.sleep(RETRY_INTERVAL)
assert deleted

def create_pvc(filepath):
def create_pvc(volume_type, option):
filepath = "./templates/workload/pvc.yaml"
with open(filepath, 'r') as f:
namespace = 'default'
manifest_dict = yaml.safe_load(f)
suffix = get_name_suffix(volume_type, option)
# correct pvc name
manifest_dict['metadata']['name'] += suffix
# correct storageclass name
if option:
manifest_dict['spec']['storageClassName'] += f"-{option}"
# correct access mode`
if volume_type == 'rwx':
manifest_dict['spec']['accessModes'][0] = 'ReadWriteMany'
api = client.CoreV1Api()
try:
pvc = api.create_namespaced_persistent_volume_claim(
Expand All @@ -129,10 +190,8 @@ def create_pvc(filepath):
print(f"Exception when create pvc: {e}")
return pvc.metadata.name


def delete_pvc(name, namespace='default'):
api = client.CoreV1Api()

try:
api.delete_namespaced_persistent_volume_claim(
name=name,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: test-deployment-rwo
name: test-deployment
labels:
app: test-deployment-rwo
app: test-deployment
namespace: default
spec:
replicas: 1
selector:
matchLabels:
app: test-deployment-rwo
app: test-deployment
template:
metadata:
labels:
app: test-deployment-rwo
app: test-deployment
spec:
containers:
- image: busybox
Expand All @@ -26,4 +26,4 @@ spec:
volumes:
- name: pod-data
persistentVolumeClaim:
claimName: test-rwo-pvc
claimName: test-pvc
Loading

0 comments on commit 7ae2289

Please sign in to comment.