From 50c91e16b9d2099259efb22b7e3dd9219960e205 Mon Sep 17 00:00:00 2001 From: Yang Chiu Date: Mon, 13 Nov 2023 10:35:48 +0800 Subject: [PATCH] test(negative): implement force drain node Signed-off-by: Yang Chiu --- e2e/keywords/k8s.resource | 23 +++++++++++++ e2e/keywords/workload.resource | 16 ++++++++- e2e/libs/k8s/k8s.py | 51 ++++++++++++++++++++++++++-- e2e/libs/keywords/k8s_keywords.py | 15 +++++++++ e2e/libs/node/node.py | 27 --------------- e2e/libs/utility/utility.py | 7 ++++ e2e/tests/node_delete.robot | 4 +-- e2e/tests/node_drain.robot | 53 ++++++++++++++++++++++++++++++ e2e/tests/replica_rebuilding.robot | 1 - 9 files changed, 163 insertions(+), 34 deletions(-) create mode 100644 e2e/tests/node_drain.robot diff --git a/e2e/keywords/k8s.resource b/e2e/keywords/k8s.resource index c1dccabe5e..75ff214d4e 100644 --- a/e2e/keywords/k8s.resource +++ b/e2e/keywords/k8s.resource @@ -30,3 +30,26 @@ Delete volume of ${workload_kind} ${workload_id} replica node Add deleted node back reboot_node_by_name ${deleted_node} + +Force drain volume of ${workload_kind} ${workload_id} volume node + ${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id} + ${volume_name} = get_workload_volume_name ${workload_name} + ${drained_node} = get_volume_node ${volume_name} + ${last_volume_node} = get_volume_node ${volume_name} + force_drain_node ${drained_node} + wait_for_all_pods_evicted ${drained_node} + Set Test Variable ${drained_node} + Set Test Variable ${last_volume_node} + +Force drain volume of ${workload_kind} ${workload_id} replica node + ${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id} + ${volume_name} = get_workload_volume_name ${workload_name} + ${drained_node} = get_replica_node ${volume_name} + ${last_volume_node} = get_volume_node ${volume_name} + force_drain_node ${drained_node} + wait_for_all_pods_evicted ${drained_node} + Set Test Variable ${drained_node} + Set Test Variable ${last_volume_node} + +Uncordon the drained node + uncordon_node ${drained_node} diff --git a/e2e/keywords/workload.resource b/e2e/keywords/workload.resource index 30ee968d2b..60615ebd48 100644 --- a/e2e/keywords/workload.resource +++ b/e2e/keywords/workload.resource @@ -46,7 +46,7 @@ Wait for volume of ${workload_kind} ${workload_id} healthy ${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id} wait_for_workload_volume_healthy ${workload_name} -Wait until volume of ${workload_kind} ${workload_id} replica rebuidling started on ${replica_locality} +Wait until volume of ${workload_kind} ${workload_id} replica rebuilding started on ${replica_locality} ${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id} ${volume_name} = get_workload_volume_name ${workload_name} wait_for_replica_rebuilding_to_start_on_node ${volume_name} ${replica_locality} @@ -66,6 +66,20 @@ Wait for volume of ${workload_kind} ${workload_id} attached and healthy ${volume_name} = get_workload_volume_name ${workload_name} wait_for_volume_healthy ${volume_name} +Wait for volume of ${workload_kind} ${workload_id} attached to the original node and degraded + ${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id} + ${volume_name} = get_workload_volume_name ${workload_name} + wait_for_volume_degraded ${volume_name} + ${volume_node} = get_volume_node ${volume_name} + Should Be Equal ${last_volume_node} ${volume_node} + +Wait for volume of ${workload_kind} ${workload_id} attached to another node and degraded + ${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id} + ${volume_name} = get_workload_volume_name ${workload_name} + wait_for_volume_degraded ${volume_name} + ${volume_node} = get_volume_node ${volume_name} + Should Not Be Equal ${last_volume_node} ${volume_node} + Delete replica of ${workload_kind} ${workload_id} volume on ${replica_locality} ${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id} ${volume_name} = get_workload_volume_name ${workload_name} diff --git a/e2e/libs/k8s/k8s.py b/e2e/libs/k8s/k8s.py index f8e3100029..ea854b8fa6 100644 --- a/e2e/libs/k8s/k8s.py +++ b/e2e/libs/k8s/k8s.py @@ -1,12 +1,14 @@ import time import subprocess import asyncio +from kubernetes import client from workload.pod import create_pod from workload.pod import delete_pod from workload.pod import new_pod_manifest from workload.constant import IMAGE_UBUNTU - +from utility.utility import subprocess_exec_cmd from utility.utility import logging +from utility.utility import get_retry_count_and_interval async def restart_kubelet(node_name, downtime_in_sec=10): manifest = new_pod_manifest( @@ -24,5 +26,48 @@ async def restart_kubelet(node_name, downtime_in_sec=10): def delete_node(node_name): exec_cmd = ["kubectl", "delete", "node", node_name] - res = subprocess.check_output(exec_cmd) - logging(f"Executed command {exec_cmd} with result {res}") + res = subprocess_exec_cmd(exec_cmd) + +def drain_node(node_name): + exec_cmd = ["kubectl", "drain", node_name, "--ignore-daemonsets", "--delete-emptydir-data"] + res = subprocess_exec_cmd(exec_cmd) + +def force_drain_node(node_name): + exec_cmd = ["kubectl", "drain", node_name, "--force", "--ignore-daemonsets", "--delete-emptydir-data"] + res = subprocess_exec_cmd(exec_cmd) + +def cordon_node(node_name): + exec_cmd = ["kubectl", "cordon", node_name] + res = subprocess_exec_cmd(exec_cmd) + +def uncordon_node(node_name): + exec_cmd = ["kubectl", "uncordon", node_name] + res = subprocess_exec_cmd(exec_cmd) + +def get_all_pods_on_node(node_name): + api = client.CoreV1Api() + all_pods = api.list_namespaced_pod(namespace='longhorn-system', field_selector='spec.nodeName=' + node_name) + user_pods = [p for p in all_pods.items if (p.metadata.namespace != 'kube-system')] + return user_pods + +def wait_all_pods_evicted(node_name): + retry_count, retry_interval = get_retry_count_and_interval() + for i in range(retry_count): + pods = get_all_pods_on_node(node_name) + logging(f"Waiting for pods evicted from {node_name} ... ({i})") + evicted = True + for pod in pods: + # check non DaemonSet Pods are evicted or terminating (deletionTimestamp != None) + pod_type = pod.metadata.owner_references[0].kind + pod_delete_timestamp = pod.metadata.deletion_timestamp + + if (pod_type != 'DaemonSet' and pod_type != 'BackingImageManager') and pod_delete_timestamp == None: + evicted = False + break + + if evicted: + break + + time.sleep(retry_interval) + + assert evicted, 'failed to evict pods' diff --git a/e2e/libs/keywords/k8s_keywords.py b/e2e/libs/keywords/k8s_keywords.py index 38fb4db2ef..625696a8d3 100644 --- a/e2e/libs/keywords/k8s_keywords.py +++ b/e2e/libs/keywords/k8s_keywords.py @@ -2,6 +2,9 @@ from robot.libraries.BuiltIn import BuiltIn from k8s.k8s import restart_kubelet from k8s.k8s import delete_node +from k8s.k8s import drain_node, force_drain_node +from k8s.k8s import cordon_node, uncordon_node +from k8s.k8s import wait_all_pods_evicted from utility.utility import logging @@ -37,3 +40,15 @@ def delete_replica_node(self, volume_name): replica_node = volume_keywords.get_replica_node(volume_name) delete_node(replica_node) return replica_node + + def drain_node(self, node_name): + drain_node(node_name) + + def force_drain_node(self, node_name): + force_drain_node(node_name) + + def uncordon_node(self, node_name): + uncordon_node(node_name) + + def wait_for_all_pods_evicted(self, node_name): + wait_all_pods_evicted(node_name) diff --git a/e2e/libs/node/node.py b/e2e/libs/node/node.py index fa50907644..22d2374669 100644 --- a/e2e/libs/node/node.py +++ b/e2e/libs/node/node.py @@ -49,33 +49,6 @@ def reset_disks(self, node_name): logging(f"Try to remove disk {disk_name} from node {node_name}") self.update_disks(node_name, disks) - def get_all_pods_on_node(self, node_name): - api = client.CoreV1Api() - all_pods = api.list_namespaced_pod(namespace='longhorn-system', field_selector='spec.nodeName=' + node_name) - user_pods = [p for p in all_pods.items if (p.metadata.namespace != 'kube-system')] - return user_pods - - def wait_all_pods_evicted(self, node_name): - retry_count, retry_interval = get_retry_count_and_interval() - for _ in range(retry_count): - pods = self.get_all_pods_on_node(node_name) - evicted = True - for pod in pods: - # check non DaemonSet Pods are evicted or terminating (deletionTimestamp != None) - pod_type = pod.metadata.owner_references[0].kind - pod_delete_timestamp = pod.metadata.deletion_timestamp - - if pod_type != 'DaemonSet' and pod_delete_timestamp == None: - evicted = False - break - - if evicted: - break - - time.sleep(retry_interval) - - assert evicted, 'failed to evict pods' - def is_accessing_node_by_index(self, node): p = re.compile('node (\d)') if m := p.match(node): diff --git a/e2e/libs/utility/utility.py b/e2e/libs/utility/utility.py index 7cf76f1141..2201758fe9 100644 --- a/e2e/libs/utility/utility.py +++ b/e2e/libs/utility/utility.py @@ -7,6 +7,7 @@ import signal from robot.api import logger from robot.libraries.BuiltIn import BuiltIn +import subprocess from longhorn import from_env @@ -79,6 +80,12 @@ def get_backupstore(): return os.environ.get('LONGHORN_BACKUPSTORE', "") +def subprocess_exec_cmd(cmd): + res = subprocess.check_output(cmd) + logging(f"Executed command {cmd} with result {res}") + return res + + def wait_for_cluster_ready(): core_api = client.CoreV1Api() retry_count, retry_interval = get_retry_count_and_interval() diff --git a/e2e/tests/node_delete.robot b/e2e/tests/node_delete.robot index 73d795e9b4..e282e1090a 100644 --- a/e2e/tests/node_delete.robot +++ b/e2e/tests/node_delete.robot @@ -26,7 +26,7 @@ Delete Volume Node While Replica Rebuilding FOR ${i} IN RANGE ${LOOP_COUNT} When Delete replica of deployment 0 volume on volume node - And Wait until volume of deployment 0 replica rebuidling started on volume node + And Wait until volume of deployment 0 replica rebuilding started on volume node And Delete volume of deployment 0 volume node Then Wait for volume of deployment 0 attached and unknown @@ -44,7 +44,7 @@ Delete Replica Node While Replica Rebuilding FOR ${i} IN RANGE ${LOOP_COUNT} When Delete replica of deployment 0 volume on replica node - And Wait until volume of deployment 0 replica rebuidling started on replica node + And Wait until volume of deployment 0 replica rebuilding started on replica node And Delete volume of deployment 0 replica node Then Wait for volume of deployment 0 attached and degraded diff --git a/e2e/tests/node_drain.robot b/e2e/tests/node_drain.robot new file mode 100644 index 0000000000..16cd98622f --- /dev/null +++ b/e2e/tests/node_drain.robot @@ -0,0 +1,53 @@ +*** Settings *** +Documentation Negative Test Cases + +Resource ../keywords/common.resource +Resource ../keywords/persistentvolumeclaim.resource +Resource ../keywords/k8s.resource +Resource ../keywords/deployment.resource +Resource ../keywords/workload.resource + +Test Setup Set test environment +Test Teardown Cleanup test resources + +*** Variables *** +${LOOP_COUNT} 1 +${RETRY_COUNT} 300 +${RETRY_INTERVAL} 1 + +*** Test Cases *** +Force Drain Volume Node While Replica Rebuilding + Given Create persistentvolumeclaim 0 using RWO volume + And Create deployment 0 with persistentvolumeclaim 0 + And Wait for volume of deployment 0 healthy + And Write 2048 MB data to file data.txt in deployment 0 + + FOR ${i} IN RANGE ${LOOP_COUNT} + When Delete replica of deployment 0 volume on volume node + And Wait until volume of deployment 0 replica rebuilding started on volume node + And Force drain volume of deployment 0 volume node + + Then Wait for volume of deployment 0 attached to another node and degraded + And Uncordon the drained node + And Wait for volume of deployment 0 attached and healthy + And Wait for deployment 0 pods stable + And Check deployment 0 data in file data.txt is intact + END + +Force Drain Replica Node While Replica Rebuilding + Given Create persistentvolumeclaim 0 using RWO volume + And Create deployment 0 with persistentvolumeclaim 0 + And Wait for volume of deployment 0 healthy + And Write 2048 MB data to file data.txt in deployment 0 + + FOR ${i} IN RANGE ${LOOP_COUNT} + When Delete replica of deployment 0 volume on replica node + And Wait until volume of deployment 0 replica rebuilding started on replica node + And Force drain volume of deployment 0 replica node + + Then Wait for volume of deployment 0 attached to the original node and degraded + And Uncordon the drained node + And Wait for volume of deployment 0 attached and healthy + And Wait for deployment 0 pods stable + And Check deployment 0 data in file data.txt is intact + END diff --git a/e2e/tests/replica_rebuilding.robot b/e2e/tests/replica_rebuilding.robot index 5421928957..f5343a8b4d 100644 --- a/e2e/tests/replica_rebuilding.robot +++ b/e2e/tests/replica_rebuilding.robot @@ -3,7 +3,6 @@ Documentation Negative Test Cases Resource ../keywords/common.resource Resource ../keywords/host.resource -Resource ../keywords/persistentvolumeclaim.resource Resource ../keywords/volume.resource Test Setup Set test environment