diff --git a/e2e/keywords/common.resource b/e2e/keywords/common.resource index e978ddd0fc..c1757e5d99 100644 --- a/e2e/keywords/common.resource +++ b/e2e/keywords/common.resource @@ -6,6 +6,7 @@ Library ../libs/keywords/node_keywords.py Library ../libs/keywords/volume_keywords.py Library ../libs/keywords/recurring_job_keywords.py Library ../libs/keywords/workload_keywords.py +Library ../libs/keywords/network_keywords.py *** Variables *** @@ -22,8 +23,10 @@ Set test environment Set Test Variable ${deployment_list} @{statefulset_list} = Create List Set Test Variable ${statefulset_list} + setup_control_plane_network_latency Cleanup test resources + cleanup_control_plane_network_latency cleanup_node_exec cleanup_stress_helper cleanup_recurring_jobs ${volume_list} diff --git a/e2e/keywords/node.resource b/e2e/keywords/node.resource index 59f2b3313f..59bb4cb61f 100644 --- a/e2e/keywords/node.resource +++ b/e2e/keywords/node.resource @@ -4,6 +4,7 @@ Documentation Physical Node Keywords Library ../libs/keywords/volume_keywords.py Library ../libs/keywords/node_keywords.py Library ../libs/keywords/workload_keywords.py +Library ../libs/keywords/network_keywords.py *** Keywords *** During replica rebuilding, reboot volume node @@ -57,6 +58,7 @@ Wait for longhorn ready Restart cluster reboot_all_nodes + setup_control_plane_network_latency wait_for_all_instance_manager_running FOR ${deployment} IN @{deployment_list} wait_for_workload_pod_stable ${deployment} diff --git a/e2e/libs/keywords/network_keywords.py b/e2e/libs/keywords/network_keywords.py new file mode 100644 index 0000000000..93a86797d5 --- /dev/null +++ b/e2e/libs/keywords/network_keywords.py @@ -0,0 +1,10 @@ +from network.network import setup_control_plane_network_latency +from network.network import cleanup_control_plane_network_latency + +class network_keywords: + + def setup_control_plane_network_latency(self): + setup_control_plane_network_latency() + + def cleanup_control_plane_network_latency(self): + cleanup_control_plane_network_latency() diff --git a/e2e/libs/network/network.py b/e2e/libs/network/network.py new file mode 100644 index 0000000000..a9d81b4b3d --- /dev/null +++ b/e2e/libs/network/network.py @@ -0,0 +1,29 @@ +from robot.libraries.BuiltIn import BuiltIn +from utility.utility import get_control_plane_nodes +from node_exec import NodeExec + +def get_control_plane_node_network_latency_in_ms(): + latency_in_ms = int(BuiltIn().get_variable_value("${CONTROL_PLANE_NODE_NETWORK_LATENCY_IN_MS}")) + return latency_in_ms + +def setup_control_plane_network_latency(): + latency_in_ms = get_control_plane_node_network_latency_in_ms() + if latency_in_ms != 0: + nodes = get_control_plane_nodes() + for node in nodes: + cmd = f"tc qdisc replace dev eth0 root netem delay {latency_in_ms}ms" + res = NodeExec.get_instance().issue_cmd(node, cmd) + cmd = f"tc qdisc show dev eth0 | grep delay" + res = NodeExec.get_instance().issue_cmd(node, cmd) + assert res, "setup control plane network latency failed" + +def cleanup_control_plane_network_latency(): + latency_in_ms = get_control_plane_node_network_latency_in_ms() + if latency_in_ms != 0: + nodes = get_control_plane_nodes() + for node in nodes: + cmd = "tc qdisc del dev eth0 root" + res = NodeExec.get_instance().issue_cmd(node, cmd) + cmd = f"tc qdisc show dev eth0 | grep -v delay" + res = NodeExec.get_instance().issue_cmd(node, cmd) + assert res, "cleanup control plane network failed" \ No newline at end of file diff --git a/e2e/libs/node_exec/node_exec.py b/e2e/libs/node_exec/node_exec.py index 76011c29ef..d01f39988f 100644 --- a/e2e/libs/node_exec/node_exec.py +++ b/e2e/libs/node_exec/node_exec.py @@ -85,12 +85,20 @@ def issue_cmd(self, node_name, cmd): stdout=True, tty=False ) - logging(f"Issued command: {cmd} with result {res}") + logging(f"Issued command: {cmd} on {node_name} with result {res}") return res def launch_pod(self, node_name): if node_name in self.node_exec_pod: - return self.node_exec_pod[node_name] + for i in range(DEFAULT_POD_TIMEOUT): + pod = self.core_api.read_namespaced_pod( + name=node_name, + namespace=self.namespace + ) + if pod is not None and pod.status.phase == 'Running': + break + time.sleep(DEFAULT_POD_INTERVAL) + return pod else: pod_manifest = { 'apiVersion': 'v1', @@ -114,6 +122,30 @@ def launch_pod(self, node_name): } } }, + "tolerations": [{ + "key": "node-role.kubernetes.io/master", + "operator": "Equal", + "value": "true", + "effect": "NoSchedule" + }, + { + "key": "node-role.kubernetes.io/master", + "operator": "Equal", + "value": "true", + "effect": "NoExecute" + }, + { + "key": "node-role.kubernetes.io/control-plane", + "operator": "Equal", + "value": "true", + "effect": "NoSchedule" + }, + { + "key": "node-role.kubernetes.io/control-plane", + "operator": "Equal", + "value": "true", + "effect": "NoExecute" + }], 'containers': [{ 'image': 'ubuntu:16.04', 'imagePullPolicy': 'IfNotPresent', diff --git a/e2e/libs/utility/utility.py b/e2e/libs/utility/utility.py index 0786cf93bd..9108cf5805 100644 --- a/e2e/libs/utility/utility.py +++ b/e2e/libs/utility/utility.py @@ -53,6 +53,16 @@ def list_nodes(): nodes.append(item.metadata.name) return sorted(nodes) +def get_control_plane_nodes(): + core_api = client.CoreV1Api() + obj = core_api.list_node() + nodes = [] + for item in obj.items: + if 'node-role.kubernetes.io/control-plane' in item.metadata.labels or \ + 'node-role.kubernetes.io/master' in item.metadata.labels: + nodes.append(item.metadata.name) + return sorted(nodes) + def wait_for_cluster_ready(): core_api = client.CoreV1Api() retry_count, retry_interval = get_retry_count_and_interval() diff --git a/e2e/tests/cluster_restart.robot b/e2e/tests/cluster_restart.robot index 1fffb04c86..06c160a093 100644 --- a/e2e/tests/cluster_restart.robot +++ b/e2e/tests/cluster_restart.robot @@ -11,6 +11,7 @@ Test Teardown Cleanup test resources ${LOOP_COUNT} 1 ${RETRY_COUNT} 300 ${RETRY_INTERVAL} 1 +${CONTROL_PLANE_NODE_NETWORK_LATENCY_IN_MS} 0 *** Test Cases *** Restart Cluster While Workload Heavy Writing diff --git a/e2e/tests/node_reboot.robot b/e2e/tests/node_reboot.robot index 780146415f..fbc87ef62e 100644 --- a/e2e/tests/node_reboot.robot +++ b/e2e/tests/node_reboot.robot @@ -13,6 +13,7 @@ ${LOOP_COUNT} 1 ${RETRY_COUNT} 300 ${RETRY_INTERVAL} 1 ${VOLUME_TYPE} rwo +${CONTROL_PLANE_NODE_NETWORK_LATENCY_IN_MS} 0 *** Test Cases *** Reboot Node One By One While Workload Heavy Writing