Skip to content

Commit

Permalink
test: inject control plane node network latency
Browse files Browse the repository at this point in the history
Signed-off-by: Yang Chiu <[email protected]>
  • Loading branch information
yangchiu authored and c3y1huang committed Oct 27, 2023
1 parent d038a55 commit a43b84a
Show file tree
Hide file tree
Showing 8 changed files with 90 additions and 2 deletions.
3 changes: 3 additions & 0 deletions e2e/keywords/common.resource
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Library ../libs/keywords/node_keywords.py
Library ../libs/keywords/volume_keywords.py
Library ../libs/keywords/recurring_job_keywords.py
Library ../libs/keywords/workload_keywords.py
Library ../libs/keywords/network_keywords.py


*** Variables ***
Expand All @@ -22,8 +23,10 @@ Set test environment
Set Test Variable ${deployment_list}
@{statefulset_list} = Create List
Set Test Variable ${statefulset_list}
setup_control_plane_network_latency

Cleanup test resources
cleanup_control_plane_network_latency
cleanup_node_exec
cleanup_stress_helper
cleanup_recurring_jobs ${volume_list}
Expand Down
2 changes: 2 additions & 0 deletions e2e/keywords/node.resource
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ Documentation Physical Node Keywords
Library ../libs/keywords/volume_keywords.py
Library ../libs/keywords/node_keywords.py
Library ../libs/keywords/workload_keywords.py
Library ../libs/keywords/network_keywords.py

*** Keywords ***
During replica rebuilding, reboot volume node
Expand Down Expand Up @@ -57,6 +58,7 @@ Wait for longhorn ready

Restart cluster
reboot_all_nodes
setup_control_plane_network_latency
wait_for_all_instance_manager_running
FOR ${deployment} IN @{deployment_list}
wait_for_workload_pod_stable ${deployment}
Expand Down
10 changes: 10 additions & 0 deletions e2e/libs/keywords/network_keywords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from network.network import setup_control_plane_network_latency
from network.network import cleanup_control_plane_network_latency

class network_keywords:

def setup_control_plane_network_latency(self):
setup_control_plane_network_latency()

def cleanup_control_plane_network_latency(self):
cleanup_control_plane_network_latency()
29 changes: 29 additions & 0 deletions e2e/libs/network/network.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from robot.libraries.BuiltIn import BuiltIn
from utility.utility import get_control_plane_nodes
from node_exec import NodeExec

def get_control_plane_node_network_latency_in_ms():
latency_in_ms = int(BuiltIn().get_variable_value("${CONTROL_PLANE_NODE_NETWORK_LATENCY_IN_MS}"))
return latency_in_ms

def setup_control_plane_network_latency():
latency_in_ms = get_control_plane_node_network_latency_in_ms()
if latency_in_ms != 0:
nodes = get_control_plane_nodes()
for node in nodes:
cmd = f"tc qdisc replace dev eth0 root netem delay {latency_in_ms}ms"
res = NodeExec.get_instance().issue_cmd(node, cmd)
cmd = f"tc qdisc show dev eth0 | grep delay"
res = NodeExec.get_instance().issue_cmd(node, cmd)
assert res, "setup control plane network latency failed"

def cleanup_control_plane_network_latency():
latency_in_ms = get_control_plane_node_network_latency_in_ms()
if latency_in_ms != 0:
nodes = get_control_plane_nodes()
for node in nodes:
cmd = "tc qdisc del dev eth0 root"
res = NodeExec.get_instance().issue_cmd(node, cmd)
cmd = f"tc qdisc show dev eth0 | grep -v delay"
res = NodeExec.get_instance().issue_cmd(node, cmd)
assert res, "cleanup control plane network failed"
36 changes: 34 additions & 2 deletions e2e/libs/node_exec/node_exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,20 @@ def issue_cmd(self, node_name, cmd):
stdout=True,
tty=False
)
logging(f"Issued command: {cmd} with result {res}")
logging(f"Issued command: {cmd} on {node_name} with result {res}")
return res

def launch_pod(self, node_name):
if node_name in self.node_exec_pod:
return self.node_exec_pod[node_name]
for i in range(DEFAULT_POD_TIMEOUT):
pod = self.core_api.read_namespaced_pod(
name=node_name,
namespace=self.namespace
)
if pod is not None and pod.status.phase == 'Running':
break
time.sleep(DEFAULT_POD_INTERVAL)
return pod
else:
pod_manifest = {
'apiVersion': 'v1',
Expand All @@ -114,6 +122,30 @@ def launch_pod(self, node_name):
}
}
},
"tolerations": [{
"key": "node-role.kubernetes.io/master",
"operator": "Equal",
"value": "true",
"effect": "NoSchedule"
},
{
"key": "node-role.kubernetes.io/master",
"operator": "Equal",
"value": "true",
"effect": "NoExecute"
},
{
"key": "node-role.kubernetes.io/control-plane",
"operator": "Equal",
"value": "true",
"effect": "NoSchedule"
},
{
"key": "node-role.kubernetes.io/control-plane",
"operator": "Equal",
"value": "true",
"effect": "NoExecute"
}],
'containers': [{
'image': 'ubuntu:16.04',
'imagePullPolicy': 'IfNotPresent',
Expand Down
10 changes: 10 additions & 0 deletions e2e/libs/utility/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,16 @@ def list_nodes():
nodes.append(item.metadata.name)
return sorted(nodes)

def get_control_plane_nodes():
core_api = client.CoreV1Api()
obj = core_api.list_node()
nodes = []
for item in obj.items:
if 'node-role.kubernetes.io/control-plane' in item.metadata.labels or \
'node-role.kubernetes.io/master' in item.metadata.labels:
nodes.append(item.metadata.name)
return sorted(nodes)

def wait_for_cluster_ready():
core_api = client.CoreV1Api()
retry_count, retry_interval = get_retry_count_and_interval()
Expand Down
1 change: 1 addition & 0 deletions e2e/tests/cluster_restart.robot
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Test Teardown Cleanup test resources
${LOOP_COUNT} 1
${RETRY_COUNT} 300
${RETRY_INTERVAL} 1
${CONTROL_PLANE_NODE_NETWORK_LATENCY_IN_MS} 0

*** Test Cases ***
Restart Cluster While Workload Heavy Writing
Expand Down
1 change: 1 addition & 0 deletions e2e/tests/node_reboot.robot
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ ${LOOP_COUNT} 1
${RETRY_COUNT} 300
${RETRY_INTERVAL} 1
${VOLUME_TYPE} rwo
${CONTROL_PLANE_NODE_NETWORK_LATENCY_IN_MS} 0

*** Test Cases ***
Reboot Node One By One While Workload Heavy Writing
Expand Down

0 comments on commit a43b84a

Please sign in to comment.