From ce7352e877b85c059f738143f7867cbeeb98f053 Mon Sep 17 00:00:00 2001 From: Yang Chiu Date: Fri, 20 Sep 2024 12:51:36 +0800 Subject: [PATCH] test(robot): fix node_exec rootfs mount point missing after node reboot Signed-off-by: Yang Chiu --- e2e/keywords/common.resource | 10 +- e2e/libs/backup/rest.py | 3 +- e2e/libs/keywords/common_keywords.py | 8 - e2e/libs/network/network.py | 9 +- e2e/libs/node_exec/node_exec.py | 264 ++++++++++---------------- e2e/libs/snapshot/rest.py | 2 +- e2e/libs/utility/utility.py | 35 ++++ e2e/libs/volume/crd.py | 39 ++-- e2e/libs/volume/rest.py | 10 +- e2e/libs/volume/volume.py | 7 +- e2e/tests/regression/test_basic.robot | 1 + 11 files changed, 173 insertions(+), 215 deletions(-) diff --git a/e2e/keywords/common.resource b/e2e/keywords/common.resource index 17e2902876..430476fd10 100644 --- a/e2e/keywords/common.resource +++ b/e2e/keywords/common.resource @@ -24,7 +24,6 @@ Library ../libs/keywords/k8s_keywords.py *** Keywords *** Set test environment init_k8s_api_client - init_node_exec ${SUITE NAME.rsplit('.')[1]} setup_control_plane_network_latency set_backupstore @@ -36,10 +35,13 @@ Set test environment END Cleanup test resources + ${variables}= Get Variables + IF "\${powered_off_node}" in "$variables" + power_on_node_by_name ${powered_off_node} + END uncordon_all_nodes cleanup_control_plane_network_latency reset_node_schedule - cleanup_node_exec cleanup_stress_helper cleanup_recurringjobs cleanup_deployments @@ -53,7 +55,3 @@ Cleanup test resources cleanup_backing_images cleanup_engine_images reset_backupstore - -Cleanup test resources include off nodes - Power on off node - Cleanup test resources diff --git a/e2e/libs/backup/rest.py b/e2e/libs/backup/rest.py index 00f10360bd..b82308093b 100644 --- a/e2e/libs/backup/rest.py +++ b/e2e/libs/backup/rest.py @@ -2,7 +2,6 @@ from utility.utility import logging from utility.utility import get_longhorn_client from utility.utility import get_retry_count_and_interval -from node_exec import NodeExec from volume import Rest as RestVolume from snapshot import Snapshot as RestSnapshot import time @@ -11,7 +10,7 @@ class Rest(Base): def __init__(self): - self.volume = RestVolume(NodeExec.get_instance()) + self.volume = RestVolume() self.snapshot = RestSnapshot() self.retry_count, self.retry_interval = get_retry_count_and_interval() diff --git a/e2e/libs/keywords/common_keywords.py b/e2e/libs/keywords/common_keywords.py index b90d99f1b8..0275b1af46 100644 --- a/e2e/libs/keywords/common_keywords.py +++ b/e2e/libs/keywords/common_keywords.py @@ -1,4 +1,3 @@ -from node_exec import NodeExec from node import Node from utility.utility import init_k8s_api_client from utility.utility import generate_name_with_suffix @@ -12,13 +11,6 @@ def __init__(self): def init_k8s_api_client(self): init_k8s_api_client() - def init_node_exec(self, test_name): - namespace = test_name.lower().replace(' ', '-')[:63] - NodeExec.get_instance().set_namespace(namespace) - - def cleanup_node_exec(self): - NodeExec.get_instance().cleanup() - def generate_name_with_suffix(self, kind, suffix): return generate_name_with_suffix(kind, suffix) diff --git a/e2e/libs/network/network.py b/e2e/libs/network/network.py index e88b59169b..3397d25197 100644 --- a/e2e/libs/network/network.py +++ b/e2e/libs/network/network.py @@ -9,7 +9,6 @@ from utility.constant import LABEL_TEST from utility.constant import LABEL_TEST_VALUE -from utility.utility import logging from utility.utility import pod_exec from workload.pod import create_pod @@ -30,9 +29,9 @@ def setup_control_plane_network_latency(): control_plane_nodes = Node.list_node_names_by_role("control-plane") for control_plane_node in control_plane_nodes: cmd = f"tc qdisc replace dev eth0 root netem delay {latency_in_ms}ms" - res = NodeExec.get_instance().issue_cmd(control_plane_node, cmd) + res = NodeExec(control_plane_node).issue_cmd(cmd) cmd = f"tc qdisc show dev eth0 | grep delay" - res = NodeExec.get_instance().issue_cmd(control_plane_node, cmd) + res = NodeExec(control_plane_node).issue_cmd(cmd) assert res, "setup control plane network latency failed" @@ -42,9 +41,9 @@ def cleanup_control_plane_network_latency(): control_plane_nodes = Node.list_node_names_by_role("control-plane") for control_plane_node in control_plane_nodes: cmd = "tc qdisc del dev eth0 root" - res = NodeExec.get_instance().issue_cmd(control_plane_node, cmd) + res = NodeExec(control_plane_node).issue_cmd(cmd) cmd = f"tc qdisc show dev eth0 | grep -v delay" - res = NodeExec.get_instance().issue_cmd(control_plane_node, cmd) + res = NodeExec(control_plane_node).issue_cmd(cmd) assert res, "cleanup control plane network failed" async def disconnect_node_network(node_name, disconnection_time_in_sec=10): diff --git a/e2e/libs/node_exec/node_exec.py b/e2e/libs/node_exec/node_exec.py index 83732c29e3..1f492b16f5 100644 --- a/e2e/libs/node_exec/node_exec.py +++ b/e2e/libs/node_exec/node_exec.py @@ -2,7 +2,6 @@ import time from kubernetes import client -from kubernetes.client.rest import ApiException from kubernetes.stream import stream from node_exec.constant import DEFAULT_POD_INTERVAL @@ -10,66 +9,25 @@ from node_exec.constant import HOST_ROOTFS from utility.utility import logging -from utility.utility import wait_delete_ns - -from workload.pod import wait_delete_pod +from utility.utility import delete_pod, get_pod class NodeExec: - _instance = None - - @staticmethod - def get_instance(): - if NodeExec._instance is None: - NodeExec() - return NodeExec._instance - - def __init__(self): - if NodeExec._instance is not None: - raise Exception('only one NodeExec instance can exist') - else: - self.node_exec_pod = {} - NodeExec._instance = self - - def set_namespace(self, namespace): + def __init__(self, node_name): + self.node_name = node_name self.core_api = client.CoreV1Api() - self.namespace = namespace - self.node_exec_pod = {} - namespace_manifest = { - 'apiVersion': 'v1', - 'kind': 'Namespace', - 'metadata': { - 'name': self.namespace - } - } - self.core_api.create_namespace( - body=namespace_manifest - ) - logging(f"Created namespace {namespace}") + self.cleanup() + self.pod = self.launch_pod() def cleanup(self): - for pod in self.node_exec_pod.values(): - logging(f"Cleaning up pod {pod.metadata.name} {pod.metadata.uid}") - try: - res = self.core_api.delete_namespaced_pod( - name=pod.metadata.name, - namespace=self.namespace, - body=client.V1DeleteOptions() - ) - wait_delete_pod(pod.metadata.uid) - except Exception as e: - assert e.status == 404 - self.core_api.delete_namespace( - name=self.namespace - ) - wait_delete_ns(self.namespace) - self.node_exec_pod.clear() + if get_pod(self.node_name): + logging(f"Cleaning up pod {self.node_name}") + delete_pod(self.node_name) + def issue_cmd(self, cmd): + logging(f"Issuing command on {self.node_name}: {cmd}") - def issue_cmd(self, node_name, cmd): - logging(f"Issuing command on {node_name}: {cmd}") - pod = self.launch_pod(node_name) if isinstance(cmd, list): exec_command = cmd else: @@ -83,129 +41,113 @@ def issue_cmd(self, node_name, cmd): ] res = stream( self.core_api.connect_get_namespaced_pod_exec, - pod.metadata.name, - self.namespace, + self.pod.metadata.name, + 'default', command=exec_command, stderr=True, stdin=False, stdout=True, tty=False ) - logging(f"Issued command: {cmd} on {node_name} with result {res}") + logging(f"Issued command: {cmd} on {self.node_name} with result {res}") return res - def launch_pod(self, node_name): - if node_name in self.node_exec_pod: - for _ in range(DEFAULT_POD_TIMEOUT): - try: - pod = self.core_api.read_namespaced_pod( - name=node_name, - namespace=self.namespace - ) - if pod is not None and pod.status.phase == 'Running': - break - except ApiException as e: - assert e.status == 404 - - time.sleep(DEFAULT_POD_INTERVAL) - return pod - else: - pod_manifest = { - 'apiVersion': 'v1', - 'kind': 'Pod', - 'metadata': { - 'name': node_name - }, - 'spec': { - 'affinity': { - 'nodeAffinity': { - 'requiredDuringSchedulingIgnoredDuringExecution': { - 'nodeSelectorTerms': [{ - 'matchExpressions': [{ - 'key': 'kubernetes.io/hostname', - 'operator': 'In', - 'values': [ - node_name - ] - }] + def launch_pod(self): + pod_manifest = { + 'apiVersion': 'v1', + 'kind': 'Pod', + 'metadata': { + 'name': self.node_name + }, + 'spec': { + 'affinity': { + 'nodeAffinity': { + 'requiredDuringSchedulingIgnoredDuringExecution': { + 'nodeSelectorTerms': [{ + 'matchExpressions': [{ + 'key': 'kubernetes.io/hostname', + 'operator': 'In', + 'values': [ + self.node_name + ] }] - } + }] } + } + }, + "tolerations": [{ + "key": "node-role.kubernetes.io/master", + "operator": "Equal", + "value": "true", + "effect": "NoSchedule" + }, + { + "key": "node-role.kubernetes.io/master", + "operator": "Equal", + "value": "true", + "effect": "NoExecute" + }, + { + "key": "node-role.kubernetes.io/control-plane", + "operator": "Equal", + "value": "true", + "effect": "NoSchedule" + }, + { + "key": "node-role.kubernetes.io/control-plane", + "operator": "Equal", + "value": "true", + "effect": "NoExecute" + }], + 'containers': [{ + 'image': 'ubuntu:16.04', + 'imagePullPolicy': 'IfNotPresent', + 'securityContext': { + 'privileged': True }, - "tolerations": [{ - "key": "node-role.kubernetes.io/master", - "operator": "Equal", - "value": "true", - "effect": "NoSchedule" - }, - { - "key": "node-role.kubernetes.io/master", - "operator": "Equal", - "value": "true", - "effect": "NoExecute" - }, - { - "key": "node-role.kubernetes.io/control-plane", - "operator": "Equal", - "value": "true", - "effect": "NoSchedule" - }, - { - "key": "node-role.kubernetes.io/control-plane", - "operator": "Equal", - "value": "true", - "effect": "NoExecute" - }], - 'containers': [{ - 'image': 'ubuntu:16.04', - 'imagePullPolicy': 'IfNotPresent', - 'securityContext': { - 'privileged': True - }, - 'name': 'node-exec', - "args": [ - "tail", "-f", "/dev/null" - ], - "volumeMounts": [{ - 'name': 'rootfs', - 'mountPath': HOST_ROOTFS - }, { - 'name': 'bus', - 'mountPath': '/var/run' - }, { - 'name': 'rancher', - 'mountPath': '/var/lib/rancher' - }], - }], - 'volumes': [{ + 'name': 'node-exec', + "args": [ + "tail", "-f", "/dev/null" + ], + "volumeMounts": [{ 'name': 'rootfs', - 'hostPath': { - 'path': '/' - } + 'mountPath': HOST_ROOTFS }, { 'name': 'bus', - 'hostPath': { - 'path': '/var/run' - } + 'mountPath': '/var/run' }, { 'name': 'rancher', - 'hostPath': { - 'path': '/var/lib/rancher' - } - }] - } + 'mountPath': '/var/lib/rancher' + }], + }], + 'volumes': [{ + 'name': 'rootfs', + 'hostPath': { + 'path': '/' + } + }, { + 'name': 'bus', + 'hostPath': { + 'path': '/var/run' + } + }, { + 'name': 'rancher', + 'hostPath': { + 'path': '/var/lib/rancher' + } + }] } - pod = self.core_api.create_namespaced_pod( - body=pod_manifest, - namespace=self.namespace - ) - for i in range(DEFAULT_POD_TIMEOUT): - pod = self.core_api.read_namespaced_pod( - name=node_name, - namespace=self.namespace - ) - if pod is not None and pod.status.phase == 'Running': - break - time.sleep(DEFAULT_POD_INTERVAL) - self.node_exec_pod[node_name] = pod - return pod + } + pod = self.core_api.create_namespaced_pod( + body=pod_manifest, + namespace='default' + ) + for i in range(DEFAULT_POD_TIMEOUT): + pod = self.core_api.read_namespaced_pod( + name=self.node_name, + namespace='default' + ) + if pod is not None and pod.status.phase == 'Running': + break + time.sleep(DEFAULT_POD_INTERVAL) + return pod diff --git a/e2e/libs/snapshot/rest.py b/e2e/libs/snapshot/rest.py index 172984fe9b..67fcad3d38 100644 --- a/e2e/libs/snapshot/rest.py +++ b/e2e/libs/snapshot/rest.py @@ -10,7 +10,7 @@ class Rest(Base): def __init__(self): - self.volume = RestVolume(NodeExec.get_instance()) + self.volume = RestVolume() self.retry_count, self.retry_interval = get_retry_count_and_interval() def create(self, volume_name, snapshot_id, waiting): diff --git a/e2e/libs/utility/utility.py b/e2e/libs/utility/utility.py index 70c6ab3a4f..3b0ac1eb79 100644 --- a/e2e/libs/utility/utility.py +++ b/e2e/libs/utility/utility.py @@ -236,6 +236,41 @@ def wait_delete_ns(name): assert not found +def delete_pod(name, namespace='default'): + core_api = client.CoreV1Api() + try: + core_api.delete_namespaced_pod(name=name, namespace=namespace) + wait_delete_pod(name) + except ApiException as e: + assert e.status == 404 + + +def wait_delete_pod(name, namespace='default'): + api = client.CoreV1Api() + retry_count, retry_interval = get_retry_count_and_interval() + for i in range(retry_count): + ret = api.list_namespaced_pod(namespace=namespace) + found = False + for item in ret.items: + if item.metadata.name == name: + found = True + break + if not found: + break + time.sleep(retry_interval) + assert not found + + +def get_pod(name, namespace='default'): + try: + core_api = client.CoreV1Api() + return core_api.read_namespaced_pod(name=name, namespace=namespace) + except Exception as e: + if e.reason == 'Not Found': + return None + raise e + + def get_mgr_ips(): ret = client.CoreV1Api().list_pod_for_all_namespaces( label_selector="app=longhorn-manager", diff --git a/e2e/libs/volume/crd.py b/e2e/libs/volume/crd.py index ec89be8f77..ab7c2e01f8 100644 --- a/e2e/libs/volume/crd.py +++ b/e2e/libs/volume/crd.py @@ -6,7 +6,7 @@ from engine import Engine -from node_exec.constant import HOST_ROOTFS +from node_exec import NodeExec from utility.constant import LABEL_TEST from utility.constant import LABEL_TEST_VALUE @@ -21,10 +21,9 @@ class CRD(Base): - def __init__(self, node_exec): + def __init__(self): self.core_api = client.CoreV1Api() self.obj_api = client.CustomObjectsApi() - self.node_exec = node_exec self.retry_count, self.retry_interval = get_retry_count_and_interval() self.engine = Engine() @@ -246,10 +245,10 @@ def wait_for_volume_state(self, volume_name, desired_state): assert volume["status"]["state"] == desired_state def is_replica_running(self, volume_name, node_name, is_running): - return Rest(self.node_exec).is_replica_running(volume_name, node_name, is_running) + return Rest().is_replica_running(volume_name, node_name, is_running) def get_replica_name_on_node(self, volume_name, node_name): - return Rest(self.node_exec).get_replica_name_on_node(volume_name, node_name) + return Rest().get_replica_name_on_node(volume_name, node_name) def wait_for_volume_keep_in_state(self, volume_name, desired_state): self.wait_for_volume_state(volume_name, desired_state) @@ -363,7 +362,7 @@ def wait_for_volume_expand_to_size(self, volume_name, expected_size): assert engine_current_size == engine_expected_size def get_endpoint(self, volume_name): - return Rest(self.node_exec).get_endpoint(volume_name) + return Rest().get_endpoint(volume_name) def write_random_data(self, volume_name, size, data_id): node_name = self.get(volume_name)["spec"]["nodeID"] @@ -375,7 +374,7 @@ def write_random_data(self, volume_name, size, data_id): "sync; " f"md5sum {endpoint} | awk \'{{print $1}}\'" ] - checksum = self.node_exec.issue_cmd(node_name, cmd) + checksum = NodeExec(node_name).issue_cmd(cmd) logging(f"Storing volume {volume_name} data {data_id} checksum = {checksum}") self.set_data_checksum(volume_name, data_id, checksum) @@ -385,8 +384,7 @@ def keep_writing_data(self, volume_name, size): node_name = self.get(volume_name)["spec"]["nodeID"] endpoint = self.get_endpoint(volume_name) logging(f"Keeping writing data to volume {volume_name}") - res = self.node_exec.issue_cmd( - node_name, + res = NodeExec(node_name).issue_cmd( f"while true; do dd if=/dev/urandom of={endpoint} bs=1M count={size} status=none; done > /dev/null 2> /dev/null &") logging(f"Created process to keep writing data to volume {volume_name}") @@ -409,19 +407,19 @@ def delete_replica(self, volume_name, node_name): ) def wait_for_replica_rebuilding_start(self, volume_name, node_name): - return Rest(self.node_exec).wait_for_replica_rebuilding_start(volume_name, node_name) + return Rest().wait_for_replica_rebuilding_start(volume_name, node_name) def is_replica_rebuilding_in_progress(self, volume_name, node_name): - return Rest(self.node_exec).is_replica_rebuilding_in_progress(volume_name, node_name) + return Rest().is_replica_rebuilding_in_progress(volume_name, node_name) def crash_replica_processes(self, volume_name): - return Rest(self.node_exec).crash_replica_processes(volume_name) + return Rest().crash_replica_processes(volume_name) def crash_node_replica_process(self, volume_name, node_name): - return Rest(self.node_exec).crash_node_replica_process(volume_name, node_name) + return Rest().crash_node_replica_process(volume_name, node_name) def wait_for_replica_rebuilding_complete(self, volume_name, node_name): - return Rest(self.node_exec).wait_for_replica_rebuilding_complete(volume_name, node_name) + return Rest().wait_for_replica_rebuilding_complete(volume_name, node_name) def check_data_checksum(self, volume_name, data_id): expected_checksum = self.get_data_checksum(volume_name, data_id) @@ -436,8 +434,7 @@ def check_data_checksum(self, volume_name, data_id): def get_checksum(self, volume_name): node_name = self.get(volume_name)["spec"]["nodeID"] endpoint = self.get_endpoint(volume_name) - checksum = self.node_exec.issue_cmd( - node_name, + checksum = NodeExec(node_name).issue_cmd( ["sh", "-c", f"md5sum {endpoint} | awk \'{{print $1}}\'"]) logging(f"Calculated volume {volume_name} checksum {checksum}") return checksum @@ -482,16 +479,16 @@ def update_volume_spec(self, volume_name, key, value): time.sleep(self.retry_interval) def activate(self, volume_name): - return Rest(self.node_exec).activate(volume_name) + return Rest().activate(volume_name) def create_persistentvolume(self, volume_name, retry): - return Rest(self.node_exec).create_persistentvolume(volume_name, retry) + return Rest().create_persistentvolume(volume_name, retry) def create_persistentvolumeclaim(self, volume_name, retry): - return Rest(self.node_exec).create_persistentvolumeclaim(volume_name, retry) + return Rest().create_persistentvolumeclaim(volume_name, retry) def upgrade_engine_image(self, volume_name, engine_image_name): - return Rest(self.node_exec).upgrade_engine_image(volume_name, engine_image_name) + return Rest().upgrade_engine_image(volume_name, engine_image_name) def wait_for_engine_image_upgrade_completed(self, volume_name, engine_image_name): - return Rest(self.node_exec).wait_for_engine_image_upgrade_completed(volume_name, engine_image_name) + return Rest().wait_for_engine_image_upgrade_completed(volume_name, engine_image_name) diff --git a/e2e/libs/volume/rest.py b/e2e/libs/volume/rest.py index 4ec867df15..23184b77a5 100644 --- a/e2e/libs/volume/rest.py +++ b/e2e/libs/volume/rest.py @@ -2,7 +2,7 @@ import time import asyncio -from node_exec.constant import HOST_ROOTFS +from node_exec import NodeExec from persistentvolumeclaim.persistentvolumeclaim import PersistentVolumeClaim from persistentvolume.persistentvolume import PersistentVolume @@ -21,8 +21,7 @@ class Rest(Base): - def __init__(self, node_exec): - self.node_exec = node_exec + def __init__(self): self.retry_count, self.retry_interval = get_retry_count_and_interval() self.pv = PersistentVolume() self.pvc = PersistentVolumeClaim() @@ -246,8 +245,7 @@ def check_data_checksum(self, volume_name, data_id): def get_checksum(self, volume_name): node_name = self.get(volume_name).controllers[0].hostId endpoint = self.get_endpoint(volume_name) - checksum = self.node_exec.issue_cmd( - node_name, + checksum = NodeExec(node_name).issue_cmd( ["sh", "-c", f"md5sum {endpoint} | awk \'{{print $1}}\'"]) logging(f"Calculated volume {volume_name} checksum {checksum}") return checksum @@ -274,7 +272,7 @@ def activate(self, volume_name): break except Exception as e: assert "hasn't finished incremental restored" in str(e.error.message) - time.sleep(RETRY_INTERVAL) + time.sleep(self.retry_interval) if activated: break volume = self.get(volume_name) diff --git a/e2e/libs/volume/volume.py b/e2e/libs/volume/volume.py index 7d8fcec34b..cd3540fb1d 100644 --- a/e2e/libs/volume/volume.py +++ b/e2e/libs/volume/volume.py @@ -1,5 +1,3 @@ -from node_exec import NodeExec - from strategy import LonghornOperationStrategy from volume.base import Base @@ -12,11 +10,10 @@ class Volume(Base): _strategy = LonghornOperationStrategy.CRD def __init__(self): - node_exec = NodeExec.get_instance() if self._strategy == LonghornOperationStrategy.CRD: - self.volume = CRD(node_exec) + self.volume = CRD() else: - self.volume = Rest(node_exec) + self.volume = Rest() def create(self, volume_name, size, numberOfReplicas, frontend, migratable, accessMode, dataEngine, backingImage, Standby, fromBackup): return self.volume.create(volume_name, size, numberOfReplicas, frontend, migratable, accessMode, dataEngine, backingImage, Standby, fromBackup) diff --git a/e2e/tests/regression/test_basic.robot b/e2e/tests/regression/test_basic.robot index a381497149..9ac29395f4 100644 --- a/e2e/tests/regression/test_basic.robot +++ b/e2e/tests/regression/test_basic.robot @@ -10,6 +10,7 @@ Resource ../keywords/recurringjob.resource Resource ../keywords/statefulset.resource Resource ../keywords/volume.resource Resource ../keywords/snapshot.resource +Resource ../keywords/node.resource Test Setup Set test environment Test Teardown Cleanup test resources