diff --git a/manager/integration/Dockerfile b/manager/integration/Dockerfile index 8541f5edd9..09ccfc9c2f 100644 --- a/manager/integration/Dockerfile +++ b/manager/integration/Dockerfile @@ -1,6 +1,6 @@ FROM registry.suse.com/bci/python:3.9 -ARG KUBECTL_VERSION=v1.17.0 +ARG KUBECTL_VERSION=v1.28.4 ARG YQ_VERSION=v4.24.2 ARG TERRAFORM_VERSION=1.3.5 ARG ARCH=amd64 diff --git a/manager/integration/tests/common.py b/manager/integration/tests/common.py index cdef00cd44..38e2c78743 100644 --- a/manager/integration/tests/common.py +++ b/manager/integration/tests/common.py @@ -6125,8 +6125,14 @@ def wait_for_instance_manager_count(client, number, retry_counts=120): return len(ims) -def create_deployment_and_write_data(client, core_api, make_deployment_with_pvc, volume_name, size, replica_count, data_size, attach_node_id=None): # NOQA - print(volume_name) +def create_deployment_and_write_data(client, # NOQA + core_api, # NOQA + make_deployment_with_pvc, # NOQA + volume_name, # NOQA + size, # NOQA + replica_count, # NOQA + data_size, # NOQA + attach_node_id=None): # NOQA apps_api = get_apps_api_client() volume = client.create_volume(name=volume_name, size=size, diff --git a/manager/integration/tests/test_node.py b/manager/integration/tests/test_node.py index b43dd20bbc..abd8033e64 100644 --- a/manager/integration/tests/test_node.py +++ b/manager/integration/tests/test_node.py @@ -50,9 +50,6 @@ from common import update_setting from common import SETTING_NODE_DRAIN_POLICY, DATA_SIZE_IN_MB_3 from common import make_deployment_with_pvc # NOQA -from common import create_pv_for_volume -from common import create_pvc_for_volume, create_and_wait_deployment -from common import get_apps_api_client, write_pod_volume_random_data from common import prepare_host_disk, wait_for_volume_degraded from common import create_deployment_and_write_data @@ -2693,7 +2690,15 @@ def finalizer(): def drain_node(core_api, node): # NOQA set_node_cordon(core_api, node.id, True) - command = ["kubectl", "drain", node.id, "--ignore-daemonsets"] + command = [ + "kubectl", + "drain", + node.id, + "--ignore-daemonsets", + "--delete-emptydir-data", + "--grace-period=-1" + ] + subprocess.run(command, check=True) @@ -2713,8 +2718,84 @@ def get_replica_detail(replica_name): return replica_info +def check_node_auto_evict_state(client, target_node, expect_state): # NOQA + def get_specific_node(client, target_node): + nodes = client.list_node() + for node in nodes: + if node.id == target_node.id: + return node + + for i in range(RETRY_COUNTS): + node = get_specific_node(client, target_node) + if node.autoEvicting is expect_state: + break + time.sleep(RETRY_INTERVAL) + assert node.autoEvicting is expect_state + + +def check_replica_evict_state(client, volume_name, node, expect_state): # NOQA + volume = client.by_id_volume(volume_name) + for replica in volume.replicas: + if replica.hostId == node.id: + replica_name = replica.name + break + + replica_info = get_replica_detail(replica_name) + eviction_requested = replica_info["spec"]["evictionRequested"] + assert eviction_requested is expect_state + + +def wait_drain_complete(future, timeout): + """ + Wait concurrent.futures object complete in a duration + """ + thread_timeout = timeout + try: + future.result(timeout=thread_timeout) + drain_complete = True + except TimeoutError: + print("drain node thread exceed timeout ({})s".format(thread_timeout)) + drain_complete = False + future.cancel() + finally: + assert drain_complete is True + + +def make_replica_on_specific_node(client, volume_name, node): # NOQA + volume = client.by_id_volume(volume_name) + volume.updateReplicaCount(replicaCount=1) + for replica in volume.replicas: + if replica.hostId != node.id: + volume.replicaRemove(name=replica.name) + wait_for_volume_replica_count(client, volume_name, 1) + + +def get_all_replica_name(client, volume_name): # NOQA + volume_replicas = [] + volume = client.by_id_volume(volume_name) + for replica in volume.replicas: + volume_replicas.append(replica.name) + + return volume_replicas + + +def check_all_replicas_evict_state(client, volume_name, expect_state): # NOQA + volume = client.by_id_volume(volume_name) + for replica in volume.replicas: + replica_info = get_replica_detail(replica.name) + eviction_requested = replica_info["spec"]["evictionRequested"] + assert eviction_requested is expect_state + + +@pytest.mark.skip(reason="Can not run when in-cluster backup store pod exist") # NOQA def test_drain_with_block_for_eviction_success(client, core_api, volume_name, make_deployment_with_pvc): # NOQA """ + Test case has the potential to drain node where backup store pods are + loacted. + In that case, test case will fail because backup store pods can only be + forcibly drained. + --- + Test drain completes after evicting replica with node-drain-policy block-for-eviction @@ -2746,33 +2827,13 @@ def test_drain_with_block_for_eviction_success(client, core_api, volume_name, ma client.update(setting, value="block-for-eviction") # Step 2, 3, 4 - volume = client.create_volume(name=volume_name, - size=str(1 * Gi), - numberOfReplicas=3) - volume = common.wait_for_volume_detached(client, volume_name) - - pvc_name = volume_name + "-pvc" - create_pv_for_volume(client, core_api, volume, volume_name) - create_pvc_for_volume(client, core_api, volume, pvc_name) - deployment_name = volume_name + "-dep" - deployment = make_deployment_with_pvc(deployment_name, pvc_name) - deployment["spec"]["template"]["spec"]["nodeSelector"] \ - = {"kubernetes.io/hostname": host_id} - - apps_api = get_apps_api_client() - create_and_wait_deployment(apps_api, deployment) - - pod_names = common.get_deployment_pod_names(core_api, deployment) - data_path = '/data/test' - write_pod_volume_random_data(core_api, - pod_names[0], - data_path, - DATA_SIZE_IN_MB_3) - expected_test_data_checksum = get_pod_data_md5sum(core_api, - pod_names[0], - data_path) - - volume = wait_for_volume_healthy(client, volume_name) + volume, pod, checksum = create_deployment_and_write_data(client, + core_api, + make_deployment_with_pvc, # NOQA + volume_name, + str(1 * Gi), + 3, + DATA_SIZE_IN_MB_3, host_id) # NOQA # Make replica not locate on eviction target node volume.updateReplicaCount(replicaCount=2) @@ -2789,33 +2850,11 @@ def test_drain_with_block_for_eviction_success(client, core_api, volume_name, ma future = executor.submit(drain_node, core_api, evict_source_node) # Step 6 - volume = client.by_id_volume(volume_name) - for replica in volume.replicas: - if replica.hostId == evict_source_node.id: - replica_name = replica.name - break - - replica_info = get_replica_detail(replica_name) - eviction_requested = replica_info["spec"]["evictionRequested"] - assert eviction_requested is True - - nodes = client.list_node() - for node in nodes: - if node.id == evict_source_node.id: - assert node.autoEvicting is True + check_replica_evict_state(client, volume_name, evict_source_node, True) + check_node_auto_evict_state(client, evict_source_node, True) # Step 7 - thread_timeout = 60 - try: - future.result(timeout=thread_timeout) - drain_complete = True - except TimeoutError: - print("drain node thread exceed timeout ({})s".format(thread_timeout)) - drain_complete = False - future.cancel() - finally: - assert drain_complete is True - + wait_drain_complete(future, 60) wait_for_volume_replica_count(client, volume_name, 2) # Step 8 @@ -2828,33 +2867,29 @@ def test_drain_with_block_for_eviction_success(client, core_api, volume_name, ma assert replica.hostId != evict_source_node.id # Stpe 10 - nodes = client.list_node() - for node in nodes: - assert node.autoEvicting is False + check_node_auto_evict_state(client, evict_source_node, False) # Step 11 - volume = client.by_id_volume(volume_name) - for replica in volume.replicas: - if replica.hostId == evict_target_node.id: - replica_name = replica.name - break - - replica_info = get_replica_detail(replica_name) - eviction_requested = replica_info["spec"]["evictionRequested"] - assert eviction_requested is False + check_replica_evict_state(client, volume_name, evict_target_node, False) # Step 12 + data_path = data_path = '/data/test' test_data_checksum = get_pod_data_md5sum(core_api, - pod_names[0], + pod, data_path) - - assert expected_test_data_checksum == test_data_checksum + assert checksum == test_data_checksum +@pytest.mark.skip(reason="Can not run when in-cluster backup store pod exist") # NOQA def test_drain_with_block_for_eviction_if_contains_last_replica_success(client, # NOQA core_api, # NOQA make_deployment_with_pvc): # NOQA """ + Test case has the potential to drain node where backup store pods are + loacted. + In that case, test case will fail because backup store pods can only be + forcibly drained. + --- Test drain completes after evicting replicas with node-drain-policy block-for-eviction-if-contains-last-replica @@ -2882,52 +2917,10 @@ def test_drain_with_block_for_eviction_if_contains_last_replica_success(client, 12. Verify that `replica.spec.evictionRequested == false` on all replicas. 13. Verify the the data in both volumes. """ - def make_replica_on_specific_node(client, volume_name, node): # NOQA - volume = client.by_id_volume(volume_name) - volume.updateReplicaCount(replicaCount=1) - for replica in volume.replicas: - if replica.hostId != node.id: - volume.replicaRemove(name=replica.name) - - def get_all_replica_name(client, volume_name): # NOQA - volume_replicas = [] - volume = client.by_id_volume(volume_name) - for replica in volume.replicas: - volume_replicas.append(replica.name) - - return volume_replicas - - def check_replica_evict_state(client, volume_name, node, expect_state): # NOQA - volume = client.by_id_volume(volume_name) - for replica in volume.replicas: - if replica.hostId == node.id: - replica_name = replica.name - break - - replica_info = get_replica_detail(replica_name) - eviction_requested = replica_info["spec"]["evictionRequested"] - assert eviction_requested is expect_state - - def check_all_replicas_evict_state(client, volume_name, expect_state): # NOQA - volume = client.by_id_volume(volume_name) - for replica in volume.replicas: - replica_info = get_replica_detail(replica.name) - eviction_requested = replica_info["spec"]["evictionRequested"] - assert eviction_requested is expect_state - - def check_node_auto_evict_state(client, target_node, expect_state): - nodes = client.list_node() - for node in nodes: - if node.id == target_node.id: - assert node.autoEvicting is expect_state - host_id = get_self_host_id() nodes = client.list_node() evict_nodes = [node for node in nodes if node.id != host_id][:2] evict_source_node = evict_nodes[0] - print() - print(host_id) - print(evict_source_node.id) # Create extra disk on current node node = client.by_id_node(host_id) @@ -2961,8 +2954,22 @@ def check_node_auto_evict_state(client, target_node, expect_state): # Step 2, 3 volume1_name = "vol-1" volume2_name = "vol-2" - volume1, pod1, checksum1 = create_deployment_and_write_data(client, core_api, make_deployment_with_pvc, volume1_name, str(1 * Gi), 3, DATA_SIZE_IN_MB_3, host_id) # NOQA - volume2, pod2, checksum2 = create_deployment_and_write_data(client, core_api, make_deployment_with_pvc, volume2_name, str(1 * Gi), 3, DATA_SIZE_IN_MB_3, host_id) # NOQA + volume1, pod1, checksum1 = create_deployment_and_write_data(client, + core_api, + make_deployment_with_pvc, # NOQA + volume1_name, + str(1 * Gi), + 3, + DATA_SIZE_IN_MB_3, # NOQA + host_id) # NOQA + volume2, pod2, checksum2 = create_deployment_and_write_data(client, + core_api, + make_deployment_with_pvc, # NOQA + volume2_name, + str(1 * Gi), + 3, + DATA_SIZE_IN_MB_3, # NOQA + host_id) # NOQA # Make volume 1 replica only located on evict_source_node make_replica_on_specific_node(client, volume1_name, evict_source_node) volume2_replicas = get_all_replica_name(client, volume2_name) @@ -2979,16 +2986,7 @@ def check_node_auto_evict_state(client, target_node, expect_state): check_all_replicas_evict_state(client, volume2_name, False) # Step 7 - thread_timeout = 60 - try: - future.result(timeout=thread_timeout) - drain_complete = True - except TimeoutError: - print("drain node thread exceed timeout ({})s".format(thread_timeout)) - drain_complete = False - future.cancel() - finally: - assert drain_complete is True + wait_drain_complete(future, 60) # Step 8 set_node_cordon(core_api, evict_source_node.id, False) @@ -3001,7 +2999,7 @@ def check_node_auto_evict_state(client, target_node, expect_state): # Step 10 # Verify volume2 replicas not moved by check replica name - # stored before node drain + # stored before the node drain volume2 = wait_for_volume_healthy(client, volume2_name) for replica in volume2.replicas: assert replica.name in volume2_replicas