From 030d2f0b180e1518ee4644a12ba72484bfaff161 Mon Sep 17 00:00:00 2001 From: Chris Date: Mon, 11 Mar 2024 12:56:15 +0800 Subject: [PATCH] Add test case test_engine_crash_during_live_upgrade ref: 7859 Signed-off-by: Chris --- manager/integration/tests/common.py | 3 +- .../integration/tests/test_engine_upgrade.py | 65 +++++++++++++++++-- manager/integration/tests/test_node.py | 34 +++++----- 3 files changed, 80 insertions(+), 22 deletions(-) diff --git a/manager/integration/tests/common.py b/manager/integration/tests/common.py index 47dd4daee9..3de48d35b0 100644 --- a/manager/integration/tests/common.py +++ b/manager/integration/tests/common.py @@ -6150,4 +6150,5 @@ def create_deployment_and_write_data(client, # NOQA deployment_pod_names[0], data_path) - return client.by_id_volume(volume_name), deployment_pod_names[0], checksum + volume = client.by_id_volume(volume_name) + return volume, deployment_pod_names[0], checksum, deployment diff --git a/manager/integration/tests/test_engine_upgrade.py b/manager/integration/tests/test_engine_upgrade.py index 31fc22859a..0aad9b249e 100644 --- a/manager/integration/tests/test_engine_upgrade.py +++ b/manager/integration/tests/test_engine_upgrade.py @@ -28,6 +28,12 @@ from common import wait_for_rebuild_start from common import create_backup, wait_for_backup_restore_completed from common import SETTING_CONCURRENT_AUTO_ENGINE_UPGRADE_NODE_LIMIT +from common import create_deployment_and_write_data +from common import crash_engine_process_with_sigkill +from common import get_deployment_pod_names, create_snapshot +from common import wait_pod +from common import make_deployment_with_pvc # NOQA +from common import DATA_SIZE_IN_MB_2 from test_settings import delete_replica_on_test_node from backupstore import set_random_backupstore # NOQA @@ -1197,18 +1203,67 @@ def test_engine_live_upgrade_while_replica_concurrent_rebuild(client, # NOQA assert replica.image == engine_upgrade_image assert replica.currentImage == engine_upgrade_image -@pytest.mark.skip(reason="TODO") # NOQA -def test_engine_crash_during_live_upgrade(): + +def test_engine_crash_during_live_upgrade(client, core_api, # NOQA + make_deployment_with_pvc, # NOQA + volume_name): # NOQA """ - 1. Create and attach a volume to a workload, then write data into the + 1. Deploy an extra engine image. + 2. Create and attach a volume to a workload, then write data into the volume. - 2. Deploy an extra engine image. 3. Send live upgrade request then immediately delete the related engine manager pod/engine process (The new replicas are not in active in this case). 4. Verify the workload will be restarted and the volume will be reattached automatically. - 5. Verify the upgrade is done during the reattachment. + 5. Verify the upgrade is done. (It actually becomes offline upgrade.) 6. Verify volume healthy and the data is correct. """ + # Step 1 + _, default_img_name, engine_upgrade_image, compatible_img, _ = \ + prepare_auto_upgrade_engine_to_default_version(client) + + # Step 2 + host_id = get_self_host_id() + volume, pod_name, checksum, deployment = \ + create_deployment_and_write_data(client, core_api, + make_deployment_with_pvc, # NOQA + volume_name, str(1 * Gi), + 3, + DATA_SIZE_IN_MB_2, + host_id) # NOQA + + create_snapshot(client, volume_name) + + # Step 3 + volume.engineUpgrade(image=engine_upgrade_image) + crash_engine_process_with_sigkill(client, core_api, volume_name) + + # Step 4, 5 + volume = wait_for_volume_detached(client, volume_name) + volume = wait_for_volume_current_image(client, volume_name, + engine_upgrade_image) + wait_for_engine_image_ref_count(client, default_img_name, 0) + # Total ei.refCount of one volumes is equal to + # 1 volume + 1 engine + all replicas(3) + wait_for_engine_image_ref_count(client, compatible_img.name, 5) + volume = wait_for_volume_healthy(client, volume_name) + + # make sure pod restarted + for i in range(RETRY_COUNTS): + time.sleep(RETRY_INTERVAL) + deployment_pod_names = get_deployment_pod_names(core_api, + deployment) + if deployment_pod_names[0] != pod_name: + new_pod_name = deployment_pod_names[0] + break + + wait_pod(new_pod_name) + + # Step 6 + data_path = '/data/test' + test_data_checksum = get_pod_data_md5sum(core_api, + new_pod_name, + data_path) + assert test_data_checksum == checksum diff --git a/manager/integration/tests/test_node.py b/manager/integration/tests/test_node.py index 7e199e4f5b..8793c7b17a 100644 --- a/manager/integration/tests/test_node.py +++ b/manager/integration/tests/test_node.py @@ -2832,13 +2832,13 @@ def test_drain_with_block_for_eviction_success(client, # NOQA client.update(setting, value="block-for-eviction") # Step 2, 3, 4 - volume, pod, checksum = create_deployment_and_write_data(client, - core_api, - make_deployment_with_pvc, # NOQA - volume_name, - str(1 * Gi), - 3, - DATA_SIZE_IN_MB_3, host_id) # NOQA + volume, pod, checksum, _ = create_deployment_and_write_data(client, + core_api, + make_deployment_with_pvc, # NOQA + volume_name, + str(1 * Gi), + 3, + DATA_SIZE_IN_MB_3, host_id) # NOQA # Make replica not locate on eviction target node volume.updateReplicaCount(replicaCount=2) @@ -2952,7 +2952,8 @@ def test_drain_with_block_for_eviction_if_contains_last_replica_success(client, # Step 2, 3 volume1_name = "vol-1" volume2_name = "vol-2" - volume1, pod1, checksum1 = create_deployment_and_write_data(client, + volume1, pod1, checksum1, _ = create_deployment_and_write_data( + client, core_api, make_deployment_with_pvc, # NOQA volume1_name, @@ -2960,7 +2961,8 @@ def test_drain_with_block_for_eviction_if_contains_last_replica_success(client, 3, DATA_SIZE_IN_MB_3, # NOQA host_id) # NOQA - volume2, pod2, checksum2 = create_deployment_and_write_data(client, + volume2, pod2, checksum2, _ = create_deployment_and_write_data( + client, core_api, make_deployment_with_pvc, # NOQA volume2_name, @@ -3053,13 +3055,13 @@ def test_drain_with_block_for_eviction_failure(client, # NOQA client.update(setting, value="block-for-eviction") # Step 2, 3, 4 - volume, pod, checksum = create_deployment_and_write_data(client, - core_api, - make_deployment_with_pvc, # NOQA - volume_name, - str(1 * Gi), - 3, - DATA_SIZE_IN_MB_3, host_id) # NOQA + volume, pod, checksum, _ = create_deployment_and_write_data(client, + core_api, + make_deployment_with_pvc, # NOQA + volume_name, + str(1 * Gi), + 3, + DATA_SIZE_IN_MB_3, host_id) # NOQA # Step 5 executor = ThreadPoolExecutor(max_workers=5)