From 030d2f0b180e1518ee4644a12ba72484bfaff161 Mon Sep 17 00:00:00 2001
From: Chris <chris.chien@suse.com>
Date: Mon, 11 Mar 2024 12:56:15 +0800
Subject: [PATCH] Add test case test_engine_crash_during_live_upgrade

ref: 7859

Signed-off-by: Chris <chris.chien@suse.com>
---
 manager/integration/tests/common.py           |  3 +-
 .../integration/tests/test_engine_upgrade.py  | 65 +++++++++++++++++--
 manager/integration/tests/test_node.py        | 34 +++++-----
 3 files changed, 80 insertions(+), 22 deletions(-)

diff --git a/manager/integration/tests/common.py b/manager/integration/tests/common.py
index 47dd4daee9..3de48d35b0 100644
--- a/manager/integration/tests/common.py
+++ b/manager/integration/tests/common.py
@@ -6150,4 +6150,5 @@ def create_deployment_and_write_data(client, # NOQA
                                    deployment_pod_names[0],
                                    data_path)
 
-    return client.by_id_volume(volume_name), deployment_pod_names[0], checksum
+    volume = client.by_id_volume(volume_name)
+    return volume, deployment_pod_names[0], checksum, deployment
diff --git a/manager/integration/tests/test_engine_upgrade.py b/manager/integration/tests/test_engine_upgrade.py
index 31fc22859a..0aad9b249e 100644
--- a/manager/integration/tests/test_engine_upgrade.py
+++ b/manager/integration/tests/test_engine_upgrade.py
@@ -28,6 +28,12 @@
 from common import wait_for_rebuild_start
 from common import create_backup, wait_for_backup_restore_completed
 from common import SETTING_CONCURRENT_AUTO_ENGINE_UPGRADE_NODE_LIMIT
+from common import create_deployment_and_write_data
+from common import crash_engine_process_with_sigkill
+from common import get_deployment_pod_names, create_snapshot
+from common import wait_pod
+from common import make_deployment_with_pvc # NOQA
+from common import DATA_SIZE_IN_MB_2
 from test_settings import delete_replica_on_test_node
 from backupstore import set_random_backupstore # NOQA
 
@@ -1197,18 +1203,67 @@ def test_engine_live_upgrade_while_replica_concurrent_rebuild(client, # NOQA
         assert replica.image == engine_upgrade_image
         assert replica.currentImage == engine_upgrade_image
 
-@pytest.mark.skip(reason="TODO")  # NOQA
-def test_engine_crash_during_live_upgrade():
+
+def test_engine_crash_during_live_upgrade(client, core_api, # NOQA
+                                          make_deployment_with_pvc, # NOQA
+                                          volume_name): # NOQA
     """
-    1. Create and attach a volume to a workload, then write data into the
+    1. Deploy an extra engine image.
+    2. Create and attach a volume to a workload, then write data into the
        volume.
-    2. Deploy an extra engine image.
     3. Send live upgrade request then immediately delete the related engine
        manager pod/engine process (The new replicas are not in active in this
        case).
     4. Verify the workload will be restarted and the volume will be reattached
        automatically.
-    5. Verify the upgrade is done during the reattachment.
+    5. Verify the upgrade is done.
        (It actually becomes offline upgrade.)
     6. Verify volume healthy and the data is correct.
     """
+    # Step 1
+    _, default_img_name, engine_upgrade_image, compatible_img, _ = \
+        prepare_auto_upgrade_engine_to_default_version(client)
+
+    # Step 2
+    host_id = get_self_host_id()
+    volume, pod_name, checksum, deployment = \
+        create_deployment_and_write_data(client, core_api,
+                                         make_deployment_with_pvc, # NOQA
+                                         volume_name, str(1 * Gi),
+                                         3,
+                                         DATA_SIZE_IN_MB_2,
+                                         host_id) # NOQA
+
+    create_snapshot(client, volume_name)
+
+    # Step 3
+    volume.engineUpgrade(image=engine_upgrade_image)
+    crash_engine_process_with_sigkill(client, core_api, volume_name)
+
+    # Step 4, 5
+    volume = wait_for_volume_detached(client, volume_name)
+    volume = wait_for_volume_current_image(client, volume_name,
+                                           engine_upgrade_image)
+    wait_for_engine_image_ref_count(client, default_img_name, 0)
+    # Total ei.refCount of one volumes is equal to
+    # 1 volume + 1 engine + all replicas(3)
+    wait_for_engine_image_ref_count(client, compatible_img.name, 5)
+    volume = wait_for_volume_healthy(client, volume_name)
+
+    # make sure pod restarted
+    for i in range(RETRY_COUNTS):
+        time.sleep(RETRY_INTERVAL)
+        deployment_pod_names = get_deployment_pod_names(core_api,
+                                                        deployment)
+        if deployment_pod_names[0] != pod_name:
+            new_pod_name = deployment_pod_names[0]
+            break
+
+    wait_pod(new_pod_name)
+
+    # Step 6
+    data_path = '/data/test'
+    test_data_checksum = get_pod_data_md5sum(core_api,
+                                             new_pod_name,
+                                             data_path)
+    assert test_data_checksum == checksum
diff --git a/manager/integration/tests/test_node.py b/manager/integration/tests/test_node.py
index 7e199e4f5b..8793c7b17a 100644
--- a/manager/integration/tests/test_node.py
+++ b/manager/integration/tests/test_node.py
@@ -2832,13 +2832,13 @@ def test_drain_with_block_for_eviction_success(client, # NOQA
     client.update(setting, value="block-for-eviction")
 
     # Step 2, 3, 4
-    volume, pod, checksum = create_deployment_and_write_data(client,
-                                                             core_api,
-                                                             make_deployment_with_pvc, # NOQA
-                                                             volume_name,
-                                                             str(1 * Gi),
-                                                             3,
-                                                             DATA_SIZE_IN_MB_3, host_id) # NOQA
+    volume, pod, checksum, _ = create_deployment_and_write_data(client,
+                                                                core_api,
+                                                                make_deployment_with_pvc, # NOQA
+                                                                volume_name,
+                                                                str(1 * Gi),
+                                                                3,
+                                                                DATA_SIZE_IN_MB_3, host_id) # NOQA
 
     # Make replica not locate on eviction target node
     volume.updateReplicaCount(replicaCount=2)
@@ -2952,7 +2952,8 @@ def test_drain_with_block_for_eviction_if_contains_last_replica_success(client,
     # Step 2, 3
     volume1_name = "vol-1"
     volume2_name = "vol-2"
-    volume1, pod1, checksum1 = create_deployment_and_write_data(client,
+    volume1, pod1, checksum1, _ = create_deployment_and_write_data(
+                                                                client,
                                                                 core_api,
                                                                 make_deployment_with_pvc, # NOQA
                                                                 volume1_name,
@@ -2960,7 +2961,8 @@ def test_drain_with_block_for_eviction_if_contains_last_replica_success(client,
                                                                 3,
                                                                 DATA_SIZE_IN_MB_3, # NOQA
                                                                 host_id) # NOQA
-    volume2, pod2, checksum2 = create_deployment_and_write_data(client,
+    volume2, pod2, checksum2, _ = create_deployment_and_write_data(
+                                                                client,
                                                                 core_api,
                                                                 make_deployment_with_pvc,  # NOQA
                                                                 volume2_name,
@@ -3053,13 +3055,13 @@ def test_drain_with_block_for_eviction_failure(client, # NOQA
     client.update(setting, value="block-for-eviction")
 
     # Step 2, 3, 4
-    volume, pod, checksum = create_deployment_and_write_data(client,
-                                                             core_api,
-                                                             make_deployment_with_pvc, # NOQA
-                                                             volume_name,
-                                                             str(1 * Gi),
-                                                             3,
-                                                             DATA_SIZE_IN_MB_3, host_id) # NOQA
+    volume, pod, checksum, _ = create_deployment_and_write_data(client,
+                                                                core_api,
+                                                                make_deployment_with_pvc, # NOQA
+                                                                volume_name,
+                                                                str(1 * Gi),
+                                                                3,
+                                                                DATA_SIZE_IN_MB_3, host_id) # NOQA
 
     # Step 5
     executor = ThreadPoolExecutor(max_workers=5)