Skip to content

Commit

Permalink
Add test case test_engine_crash_during_live_upgrade
Browse files Browse the repository at this point in the history
ref: 7859

Signed-off-by: Chris <[email protected]>
  • Loading branch information
chriscchien authored and yangchiu committed Mar 20, 2024
1 parent 8ff10e6 commit 6c1c54e
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 23 deletions.
7 changes: 5 additions & 2 deletions manager/integration/tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -868,7 +868,8 @@ def write_pod_volume_random_data(api, pod_name, path, size_in_mb):
'/bin/sh',
'-c',
'dd if=/dev/urandom of=' + path +
' bs=1M' + ' count=' + str(size_in_mb)
' bs=1M' + ' count=' + str(size_in_mb) +
'; sync'
]
return stream(
api.connect_get_namespaced_pod_exec, pod_name, 'default',
Expand Down Expand Up @@ -6146,8 +6147,10 @@ def create_deployment_and_write_data(client, # NOQA
deployment_pod_names[0],
data_path,
data_size)

checksum = get_pod_data_md5sum(core_api,
deployment_pod_names[0],
data_path)

return client.by_id_volume(volume_name), deployment_pod_names[0], checksum
volume = client.by_id_volume(volume_name)
return volume, deployment_pod_names[0], checksum, deployment
63 changes: 58 additions & 5 deletions manager/integration/tests/test_engine_upgrade.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@
from common import wait_for_rebuild_start
from common import create_backup, wait_for_backup_restore_completed
from common import SETTING_CONCURRENT_AUTO_ENGINE_UPGRADE_NODE_LIMIT
from common import create_deployment_and_write_data
from common import crash_engine_process_with_sigkill
from common import get_deployment_pod_names
from common import wait_pod
from common import make_deployment_with_pvc # NOQA
from common import DATA_SIZE_IN_MB_2
from test_settings import delete_replica_on_test_node
from backupstore import set_random_backupstore # NOQA

Expand Down Expand Up @@ -1197,18 +1203,65 @@ def test_engine_live_upgrade_while_replica_concurrent_rebuild(client, # NOQA
assert replica.image == engine_upgrade_image
assert replica.currentImage == engine_upgrade_image

@pytest.mark.skip(reason="TODO") # NOQA
def test_engine_crash_during_live_upgrade():

def test_engine_crash_during_live_upgrade(client, core_api, # NOQA
make_deployment_with_pvc, # NOQA
volume_name): # NOQA
"""
1. Create and attach a volume to a workload, then write data into the
1. Deploy an extra engine image.
2. Create and attach a volume to a workload, then write data into the
volume.
2. Deploy an extra engine image.
3. Send live upgrade request then immediately delete the related engine
manager pod/engine process (The new replicas are not in active in this
case).
4. Verify the workload will be restarted and the volume will be reattached
automatically.
5. Verify the upgrade is done during the reattachment.
5. Verify the upgrade is done.
(It actually becomes offline upgrade.)
6. Verify volume healthy and the data is correct.
"""
# Step 1
_, default_img_name, engine_upgrade_image, compatible_img, _ = \
prepare_auto_upgrade_engine_to_default_version(client)

# Step 2
host_id = get_self_host_id()
volume, pod_name, checksum, deployment = \
create_deployment_and_write_data(client, core_api,
make_deployment_with_pvc, # NOQA
volume_name, str(1 * Gi),
3,
DATA_SIZE_IN_MB_2,
host_id) # NOQA

# Step 3
volume.engineUpgrade(image=engine_upgrade_image)
crash_engine_process_with_sigkill(client, core_api, volume_name)

# Step 4, 5
volume = wait_for_volume_detached(client, volume_name)
volume = wait_for_volume_current_image(client, volume_name,
engine_upgrade_image)
wait_for_engine_image_ref_count(client, default_img_name, 0)
# Total ei.refCount of one volumes is equal to
# 1 volume + 1 engine + all replicas(3)
wait_for_engine_image_ref_count(client, compatible_img.name, 5)
volume = wait_for_volume_healthy(client, volume_name)

# make sure pod restarted
for i in range(RETRY_COUNTS):
time.sleep(RETRY_INTERVAL)
deployment_pod_names = get_deployment_pod_names(core_api,
deployment)
if deployment_pod_names[0] != pod_name:
new_pod_name = deployment_pod_names[0]
break

wait_pod(new_pod_name)

# Step 6
data_path = '/data/test'
test_data_checksum = get_pod_data_md5sum(core_api,
new_pod_name,
data_path)
assert test_data_checksum == checksum
34 changes: 18 additions & 16 deletions manager/integration/tests/test_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -2832,13 +2832,13 @@ def test_drain_with_block_for_eviction_success(client, # NOQA
client.update(setting, value="block-for-eviction")

# Step 2, 3, 4
volume, pod, checksum = create_deployment_and_write_data(client,
core_api,
make_deployment_with_pvc, # NOQA
volume_name,
str(1 * Gi),
3,
DATA_SIZE_IN_MB_3, host_id) # NOQA
volume, pod, checksum, _ = create_deployment_and_write_data(client,
core_api,
make_deployment_with_pvc, # NOQA
volume_name,
str(1 * Gi),
3,
DATA_SIZE_IN_MB_3, host_id) # NOQA

# Make replica not locate on eviction target node
volume.updateReplicaCount(replicaCount=2)
Expand Down Expand Up @@ -2952,15 +2952,17 @@ def test_drain_with_block_for_eviction_if_contains_last_replica_success(client,
# Step 2, 3
volume1_name = "vol-1"
volume2_name = "vol-2"
volume1, pod1, checksum1 = create_deployment_and_write_data(client,
volume1, pod1, checksum1, _ = create_deployment_and_write_data(
client,
core_api,
make_deployment_with_pvc, # NOQA
volume1_name,
str(1 * Gi),
3,
DATA_SIZE_IN_MB_3, # NOQA
host_id) # NOQA
volume2, pod2, checksum2 = create_deployment_and_write_data(client,
volume2, pod2, checksum2, _ = create_deployment_and_write_data(
client,
core_api,
make_deployment_with_pvc, # NOQA
volume2_name,
Expand Down Expand Up @@ -3053,13 +3055,13 @@ def test_drain_with_block_for_eviction_failure(client, # NOQA
client.update(setting, value="block-for-eviction")

# Step 2, 3, 4
volume, pod, checksum = create_deployment_and_write_data(client,
core_api,
make_deployment_with_pvc, # NOQA
volume_name,
str(1 * Gi),
3,
DATA_SIZE_IN_MB_3, host_id) # NOQA
volume, pod, checksum, _ = create_deployment_and_write_data(client,
core_api,
make_deployment_with_pvc, # NOQA
volume_name,
str(1 * Gi),
3,
DATA_SIZE_IN_MB_3, host_id) # NOQA

# Step 5
executor = ThreadPoolExecutor(max_workers=5)
Expand Down

0 comments on commit 6c1c54e

Please sign in to comment.