diff --git a/docs/content/manual/pre-release/node-not-ready/node-restart/dr-volume-node-rebooted.md b/docs/content/manual/pre-release/node-not-ready/node-restart/dr-volume-node-rebooted.md deleted file mode 100644 index 2702e1fa72..0000000000 --- a/docs/content/manual/pre-release/node-not-ready/node-restart/dr-volume-node-rebooted.md +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: "[#1366](https://github.com/longhorn/longhorn/issues/1366) && [#1328](https://github.com/longhorn/longhorn/issues/1328) The node the DR volume attached to is rebooted" ---- -#### Scenario 1 -1. Create a pod with Longhorn volume. -2. Write data to the volume and get the md5sum. -3. Create the 1st backup for the volume. -4. Create a DR volume from the backup. -5. Wait for the DR volume starting the initial restore. Then reboot the DR volume attached node immediately. -6. Wait for the DR volume detached then reattached. -7. Wait for the DR volume restore complete after the reattachment. -8. Activate the DR volume and check the data md5sum. -#### Scenario 2 -1. Create a pod with Longhorn volume. -2. Write data to the volume and get the md5sum. -3. Create the 1st backup for the volume. -4. Create a DR volume from the backup. -5. Wait for the DR volume to complete the initial restore. -6. Write more data to the original volume and get the md5sum. -7. Create the 2nd backup for the volume. -8. Wait for the DR volume incremental restore getting triggered. Then reboot the DR volume attached node immediately. -9. Wait for the DR volume detached then reattached. -10. Wait for the DR volume restore complete after the reattachment. -11. Activate the DR volume and check the data md5sum. diff --git a/e2e/keywords/backup.resource b/e2e/keywords/backup.resource index e0a14133e6..309a94c821 100644 --- a/e2e/keywords/backup.resource +++ b/e2e/keywords/backup.resource @@ -47,3 +47,8 @@ Check volume ${volume_id} data is backup ${backup_id} Check backup synced from backupstore assert_all_backups_before_uninstall_exist ${backups_before_uninstall} + +Volume ${volume_id} backup ${backup_id} should be able to create + Create backup ${backup_id} for volume ${volume_id} + Verify backup list contains no error for volume ${volume_id} + Verify backup list contains backup ${backup_id} of volume ${volume_id} diff --git a/e2e/keywords/volume.resource b/e2e/keywords/volume.resource index b322b369e0..d17034a009 100644 --- a/e2e/keywords/volume.resource +++ b/e2e/keywords/volume.resource @@ -170,6 +170,11 @@ Wait for volume ${volume_id} restoration from backup ${backup_id} completed ${backup_name} = get_backup_name ${backup_id} wait_for_volume_restoration_completed ${volume_name} ${backup_name} +Wait for volume ${volume_id} restoration from backup ${backup_id} start + ${volume_name} = generate_name_with_suffix volume ${volume_id} + ${backup_name} = get_backup_name ${backup_id} + wait_for_volume_restoration_start ${volume_name} ${backup_name} + Wait until volume ${volume_id} replica rebuilding started on ${replica_locality} ${volume_name} = generate_name_with_suffix volume ${volume_id} wait_for_replica_rebuilding_to_start_on_node ${volume_name} ${replica_locality} diff --git a/e2e/libs/keywords/volume_keywords.py b/e2e/libs/keywords/volume_keywords.py index 2ef9c77ac0..572e29e402 100644 --- a/e2e/libs/keywords/volume_keywords.py +++ b/e2e/libs/keywords/volume_keywords.py @@ -280,6 +280,10 @@ def wait_for_volume_restoration_completed(self, volume_name, backup_name): logging(f'Waiting for volume {volume_name} restoration from {backup_name} completed') self.volume.wait_for_volume_restoration_completed(volume_name, backup_name) + def wait_for_volume_restoration_start(self, volume_name, backup_name): + logging(f'Waiting for volume {volume_name} restoration from {backup_name} start') + self.volume.wait_for_volume_restoration_start(volume_name, backup_name) + def validate_volume_replicas_anti_affinity(self, volume_name): self.volume.validate_volume_replicas_anti_affinity(volume_name) diff --git a/e2e/libs/volume/base.py b/e2e/libs/volume/base.py index ef94d19b30..c58489e866 100644 --- a/e2e/libs/volume/base.py +++ b/e2e/libs/volume/base.py @@ -92,6 +92,10 @@ def wait_for_volume_migration_completed(self, volume_name, node_name): def wait_for_volume_restoration_completed(self, volume_name, backup_name): return NotImplemented + @abstractmethod + def wait_for_volume_restoration_start(self, volume_name, backup_name): + return NotImplemented + @abstractmethod def get_endpoint(self, volume_name): return NotImplemented diff --git a/e2e/libs/volume/crd.py b/e2e/libs/volume/crd.py index 9598a99b8d..90b0286136 100644 --- a/e2e/libs/volume/crd.py +++ b/e2e/libs/volume/crd.py @@ -357,6 +357,30 @@ def wait_for_volume_restoration_completed(self, volume_name, backup_name): time.sleep(self.retry_interval) assert updated + def wait_for_volume_restoration_start(self, volume_name, backup_name, + progress=0): + started = False + for i in range(self.retry_count): + try: + engines = self.engine.get_engines(volume_name) + for engine in engines: + for status in engine['status']['restoreStatus'].values(): + if status['state'] == "in_progress" and status['progress'] > progress: + started = True + break + # Sometime the restore time is pretty short + # and the test may not be able to catch the intermediate status. + if engine['status']['lastRestoredBackup'] == backup_name: + started = True + if started: + break + if started: + break + except Exception as e: + logging(f"Getting volume {volume_name} engines error: {e}") + time.sleep(self.retry_interval) + assert started + def wait_for_volume_expand_to_size(self, volume_name, expected_size): engine = None engine_current_size = 0 diff --git a/e2e/libs/volume/rest.py b/e2e/libs/volume/rest.py index 3d6a4225a3..ab170070a7 100644 --- a/e2e/libs/volume/rest.py +++ b/e2e/libs/volume/rest.py @@ -80,6 +80,9 @@ def wait_for_volume_migration_completed(self, volume_name, node_name): def wait_for_volume_restoration_completed(self, volume_name): return NotImplemented + def wait_for_volume_restoration_start(self, volume_name): + return NotImplemented + def get_endpoint(self, volume_name): endpoint = "" v = self.get(volume_name) diff --git a/e2e/libs/volume/volume.py b/e2e/libs/volume/volume.py index b039545a45..f9bb1ac7eb 100644 --- a/e2e/libs/volume/volume.py +++ b/e2e/libs/volume/volume.py @@ -78,6 +78,9 @@ def wait_for_volume_migration_completed(self, volume_name, node_name): def wait_for_volume_restoration_completed(self, volume_name, backup_name): self.volume.wait_for_volume_restoration_completed(volume_name, backup_name) + def wait_for_volume_restoration_start(self, volume_name, backup_name): + self.volume.wait_for_volume_restoration_start(volume_name, backup_name) + def wait_for_volume_expand_to_size(self, volume_name, size): return self.volume.wait_for_volume_expand_to_size(volume_name, size) diff --git a/e2e/tests/negative/test_backup_listing.robot b/e2e/tests/negative/test_backup_listing.robot index 0d292c7141..6e0c921b25 100644 --- a/e2e/tests/negative/test_backup_listing.robot +++ b/e2e/tests/negative/test_backup_listing.robot @@ -101,11 +101,6 @@ Create pod ${pod_id} mount ${size} GB volume ${volume_id} Create pod ${pod_id} using volume ${volume_id} Wait for pod ${pod_id} running -Volume ${volume_id} backup ${backup_id} should be able to create - Create backup ${backup_id} for volume ${volume_id} - Verify backup list contains no error for volume ${volume_id} - Verify backup list contains backup ${backup_id} of volume ${volume_id} - Write data to file in deployment 0 Write 100 MB data to file data in deployment 0 diff --git a/e2e/tests/negative/test_dr_volume_node_reboot.robot b/e2e/tests/negative/test_dr_volume_node_reboot.robot new file mode 100644 index 0000000000..2a8d2c7f30 --- /dev/null +++ b/e2e/tests/negative/test_dr_volume_node_reboot.robot @@ -0,0 +1,91 @@ +*** Settings *** +Documentation Test DR volume node reboot +... https://github.com/longhorn/longhorn/issues/8425 + +Test Tags manual longhorn-8425 + +Resource ../keywords/common.resource +Resource ../keywords/deployment.resource +Resource ../keywords/workload.resource +Resource ../keywords/longhorn.resource +Resource ../keywords/host.resource +Resource ../keywords/storageclass.resource +Resource ../keywords/persistentvolumeclaim.resource +Resource ../keywords/recurringjob.resource +Resource ../keywords/statefulset.resource +Resource ../keywords/volume.resource +Resource ../keywords/snapshot.resource +Resource ../keywords/backup.resource + + +Test Setup Set test environment +Test Teardown Cleanup test resources + +*** Variables *** +${RETRY_COUNT} 400 +${LOOP_COUNT} 5 +${RETRY_INTERVAL} 1 +${DATA_ENGINE} v1 + +*** Test Cases *** +DR Volume Node Reboot During Initial Restoration + [Tags] manual longhorn-8425 + [Documentation] Test DR volume node reboot during initial restoration + ... Create a pod with Longhorn volume. + ... Write data to the volume and get the md5sum. + ... Create the 1st backup for the volume. + ... Create a DR volume from the backup. + ... Wait for the DR volume starting the initial restore. + ... Then reboot the DR volume attached node immediately. + ... Wait for the DR volume detached then reattached. + ... Wait for the DR volume restore complete after the reattachment. + ... Activate the DR volume and check the data md5sum. + Given Create volume 0 with dataEngine=${DATA_ENGINE} + And Attach volume 0 + And Wait for volume 0 healthy + And Write data 0 to volume 0 + Then Volume 0 backup 0 should be able to create + FOR ${i} IN RANGE ${LOOP_COUNT} + Then Create DR volume 1 from backup 0 dataEngine=${DATA_ENGINE} + And Wait for volume 1 restoration from backup 0 start + Then Reboot volume 1 volume node + And Wait for volume 1 restoration from backup 0 completed + When Activate DR volume 1 + And Attach volume 1 + And Wait for volume 1 healthy + Then Check volume 1 data is backup 0 + Then Detach volume 1 + And Delete volume 1 + END + +DR Volume Node Reboot During Incremental Restoration + [Tags] manual longhorn-8425 + [Documentation] Test DR volume node reboot During Incremental Restoration + ... Create a pod with Longhorn volume. + ... Write data to the volume and get the md5sum. + ... Create the 1st backup for the volume. + ... Create a DR volume from the backup. + ... Wait for the DR volume to complete the initial restore. + ... Write more data to the original volume and get the md5sum. + ... Create the 2nd backup for the volume. + ... Wait for the DR volume incremental restore getting triggered. + ... Then reboot the DR volume attached node immediately. + ... Wait for the DR volume detached then reattached. + ... Wait for the DR volume restore complete after the reattachment. + ... Activate the DR volume and check the data md5sum. + Given Create volume 0 with dataEngine=${DATA_ENGINE} + And Attach volume 0 + And Wait for volume 0 healthy + And Write data 0 to volume 0 + Then Volume 0 backup 0 should be able to create + Then Create DR volume 1 from backup 0 dataEngine=${DATA_ENGINE} + And Wait for volume 1 restoration from backup 0 completed + Then Write data 1 to volume 0 + And Volume 0 backup 1 should be able to create + And Wait for volume 1 restoration from backup 1 start + Then Reboot volume 1 volume node + Then Wait for volume 1 restoration from backup 1 completed + And Activate DR volume 1 + And Attach volume 1 + And Wait for volume 1 healthy + And Check volume 1 data is backup 1 \ No newline at end of file