longhorn · roger-ryao · Nov 11, 2024 · Nov 15, 2024
@@ -47,3 +47,8 @@ Check volume ${volume_id} data is backup ${backup_id}
 
 Check backup synced from backupstore
     assert_all_backups_before_uninstall_exist    ${backups_before_uninstall}
+
+Volume ${volume_id} backup ${backup_id} should be able to create
+    Create backup ${backup_id} for volume ${volume_id}
+    Verify backup list contains no error for volume ${volume_id}
+    Verify backup list contains backup ${backup_id} of volume ${volume_id}
@@ -170,6 +170,11 @@ Wait for volume ${volume_id} restoration from backup ${backup_id} completed
     ${backup_name} =    get_backup_name    ${backup_id}
     wait_for_volume_restoration_completed    ${volume_name}    ${backup_name}
 
+Wait for volume ${volume_id} restoration from backup ${backup_id} start
+    ${volume_name} =    generate_name_with_suffix    volume    ${volume_id}
+    ${backup_name} =    get_backup_name    ${backup_id}
+    wait_for_volume_restoration_start    ${volume_name}    ${backup_name}
+
 Wait until volume ${volume_id} replica rebuilding started on ${replica_locality}
     ${volume_name} =    generate_name_with_suffix    volume    ${volume_id}
     wait_for_replica_rebuilding_to_start_on_node    ${volume_name}    ${replica_locality}

@@ -280,6 +280,10 @@ def wait_for_volume_restoration_completed(self, volume_name, backup_name):
         logging(f'Waiting for volume {volume_name} restoration from {backup_name} completed')
         self.volume.wait_for_volume_restoration_completed(volume_name, backup_name)
 
+    def wait_for_volume_restoration_start(self, volume_name, backup_name):
+        logging(f'Waiting for volume {volume_name} restoration from {backup_name} start')
+        self.volume.wait_for_volume_restoration_start(volume_name, backup_name)
+
     def validate_volume_replicas_anti_affinity(self, volume_name):
         self.volume.validate_volume_replicas_anti_affinity(volume_name)
 

@@ -92,6 +92,10 @@ def wait_for_volume_migration_completed(self, volume_name, node_name):
     def wait_for_volume_restoration_completed(self, volume_name, backup_name):
         return NotImplemented
 
+    @abstractmethod
+    def wait_for_volume_restoration_start(self, volume_name, backup_name):
+        return NotImplemented
+
     @abstractmethod
     def get_endpoint(self, volume_name):
         return NotImplemented

@@ -357,6 +357,30 @@ def wait_for_volume_restoration_completed(self, volume_name, backup_name):
             time.sleep(self.retry_interval)
         assert updated
 
+    def wait_for_volume_restoration_start(self, volume_name, backup_name,
+                                          progress=0):
+        started = False
+        for i in range(self.retry_count):
+            try:
+                engines = self.engine.get_engines(volume_name)
+                for engine in engines:
+                    for status in engine['status']['restoreStatus'].values():
+                        if status['state'] == "in_progress" and status['progress'] > progress:
+                            started = True
+                            break
+                    #  Sometime the restore time is pretty short
+                    #  and the test may not be able to catch the intermediate status.
+                    if engine['status']['lastRestoredBackup'] == backup_name:
+                        started = True
+                    if started:
+                        break
+                if started:
+                    break
+            except Exception as e:
+                logging(f"Getting volume {volume_name} engines error: {e}")
+            time.sleep(self.retry_interval)
+        assert started
+
     def wait_for_volume_expand_to_size(self, volume_name, expected_size):
         engine = None
         engine_current_size = 0

@@ -80,6 +80,9 @@ def wait_for_volume_migration_completed(self, volume_name, node_name):
     def wait_for_volume_restoration_completed(self, volume_name):
         return NotImplemented
 
+    def wait_for_volume_restoration_start(self, volume_name):
+        return NotImplemented
+
     def get_endpoint(self, volume_name):
         endpoint = ""
         v = self.get(volume_name)

@@ -78,6 +78,9 @@ def wait_for_volume_migration_completed(self, volume_name, node_name):
     def wait_for_volume_restoration_completed(self, volume_name, backup_name):
         self.volume.wait_for_volume_restoration_completed(volume_name, backup_name)
 
+    def wait_for_volume_restoration_start(self, volume_name, backup_name):
+        self.volume.wait_for_volume_restoration_start(volume_name, backup_name)
+
     def wait_for_volume_expand_to_size(self, volume_name, size):
         return self.volume.wait_for_volume_expand_to_size(volume_name, size)
 

@@ -101,11 +101,6 @@ Create pod ${pod_id} mount ${size} GB volume ${volume_id}
     Create pod ${pod_id} using volume ${volume_id}
     Wait for pod ${pod_id} running
 
-Volume ${volume_id} backup ${backup_id} should be able to create
-    Create backup ${backup_id} for volume ${volume_id}
-    Verify backup list contains no error for volume ${volume_id}
-    Verify backup list contains backup ${backup_id} of volume ${volume_id}
-
 Write data to file in deployment 0
     Write 100 MB data to file data in deployment 0
 

@@ -0,0 +1,91 @@
+*** Settings ***
+Documentation    Test DR volume node reboot
+...              https://github.com/longhorn/longhorn/issues/8425
+
+Test Tags    manual longhorn-8425
+
+Resource    ../keywords/common.resource
+Resource    ../keywords/deployment.resource
+Resource    ../keywords/workload.resource
+Resource    ../keywords/longhorn.resource
+Resource    ../keywords/host.resource
+Resource    ../keywords/storageclass.resource
+Resource    ../keywords/persistentvolumeclaim.resource
+Resource    ../keywords/recurringjob.resource
+Resource    ../keywords/statefulset.resource
+Resource    ../keywords/volume.resource
+Resource    ../keywords/snapshot.resource
+Resource    ../keywords/backup.resource
+
+
+Test Setup    Set test environment
+Test Teardown    Cleanup test resources
+
+*** Variables ***
+${RETRY_COUNT}    400
+${LOOP_COUNT}    5
+${RETRY_INTERVAL}    1
+${DATA_ENGINE}    v1
+
+*** Test Cases ***
+DR Volume Node Reboot During Initial Restoration
+    [Tags]  manual  longhorn-8425
+    [Documentation]    Test DR volume node reboot during initial restoration
+    ...                Create a pod with Longhorn volume.
+    ...                Write data to the volume and get the md5sum.
+    ...                Create the 1st backup for the volume.
+    ...                Create a DR volume from the backup.
+    ...                Wait for the DR volume starting the initial restore.
+    ...                Then reboot the DR volume attached node immediately.
+    ...                Wait for the DR volume detached then reattached.
+    ...                Wait for the DR volume restore complete after the reattachment.
+    ...                Activate the DR volume and check the data md5sum.
+    Given Create volume 0 with    dataEngine=${DATA_ENGINE}
+    And Attach volume 0
+    And Wait for volume 0 healthy
+    And Write data 0 to volume 0
+    Then Volume 0 backup 0 should be able to create
+    FOR    ${i}    IN RANGE    ${LOOP_COUNT}
+        Then Create DR volume 1 from backup 0    dataEngine=${DATA_ENGINE}
+        And Wait for volume 1 restoration from backup 0 start
+        Then Reboot volume 1 volume node
+        And Wait for volume 1 restoration from backup 0 completed
+        When Activate DR volume 1
+        And Attach volume 1
+        And Wait for volume 1 healthy
+        Then Check volume 1 data is backup 0
+        Then Detach volume 1
+        And Delete volume 1
+    END
+
+DR Volume Node Reboot During Incremental Restoration
+    [Tags]  manual  longhorn-8425
+    [Documentation]    Test DR volume node reboot During Incremental Restoration
+    ...                Create a pod with Longhorn volume.
+    ...                Write data to the volume and get the md5sum.
+    ...                Create the 1st backup for the volume.
+    ...                Create a DR volume from the backup.
+    ...                Wait for the DR volume to complete the initial restore.
+    ...                Write more data to the original volume and get the md5sum.
+    ...                Create the 2nd backup for the volume.
+    ...                Wait for the DR volume incremental restore getting triggered.
+    ...                Then reboot the DR volume attached node immediately.
+    ...                Wait for the DR volume detached then reattached.
+    ...                Wait for the DR volume restore complete after the reattachment.
+    ...                Activate the DR volume and check the data md5sum.
+    Given Create volume 0 with    dataEngine=${DATA_ENGINE}
+    And Attach volume 0
+    And Wait for volume 0 healthy
+    And Write data 0 to volume 0
+    Then Volume 0 backup 0 should be able to create
+    Then Create DR volume 1 from backup 0    dataEngine=${DATA_ENGINE}
+    And Wait for volume 1 restoration from backup 0 completed
+    Then Write data 1 to volume 0
+    And Volume 0 backup 1 should be able to create
+    And Wait for volume 1 restoration from backup 1 start
+    Then Reboot volume 1 volume node
+    Then Wait for volume 1 restoration from backup 1 completed
+    And Activate DR volume 1
+    And Attach volume 1
+    And Wait for volume 1 healthy
+    And Check volume 1 data is backup 1