From 4759be0c50087f585eee50b416ee817a63fe5766 Mon Sep 17 00:00:00 2001
From: Roger Yao <roger.yao@suse.com>
Date: Mon, 11 Nov 2024 00:56:15 +0000
Subject: [PATCH 1/2] test(robot): Add case DR Volume Node Reboot During
 Restoration

longhorn/longhorn#8425

Signed-off-by: Roger Yao <roger.yao@suse.com>
---
 e2e/keywords/backup.resource                  |  5 ++
 e2e/keywords/volume.resource                  |  5 ++
 e2e/libs/keywords/volume_keywords.py          |  4 ++
 e2e/libs/volume/base.py                       |  4 ++
 e2e/libs/volume/crd.py                        | 24 +++++++
 e2e/libs/volume/rest.py                       |  3 +
 e2e/libs/volume/volume.py                     |  3 +
 e2e/tests/negative/test_backup_listing.robot  |  5 --
 .../negative/test_dr_volume_node_reboot.robot | 72 +++++++++++++++++++
 9 files changed, 120 insertions(+), 5 deletions(-)
 create mode 100644 e2e/tests/negative/test_dr_volume_node_reboot.robot

diff --git a/e2e/keywords/backup.resource b/e2e/keywords/backup.resource
index e0a14133e6..309a94c821 100644
--- a/e2e/keywords/backup.resource
+++ b/e2e/keywords/backup.resource
@@ -47,3 +47,8 @@ Check volume ${volume_id} data is backup ${backup_id}
 
 Check backup synced from backupstore
     assert_all_backups_before_uninstall_exist    ${backups_before_uninstall}
+
+Volume ${volume_id} backup ${backup_id} should be able to create
+    Create backup ${backup_id} for volume ${volume_id}
+    Verify backup list contains no error for volume ${volume_id}
+    Verify backup list contains backup ${backup_id} of volume ${volume_id}
diff --git a/e2e/keywords/volume.resource b/e2e/keywords/volume.resource
index b322b369e0..d17034a009 100644
--- a/e2e/keywords/volume.resource
+++ b/e2e/keywords/volume.resource
@@ -170,6 +170,11 @@ Wait for volume ${volume_id} restoration from backup ${backup_id} completed
     ${backup_name} =    get_backup_name    ${backup_id}
     wait_for_volume_restoration_completed    ${volume_name}    ${backup_name}
 
+Wait for volume ${volume_id} restoration from backup ${backup_id} start
+    ${volume_name} =    generate_name_with_suffix    volume    ${volume_id}
+    ${backup_name} =    get_backup_name    ${backup_id}
+    wait_for_volume_restoration_start    ${volume_name}    ${backup_name}
+
 Wait until volume ${volume_id} replica rebuilding started on ${replica_locality}
     ${volume_name} =    generate_name_with_suffix    volume    ${volume_id}
     wait_for_replica_rebuilding_to_start_on_node    ${volume_name}    ${replica_locality}
diff --git a/e2e/libs/keywords/volume_keywords.py b/e2e/libs/keywords/volume_keywords.py
index 2ef9c77ac0..572e29e402 100644
--- a/e2e/libs/keywords/volume_keywords.py
+++ b/e2e/libs/keywords/volume_keywords.py
@@ -280,6 +280,10 @@ def wait_for_volume_restoration_completed(self, volume_name, backup_name):
         logging(f'Waiting for volume {volume_name} restoration from {backup_name} completed')
         self.volume.wait_for_volume_restoration_completed(volume_name, backup_name)
 
+    def wait_for_volume_restoration_start(self, volume_name, backup_name):
+        logging(f'Waiting for volume {volume_name} restoration from {backup_name} start')
+        self.volume.wait_for_volume_restoration_start(volume_name, backup_name)
+
     def validate_volume_replicas_anti_affinity(self, volume_name):
         self.volume.validate_volume_replicas_anti_affinity(volume_name)
 
diff --git a/e2e/libs/volume/base.py b/e2e/libs/volume/base.py
index ef94d19b30..c58489e866 100644
--- a/e2e/libs/volume/base.py
+++ b/e2e/libs/volume/base.py
@@ -92,6 +92,10 @@ def wait_for_volume_migration_completed(self, volume_name, node_name):
     def wait_for_volume_restoration_completed(self, volume_name, backup_name):
         return NotImplemented
 
+    @abstractmethod
+    def wait_for_volume_restoration_start(self, volume_name, backup_name):
+        return NotImplemented
+
     @abstractmethod
     def get_endpoint(self, volume_name):
         return NotImplemented
diff --git a/e2e/libs/volume/crd.py b/e2e/libs/volume/crd.py
index 9598a99b8d..90b0286136 100644
--- a/e2e/libs/volume/crd.py
+++ b/e2e/libs/volume/crd.py
@@ -357,6 +357,30 @@ def wait_for_volume_restoration_completed(self, volume_name, backup_name):
             time.sleep(self.retry_interval)
         assert updated
 
+    def wait_for_volume_restoration_start(self, volume_name, backup_name,
+                                          progress=0):
+        started = False
+        for i in range(self.retry_count):
+            try:
+                engines = self.engine.get_engines(volume_name)
+                for engine in engines:
+                    for status in engine['status']['restoreStatus'].values():
+                        if status['state'] == "in_progress" and status['progress'] > progress:
+                            started = True
+                            break
+                    #  Sometime the restore time is pretty short
+                    #  and the test may not be able to catch the intermediate status.
+                    if engine['status']['lastRestoredBackup'] == backup_name:
+                        started = True
+                    if started:
+                        break
+                if started:
+                    break
+            except Exception as e:
+                logging(f"Getting volume {volume_name} engines error: {e}")
+            time.sleep(self.retry_interval)
+        assert started
+
     def wait_for_volume_expand_to_size(self, volume_name, expected_size):
         engine = None
         engine_current_size = 0
diff --git a/e2e/libs/volume/rest.py b/e2e/libs/volume/rest.py
index 3d6a4225a3..ab170070a7 100644
--- a/e2e/libs/volume/rest.py
+++ b/e2e/libs/volume/rest.py
@@ -80,6 +80,9 @@ def wait_for_volume_migration_completed(self, volume_name, node_name):
     def wait_for_volume_restoration_completed(self, volume_name):
         return NotImplemented
 
+    def wait_for_volume_restoration_start(self, volume_name):
+        return NotImplemented
+
     def get_endpoint(self, volume_name):
         endpoint = ""
         v = self.get(volume_name)
diff --git a/e2e/libs/volume/volume.py b/e2e/libs/volume/volume.py
index b039545a45..f9bb1ac7eb 100644
--- a/e2e/libs/volume/volume.py
+++ b/e2e/libs/volume/volume.py
@@ -78,6 +78,9 @@ def wait_for_volume_migration_completed(self, volume_name, node_name):
     def wait_for_volume_restoration_completed(self, volume_name, backup_name):
         self.volume.wait_for_volume_restoration_completed(volume_name, backup_name)
 
+    def wait_for_volume_restoration_start(self, volume_name, backup_name):
+        self.volume.wait_for_volume_restoration_start(volume_name, backup_name)
+
     def wait_for_volume_expand_to_size(self, volume_name, size):
         return self.volume.wait_for_volume_expand_to_size(volume_name, size)
 
diff --git a/e2e/tests/negative/test_backup_listing.robot b/e2e/tests/negative/test_backup_listing.robot
index 0d292c7141..6e0c921b25 100644
--- a/e2e/tests/negative/test_backup_listing.robot
+++ b/e2e/tests/negative/test_backup_listing.robot
@@ -101,11 +101,6 @@ Create pod ${pod_id} mount ${size} GB volume ${volume_id}
     Create pod ${pod_id} using volume ${volume_id}
     Wait for pod ${pod_id} running
 
-Volume ${volume_id} backup ${backup_id} should be able to create
-    Create backup ${backup_id} for volume ${volume_id}
-    Verify backup list contains no error for volume ${volume_id}
-    Verify backup list contains backup ${backup_id} of volume ${volume_id}
-
 Write data to file in deployment 0
     Write 100 MB data to file data in deployment 0
 
diff --git a/e2e/tests/negative/test_dr_volume_node_reboot.robot b/e2e/tests/negative/test_dr_volume_node_reboot.robot
new file mode 100644
index 0000000000..26d164e6f6
--- /dev/null
+++ b/e2e/tests/negative/test_dr_volume_node_reboot.robot
@@ -0,0 +1,72 @@
+*** Settings ***
+Documentation    Test DR volume node reboot
+...              https://github.com/longhorn/longhorn/issues/8425
+
+Test Tags    manual longhorn-8425
+
+Resource    ../keywords/common.resource
+Resource    ../keywords/deployment.resource
+Resource    ../keywords/workload.resource
+Resource    ../keywords/longhorn.resource
+Resource    ../keywords/host.resource
+Resource    ../keywords/storageclass.resource
+Resource    ../keywords/persistentvolumeclaim.resource
+Resource    ../keywords/recurringjob.resource
+Resource    ../keywords/statefulset.resource
+Resource    ../keywords/volume.resource
+Resource    ../keywords/snapshot.resource
+Resource    ../keywords/backup.resource
+
+
+Test Setup    Set test environment
+Test Teardown    Cleanup test resources
+
+*** Variables ***
+${RETRY_COUNT}    400
+${LOOP_COUNT}    5
+${RETRY_INTERVAL}    1
+${DATA_ENGINE}    v1
+
+*** Test Cases ***
+DR Volume Node Reboot During Initial Restoration
+    [Tags]  manual  longhorn-8425
+    [Documentation]    Test DR volume node reboot
+    ...                during initial restoration
+    Given Create volume 0 with    dataEngine=${DATA_ENGINE}
+    And Attach volume 0
+    And Wait for volume 0 healthy
+    And Write data 0 to volume 0
+    Then Volume 0 backup 0 should be able to create
+    FOR    ${i}    IN RANGE    ${LOOP_COUNT}
+        Then Create DR volume 1 from backup 0    dataEngine=${DATA_ENGINE}
+        And Wait for volume 1 restoration from backup 0 start
+        Then Reboot volume 1 volume node
+        And Wait for volume 1 restoration from backup 0 completed
+        When Activate DR volume 1
+        And Attach volume 1
+        And Wait for volume 1 healthy
+        Then Check volume 1 data is backup 0
+        Then Detach volume 1
+        And Delete volume 1
+    END
+
+DR Volume Node Reboot During Incremental Restoration
+    [Tags]  manual  longhorn-8425
+    [Documentation]    Test DR volume node reboot
+    ...                During Incremental Restoration
+    Given Create volume 0 with    dataEngine=${DATA_ENGINE}
+    And Attach volume 0
+    And Wait for volume 0 healthy
+    And Write data 0 to volume 0
+    Then Volume 0 backup 0 should be able to create
+    Then Create DR volume 1 from backup 0    dataEngine=${DATA_ENGINE}
+    And Wait for volume 1 restoration from backup 0 completed
+    Then Write data 1 to volume 0
+    And Volume 0 backup 1 should be able to create
+    And Wait for volume 1 restoration from backup 1 start
+    Then Reboot volume 1 volume node
+    Then Wait for volume 1 restoration from backup 1 completed
+    And Activate DR volume 1
+    And Attach volume 1
+    And Wait for volume 1 healthy
+    And Check volume 1 data is backup 1
\ No newline at end of file

From 548668dd57e5eef2139b7a57b9c5502dc6558eeb Mon Sep 17 00:00:00 2001
From: Roger Yao <roger.yao@suse.com>
Date: Fri, 15 Nov 2024 16:15:59 +0800
Subject: [PATCH 2/2] Remove manual test case dr volume node reboot

longhorn/longhorn#8425

Signed-off-by: Roger Yao <roger.yao@suse.com>
---
 .../node-restart/dr-volume-node-rebooted.md   | 24 -----------------
 .../negative/test_dr_volume_node_reboot.robot | 27 ++++++++++++++++---
 2 files changed, 23 insertions(+), 28 deletions(-)
 delete mode 100644 docs/content/manual/pre-release/node-not-ready/node-restart/dr-volume-node-rebooted.md

diff --git a/docs/content/manual/pre-release/node-not-ready/node-restart/dr-volume-node-rebooted.md b/docs/content/manual/pre-release/node-not-ready/node-restart/dr-volume-node-rebooted.md
deleted file mode 100644
index 2702e1fa72..0000000000
--- a/docs/content/manual/pre-release/node-not-ready/node-restart/dr-volume-node-rebooted.md
+++ /dev/null
@@ -1,24 +0,0 @@
----
-title: "[#1366](https://github.com/longhorn/longhorn/issues/1366) && [#1328](https://github.com/longhorn/longhorn/issues/1328) The node the DR volume attached to is rebooted"
----
-#### Scenario 1
-1. Create a pod with Longhorn volume.
-2. Write data to the volume and get the md5sum.
-3. Create the 1st backup for the volume.
-4. Create a DR volume from the backup.
-5. Wait for the DR volume starting the initial restore. Then reboot the DR volume attached node immediately.
-6. Wait for the DR volume detached then reattached.
-7. Wait for the DR volume restore complete after the reattachment.
-8. Activate the DR volume and check the data md5sum.
-#### Scenario 2
-1. Create a pod with Longhorn volume.
-2. Write data to the volume and get the md5sum.
-3. Create the 1st backup for the volume.
-4. Create a DR volume from the backup.
-5. Wait for the DR volume to complete the initial restore. 
-6. Write more data to the original volume and get the md5sum.
-7. Create the 2nd backup for the volume.
-8. Wait for the DR volume incremental restore getting triggered. Then reboot the DR volume attached node immediately.
-9. Wait for the DR volume detached then reattached.
-10. Wait for the DR volume restore complete after the reattachment.
-11. Activate the DR volume and check the data md5sum.
diff --git a/e2e/tests/negative/test_dr_volume_node_reboot.robot b/e2e/tests/negative/test_dr_volume_node_reboot.robot
index 26d164e6f6..2a8d2c7f30 100644
--- a/e2e/tests/negative/test_dr_volume_node_reboot.robot
+++ b/e2e/tests/negative/test_dr_volume_node_reboot.robot
@@ -30,8 +30,16 @@ ${DATA_ENGINE}    v1
 *** Test Cases ***
 DR Volume Node Reboot During Initial Restoration
     [Tags]  manual  longhorn-8425
-    [Documentation]    Test DR volume node reboot
-    ...                during initial restoration
+    [Documentation]    Test DR volume node reboot during initial restoration
+    ...                Create a pod with Longhorn volume.
+    ...                Write data to the volume and get the md5sum.
+    ...                Create the 1st backup for the volume.
+    ...                Create a DR volume from the backup.
+    ...                Wait for the DR volume starting the initial restore.
+    ...                Then reboot the DR volume attached node immediately.
+    ...                Wait for the DR volume detached then reattached.
+    ...                Wait for the DR volume restore complete after the reattachment.
+    ...                Activate the DR volume and check the data md5sum.
     Given Create volume 0 with    dataEngine=${DATA_ENGINE}
     And Attach volume 0
     And Wait for volume 0 healthy
@@ -52,8 +60,19 @@ DR Volume Node Reboot During Initial Restoration
 
 DR Volume Node Reboot During Incremental Restoration
     [Tags]  manual  longhorn-8425
-    [Documentation]    Test DR volume node reboot
-    ...                During Incremental Restoration
+    [Documentation]    Test DR volume node reboot During Incremental Restoration
+    ...                Create a pod with Longhorn volume.
+    ...                Write data to the volume and get the md5sum.
+    ...                Create the 1st backup for the volume.
+    ...                Create a DR volume from the backup.
+    ...                Wait for the DR volume to complete the initial restore.
+    ...                Write more data to the original volume and get the md5sum.
+    ...                Create the 2nd backup for the volume.
+    ...                Wait for the DR volume incremental restore getting triggered.
+    ...                Then reboot the DR volume attached node immediately.
+    ...                Wait for the DR volume detached then reattached.
+    ...                Wait for the DR volume restore complete after the reattachment.
+    ...                Activate the DR volume and check the data md5sum.
     Given Create volume 0 with    dataEngine=${DATA_ENGINE}
     And Attach volume 0
     And Wait for volume 0 healthy