diff --git a/.drone.yml b/.drone.yml
index 281a568cbe..ceb6c24470 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -23,7 +23,6 @@ steps:
   image: rancher/dapper:v0.5.3
   commands:
   - dapper
-  privileged: true
   volumes:
   - name: socket
     path: /var/run/docker.sock
@@ -92,7 +91,6 @@ steps:
   image: rancher/dapper:v0.5.3
   commands:
   - dapper
-  privileged: true
   volumes:
   - name: socket
     path: /var/run/docker.sock
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 0000000000..af61c5c1c6
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,11 @@
+#### Which issue(s) this PR fixes:
+<!--
+Use `Issue #<issue number>` or `Issue longhorn/longhorn#<issue number>` or `Issue (paste link of issue)`. DON'T use `Fixes #<issue number>` or `Fixes (paste link of issue)`, as it will automatically close the linked issue when the PR is merged.
+-->
+Issue #
+
+#### What this PR does / why we need it:
+
+#### Special notes for your reviewer:
+
+#### Additional documentation or context
diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml
index 7cc4476071..416ddc1047 100644
--- a/.github/workflows/publish.yaml
+++ b/.github/workflows/publish.yaml
@@ -7,7 +7,7 @@ on:
 
 jobs:
   publish:
-    runs-on: [self-hosted, python3.8]
+    runs-on: ubuntu-latest
     steps:
       - name: Checkout
         uses: actions/checkout@v2
diff --git a/build_engine_test_images/Dockerfile.setup b/build_engine_test_images/Dockerfile.setup
index feaf3d3811..5a4bacccf5 100644
--- a/build_engine_test_images/Dockerfile.setup
+++ b/build_engine_test_images/Dockerfile.setup
@@ -15,7 +15,7 @@ RUN wget -q https://releases.hashicorp.com/terraform/${TERRAFORM_VERSION}/terraf
     wget -q "https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64" && \
     mv yq_linux_amd64 /usr/local/bin/yq && \
     chmod +x /usr/local/bin/yq && \
-    apk add openssh-client ca-certificates git rsync bash curl jq docker && \
+    apk add openssh-client ca-certificates git rsync bash curl jq && \
     ssh-keygen -t rsa -b 4096 -N "" -f ~/.ssh/id_rsa
 
 COPY [".", "$WORKSPACE"]
\ No newline at end of file
diff --git a/build_engine_test_images/Jenkinsfile b/build_engine_test_images/Jenkinsfile
index 82623329b3..d940435469 100644
--- a/build_engine_test_images/Jenkinsfile
+++ b/build_engine_test_images/Jenkinsfile
@@ -15,12 +15,11 @@ node {
     usernamePassword(credentialsId: 'DOCKER_CREDS', passwordVariable: 'DOCKER_PASSWORD', usernameVariable: 'DOCKER_USERNAME'),
     usernamePassword(credentialsId: 'AWS_CREDS', passwordVariable: 'AWS_SECRET_KEY', usernameVariable: 'AWS_ACCESS_KEY')
   ]) {
-        stage('build') {
+        stage('build') {      
             
             sh "build_engine_test_images/scripts/build.sh"
 
-            sh """ docker run -itd --privileged -v /var/run/docker.sock:/var/run/docker.sock \
-                                   --name ${JOB_BASE_NAME}-${BUILD_NUMBER} \
+            sh """ docker run -itd --name ${JOB_BASE_NAME}-${BUILD_NUMBER} \
                                    --env TF_VAR_build_engine_aws_access_key=${AWS_ACCESS_KEY} \
                                    --env TF_VAR_build_engine_aws_secret_key=${AWS_SECRET_KEY} \
                                    --env TF_VAR_docker_id=${DOCKER_USERNAME} \
diff --git a/build_engine_test_images/run.sh b/build_engine_test_images/run.sh
index 9377685a9d..fc831c228e 100755
--- a/build_engine_test_images/run.sh
+++ b/build_engine_test_images/run.sh
@@ -26,30 +26,6 @@ if [[ -z "$TF_VAR_docker_repo" ]]; then
     exit 1
 fi
 
-# if commit_id is empty, we can directly check longhorn-engine:master-head's api version
-if [[ -z "${TF_VAR_commit_id}" ]]; then
-
-  docker login -u="${TF_VAR_docker_id}" -p="${TF_VAR_docker_password}"
-  docker pull longhornio/longhorn-engine:master-head
-  version=`docker run longhornio/longhorn-engine:master-head longhorn version --client-only`
-  CLIAPIVersion=`echo $version | jq -r ".clientVersion.cliAPIVersion"`
-  CLIAPIMinVersion=`echo $version | jq -r ".clientVersion.cliAPIMinVersion"`
-  ControllerAPIVersion=`echo $version | jq -r ".clientVersion.controllerAPIVersion"`
-  ControllerAPIMinVersion=`echo $version | jq -r ".clientVersion.controllerAPIMinVersion"`
-  DataFormatVersion=`echo $version | jq -r ".clientVersion.dataFormatVersion"`
-  DataFormatMinVersion=`echo $version | jq -r ".clientVersion.dataFormatMinVersion"`
-  echo "latest engine version: ${version}"
-
-  upgrade_image="${TF_VAR_docker_repo}:upgrade-test.$CLIAPIVersion-$CLIAPIMinVersion"\
-".$ControllerAPIVersion-$ControllerAPIMinVersion"\
-".$DataFormatVersion-$DataFormatMinVersion"
-
-  if [[ $(docker manifest inspect "${upgrade_image}") != "" ]]; then
-    echo "latest engine test images have already published"
-    exit 0
-  fi
-fi
-
 trap ./scripts/cleanup.sh EXIT
 
 # Build amd64 images
diff --git a/docs/content/manual/pre-release/basic-operations/storage-network.md b/docs/content/manual/pre-release/basic-operations/storage-network.md
index f1f59642c5..93b29145d1 100644
--- a/docs/content/manual/pre-release/basic-operations/storage-network.md
+++ b/docs/content/manual/pre-release/basic-operations/storage-network.md
@@ -6,13 +6,17 @@ https://github.com/longhorn/longhorn/issues/2285
 
 ## Test Multus version below v4.0.0
 **Given** Set up the Longhorn environment as mentioned [here](https://longhorn.github.io/longhorn-tests/manual/release-specific/v1.3.0/test-storage-network/)
+
 **When** Run Longhorn core tests on the environment.
-**Then** All the tests should pass.
+
+**Then** All the tests should pass. 
 
 ## Related issue:
 https://github.com/longhorn/longhorn/issues/6953
 
 ## Test Multus version above v4.0.0
 **Given** Set up the Longhorn environment as mentioned [here](https://longhorn.github.io/longhorn-tests/manual/release-specific/v1.6.0/test-storage-network/)
+
 **When** Run Longhorn core tests on the environment.
+
 **Then** All the tests should pass.
diff --git a/docs/content/manual/pre-release/environment/cluster-using-customized-kubelet-root-directory.md b/docs/content/manual/pre-release/environment/cluster-using-customized-kubelet-root-directory.md
index d27f7ec62b..c8426cfee9 100644
--- a/docs/content/manual/pre-release/environment/cluster-using-customized-kubelet-root-directory.md
+++ b/docs/content/manual/pre-release/environment/cluster-using-customized-kubelet-root-directory.md
@@ -2,8 +2,10 @@
 title: Cluster using customize kubelet root directory
 ---
 
-1. Set up a cluster using a customized kubelet root directory. 
-   e.g., launching k3s `k3s server --kubelet-arg "root-dir=/var/lib/longhorn-test" &`
+1. Set up a cluster using a customized kubelet root directory.
+   For example, launching k3s:
+   - Controller: `k3s server --kubelet-arg "root-dir=/var/lib/longhorn-test"`
+   - Worker: `k3s agent --kubelet-arg "root-dir=/var/lib/longhorn-test"`
 2. Install `Longhorn` with env `KUBELET_ROOT_DIR` in `longhorn-driver-deployer` being set to the corresponding value.
 3. Launch a pod using Longhorn volumes via StorageClass. Everything should work fine.
 4. Delete the pod and the PVC. Everything should be cleaned up.
diff --git a/docs/content/manual/pre-release/resiliency/pvc_provisioning_with_insufficient_storage.md b/docs/content/manual/pre-release/resiliency/pvc_provisioning_with_insufficient_storage.md
index e1a8079826..cd7f88356b 100644
--- a/docs/content/manual/pre-release/resiliency/pvc_provisioning_with_insufficient_storage.md
+++ b/docs/content/manual/pre-release/resiliency/pvc_provisioning_with_insufficient_storage.md
@@ -5,17 +5,22 @@ title: "PVC provisioning with insufficient storage"
 #### Related Issue:
 - https://github.com/longhorn/longhorn/issues/4654
 - https://github.com/longhorn/longhorn/issues/3529
+- https://github.com/longhorn/longhorn/issues/6461
 
 #### Root Cause Analysis
 - https://github.com/longhorn/longhorn/issues/4654#issuecomment-1264870672
 
 This case need to be tested on both RWO/RWX volumes
 
-1. Create a PVC with size larger than 8589934591 GiB.
+1. Create a PVC with size larger than `8589934591` GiB.
     - Deployment keep in pending status, RWO/RWX volume will keep in a create -> delete loop.
-2. Create a PVC with size <= 8589934591 GiB, but greater than the actual available space size.
-    - RWO/RWX volume will be created, and volume will have annotation "longhorn.io/volume-scheduling-error": "insufficient storage volume scheduling failure" in it.
-3. Create a PVC with size < the actual available space size，Resize the PVC to a not schedulable size
+1. Create a PVC with size <= `8589934591` GiB, but greater than the actual available space size.
+    - RWO/RWX volume will be created, and the associated PV for this volume will have annotation "**longhorn.io/volume-scheduling-error**": "**insufficient storage**" in it.
+    - We can observe "**Scheduling Failure**" and "**Replica Scheduling Failure**" error messages on the Longhorn UI with the following details
+        - **Scheduling Failure**
+        - Replica Scheduling Failure
+        - Error Message: insufficient storage
+1. Create a PVC with size < the actual available space size，Resize the PVC to a not schedulable size
     - After resize PVC to a not schedulable size, both RWO/RWX were still in scheduling status.
 
 We can modify/use https://raw.githubusercontent.com/longhorn/longhorn/master/examples/rwx/rwx-nginx-deployment.yaml to deploy RWO/RWX PVC for this test 
\ No newline at end of file
diff --git a/docs/content/manual/pre-release/ui/_index.md b/docs/content/manual/pre-release/ui/_index.md
new file mode 100644
index 0000000000..9ec12c4ba1
--- /dev/null
+++ b/docs/content/manual/pre-release/ui/_index.md
@@ -0,0 +1,3 @@
+---
+title: UI
+---
diff --git a/docs/content/manual/pre-release/ui/ui-sanity-check.md b/docs/content/manual/pre-release/ui/ui-sanity-check.md
new file mode 100644
index 0000000000..0c19f20015
--- /dev/null
+++ b/docs/content/manual/pre-release/ui/ui-sanity-check.md
@@ -0,0 +1,14 @@
+---
+title: ui sanity check
+---
+
+1. Access Longhorn UI on `Chrome`, `Firefox` and `Safari` latest/stable version.
+1. Check the pages. All the text, form, tables should be proper.
+1. Verify all the links at the bottom, they shouldn't be broken and redirects to right pages.
+1. Check the setting page, all the settings's text, values should be proper.
+1. Create `Backing Image`, `volume`, `pv`, `pvc` and `recurring jobs` using UI.
+1. Take `volume snapshot`, create `volume backup`, and `system backup` using UI.
+1. Restore `Backup` and `system backup` using UI.
+1. Check the `events` on dashboard, they should be normal.
+1. Check the logs on the volume detail page, there shouldn't be any error.
+1. Check the browser's console, there shouldn't be any error.
\ No newline at end of file
diff --git a/docs/content/manual/pre-release/upgrade/upgrade-with-new-instance-manager.md b/docs/content/manual/pre-release/upgrade/upgrade-with-new-instance-manager.md
index a206f8dd66..e3b5c82451 100644
--- a/docs/content/manual/pre-release/upgrade/upgrade-with-new-instance-manager.md
+++ b/docs/content/manual/pre-release/upgrade/upgrade-with-new-instance-manager.md
@@ -4,7 +4,7 @@ title: Test System Upgrade with New Instance Manager
 
 1. Prepare 3 sets of longhorn-manager and longhorn-instance-manager images.
 2. Deploy Longhorn with the 1st set of images.
-3. Set `Guaranteed Engine Manager CPU` and `Guaranteed Replica Manager CPU` to 15 and 24, respectively. 
+3. Set `Guaranteed Instance Manager CPU` to 40, respectively.
    Then wait for the instance manager recreation.
 4. Create and attach a volume to a node (node1).
 5. Upgrade the Longhorn system with the 2nd set of images. 
@@ -13,4 +13,4 @@ title: Test System Upgrade with New Instance Manager
 7. Upgrade the Longhorn system with the 3rd set of images.
 8. Verify the pods of the 3rd instance manager cannot be launched on node1 since there is no available CPU for the allocation.
 9. Detach the volume in the 1st instance manager pod. 
-   Verify the related instance manager pods will be cleaned up and the new instance manager pod can be launched on node1.
+   Verify the related instance manager pods will be cleaned up and the new instance manager pod can be launched on node1.
\ No newline at end of file
diff --git a/docs/content/manual/pre-release/v2-volume/_index.md b/docs/content/manual/pre-release/v2-volume/_index.md
new file mode 100644
index 0000000000..73a4d68854
--- /dev/null
+++ b/docs/content/manual/pre-release/v2-volume/_index.md
@@ -0,0 +1,3 @@
+---
+title: v2 volume
+---
diff --git a/docs/content/manual/pre-release/v2-volume/sanity-check.md b/docs/content/manual/pre-release/v2-volume/sanity-check.md
new file mode 100644
index 0000000000..55041ed418
--- /dev/null
+++ b/docs/content/manual/pre-release/v2-volume/sanity-check.md
@@ -0,0 +1,14 @@
+---
+title: v2 volume sanity check
+---
+## Related doc:
+https://longhorn.io/docs/1.6.0/v2-data-engine/features/
+
+- Support both amd64 and arm64
+- Volume creation, attachment, detachment and deletion
+- Automatic offline replica rebuilding
+- [Orphaned replica management](https://github.com/longhorn/longhorn/issues/5827)
+- Snapshot creation, deletion and reversion
+- Volume backup and restoration
+- [Selective v2 Data Engine activation](https://github.com/longhorn/longhorn/issues/7015)
+- Upgrade Longhorn from previous version with v2 volume
\ No newline at end of file
diff --git a/docs/content/manual/release-specific/v1.6.0/test-engine-version-enforcement.md b/docs/content/manual/release-specific/v1.6.0/test-engine-version-enforcement.md
new file mode 100644
index 0000000000..ba4f32d956
--- /dev/null
+++ b/docs/content/manual/release-specific/v1.6.0/test-engine-version-enforcement.md
@@ -0,0 +1,43 @@
+---
+title: Test engine version enforcement
+---
+
+## Related issue
+https://github.com/longhorn/longhorn/issues/5842
+https://github.com/longhorn/longhorn/issues/7539
+
+## Test step
+
+**Given** Longhorn v1.4.x cluster running  
+And create and attach a volume (volume-1)  
+And upgraded Longhorn to v1.5.x  
+And create and attach a volume (volume-2)
+
+**When** upgraded Longhorn to v1.6.0  
+**Then** v1.6.0 longhorn-manager Pods should be in crashloop
+```
+longhorn-manager-zrf8r                                0/1     CrashLoopBackOff   2 (10s ago)   52s
+longhorn-manager-zsph2                                0/1     CrashLoopBackOff   2 (8s ago)    52s
+longhorn-manager-grhsf                                0/1     CrashLoopBackOff   2 (8s ago)    51s
+```
+And should see incompatible version error in longhorn-manager Pod logs
+```
+time="2023-08-17T03:03:20Z" level=fatal msg="Error starting manager: failed checking Engine upgarde path: incompatible Engine ei-7fa7c208 client API version: found version 7 is below required minimal version 8"
+```
+
+**When** downgraded Longhorn to v1.5.x  
+**Then** Longhorn components should be running
+
+**When** upgraded v1.4.1 volume (volume-1) engine  
+And upgraded Longhorn to v1.6.0
+**Then** Longhorn components should be running  
+And v1.4.x EngineImage state should be deployed and incompatible should be true.
+```
+NAME          INCOMPATIBLE   STATE      IMAGE                               REFCOUNT   BUILDDATE   AGE
+ei-74783864   false          deployed   longhornio/longhorn-engine:v1.5.1   10         28d         12m
+ei-7fa7c208   true           deployed   longhornio/longhorn-engine:v1.4.1   0          157d        13m
+ei-ad420081   false          deployed   c3y1huang/research:2017-lh-ei       0          44h         24s
+```
+
+**When** update existing volume/engine/replica custom resourcs `spec.image` with `longhornio/longhorn-engine:v1.4.x`  
+**Then** should be blocked  
diff --git a/docs/content/manual/release-specific/v1.6.0/test-list-backup-when-cluster-has-node-cordoned-before-longhorn-installation.md b/docs/content/manual/release-specific/v1.6.0/test-list-backup-when-cluster-has-node-cordoned-before-longhorn-installation.md
new file mode 100644
index 0000000000..a5a40de4d8
--- /dev/null
+++ b/docs/content/manual/release-specific/v1.6.0/test-list-backup-when-cluster-has-node-cordoned-before-longhorn-installation.md
@@ -0,0 +1,18 @@
+---
+title: Test list backup when cluster has node cordoned before Longhorn installation
+---
+
+## Related issue
+https://github.com/longhorn/longhorn/issues/7619
+
+## Test step
+
+**Given** a cluster has 3 worker nodes.  
+**And** 2 worker nodes are cordoned.  
+**And** Longhorn is installed.  
+
+**When** Setting up a backup target.  
+
+**Then** no error is observed on the UI Backup page.  
+**And** Backup custom resources are created if the backup target has existing backups.  
+
diff --git a/docs/content/manual/release-specific/v1.6.0/test-pvc-name-and-namespace-included-in-volume-metrics.md b/docs/content/manual/release-specific/v1.6.0/test-pvc-name-and-namespace-included-in-volume-metrics.md
new file mode 100644
index 0000000000..850c8ea8db
--- /dev/null
+++ b/docs/content/manual/release-specific/v1.6.0/test-pvc-name-and-namespace-included-in-volume-metrics.md
@@ -0,0 +1,47 @@
+---
+title: Test PVC Name and Namespace included in the volume metrics
+---
+
+## Related issues
+
+- https://github.com/longhorn/longhorn/issues/5297
+- https://github.com/longhorn/longhorn-manager/pull/2284
+
+## Test step
+
+**Given** created 2 volumes (volume-1, volume-2)
+
+**When** PVC created for volume (volume-1)
+And attached volumes (volume-1, volume-2)
+
+**Then** metrics with `longhorn_volume_` prefix should include `pvc="volume-1"`
+
+```bash
+curl -sSL http://10.0.2.212:32744/metrics | grep longhorn_volume | grep ip-10-0-2-151 | grep volume-1
+longhorn_volume_actual_size_bytes{pvc_namespace="default",node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0
+longhorn_volume_capacity_bytes{pvc_namespace="default",node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 1.073741824e+09
+longhorn_volume_read_iops{pvc_namespace="default",node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0
+longhorn_volume_read_latency{pvc_namespace="default",node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0
+longhorn_volume_read_throughput{pvc_namespace="default",node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0
+longhorn_volume_robustness{pvc_namespace="default",node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 1
+longhorn_volume_state{pvc_namespace="default",node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 2
+longhorn_volume_write_iops{pvc_namespace="default",node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0
+longhorn_volume_write_latency{pvc_namespace="default",node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0
+longhorn_volume_write_throughput{pvc_namespace="default",node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0
+```
+
+And metrics with `longhorn_volume_` prefix should include `pvc=""` for (volume-2)
+
+```bash
+> curl -sSL http://10.0.2.212:32744/metrics | grep longhorn_volume | grep ip-10-0-2-151 | grep volume-2
+longhorn_volume_actual_size_bytes{pvc_namespace="",node="ip-10-0-2-151",pvc="",volume="volume-2"} 0
+longhorn_volume_capacity_bytes{pvc_namespace="",node="ip-10-0-2-151",pvc="",volume="volume-2"} 1.073741824e+09
+longhorn_volume_read_iops{pvc_namespace="",node="ip-10-0-2-151",pvc="",volume="volume-2"} 0
+longhorn_volume_read_latency{pvc_namespace="",node="ip-10-0-2-151",pvc="",volume="volume-2"} 0
+longhorn_volume_read_throughput{pvc_namespace="",node="ip-10-0-2-151",pvc="",volume="volume-2"} 0
+longhorn_volume_robustness{pvc_namespace="",node="ip-10-0-2-151",pvc="",volume="volume-2"} 1
+longhorn_volume_state{pvc_namespace="",node="ip-10-0-2-151",pvc="",volume="volume-2"} 2
+longhorn_volume_write_iops{pvc_namespace="",node="ip-10-0-2-151",pvc="",volume="volume-2"} 0
+longhorn_volume_write_latency{pvc_namespace="",node="ip-10-0-2-151",pvc="",volume="volume-2"} 0
+longhorn_volume_write_throughput{pvc_namespace="",node="ip-10-0-2-151",pvc="",volume="volume-2"} 0
+```
diff --git a/docs/content/manual/release-specific/v1.6.0/test-pvc-name-included-in-volume-metrics.md b/docs/content/manual/release-specific/v1.6.0/test-pvc-name-included-in-volume-metrics.md
deleted file mode 100644
index 340474634b..0000000000
--- a/docs/content/manual/release-specific/v1.6.0/test-pvc-name-included-in-volume-metrics.md
+++ /dev/null
@@ -1,42 +0,0 @@
----
-title: Test PVC Name included in the volume metrics
----
-
-## Related issue
-https://github.com/longhorn/longhorn/issues/5297
-
-## Test step
-
-**Given** created 2 volumes (volume-1, volume-2)
-
-**When** PVC created for volume (volume-1)  
-And attached volumes (volume-1, volume-2)
-
-**Then** metrics with `longhorn_volume_` prefix should include `pvc="volume-1"`
-```bash
-curl -sSL http://10.0.2.212:32744/metrics | grep longhorn_volume | grep ip-10-0-2-151 | grep volume-1
-longhorn_volume_actual_size_bytes{node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0
-longhorn_volume_capacity_bytes{node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 1.073741824e+09
-longhorn_volume_read_iops{node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0
-longhorn_volume_read_latency{node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0
-longhorn_volume_read_throughput{node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0
-longhorn_volume_robustness{node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 1
-longhorn_volume_state{node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 2
-longhorn_volume_write_iops{node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0
-longhorn_volume_write_latency{node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0
-longhorn_volume_write_throughput{node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0
-```
-And metrics with `longhorn_volume_` prefix should include `pvc=""` for (volume-2)
-```bash
-> curl -sSL http://10.0.2.212:32744/metrics | grep longhorn_volume | grep ip-10-0-2-151 | grep volume-2
-longhorn_volume_actual_size_bytes{node="ip-10-0-2-151",pvc="",volume="volume-2"} 0
-longhorn_volume_capacity_bytes{node="ip-10-0-2-151",pvc="",volume="volume-2"} 1.073741824e+09
-longhorn_volume_read_iops{node="ip-10-0-2-151",pvc="",volume="volume-2"} 0
-longhorn_volume_read_latency{node="ip-10-0-2-151",pvc="",volume="volume-2"} 0
-longhorn_volume_read_throughput{node="ip-10-0-2-151",pvc="",volume="volume-2"} 0
-longhorn_volume_robustness{node="ip-10-0-2-151",pvc="",volume="volume-2"} 1
-longhorn_volume_state{node="ip-10-0-2-151",pvc="",volume="volume-2"} 2
-longhorn_volume_write_iops{node="ip-10-0-2-151",pvc="",volume="volume-2"} 0
-longhorn_volume_write_latency{node="ip-10-0-2-151",pvc="",volume="volume-2"} 0
-longhorn_volume_write_throughput{node="ip-10-0-2-151",pvc="",volume="volume-2"} 0
-```
\ No newline at end of file
diff --git a/docs/content/manual/release-specific/v1.6.0/test-storage-network.md b/docs/content/manual/release-specific/v1.6.0/test-storage-network.md
index 546d0c8410..1d10cf5654 100644
--- a/docs/content/manual/release-specific/v1.6.0/test-storage-network.md
+++ b/docs/content/manual/release-specific/v1.6.0/test-storage-network.md
@@ -38,6 +38,8 @@ https://github.com/longhorn/longhorn/issues/6953
 
 ### Setup instances
 
+#### Thin Plugin
+
 **Given** K3s K8s cluster installed on EC2 instances.
 
 *And* Deploy Multus DaemonSet on the control-plane node.
@@ -177,6 +179,140 @@ kubectl apply -f nad-192-168-0-0.yaml
 ```
 
 
+#### Thick Plugin
+
+**Given** K3s K8s cluster installed on EC2 instances.
+
+*And* (For K3s) Establish symbolic links on all cluster nodes.
+  ```bash
+  mkdir /etc/cni
+  mkdir /opt/cni
+  ln -s /var/lib/rancher/k3s/agent/etc/cni/net.d /etc/cni/
+  ln -s /var/lib/rancher/k3s/data/current/bin /opt/cni/
+  ```
+
+*And* Deploy Multus DaemonSet on the control-plane node.
+- Download YAML.
+  ```
+  curl -O https://raw.githubusercontent.com/k8snetworkplumbingwg/multus-cni/v4.0.2/deployments/multus-daemonset-thick.yml
+  ```
+- Edit YAML.
+  ```
+  diff --git a/deployments/multus-daemonset-thick.yml b/deployments/multus-daemonset-thick.yml
+  index eaa92ece..c895651b 100644
+  --- a/deployments/multus-daemonset-thick.yml
+  +++ b/deployments/multus-daemonset-thick.yml
+  @@ -152,7 +152,7 @@ spec:
+         serviceAccountName: multus
+         containers:
+           - name: kube-multus
+  -          image: ghcr.io/k8snetworkplumbingwg/multus-cni:snapshot-thick
+  +          image: ghcr.io/k8snetworkplumbingwg/multus-cni:v4.0.2-thick
+             command: [ "/usr/src/multus-cni/bin/multus-daemon" ]
+             resources:
+               requests:
+  @@ -183,9 +183,11 @@ spec:
+               - name: hostroot
+                 mountPath: /hostroot
+                 mountPropagation: HostToContainer
+  +            - name: cnibin
+  +              mountPath: /opt/cni/bin
+         initContainers:
+           - name: install-multus-binary
+  -          image: ghcr.io/k8snetworkplumbingwg/multus-cni:snapshot-thick
+  +          image: ghcr.io/k8snetworkplumbingwg/multus-cni:v4.0.2-thick
+             command:
+               - "cp"
+               - "/usr/src/multus-cni/bin/multus-shim"
+  ```
+- Apply YAML to K8s cluster.
+  ```
+  kubectl apply -f multus-daemonset-thick.yml
+  ```
+
+*And* Download `ipvlan` and put to K3s binaries path to all cluster nodes.
+```
+curl -OL https://github.com/containernetworking/plugins/releases/download/v1.3.0/cni-plugins-linux-amd64-v1.3.0.tgz
+tar -zxvf cni-plugins-linux-amd64-v1.3.0.tgz
+cp ipvlan /var/lib/rancher/k3s/data/current/bin/
+```
+
+*And* Setup flannels on all cluster nodes.
+```
+# Update nodes eth1 IP to N1, N2, N3
+N1="10.0.2.95"
+N2="10.0.2.139"
+N3="10.0.2.158"
+NODES=(${N1} ${N2} ${N3})
+
+STORAGE_NETWORK_PREFIX="192.168"
+
+ETH1_IP=`ip a | grep eth1 | grep -Eo 'inet (addr:)?([0-9]*\.){3}[0-9]*'  | awk '{print $2}'`
+
+count=1
+for n in "${NODES[@]}"; do
+    [[ ${ETH1_IP} != $n ]] && ((count=count+1)) && continue
+
+    NET=$count
+    break
+done
+
+cat << EOF > /run/flannel/multus-subnet-${STORAGE_NETWORK_PREFIX}.0.0.env
+FLANNEL_NETWORK=${STORAGE_NETWORK_PREFIX}.0.0/16
+FLANNEL_SUBNET=${STORAGE_NETWORK_PREFIX}.${NET}.0/24
+FLANNEL_MTU=1472
+FLANNEL_IPMASQ=true
+EOF
+```
+*And* Setup routes on all cluster nodes.
+```
+# Update nodes eth1 IP to N1, N2, N3
+N1="10.0.2.95"
+N2="10.0.2.139"
+N3="10.0.2.158"
+
+STORAGE_NETWORK_PREFIX="192.168"
+ACTION="add"
+
+ETH1_IP=`ip a | grep eth1 | grep -Eo 'inet (addr:)?([0-9]*\.){3}[0-9]*'  | awk '{print $2}'`
+
+[[ ${ETH1_IP} != ${N1} ]] && ip r ${ACTION} ${STORAGE_NETWORK_PREFIX}.1.0/24 via ${N1} dev eth1
+[[ ${ETH1_IP} != ${N2} ]] && ip r ${ACTION} ${STORAGE_NETWORK_PREFIX}.2.0/24 via ${N2} dev eth1
+[[ ${ETH1_IP} != ${N3} ]] && ip r ${ACTION} ${STORAGE_NETWORK_PREFIX}.3.0/24 via ${N3} dev eth1
+```
+
+*And* Deploy `NetworkAttachmentDefinition`.
+```
+cat << EOF > nad-192-168-0-0.yaml
+apiVersion: "k8s.cni.cncf.io/v1"
+kind: NetworkAttachmentDefinition
+metadata:
+  name: demo-192-168-0-0
+  namespace: kube-system
+  #namespace: longhorn-system
+spec:
+  config: '{
+      "cniVersion": "0.3.1",
+      "type": "flannel",
+      "subnetFile": "/run/flannel/multus-subnet-192.168.0.0.env",
+      "dataDir": "/var/lib/cni/multus-subnet-192.168.0.0",
+      "delegate": {
+        "type": "ipvlan",
+        "master": "eth1",
+        "mode": "l3",
+          "capabilities": {
+            "ips": true
+        }
+      },
+      "kubernetes": {
+          "kubeconfig": "/etc/cni/net.d/multus.d/multus.kubeconfig"
+      }
+    }'
+EOF
+kubectl apply -f nad-192-168-0-0.yaml
+```
+
+
 ### Test storage network
 **Given** Longhorn deployed.
 
diff --git a/docs/content/manual/release-specific/v1.6.0/test-support-bundle-kubelet-log-for-k3s.md b/docs/content/manual/release-specific/v1.6.0/test-support-bundle-kubelet-log-for-k3s.md
new file mode 100644
index 0000000000..7e6ce16a11
--- /dev/null
+++ b/docs/content/manual/release-specific/v1.6.0/test-support-bundle-kubelet-log-for-k3s.md
@@ -0,0 +1,13 @@
+---
+title: Test Support Bundle Should Include Kubelet Log When On K3s Cluster
+---
+
+## Related issue
+https://github.com/longhorn/longhorn/issues/7121
+
+## Test
+
+**Given** Longhorn installed on K3s cluster  
+**When** generated support-bundle  
+**Then** should have worker node kubelet logs in `k3s-agent-service.log`  
+**And** should have control-plan node kubelet log in `k3s-service.log` (if Longhorn is deployed on control-plan node)  
diff --git a/docs/content/manual/release-specific/v1.6.0/test-support-bundle-metadata-file.md b/docs/content/manual/release-specific/v1.6.0/test-support-bundle-metadata-file.md
new file mode 100644
index 0000000000..14c23cf5ec
--- /dev/null
+++ b/docs/content/manual/release-specific/v1.6.0/test-support-bundle-metadata-file.md
@@ -0,0 +1,13 @@
+---
+title: Test Support Bundle Metadata File
+---
+
+## Related issue
+https://github.com/longhorn/longhorn/issues/6997
+
+## Test
+
+**Given** Longhorn installed on SUSE Linux  
+**When** generated support-bundle with description and issue URL  
+**Then** `issuedescription` has the description in the metadata.yaml  
+**And** `issueurl` has the issue URL in the metadata.yaml  
diff --git a/docs/content/manual/release-specific/v1.6.0/test-upgrade-responder-collect-spdk-related-info.md b/docs/content/manual/release-specific/v1.6.0/test-upgrade-responder-collect-spdk-related-info.md
new file mode 100644
index 0000000000..0ae906383d
--- /dev/null
+++ b/docs/content/manual/release-specific/v1.6.0/test-upgrade-responder-collect-spdk-related-info.md
@@ -0,0 +1,105 @@
+---
+title: Test upgrade responder should collect SPDK related info
+---
+
+## Related issue
+https://github.com/longhorn/longhorn/issues/6033
+
+## Test step
+
+### Prerequisite
+
+**Given** Patch build and deploy Longhorn.  
+```
+diff --git a/controller/setting_controller.go b/controller/setting_controller.go
+index de77b7246..ac6263ac5 100644
+--- a/controller/setting_controller.go
++++ b/controller/setting_controller.go
+@@ -49,7 +49,7 @@ const (
+ var (
+ 	upgradeCheckInterval          = time.Hour
+ 	settingControllerResyncPeriod = time.Hour
+-	checkUpgradeURL               = "https://longhorn-upgrade-responder.rancher.io/v1/checkupgrade"
++	checkUpgradeURL               = "http://longhorn-upgrade-responder.default.svc.cluster.local:8314/v1/checkupgrade"
+ )
+
+ type SettingController struct {
+```
+> Match the checkUpgradeURL with the application name: `http://<APP_NAME>-upgrade-responder.default.svc.cluster.local:8314/v1/checkupgrade`
+
+**And** Set setting `v2-data-engine` to `true`.  
+**And** [Add two block-type Disks in Longhorn Nodes](https://longhorn.io/docs/1.5.3/spdk/quick-start/#add-block-type-disks-in-longhorn-nodes).  
+
+#### Test Collecting Longhorn Disk Type
+
+**Given** [Prerequisite](#prerequisite).
+**And** [Deploy upgrade responder stack](https://github.com/longhorn/longhorn/tree/master/dev/upgrade-responder).  
+
+**When** Wait 1~2 hours for collection data to send to the influxDB database.  
+
+**Then** `longhorn_disk_block_Count` should exist the influxDB database.  
+         `longhorn_disk_filesystem_Count` should exist the influxDB database.  
+```bash
+> app_name="longhorn"
+> influxdb_pod=$(kubectl get pod | grep influxdb | awk '{print $1}' | head -n 1)
+> kubectl exec -it ${influxdb_pod} -- influx -execute 'SHOW FIELD KEYS FROM upgrade_request' -database="${app_name}_upgrade_responder" | grep longhorn_disk
+longhorn_disk_block_count                                           float
+longhorn_disk_filesystem_count                                      float
+```
+
+**And** the value in `longhorn_disk_filesystem_Count` should equal to the number of volume using the V1 engine.  
+```bash
+> kubectl exec -it ${influxdb_pod} -- influx -execute 'SELECT "longhorn_disk_filesystem_count" FROM "upgrade_request"' -database="${app_name}_upgrade_responder"
+name: upgrade_request
+time                longhorn_disk_filesystem_count
+----                ------------------------------
+1702351841122419036 1
+1702351841563938125 1
+1702351842436864452 1
+```
+**And** the value in `longhorn_disk_block_Count` should equal to the number of volume using the V2 engine.  
+```bash
+> kubectl exec -it ${influxdb_pod} -- influx -execute 'SELECT "longhorn_disk_block_count" FROM "upgrade_request"' -database="${app_name}_upgrade_responder"
+name: upgrade_request
+time                longhorn_disk_block_count
+----                -------------------------
+1702351841122419036 2
+1702351841563938125 2
+1702351842436864452 2
+```
+
+#### Test Collecting Volume Backend Store Driver
+
+**Given** [Prerequisite](#prerequisite).
+**And** Create one volume using V1 engine.
+        Create two volume using V2 engine.
+**And** [Deploy upgrade responder stack](https://github.com/longhorn/longhorn/tree/master/dev/upgrade-responder).  
+
+**When** Wait 1~2 hours for collection data to send to the influxDB database.  
+
+**Then** `longhorn_volume_backend_store_driver_v1_count` should exist the influxDB database.  
+         `longhorn_volume_backend_store_driver_v2_count` should exist the influxDB database.  
+```bash
+> app_name="longhorn"
+> influxdb_pod=$(kubectl get pod | grep influxdb | awk '{print $1}' | head -n 1)
+> kubectl exec -it ${influxdb_pod} -- influx -execute 'SHOW FIELD KEYS FROM upgrade_request' -database="${app_name}_upgrade_responder" | grep longhorn_volume_backend_store_driver
+longhorn_volume_backend_store_driver_v1_count                       float
+longhorn_volume_backend_store_driver_v2_count                       float
+```
+
+**And** the value in `longhorn_volume_backend_store_driver_v1_count` should equal to the number of volume using the V1 engine.  
+```bash
+> kubectl exec -it ${influxdb_pod} -- influx -execute 'SELECT "longhorn_volume_backend_store_driver_v1_count" FROM "upgrade_request"' -database="${app_name}_upgrade_responder"
+name: upgrade_request
+time                longhorn_volume_backend_store_driver_v1_count
+----                ---------------------------------------------
+1702351841122419036 3
+```
+**And** the value in `longhorn_volume_backend_store_driver_v2_count` should equal to the number of volume using the V2 engine.  
+```bash
+> kubectl exec -it ${influxdb_pod} -- influx -execute 'SELECT "longhorn_volume_backend_store_driver_v2_count" FROM "upgrade_request"' -database="${app_name}_upgrade_responder"
+name: upgrade_request
+time                longhorn_volume_backend_store_driver_v2_count
+----                ---------------------------------------------
+1702351841122419036 2
+```
diff --git a/docs/content/manual/release-specific/v1.6.0/test-upgrade-responder-collectiing-average-sizes-for-v1-volumes-only.md b/docs/content/manual/release-specific/v1.6.0/test-upgrade-responder-collectiing-average-sizes-for-v1-volumes-only.md
new file mode 100644
index 0000000000..34de7afbad
--- /dev/null
+++ b/docs/content/manual/release-specific/v1.6.0/test-upgrade-responder-collectiing-average-sizes-for-v1-volumes-only.md
@@ -0,0 +1,81 @@
+---
+Test upgrade-responder: Collecting Average Sizes for V1 Volumes Only
+---
+
+## Related issues
+
+- https://github.com/longhorn/longhorn/issues/7380
+
+## Test step
+
+**Given** Patch build and deploy Longhorn.
+```
+diff --git a/controller/setting_controller.go b/controller/setting_controller.go
+index de77b7246..ac6263ac5 100644
+--- a/controller/setting_controller.go
++++ b/controller/setting_controller.go
+@@ -49,7 +49,7 @@ const (
+ var (
+ 	upgradeCheckInterval          = time.Hour
+ 	settingControllerResyncPeriod = time.Hour
+-	checkUpgradeURL               = "https://longhorn-upgrade-responder.rancher.io/v1/checkupgrade"
++	checkUpgradeURL               = "http://longhorn-upgrade-responder.default.svc.cluster.local:8314/v1/checkupgrade"
+ )
+
+ type SettingController struct {
+```
+> Match the checkUpgradeURL with the application name: `http://<APP_NAME>-upgrade-responder.default.svc.cluster.local:8314/v1/checkupgrade`
+
+**And** setting `v2-data-engine` value is `true`.  
+**And** add a block disk to cluster nodes.  
+**And** [deploy upgrade responder stack](https://github.com/longhorn/longhorn/tree/master/dev/upgrade-responder).  
+
+**When** create 50 mi volume `lhv-v1` using v1 data engine.  
+**And** create 50 mi volume `lhv-v2` using v2 data engine.  
+**And** attach volume `lhv-v1` and write some data.  
+**And** attach volume `lhv-v2` and write some data.  
+**And** Wait 1~2 hours for collection data to send to the influxDB database.  
+
+**Then** the value of field `longhorn_volume_average_size_bytes` in the influxdb should equal to the average size of all v1 volumes (excluding v2 volumes).  
+**And** the value of field `longhorn_volume_average_actual_size_bytes` in the influxdb should be equal or simular to the average actual size of all v1 volumes (excluding v2 volumes).  
+> It's OK for the actual size to be slightly off due to ongoing workload activities, such as data writing by the upgrade-responder.  
+```bash
+# Get the sizes in the influxdb.
+#
+# Sample:
+# > name: upgrade_request
+#   time                longhorn_volume_average_actual_size_bytes longhorn_volume_average_size_bytes
+#   ----                ----------------------------------------- ----------------------------------
+#   1703045996398941914 73269248                                  1449132032
+#   1703046063248379696 73284266                                  1449132032
+app_name="longhorn"
+influxdb_pod=$(kubectl get pod | grep influxdb | awk '{print $1}' | head -n 1)
+kubectl exec -it ${influxdb_pod} -- influx -execute 'SELECT "longhorn_volume_average_actual_size_bytes", "longhorn_volume_average_size_bytes" FROM "upgrade_request"' -database="${app_name}_upgrade_responder"
+```
+
+```bash
+# Get the sizes from Longhorn volumes.
+
+v1_volume_count=$(kubectl get volumes -n longhorn-system -o=jsonpath='{range .items[*]}{.spec.backendStoreDriver}{"\n"}{end}' | grep -c 'v1')
+echo "Number of V1 volumes: $v1_volume_count"
+
+# Get the expected average size.
+# > Total size: 4347396096
+# > Average size: 1449132032
+total_size=$(kubectl get volumes -n longhorn-system -o=json | jq -r '[.items[] | select(.spec.backendStoreDriver != "v2") | .spec.size | tonumber] | add')
+echo "Total size: $total_size"
+
+average_size=$(echo "scale=0; $total_size / $v1_volume_count" | bc)
+echo "Average size: $average_size"
+
+# Get the expected average actual size.
+#
+# Sample:
+# > Total actualSize: 220368896
+# > Average actual size: 73456298
+total_actual_size=$(kubectl get volumes -n longhorn-system -o=json | jq -r '[.items[] | select(.spec.backendStoreDriver != "v2") | .status.actualSize | tonumber] | add')
+echo "Total actualSize: $total_actual_size"
+
+average_total_actual_size=$(echo "scale=0; $total_actual_size / $v1_volume_count" | bc)
+echo "Average actual size: $average_total_actual_size"
+```
diff --git a/e2e/keywords/common.resource b/e2e/keywords/common.resource
index 6ccb7ec41a..8a3795bea1 100644
--- a/e2e/keywords/common.resource
+++ b/e2e/keywords/common.resource
@@ -2,7 +2,7 @@
 Documentation       Common keywords
 
 Library             ../libs/keywords/common_keywords.py
-Library             ../libs/keywords/node_keywords.py
+Library             ../libs/keywords/stress_keywords.py
 Library             ../libs/keywords/volume_keywords.py
 Library             ../libs/keywords/recurring_job_keywords.py
 Library             ../libs/keywords/workload_keywords.py
@@ -24,6 +24,8 @@ Set test environment
     Set Test Variable    ${deployment_list}
     @{statefulset_list} =    Create List
     Set Test Variable    ${statefulset_list}
+    @{persistentvolumeclaim_list} =   Create List
+    Set Test Variable    ${persistentvolumeclaim_list}
     setup_control_plane_network_latency
     set_backupstore
 
diff --git a/e2e/keywords/node.resource b/e2e/keywords/node.resource
index 59bb4cb61f..0dc25d63b8 100644
--- a/e2e/keywords/node.resource
+++ b/e2e/keywords/node.resource
@@ -66,9 +66,3 @@ Restart cluster
     FOR    ${statefulset}    IN    @{statefulset_list}
         wait_for_workload_pod_stable   ${statefulset}
     END
-
-During replica rebuilding, stress volume node cpu
-    stress_node_cpu_by_volume    ${volume_name}
-
-During replica rebuilding, stress volume node memory
-    stress_node_memory_by_volume    ${volume_name}
diff --git a/e2e/keywords/stress.resource b/e2e/keywords/stress.resource
new file mode 100644
index 0000000000..d4a99230fc
--- /dev/null
+++ b/e2e/keywords/stress.resource
@@ -0,0 +1,17 @@
+*** Settings ***
+Documentation       Stress Node Keywords
+
+Library             ../libs/keywords/stress_keywords.py
+
+*** Keywords ***
+Stress the CPU of all ${role} nodes
+    stress_node_cpu_by_role    ${role}
+
+Stress the CPU of all volume nodes
+    stress_node_cpu_by_volumes    ${volume_list}
+
+Stress the memory of all ${role} nodes
+    stress_node_memory_by_role    ${role}
+
+Stress the memory of all volume nodes
+    stress_node_memory_by_volumes    ${volume_list}
diff --git a/e2e/keywords/volume.resource b/e2e/keywords/volume.resource
index 47815ced3e..723132f019 100644
--- a/e2e/keywords/volume.resource
+++ b/e2e/keywords/volume.resource
@@ -18,6 +18,12 @@ Create volume ${idx} with ${size} GB and ${replica_count} replicas
     attach_volume    ${volume_name}
     Insert Into List    ${volume_list}    ${idx}    ${volume_name}
 
+Attach volume to node
+    attach_volume    ${volume_name}
+
+Detach volume from node
+    detach_volume    ${volume_name}
+
 Write data to the volume
     ${volume_data_checksum} =     write_volume_random_data   ${volume_name}    2048
     Set Test Variable    ${volume_data_checksum}
@@ -48,7 +54,7 @@ Wait until replica ${replica_0} rebuilt, delete replica ${replica_2}
     delete_replica    ${volume_name}    ${replica_2}
 
 Check data is intact
-    check_data    ${volume_name}    ${volume_data_checksum}
+    check_data_checksum    ${volume_name}    ${volume_data_checksum}
 
 Check volume ${idx} works
     ${volume_data_checksum} =     write_volume_random_data   ${volume_list}[${idx}]    1024
@@ -73,4 +79,4 @@ Wait until replica on replica node rebuilt
 
 Wait for volume of statefulset ${idx} healthy
     ${volume_name} =    get_workload_volume_name    ${statefulset_list}[${idx}]
-    wait_for_volume_healthy    ${volume_name}
\ No newline at end of file
+    wait_for_volume_healthy    ${volume_name}
diff --git a/e2e/keywords/workload.resource b/e2e/keywords/workload.resource
index 6d8b91d6d0..8bbba38b96 100644
--- a/e2e/keywords/workload.resource
+++ b/e2e/keywords/workload.resource
@@ -3,6 +3,7 @@ Documentation    Workload Keywords
 
 Library    Collections
 Library    ../libs/keywords/workload_keywords.py
+Library    ../libs/keywords/persistent_volume_claim_keywords.py
 
 *** Keywords ***
 Create deployment ${idx} with ${volume_type} volume
@@ -13,6 +14,25 @@ Create statefulset ${idx} with ${volume_type} volume
     ${statefulset_name} =    create_statefulset    ${volume_type}
     Insert Into List    ${statefulset_list}    ${idx}    ${statefulset_name}
 
+    ${volume_name} =    get_workload_volume_name    ${statefulset_name}
+    Insert Into List    ${volume_list}    ${idx}    ${volume_name}
+
+    ${pvc_name} =    get_workload_pvc_name    ${statefulset_name}
+    Insert Into List    ${persistentvolumeclaim_list}    ${idx}    ${pvc_name}
+
+Scale down statefulset ${idx} to detach volume
+    ${statefulset} =    get_statefulset    ${statefulset_list}[${idx}]
+    ${scale_up_replica_count} =   Set Variable    ${statefulset.spec.replicas}
+    Set Test Variable   ${scale_up_replica_count}
+
+    scale_statefulset    ${statefulset_list}[${idx}]    0
+    wait_for_volume_detached    ${volume_list}[${idx}]
+
+Scale up statefulset ${idx} to attach volume
+    scale_statefulset    ${statefulset_list}[${idx}]    ${scale_up_replica_count}
+    wait_for_volume_healthy    ${volume_list}[${idx}]
+    wait_for_statefulset_replicas_ready    ${statefulset_list}[${idx}]    ${scale_up_replica_count}
+
 Create deployment ${idx} with ${volume_type} and ${option} volume
     ${deployment_name} =    create_deployment    ${volume_type}    ${option}
     Insert Into List    ${deployment_list}    ${idx}    ${deployment_name}
@@ -21,6 +41,15 @@ Create statefulset ${idx} with ${volume_type} and ${option} volume
     ${statefulset_name} =    create_statefulset    ${volume_type}    ${option}
     Insert Into List    ${statefulset_list}    ${idx}    ${statefulset_name}
 
+Expand statefulset ${idx} volume by ${size} MiB
+    ${expected_size} =     expand_pvc_size_by_mib    ${persistentvolumeclaim_list}[${idx}]    ${size}
+    Set Test Variable    ${expected_size}
+
+Write ${size} MB data to statefulset ${idx}
+    ${pod_name} =    get_workload_pod_name    ${statefulset_list}[${idx}]
+    ${pod_data_checksum} =    write_pod_random_data    ${pod_name}    ${size}
+    Insert Into List    ${data_checksum_list}    ${idx}    ${pod_data_checksum}
+
 Keep writing data to deployment ${idx}
     ${pod_name} =    get_workload_pod_name    ${deployment_list}[${idx}]
     keep_writing_pod_data    ${pod_name}
@@ -32,12 +61,24 @@ Keep writing data to statefulset ${idx}
 Check deployment ${idx} works
     ${pod_name} =    get_workload_pod_name    ${deployment_list}[${idx}]
     ${pod_data_checksum} =    write_pod_random_data    ${pod_name}    1024
-    check_pod_data    ${pod_name}    ${pod_data_checksum}
+    check_pod_data_checksum    ${pod_name}    ${pod_data_checksum}
 
 Check statefulset ${idx} works
     ${pod_name} =    get_workload_pod_name    ${statefulset_list}[${idx}]
     ${pod_data_checksum} =    write_pod_random_data    ${pod_name}    1024
-    check_pod_data    ${pod_name}    ${pod_data_checksum}
+    check_pod_data_checksum    ${pod_name}    ${pod_data_checksum}
+
+Check statefulset ${idx} data is intact
+    ${pod_name} =    get_workload_pod_name    ${statefulset_list}[${idx}]
+    ${expected_data_checksum} =     Get From List    ${data_checksum_list}    ${idx}
+    check_pod_data_checksum    ${pod_name}    ${expected_data_checksum}
+
+Wait for statefulset ${idx} volume size expanded
+    wait_for_volume_expand_to_size    ${volume_list}[${idx}]    ${expected_size}
+
+Wait for statefulset ${idx} volume detached
+    wait_for_volume_detached    ${volume_list}[${idx}]
 
 Wait for statefulset ${idx} stable
     wait_for_workload_pod_stable   ${statefulset_list}[${idx}]
+
diff --git a/e2e/libs/engine/crd.py b/e2e/libs/engine/crd.py
index 5b4ddef80b..4af52eea84 100644
--- a/e2e/libs/engine/crd.py
+++ b/e2e/libs/engine/crd.py
@@ -1,13 +1,13 @@
 import logging
 
-from engine.base import Base
+from kubernetes import client
 
-from utils.common_utils import k8s_cr_api
+from engine.base import Base
 
 
 class CRD(Base):
     def __init__(self):
-        self.cr_api = k8s_cr_api()
+        self.obj_api = client.CustomObjectsApi()
 
     def get_engine(self, volume_name, node_name):
         if volume_name == "" or node_name == "":
@@ -22,14 +22,22 @@ def get_engine(self, volume_name, node_name):
         if node_name != "":
             label_selector.append(f"longhornnode={node_name}")
 
-        api_response = self.cr_api.list_namespaced_custom_object(
+        api_response = self.obj_api.list_namespaced_custom_object(
             group="longhorn.io",
             version="v1beta2",
             namespace="longhorn-system",
             plural="engines",
             label_selector=",".join(label_selector)
         )
-        return api_response
+
+        if api_response == "" or api_response is None:
+            raise Exception(f"failed to get the volume {volume_name} engine")
+
+        engines = api_response["items"]
+        if len(engines) == 0:
+            logging.warning(f"cannot get the volume {volume_name} engines")
+
+        return engines
 
     def delete_engine(self, volume_name, node_name):
         if volume_name == "" or node_name == "":
@@ -38,17 +46,9 @@ def delete_engine(self, volume_name, node_name):
             logging.info(
                 f"delete the volume {volume_name} on node {node_name} engine")
 
-        resp = self.get_engine(volume_name, node_name)
-        assert resp != "", "failed to get engines"
-
-        engines = resp['items']
-        if len(engines) == 0:
-            logging.warning("cannot find engines")
-            return
-
-        for engine in engines:
+        for engine in self.get_engine(volume_name, node_name):
             engine_name = engine['metadata']['name']
-            self.cr_api.delete_namespaced_custom_object(
+            self.obj_api.delete_namespaced_custom_object(
                 group="longhorn.io",
                 version="v1beta2",
                 namespace="longhorn-system",
diff --git a/e2e/libs/engine/engine.py b/e2e/libs/engine/engine.py
index 2003712917..e0b8e9654e 100644
--- a/e2e/libs/engine/engine.py
+++ b/e2e/libs/engine/engine.py
@@ -3,6 +3,8 @@
 
 from strategy import LonghornOperationStrategy
 
+from utility.utility import logging
+
 
 class Engine(Base):
 
@@ -15,6 +17,13 @@ def __init__(self):
     def get_engine(self, volume_name, node_name):
         return self.engine.get_engine(volume_name, node_name)
 
+    def get_engine_by_volume(self, volume):
+        engines = self.engine.get_engine(volume["metadata"]["name"], "")
+        assert len(engines) == 1, \
+            f"Expected exactly one engine but found {len(engines)}"
+
+        return engines[0]
+
     # delete engines, if input parameters are empty then will delete all
     def delete_engine(self, volume_name="", node_name=""):
         return self.engine.delete_engine(volume_name, node_name)
@@ -22,17 +31,8 @@ def delete_engine(self, volume_name="", node_name=""):
     def get_engine_state(self, volume_name, node_name):
         logging(f"Getting the volume {volume_name} engine on the node {node_name} state")
 
-        resp = self.get_engine(volume_name, node_name)
-        if resp == "" or resp is None:
-            raise Exception(f"failed to get the volume {volume_name} engine")
-
-        engines = resp["items"]
-        if len(engines) == 0:
-            logging.warning(f"cannot get the volume {volume_name} engines")
-            return
-
         engines_states = {}
-        for engine in engines:
+        for engine in self.engine.get_engine(volume_name, node_name):
             engine_name = engine["metadata"]["name"]
             engine_state = engine['status']['currentState']
             engines_states[engine_name] = engine_state
diff --git a/e2e/libs/keywords/kubelet_keywords.py b/e2e/libs/keywords/kubelet_keywords.py
index 55c8e6cef6..58f33dec59 100644
--- a/e2e/libs/keywords/kubelet_keywords.py
+++ b/e2e/libs/keywords/kubelet_keywords.py
@@ -1,5 +1,6 @@
 from kubelet.kubelet import restart_kubelet
 
+
 class kubelet_keywords:
 
     def restart_kubelet(self, node_name, stop_time_in_sec):
diff --git a/e2e/libs/keywords/node_keywords.py b/e2e/libs/keywords/node_keywords.py
index c41977bbeb..fc23bb45c5 100644
--- a/e2e/libs/keywords/node_keywords.py
+++ b/e2e/libs/keywords/node_keywords.py
@@ -1,9 +1,8 @@
 from robot.libraries.BuiltIn import BuiltIn
 
 from node import Node
-from node import Stress
+from node.utility import get_node_by_index
 
-from utility.utility import get_node
 from utility.utility import wait_for_all_instance_manager_running
 
 
@@ -11,7 +10,6 @@ class node_keywords:
 
     def __init__(self):
         self.node = Node()
-        self.stress = Stress()
 
     def reboot_volume_node(self, volume_name):
         volume_keywords = BuiltIn().get_library_instance('volume_keywords')
@@ -24,7 +22,7 @@ def reboot_replica_node(self, volume_name):
         self.node.reboot_node(replica_node)
 
     def reboot_node_by_index(self, idx, power_off_time_in_min=1):
-        node_name = get_node(idx)
+        node_name = get_node_by_index(idx)
         self.node.reboot_node(node_name, int(power_off_time_in_min) * 60)
 
     def reboot_all_worker_nodes(self, power_off_time_in_min=1):
@@ -38,16 +36,3 @@ def reboot_node_by_name(self, node_name, power_off_time_in_min=1):
 
     def wait_for_all_instance_manager_running(self):
         wait_for_all_instance_manager_running()
-
-    def cleanup_stress_helper(self):
-        self.stress.cleanup()
-
-    def stress_node_cpu_by_volume(self, volume_name):
-        volume_keywords = BuiltIn().get_library_instance('volume_keywords')
-        volume_node = volume_keywords.get_volume_node(volume_name)
-        self.stress.cpu([volume_node])
-
-    def stress_node_memory_by_volume(self, volume_name):
-        volume_keywords = BuiltIn().get_library_instance('volume_keywords')
-        volume_node = volume_keywords.get_volume_node(volume_name)
-        self.stress.memory([volume_node])
diff --git a/e2e/libs/keywords/persistent_volume_claim_keywords.py b/e2e/libs/keywords/persistent_volume_claim_keywords.py
new file mode 100644
index 0000000000..5bef238a64
--- /dev/null
+++ b/e2e/libs/keywords/persistent_volume_claim_keywords.py
@@ -0,0 +1,16 @@
+from persistent_volume_claim import PersistentVolumeClaim
+
+from utility.utility import logging
+
+from volume.constant import MEBIBYTE
+
+
+class persistent_volume_claim_keywords:
+
+    def __init__(self):
+        self.pvc = PersistentVolumeClaim()
+
+    def expand_pvc_size_by_mib(self, claim_name, size_in_mib):
+        logging(f'Expanding PVC {claim_name} by {size_in_mib} MiB')
+        size_in_byte = int(size_in_mib) * MEBIBYTE
+        return self.pvc.expand(claim_name, size_in_byte)
diff --git a/e2e/libs/keywords/recurring_job_keywords.py b/e2e/libs/keywords/recurring_job_keywords.py
index 999ee40794..148f92dd43 100644
--- a/e2e/libs/keywords/recurring_job_keywords.py
+++ b/e2e/libs/keywords/recurring_job_keywords.py
@@ -8,7 +8,6 @@ class recurring_job_keywords:
     def __init__(self):
         self.recurring_job = RecurringJob()
 
-
     def create_snapshot_recurring_job_for_volume(self, volume_name):
         job_name = volume_name + '-snap'
         self.recurring_job.create(job_name, task="snapshot")
@@ -16,7 +15,6 @@ def create_snapshot_recurring_job_for_volume(self, volume_name):
         self.recurring_job.get(job_name)
         logging(f'Created recurring job {job_name} for volume {volume_name}')
 
-
     def create_backup_recurring_job_for_volume(self, volume_name):
         job_name = volume_name + '-bak'
         self.recurring_job.create(job_name, task="backup")
@@ -24,10 +22,8 @@ def create_backup_recurring_job_for_volume(self, volume_name):
         self.recurring_job.get(job_name)
         logging(f'Created recurring job {job_name} for volume {volume_name}')
 
-
     def check_recurring_jobs_work(self, volume_name):
         self.recurring_job.check_jobs_work(volume_name)
 
-
     def cleanup_recurring_jobs(self, volume_names):
         self.recurring_job.cleanup(volume_names)
diff --git a/e2e/libs/keywords/stress_keywords.py b/e2e/libs/keywords/stress_keywords.py
new file mode 100644
index 0000000000..f9b9928d44
--- /dev/null
+++ b/e2e/libs/keywords/stress_keywords.py
@@ -0,0 +1,26 @@
+from robot.libraries.BuiltIn import BuiltIn
+
+from node import Stress
+from node.utility import list_node_names_by_role
+from node.utility import list_node_names_by_volumes
+
+
+class stress_keywords:
+
+    def __init__(self):
+        self.stress = Stress()
+
+    def cleanup_stress_helper(self):
+        self.stress.cleanup()
+
+    def stress_node_cpu_by_role(self, role):
+        self.stress.cpu(list_node_names_by_role(role))
+
+    def stress_node_cpu_by_volumes(self, volume_names):
+        self.stress.cpu(list_node_names_by_volumes(volume_names))
+
+    def stress_node_memory_by_role(self, role):
+        self.stress.memory(list_node_names_by_role(role))
+
+    def stress_node_memory_by_volumes(self, volume_names):
+        self.stress.memory(list_node_names_by_volumes(volume_names))
diff --git a/e2e/libs/keywords/volume_keywords.py b/e2e/libs/keywords/volume_keywords.py
index 192b1a303f..813a0fddbd 100644
--- a/e2e/libs/keywords/volume_keywords.py
+++ b/e2e/libs/keywords/volume_keywords.py
@@ -1,7 +1,10 @@
-from utility.utility import logging
+from node.utility import get_node_by_index
+from node.utility import list_node_names_by_role
+
 from utility.utility import generate_volume_name
-from utility.utility import get_node, list_nodes
-from utility.utility import get_test_pod_running_node, get_test_pod_not_running_node
+from utility.utility import get_test_pod_not_running_node
+from utility.utility import get_test_pod_running_node
+from utility.utility import logging
 
 from volume import Volume
 
@@ -11,68 +14,65 @@ class volume_keywords:
     def __init__(self):
         self.volume = Volume()
 
-
     def create_volume(self, size, replica_count):
         volume_name = generate_volume_name()
+        logging(f'Creating volume {volume_name}')
         self.volume.create(volume_name, size, replica_count)
-        logging(f'Created volume {volume_name}')
         return volume_name
 
-
     def attach_volume(self, volume_name):
         attach_node = get_test_pod_not_running_node()
-        logging(f'Attached volume {volume_name} to {attach_node}')
+        logging(f'Attaching volume {volume_name} to {attach_node}')
         self.volume.attach(volume_name, attach_node)
 
+    def detach_volume(self, volume_name):
+        logging(f'Detaching volume {volume_name}')
+        self.volume.detach(volume_name)
+
+    def wait_for_volume_expand_to_size(self, volume_name, size):
+        logging(f'Waiting for volume {volume_name} expand to {size}')
+        return self.volume.wait_for_volume_expand_to_size(volume_name, size)
 
     def get_volume_node(self, volume_name):
         volume = self.volume.get(volume_name)
         return volume['spec']['nodeID']
-        # return volume.controllers[0].hostId
-
 
     def get_replica_node(self, volume_name):
-        nodes = list_nodes()
+        worker_nodes = list_node_names_by_role("worker")
         volume_node = self.get_volume_node(volume_name)
         test_pod_running_node = get_test_pod_running_node()
-        for node in nodes:
-            if node != volume_node and node != test_pod_running_node:
-                return node
-
+        for worker_node in worker_nodes:
+            if worker_node != volume_node and worker_node != test_pod_running_node:
+                return worker_node
 
     def write_volume_random_data(self, volume_name, size_in_mb):
         return self.volume.write_random_data(volume_name, size_in_mb)
 
-
     def keep_writing_data(self, volume_name):
         self.volume.keep_writing_data(volume_name)
 
-
-    def check_data(self, volume_name, checksum):
+    def check_data_checksum(self, volume_name, checksum):
         logging(f"Checking volume {volume_name} data with checksum {checksum}")
-        self.volume.check_data(volume_name, checksum)
-
+        self.volume.check_data_checksum(volume_name, checksum)
 
     def delete_replica(self, volume_name, replica_node):
         if str(replica_node).isdigit():
-            replica_node = get_node(replica_node)
+            replica_node = get_node_by_index(replica_node)
         logging(f"Deleting volume {volume_name}'s replica on node {replica_node}")
         self.volume.delete_replica(volume_name, replica_node)
 
-
     def wait_for_replica_rebuilding_start(self, volume_name, replica_node):
         if str(replica_node).isdigit():
-            replica_node = get_node(replica_node)
+            replica_node = get_node_by_index(replica_node)
         logging(f"Waiting for volume {volume_name}'s replica on node {replica_node} rebuilding started")
         self.volume.wait_for_replica_rebuilding_start(
             volume_name,
             replica_node
         )
 
-
     def wait_for_replica_rebuilding_complete(self, volume_name, replica_node):
         if str(replica_node).isdigit():
-            replica_node = get_node(replica_node)
+            replica_node = get_node_by_index(replica_node)
         logging(f"Waiting for volume {volume_name}'s replica on node {replica_node} rebuilding completed")
         self.volume.wait_for_replica_rebuilding_complete(
             volume_name,
@@ -82,8 +82,11 @@ def wait_for_replica_rebuilding_complete(self, volume_name, replica_node):
     def wait_for_volume_attached(self, volume_name):
         self.volume.wait_for_volume_attached(volume_name)
 
+    def wait_for_volume_detached(self, volume_name):
+        self.volume.wait_for_volume_detached(volume_name)
+
     def wait_for_volume_healthy(self, volume_name):
         self.volume.wait_for_volume_healthy(volume_name)
 
     def cleanup_volumes(self, volume_names):
-        self.volume.cleanup(volume_names)
\ No newline at end of file
+        self.volume.cleanup(volume_names)
diff --git a/e2e/libs/keywords/workload_keywords.py b/e2e/libs/keywords/workload_keywords.py
index cee3fb2c94..51450df9e5 100644
--- a/e2e/libs/keywords/workload_keywords.py
+++ b/e2e/libs/keywords/workload_keywords.py
@@ -1,5 +1,6 @@
 from workload.workload import *
 
+
 class workload_keywords:
 
     def __init__(self):
@@ -14,7 +15,7 @@ def cleanup_storageclasses(self):
         delete_storageclass('longhorn-test-strict-local')
 
     def create_deployment(self, volume_type="rwo", option=""):
-        pvc_name = create_pvc(volume_type, option)
+        create_pvc(volume_type, option)
         deployment_name = create_deployment(volume_type, option)
         return deployment_name
 
@@ -22,9 +23,18 @@ def create_statefulset(self, volume_type="rwo", option=""):
         statefulset_name = create_statefulset(volume_type, option)
         return statefulset_name
 
+    def get_statefulset(self, statefulset_name):
+        return get_statefulset(statefulset_name)
+
+    def scale_statefulset(self, statefulset_name, replica_count):
+        return scale_statefulset(statefulset_name, replica_count)
+
     def get_workload_pod_name(self, workload_name):
         return get_workload_pod_names(workload_name)[0]
 
+    def get_workload_pvc_name(self, workload_name):
+        return get_workload_pvc_name(workload_name)
+
     def get_workload_volume_name(self, workload_name):
         return get_workload_volume_name(workload_name)
 
@@ -34,8 +44,8 @@ def keep_writing_pod_data(self, pod_name):
     def write_pod_random_data(self, pod, size_in_mb):
         return write_pod_random_data(pod, size_in_mb)
 
-    def check_pod_data(self, pod_name, checksum):
-        check_pod_data(pod_name, checksum)
+    def check_pod_data_checksum(self, pod_name, checksum):
+        check_pod_data_checksum(pod_name, checksum)
 
     def cleanup_deployments(self, deployment_names):
         for name in deployment_names:
@@ -51,3 +61,6 @@ def cleanup_statefulsets(self, statefulset_names):
 
     def wait_for_workload_pod_stable(self, workload_name):
         return wait_for_workload_pod_stable(workload_name)
+
+    def wait_for_statefulset_replicas_ready(self, statefulset_name, expected_ready_count):
+        return wait_for_statefulset_replicas_ready(statefulset_name, expected_ready_count)
diff --git a/e2e/libs/kubelet/kubelet.py b/e2e/libs/kubelet/kubelet.py
index c9c5180050..06beb039da 100644
--- a/e2e/libs/kubelet/kubelet.py
+++ b/e2e/libs/kubelet/kubelet.py
@@ -1,11 +1,11 @@
-from utility.utility import logging
 import time
 
-from workload.pod import new_pod_manifest
 from workload.pod import create_pod
-from workload.pod import wait_for_pod_status
 from workload.pod import delete_pod
-from workload.pod import IMAGE_UBUNTU
+from workload.pod import new_pod_manifest
+
+from workload.constant import IMAGE_UBUNTU
+
 
 def restart_kubelet(node_name, stop_time_in_sec=10):
     manifest = new_pod_manifest(
diff --git a/e2e/libs/network/network.py b/e2e/libs/network/network.py
index a9d81b4b3d..b775430f52 100644
--- a/e2e/libs/network/network.py
+++ b/e2e/libs/network/network.py
@@ -1,29 +1,35 @@
 from robot.libraries.BuiltIn import BuiltIn
-from utility.utility import get_control_plane_nodes
+
+from node.utility import list_node_names_by_role
+
 from node_exec import NodeExec
 
+
+
 def get_control_plane_node_network_latency_in_ms():
-    latency_in_ms = int(BuiltIn().get_variable_value("${CONTROL_PLANE_NODE_NETWORK_LATENCY_IN_MS}"))
+    latency_in_ms = int(BuiltIn().get_variable_value("${CONTROL_PLANE_NODE_NETWORK_LATENCY_IN_MS}", default="0"))
     return latency_in_ms
 
+
 def setup_control_plane_network_latency():
     latency_in_ms = get_control_plane_node_network_latency_in_ms()
     if latency_in_ms != 0:
-        nodes = get_control_plane_nodes()
-        for node in nodes:
+        control_plane_nodes = list_node_names_by_role("control-plane")
+        for control_plane_node in control_plane_nodes:
             cmd = f"tc qdisc replace dev eth0 root netem delay {latency_in_ms}ms"
-            res = NodeExec.get_instance().issue_cmd(node, cmd)
+            res = NodeExec.get_instance().issue_cmd(control_plane_node, cmd)
             cmd = f"tc qdisc show dev eth0 | grep delay"
-            res = NodeExec.get_instance().issue_cmd(node, cmd)
+            res = NodeExec.get_instance().issue_cmd(control_plane_node, cmd)
             assert res, "setup control plane network latency failed"
 
+
 def cleanup_control_plane_network_latency():
     latency_in_ms = get_control_plane_node_network_latency_in_ms()
     if latency_in_ms != 0:
-        nodes = get_control_plane_nodes()
-        for node in nodes:
+        control_plane_nodes = list_node_names_by_role("control-plane")
+        for control_plane_node in control_plane_nodes:
             cmd = "tc qdisc del dev eth0 root"
-            res = NodeExec.get_instance().issue_cmd(node, cmd)
+            res = NodeExec.get_instance().issue_cmd(control_plane_node, cmd)
             cmd = f"tc qdisc show dev eth0 | grep -v delay"
-            res = NodeExec.get_instance().issue_cmd(node, cmd)
-            assert res, "cleanup control plane network failed"
\ No newline at end of file
+            res = NodeExec.get_instance().issue_cmd(control_plane_node, cmd)
+            assert res, "cleanup control plane network failed"
diff --git a/e2e/libs/node/constant.py b/e2e/libs/node/constant.py
new file mode 100644
index 0000000000..b7dc738512
--- /dev/null
+++ b/e2e/libs/node/constant.py
@@ -0,0 +1,7 @@
+NODE_STRESS_CPU_LOAD_PERCENTAGE = 100
+NODE_STRESS_MEM_LOAD_PERCENTAGE = 100
+NODE_STRESS_MEM_VM_WORKERS = 1
+NODE_STRESS_TIMEOUT_SECOND = 60 * 60  # 1 hour
+
+STRESS_HELPER_LABEL = "longhorn-stress-helper"
+STRESS_HELPER_POD_NAME_PREFIX = "longhorn-stress-helper-"
diff --git a/e2e/libs/node/node.py b/e2e/libs/node/node.py
index 98935bffcd..4938a473ad 100644
--- a/e2e/libs/node/node.py
+++ b/e2e/libs/node/node.py
@@ -4,7 +4,8 @@
 
 from kubernetes import client
 
-from utility.utility import list_nodes
+from node.utility import list_node_names_by_role
+
 from utility.utility import logging
 from utility.utility import wait_for_cluster_ready
 
@@ -19,7 +20,7 @@ def __init__(self):
     def reboot_all_nodes(self, shut_down_time_in_sec=60):
         instance_ids = [value for value in self.mapping.values()]
 
-        resp = self.aws_client.stop_instances(InstanceIds=instance_ids)
+        resp = self.aws_client.stop_instances(InstanceIds=instance_ids, Force=True)
         logging(f"Stopping instances {instance_ids} response: {resp}")
         waiter = self.aws_client.get_waiter('instance_stopped')
         waiter.wait(InstanceIds=instance_ids)
@@ -37,7 +38,7 @@ def reboot_all_nodes(self, shut_down_time_in_sec=60):
     def reboot_node(self, reboot_node_name, shut_down_time_in_sec=60):
         instance_ids = [self.mapping[reboot_node_name]]
 
-        resp = self.aws_client.stop_instances(InstanceIds=instance_ids)
+        resp = self.aws_client.stop_instances(InstanceIds=instance_ids, Force=True)
         logging(f"Stopping instances {instance_ids} response: {resp}")
         waiter = self.aws_client.get_waiter('instance_stopped')
         waiter.wait(InstanceIds=instance_ids)
@@ -52,9 +53,9 @@ def reboot_node(self, reboot_node_name, shut_down_time_in_sec=60):
         logging(f"Started instances")
 
     def reboot_all_worker_nodes(self, shut_down_time_in_sec=60):
-        instance_ids = [self.mapping[value] for value in list_nodes()]
+        instance_ids = [self.mapping[value] for value in list_node_names_by_role("worker")]
 
-        resp = self.aws_client.stop_instances(InstanceIds=instance_ids)
+        resp = self.aws_client.stop_instances(InstanceIds=instance_ids, Force=True)
         logging(f"Stopping instances {instance_ids} response: {resp}")
         waiter = self.aws_client.get_waiter('instance_stopped')
         waiter.wait(InstanceIds=instance_ids)
diff --git a/e2e/libs/node/stress.py b/e2e/libs/node/stress.py
index f142fca7c5..b293103a5c 100644
--- a/e2e/libs/node/stress.py
+++ b/e2e/libs/node/stress.py
@@ -1,39 +1,57 @@
-from kubernetes import client
+from kubernetes.client.rest import ApiException
 
 from node.utility import get_node_cpu_cores
 
+from node.constant import NODE_STRESS_CPU_LOAD_PERCENTAGE
+from node.constant import NODE_STRESS_MEM_LOAD_PERCENTAGE
+from node.constant import NODE_STRESS_MEM_VM_WORKERS
+from node.constant import NODE_STRESS_TIMEOUT_SECOND
+from node.constant import STRESS_HELPER_LABEL
+from node.constant import STRESS_HELPER_POD_NAME_PREFIX
+
 from utility.utility import logging
 
 from workload.pod import create_pod
 from workload.pod import delete_pod
+from workload.pod import get_pod
 from workload.pod import new_pod_manifest
 from workload.workload import get_workload_pods
 
-from workload.pod import IMAGE_LITMUX
-
-NODE_CPU_LOAD_PERCENTAGE = 100
-NODE_MEM_LOAD_PERCENTAGE = 100
-NODE_MEM_VM_WORKERS = 1
-NODE_STRESS_TIMEOUT_SECOND = 300
+from workload.constant import IMAGE_LITMUX
 
-LABEL_STRESS_HELPER = "longhorn-stress-helper"
 
 class Stress:
     def cleanup(self):
-        for pod in get_workload_pods(LABEL_STRESS_HELPER):
+        for pod in get_workload_pods(STRESS_HELPER_LABEL):
             logging(f"Cleaning up stress pod {pod.metadata.name}")
             delete_pod(pod.metadata.name, pod.metadata.namespace)
 
     def cpu(self, node_names):
         for node_name in node_names:
+            pod_name = f"{STRESS_HELPER_POD_NAME_PREFIX}{node_name}"
+
+            # If the helper pod creation is called inside of a test case loop,
+            # we need to check if the pod already running.
+            try:
+                pod = get_pod(pod_name)
+                if pod and pod.status.phase != "Running":
+                    logging(f"Deleting stress pod {pod_name} in phase {pod.status.phase}")
+                    delete_pod(pod_name)
+                elif pod:
+                    logging(f"Stress pod {pod_name} already running")
+                    continue
+            except ApiException as e:
+                assert e.status == 404
+
             manifest = new_pod_manifest(
+                pod_name=pod_name,
                 image=IMAGE_LITMUX,
                 command=["stress-ng"],
                 args=['--cpu', str(get_node_cpu_cores(node_name)),
-                      '--cpu-load', str(NODE_CPU_LOAD_PERCENTAGE),
+                      '--cpu-load', str(NODE_STRESS_CPU_LOAD_PERCENTAGE),
                       '--timeout', str(NODE_STRESS_TIMEOUT_SECOND)],
                 node_name=node_name,
-                labels={'app': LABEL_STRESS_HELPER}
+                labels={'app': STRESS_HELPER_LABEL}
             )
 
             pod_name = manifest['metadata']['name']
@@ -42,14 +60,30 @@ def cpu(self, node_names):
 
     def memory(self, node_names):
         for node_name in node_names:
+            pod_name = f"{STRESS_HELPER_POD_NAME_PREFIX}{node_name}"
+
+            # If the helper pod creation is called inside of a test case loop,
+            # we need to check if the pod already running.
+            try:
+                pod = get_pod(pod_name)
+                if pod and pod.status.phase != "Running":
+                    logging(f"Deleting stress pod {pod_name} in phase {pod.status.phase}")
+                    delete_pod(pod_name)
+                elif pod:
+                    logging(f"Stress pod {pod_name} already running")
+                    continue
+            except ApiException as e:
+                assert e.status == 404
+
             manifest = new_pod_manifest(
+                pod_name=pod_name,
                 image=IMAGE_LITMUX,
                 command=["stress-ng"],
-                args=['--vm', str(NODE_MEM_VM_WORKERS),
-					  '--vm-bytes', f"{NODE_MEM_LOAD_PERCENTAGE}%",
+                args=['--vm', str(NODE_STRESS_MEM_VM_WORKERS),
+					  '--vm-bytes', f"{NODE_STRESS_MEM_LOAD_PERCENTAGE}%",
 					  '--timeout', str(NODE_STRESS_TIMEOUT_SECOND)],
                 node_name=node_name,
-                labels={'app': LABEL_STRESS_HELPER}
+                labels={'app': STRESS_HELPER_LABEL}
             )
 
             pod_name = manifest['metadata']['name']
diff --git a/e2e/libs/node/utility.py b/e2e/libs/node/utility.py
index 571b983b6f..d89c44f1d0 100644
--- a/e2e/libs/node/utility.py
+++ b/e2e/libs/node/utility.py
@@ -1,9 +1,53 @@
+from robot.libraries.BuiltIn import BuiltIn
+
 from kubernetes import client
 
+
+def get_node_by_index(index, role="worker"):
+    nodes = list_node_names_by_role(role)
+    return nodes[int(index)]
+
+
 def get_node_by_name(node_name):
     core_api = client.CoreV1Api()
     return core_api.read_node(node_name)
 
+
 def get_node_cpu_cores(node_name):
     node = get_node_by_name(node_name)
     return node.status.capacity['cpu']
+
+
+def list_node_names_by_role(role="all"):
+    if role not in ["all", "control-plane", "worker"]:
+        raise ValueError("Role must be one of 'all', 'master' or 'worker'")
+
+    def filter_nodes(nodes, condition):
+        return [node.metadata.name for node in nodes if condition(node)]
+
+    core_api = client.CoreV1Api()
+    nodes = core_api.list_node().items
+
+    control_plane_labels = ['node-role.kubernetes.io/master', 'node-role.kubernetes.io/control-plane']
+
+    if role == "all":
+        return sorted(filter_nodes(nodes, lambda node: True))
+
+    if role == "control-plane":
+        condition = lambda node: all(label in node.metadata.labels for label in control_plane_labels)
+        return sorted(filter_nodes(nodes, condition))
+
+    if role == "worker":
+        condition = lambda node: not any(label in node.metadata.labels for label in control_plane_labels)
+        return sorted(filter_nodes(nodes, condition))
+
+
+def list_node_names_by_volumes(volume_names):
+    volume_nodes = {}
+    volume_keywords = BuiltIn().get_library_instance('volume_keywords')
+
+    for volume_name in volume_names:
+        volume_node = volume_keywords.get_volume_node(volume_name)
+        if volume_node not in volume_nodes:
+            volume_nodes[volume_node] = True
+    return list(volume_nodes.keys())
diff --git a/e2e/libs/node_exec/constant.py b/e2e/libs/node_exec/constant.py
new file mode 100644
index 0000000000..255c49afc4
--- /dev/null
+++ b/e2e/libs/node_exec/constant.py
@@ -0,0 +1,2 @@
+DEFAULT_POD_TIMEOUT = 180
+DEFAULT_POD_INTERVAL = 1
diff --git a/e2e/libs/node_exec/node_exec.py b/e2e/libs/node_exec/node_exec.py
index d01f39988f..dd6aee3c1d 100644
--- a/e2e/libs/node_exec/node_exec.py
+++ b/e2e/libs/node_exec/node_exec.py
@@ -7,9 +7,9 @@
 from workload.pod import wait_delete_pod
 from utility.utility import wait_delete_ns
 
+from node_exec.constant import DEFAULT_POD_INTERVAL
+from node_exec.constant import DEFAULT_POD_TIMEOUT
 
-DEFAULT_POD_TIMEOUT = 180
-DEFAULT_POD_INTERVAL = 1
 
 class NodeExec:
 
@@ -198,4 +198,4 @@ def launch_pod(self, node_name):
                     break
                 time.sleep(DEFAULT_POD_INTERVAL)
             self.node_exec_pod[node_name] = pod
-            return pod
\ No newline at end of file
+            return pod
diff --git a/e2e/libs/persistent_volume_claim/__init__.py b/e2e/libs/persistent_volume_claim/__init__.py
new file mode 100644
index 0000000000..75f06a434b
--- /dev/null
+++ b/e2e/libs/persistent_volume_claim/__init__.py
@@ -0,0 +1 @@
+from persistent_volume_claim.persistent_volume_claim import PersistentVolumeClaim
diff --git a/e2e/libs/persistent_volume_claim/base.py b/e2e/libs/persistent_volume_claim/base.py
new file mode 100644
index 0000000000..43d45fba0c
--- /dev/null
+++ b/e2e/libs/persistent_volume_claim/base.py
@@ -0,0 +1,12 @@
+from abc import ABC, abstractmethod
+
+
+class Base(ABC):
+
+    @abstractmethod
+    def get(self, volume_name):
+        return NotImplemented
+
+    @abstractmethod
+    def expand(self, claim_name, size, claim_namespace="default"):
+        return NotImplemented
diff --git a/e2e/libs/persistent_volume_claim/constant.py b/e2e/libs/persistent_volume_claim/constant.py
new file mode 100644
index 0000000000..82e875b169
--- /dev/null
+++ b/e2e/libs/persistent_volume_claim/constant.py
@@ -0,0 +1,2 @@
+RETRY_COUNTS = 150
+RETRY_INTERVAL = 1
diff --git a/e2e/libs/persistent_volume_claim/crd.py b/e2e/libs/persistent_volume_claim/crd.py
new file mode 100644
index 0000000000..bd3f80be07
--- /dev/null
+++ b/e2e/libs/persistent_volume_claim/crd.py
@@ -0,0 +1,40 @@
+from kubernetes import client
+
+from persistent_volume_claim.base import Base
+
+from utility.utility import get_retry_count_and_interval
+from utility.utility import logging
+
+
+class CRD(Base):
+
+    def __init__(self):
+        self.core_v1_api = client.CoreV1Api()
+        self.retry_count, self.retry_interval = get_retry_count_and_interval()
+
+    def get(self, claim_name, claim_namespace="default"):
+        return self.core_v1_api.read_namespaced_persistent_volume_claim(
+            name=claim_name,
+            namespace=claim_namespace,
+        )
+
+    def expand(self, claim_name, size, namespace="default"):
+        try:
+            self.core_v1_api.patch_namespaced_persistent_volume_claim(
+                name=claim_name,
+                namespace=namespace,
+                body={
+                        'spec': {
+                            'resources': {
+                                'requests': {
+                                    'storage':  str(size)
+                                }
+                            }
+                        }
+                }
+            )
+            return size
+        except client.exceptions.ApiException as e:
+            logging(f"Exception when expanding PVC: {e}")
+
+        return size
diff --git a/e2e/libs/persistent_volume_claim/persistent_volume_claim.py b/e2e/libs/persistent_volume_claim/persistent_volume_claim.py
new file mode 100644
index 0000000000..7fce6e1ad2
--- /dev/null
+++ b/e2e/libs/persistent_volume_claim/persistent_volume_claim.py
@@ -0,0 +1,26 @@
+from strategy import LonghornOperationStrategy
+
+from persistent_volume_claim.base import Base
+from persistent_volume_claim.crd import CRD
+
+from utility.utility import logging
+
+
+class PersistentVolumeClaim(Base):
+
+    _strategy = LonghornOperationStrategy.CRD
+
+    def __init__(self):
+        if self._strategy == LonghornOperationStrategy.CRD:
+            self.pvc = CRD()
+
+    def get(self, claim_name):
+        return self.pvc.get(claim_name)
+
+    def expand(self, claim_name, size_in_byte):
+        pvc = self.pvc.get(claim_name)
+        current_size = int(pvc.spec.resources.requests['storage'])
+
+        target_size = current_size + size_in_byte
+        logging(f"Expanding PVC {claim_name} from {current_size} to {target_size}")
+        return self.pvc.expand(claim_name, target_size)
diff --git a/e2e/libs/recurring_job/base.py b/e2e/libs/recurring_job/base.py
index e74e536745..5e4897fbdf 100644
--- a/e2e/libs/recurring_job/base.py
+++ b/e2e/libs/recurring_job/base.py
@@ -25,4 +25,4 @@ def check_jobs_work(self, volume_name):
 
     @abstractmethod
     def cleanup(self, volume_names):
-        return NotImplemented
\ No newline at end of file
+        return NotImplemented
diff --git a/e2e/libs/recurring_job/constant.py b/e2e/libs/recurring_job/constant.py
new file mode 100644
index 0000000000..bb5017e701
--- /dev/null
+++ b/e2e/libs/recurring_job/constant.py
@@ -0,0 +1,2 @@
+RETRY_COUNTS = 180
+RETRY_INTERVAL = 1
diff --git a/e2e/libs/recurring_job/crd.py b/e2e/libs/recurring_job/crd.py
index 1b10fc8ce5..43ed450e31 100644
--- a/e2e/libs/recurring_job/crd.py
+++ b/e2e/libs/recurring_job/crd.py
@@ -30,4 +30,4 @@ def check_jobs_work(self, volume_name):
 
     def cleanup(self, volume_names):
         logging("Delegating the cleanup call to API because there is no CRD implementation")
-        return self.rest.cleanup(volume_names)
\ No newline at end of file
+        return self.rest.cleanup(volume_names)
diff --git a/e2e/libs/recurring_job/recurring_job.py b/e2e/libs/recurring_job/recurring_job.py
index 9ee52f2347..3616619d73 100644
--- a/e2e/libs/recurring_job/recurring_job.py
+++ b/e2e/libs/recurring_job/recurring_job.py
@@ -38,4 +38,4 @@ def check_jobs_work(self, volume_name):
         return self.recurring_job.check_jobs_work(volume_name)
 
     def cleanup(self, volume_names):
-        return self.recurring_job.cleanup(volume_names)
\ No newline at end of file
+        return self.recurring_job.cleanup(volume_names)
diff --git a/e2e/libs/recurring_job/rest.py b/e2e/libs/recurring_job/rest.py
index d988783f71..603c2457fc 100644
--- a/e2e/libs/recurring_job/rest.py
+++ b/e2e/libs/recurring_job/rest.py
@@ -10,9 +10,9 @@
 from utility.utility import get_longhorn_client
 from utility.utility import logging
 
+from recurring_job.constant import RETRY_COUNTS
+from recurring_job.constant import RETRY_INTERVAL
 
-RETRY_COUNTS = 180
-RETRY_INTERVAL = 1
 
 class Rest(Base):
 
@@ -65,16 +65,21 @@ def _wait_for_volume_recurring_job_delete(self, job_name, volume_name):
         assert deleted
 
     def get_volume_recurring_jobs_and_groups(self, volume_name):
-        volume = self.client.by_id_volume(volume_name)
-        list = volume.recurringJobList()
-        jobs = []
-        groups = []
-        for item in list:
-            if item['isGroup']:
-                groups.append(item['name'])
-            else:
-                jobs.append(item['name'])
-        return jobs, groups
+        for _ in range(RETRY_COUNTS):
+            try:
+                volume = self.client.by_id_volume(volume_name)
+                list = volume.recurringJobList()
+                jobs = []
+                groups = []
+                for item in list:
+                    if item['isGroup']:
+                        groups.append(item['name'])
+                    else:
+                        jobs.append(item['name'])
+                return jobs, groups
+            except Exception as e:
+                logging(f"Getting volume {volume} recurring job list error: {e}")
+                time.sleep(RETRY_INTERVAL)
 
     def _wait_for_cron_job_create(self, job_name):
         created = False
@@ -125,23 +130,26 @@ def _check_snapshot_created_in_time(self, volume_name, job_name, period_in_sec):
         snapshot_timestamp = 0
         for _ in range(period_in_sec * 2):
             snapshot_list = filter_cr("longhorn.io", "v1beta2", "longhorn-system", "snapshots", label_selector=label_selector)
-            if len(snapshot_list['items']) > 0:
-                for item in snapshot_list['items']:
-                    # this snapshot can be created by snapshot or backup recurring job
-                    # but job_name is in spec.labels.RecurringJob
-                    # and crd doesn't support field selector
-                    # so need to filter by ourselves
-                    if 'RecurringJob' in item['status']['labels'] and \
-                        item['status']['labels']['RecurringJob'] == job_name and \
-                        item['status']['readyToUse'] == True:
-                        logging(f"Got snapshot {item}")
-                        snapshot_time = item['metadata']['creationTimestamp']
-                        snapshot_time = datetime.strptime(snapshot_time, '%Y-%m-%dT%H:%M:%SZ')
-                        snapshot_timestamp = snapshot_time.timestamp()
-                        logging(f"Got snapshot time = {snapshot_time}, timestamp = {snapshot_timestamp}")
-                        break
-                if snapshot_timestamp > current_timestamp:
-                    return
+            try:
+                if len(snapshot_list['items']) > 0:
+                    for item in snapshot_list['items']:
+                        # this snapshot can be created by snapshot or backup recurring job
+                        # but job_name is in spec.labels.RecurringJob
+                        # and crd doesn't support field selector
+                        # so need to filter by ourselves
+                        if 'RecurringJob' in item['status']['labels'] and \
+                            item['status']['labels']['RecurringJob'] == job_name and \
+                            item['status']['readyToUse'] == True:
+                            logging(f"Got snapshot {item}")
+                            snapshot_time = item['metadata']['creationTimestamp']
+                            snapshot_time = datetime.strptime(snapshot_time, '%Y-%m-%dT%H:%M:%SZ')
+                            snapshot_timestamp = snapshot_time.timestamp()
+                            logging(f"Got snapshot time = {snapshot_time}, timestamp = {snapshot_timestamp}")
+                            break
+                    if snapshot_timestamp > current_timestamp:
+                        return
+            except Exception as e:
+                logging(f"Iterating snapshot list error: {e}")
             time.sleep(1)
         assert False, f"since {current_time},\
                         there's no new snapshot created by recurring job \
@@ -156,17 +164,25 @@ def _check_backup_created_in_time(self, volume_name, period_in_sec):
         backup_timestamp = 0
         for _ in range(period_in_sec * 2):
             backup_list = filter_cr("longhorn.io", "v1beta2", "longhorn-system", "backups", label_selector=label_selector)
-            if len(backup_list['items']) > 0:
-                state = backup_list['items'][0]['status']['state']
-                if state != "InProgress" and state != "Completed":
-                    continue
-                backup_time = backup_list['items'][0]['metadata']['creationTimestamp']
-                backup_time = datetime.strptime(backup_time, '%Y-%m-%dT%H:%M:%SZ')
-                backup_timestamp = backup_time.timestamp()
-                logging(f"Got backup time = {backup_time}, timestamp = {backup_timestamp}")
-            if backup_timestamp > current_timestamp:
-                return
+            try:
+                if len(backup_list['items']) > 0:
+                    for item in backup_list['items']:
+                        state = item['status']['state']
+                        if state != "InProgress" and state != "Completed":
+                            continue
+                        backup_time = item['metadata']['creationTimestamp']
+                        backup_time = datetime.strptime(backup_time, '%Y-%m-%dT%H:%M:%SZ')
+                        backup_timestamp = backup_time.timestamp()
+                        logging(f"Got backup time = {backup_time}, timestamp = {backup_timestamp}")
+                        if backup_timestamp > current_timestamp:
+                            return
+            except Exception as e:
+                logging(f"Iterating backup list error: {e}")
             time.sleep(1)
+        logging(f"since {current_time},\
+                        there's no new backup created by recurring job \
+                        {backup_list}")
+        time.sleep(86400 * 3)
         assert False, f"since {current_time},\
                         there's no new backup created by recurring job \
                         {backup_list}"
@@ -177,4 +193,4 @@ def cleanup(self, volume_names):
             jobs, _ = self.get_volume_recurring_jobs_and_groups(volume_name)
             for job in jobs:
                 logging(f"Deleting recurring job {job}")
-                self.delete(job, volume_name)
\ No newline at end of file
+                self.delete(job, volume_name)
diff --git a/e2e/libs/replica/base.py b/e2e/libs/replica/base.py
index 1fdef38f35..2abca02ca5 100644
--- a/e2e/libs/replica/base.py
+++ b/e2e/libs/replica/base.py
@@ -17,4 +17,4 @@ def wait_for_replica_rebuilding_start(self, volume_name, node_name):
 
     @abstractmethod
     def wait_for_replica_rebuilding_complete(self, volume_name, node_name):
-        return NotImplemented
\ No newline at end of file
+        return NotImplemented
diff --git a/e2e/libs/replica/constant.py b/e2e/libs/replica/constant.py
new file mode 100644
index 0000000000..82e875b169
--- /dev/null
+++ b/e2e/libs/replica/constant.py
@@ -0,0 +1,2 @@
+RETRY_COUNTS = 150
+RETRY_INTERVAL = 1
diff --git a/e2e/libs/replica/rest.py b/e2e/libs/replica/rest.py
index 6ace292c03..8c492c0b56 100644
--- a/e2e/libs/replica/rest.py
+++ b/e2e/libs/replica/rest.py
@@ -3,10 +3,11 @@
 from replica.base import Base
 
 from utils import common_utils
+from utility.utility import logging
 
+from replica.constant import RETRY_COUNTS
+from replica.constant import RETRY_INTERVAL
 
-RETRY_COUNTS = 150
-RETRY_INTERVAL = 1
 
 class Rest(Base):
     def __init__(self, node_exec):
@@ -22,46 +23,55 @@ def delete_replica(self, volume_name, node_name):
     def wait_for_replica_rebuilding_start(self, volume_name, node_name):
         rebuilding_replica_name = None
         for i in range(RETRY_COUNTS):
-            v = self.longhorn_client.by_id_volume(volume_name)
-            for replica in v.replicas:
-                if replica.hostId == node_name:
-                    rebuilding_replica_name = replica.name
+            try:
+                v = self.longhorn_client.by_id_volume(volume_name)
+                for replica in v.replicas:
+                    if replica.hostId == node_name:
+                        rebuilding_replica_name = replica.name
+                        break
+                if rebuilding_replica_name:
                     break
-            if rebuilding_replica_name:
-                break
+            except Exception as e:
+                logging(f"Failed to get volume {e}")
             time.sleep(RETRY_INTERVAL)
         assert rebuilding_replica_name != None, f'failed to get rebuilding replica name'
 
         started = False
         for i in range(RETRY_COUNTS):
-            v = self.longhorn_client.by_id_volume(volume_name)
-            for status in v.rebuildStatus:
-                if status.replica == rebuilding_replica_name and\
-                   status.state == "in_progress":
-                    started = True
+            try:
+                v = self.longhorn_client.by_id_volume(volume_name)
+                for status in v.rebuildStatus:
+                    if status.replica == rebuilding_replica_name and\
+                       status.state == "in_progress":
+                        started = True
+                        break
+                if started:
                     break
-            if started:
-                break
+            except Exception as e:
+                logging(f"Failed to get volume {e}")
             time.sleep(RETRY_INTERVAL)
         assert started, f'replica {rebuilding_replica_name} rebuilding starting failed'
 
     def wait_for_replica_rebuilding_complete(self, volume_name, node_name):
         completed = False
         for i in range(RETRY_COUNTS):
-            v = self.longhorn_client.by_id_volume(volume_name)
-            for replica in v.replicas:
-                # use replica.mode is RW or RO to check if this replica
-                # has been rebuilt or not
-                # because rebuildStatus is not reliable
-                # when the rebuild progress reaches 100%
-                # it will be removed from rebuildStatus immediately
-                # and you will just get an empty rebuildStatus []
-                # so it's no way to distinguish "rebuilding not started yet"
-                # or "rebuilding already completed" using rebuildStatus
-                if replica.hostId == node_name and replica.mode == "RW":
-                    completed = True
-                    break
+            try:
+                v = self.longhorn_client.by_id_volume(volume_name)
+                for replica in v.replicas:
+                    # use replica.mode is RW or RO to check if this replica
+                    # has been rebuilt or not
+                    # because rebuildStatus is not reliable
+                    # when the rebuild progress reaches 100%
+                    # it will be removed from rebuildStatus immediately
+                    # and you will just get an empty rebuildStatus []
+                    # so it's no way to distinguish "rebuilding not started yet"
+                    # or "rebuilding already completed" using rebuildStatus
+                    if replica.hostId == node_name and replica.mode == "RW":
+                        completed = True
+                        break
+            except Exception as e:
+                logging(f"Failed to get volume {e}")
             if completed:
                 break
             time.sleep(RETRY_INTERVAL)
-        assert completed, f'failed rebuilding replicas'
\ No newline at end of file
+        assert completed, f'failed rebuilding replicas'
diff --git a/e2e/libs/utility/utility.py b/e2e/libs/utility/utility.py
index 7b04d260c0..83b07ab944 100644
--- a/e2e/libs/utility/utility.py
+++ b/e2e/libs/utility/utility.py
@@ -1,17 +1,23 @@
-from kubernetes import config, client, dynamic
-from kubernetes.client.rest import ApiException
-from kubernetes.stream import stream
-from longhorn import from_env
-import string
-import random
 import os
 import socket
+import string
 import time
+import random
 import yaml
 
+from longhorn import from_env
+
+from kubernetes import client
+from kubernetes import config
+from kubernetes import dynamic
+from kubernetes.client.rest import ApiException
+
 from robot.api import logger
 from robot.libraries.BuiltIn import BuiltIn
 
+from node.utility import get_node_by_index
+from node.utility import list_node_names_by_role
+
 
 def logging(msg, also_report=False):
     if also_report:
@@ -19,16 +25,19 @@ def logging(msg, also_report=False):
     else:
         logger.console(msg)
 
+
 def get_retry_count_and_interval():
     retry_count = int(BuiltIn().get_variable_value("${RETRY_COUNT}"))
     retry_interval = int(BuiltIn().get_variable_value("${RETRY_INTERVAL}"))
     return retry_count, retry_interval
 
+
 def generate_name(name_prefix="test-"):
     return name_prefix + \
         ''.join(random.choice(string.ascii_lowercase + string.digits)
                 for _ in range(6))
 
+
 def generate_volume_name():
     return generate_name("vol-")
 
@@ -43,25 +52,6 @@ def init_k8s_api_client():
         config.load_incluster_config()
         logging("Initialized in-cluster k8s api client")
 
-def list_nodes():
-    core_api = client.CoreV1Api()
-    obj = core_api.list_node()
-    nodes = []
-    for item in obj.items:
-        if 'node-role.kubernetes.io/control-plane' not in item.metadata.labels and \
-                'node-role.kubernetes.io/master' not in item.metadata.labels:
-            nodes.append(item.metadata.name)
-    return sorted(nodes)
-
-def get_control_plane_nodes():
-    core_api = client.CoreV1Api()
-    obj = core_api.list_node()
-    nodes = []
-    for item in obj.items:
-        if 'node-role.kubernetes.io/control-plane' in item.metadata.labels or \
-                'node-role.kubernetes.io/master' in item.metadata.labels:
-            nodes.append(item.metadata.name)
-    return sorted(nodes)
 
 def wait_for_cluster_ready():
     core_api = client.CoreV1Api()
@@ -83,30 +73,27 @@ def wait_for_cluster_ready():
         time.sleep(retry_interval)
     assert ready, f"expect cluster's ready but it isn't {resp}"
 
+
 def wait_for_all_instance_manager_running():
-    core_api = client.CoreV1Api()
     longhorn_client = get_longhorn_client()
-    nodes = list_nodes()
+    worker_nodes = list_node_names_by_role("worker")
 
     retry_count, retry_interval = get_retry_count_and_interval()
     for _ in range(retry_count):
         logging(f"Waiting for all instance manager running ({_}) ...")
-        instance_managers = longhorn_client.list_instance_manager()
-        instance_manager_map = {}
         try:
+            instance_managers = longhorn_client.list_instance_manager()
+            instance_manager_map = {}
             for im in instance_managers:
                 if im.currentState == "running":
                     instance_manager_map[im.nodeID] = im
-            if len(instance_manager_map) == len(nodes):
+            if len(instance_manager_map) == len(worker_nodes):
                 break
             time.sleep(retry_interval)
         except Exception as e:
             logging(f"Getting instance manager state error: {e}")
-    assert len(instance_manager_map) == len(nodes), f"expect all instance managers running, instance_managers = {instance_managers}, instance_manager_map = {instance_manager_map}"
+    assert len(instance_manager_map) == len(worker_nodes), f"expect all instance managers running, instance_managers = {instance_managers}, instance_manager_map = {instance_manager_map}"
 
-def get_node(index):
-    nodes = list_nodes()
-    return nodes[int(index)]
 
 def apply_cr(manifest_dict):
     dynamic_client = dynamic.DynamicClient(client.api_client.ApiClient())
@@ -125,11 +112,13 @@ def apply_cr(manifest_dict):
         crd_api.create(body=manifest_dict, namespace=namespace)
         logging.info(f"{namespace}/{resource_name} created")
 
+
 def apply_cr_from_yaml(filepath):
     with open(filepath, 'r') as f:
         manifest_dict = yaml.safe_load(f)
         apply_cr(manifest_dict)
 
+
 def get_cr(group, version, namespace, plural, name):
     api = client.CustomObjectsApi()
     try:
@@ -138,6 +127,7 @@ def get_cr(group, version, namespace, plural, name):
     except ApiException as e:
         logging(f"Getting namespaced custom object error: {e}")
 
+
 def filter_cr(group, version, namespace, plural, field_selector="", label_selector=""):
     api = client.CustomObjectsApi()
     try:
@@ -146,6 +136,7 @@ def filter_cr(group, version, namespace, plural, field_selector="", label_select
     except ApiException as e:
         logging(f"Listing namespaced custom object: {e}")
 
+
 def wait_delete_ns(name):
     api = client.CoreV1Api()
     retry_count, retry_interval = get_retry_count_and_interval()
@@ -161,6 +152,7 @@ def wait_delete_ns(name):
         time.sleep(retry_interval)
     assert not found
 
+
 def get_mgr_ips():
     ret = client.CoreV1Api().list_pod_for_all_namespaces(
         label_selector="app=longhorn-manager",
@@ -170,6 +162,7 @@ def get_mgr_ips():
         mgr_ips.append(i.status.pod_ip)
     return mgr_ips
 
+
 def get_longhorn_client():
     retry_count, retry_interval = get_retry_count_and_interval()
     if os.getenv('LONGHORN_CLIENT_URL'):
@@ -204,18 +197,21 @@ def get_longhorn_client():
                 logging(f"Getting longhorn client error: {e}")
                 time.sleep(retry_interval)
 
+
 def get_test_pod_running_node():
     if "NODE_NAME" in os.environ:
         return os.environ["NODE_NAME"]
     else:
-        return get_node(0)
+        return get_node_by_index(0)
+
 
 def get_test_pod_not_running_node():
-    nodes = list_nodes()
+    worker_nodes = list_node_names_by_role("worker")
     test_pod_running_node = get_test_pod_running_node()
-    for node in nodes:
-        if node != test_pod_running_node:
-            return node
+    for worker_node in worker_nodes:
+        if worker_node != test_pod_running_node:
+            return worker_node
+
 
 def get_test_case_namespace(test_name):
     return test_name.lower().replace(' ', '-')
diff --git a/e2e/libs/volume/base.py b/e2e/libs/volume/base.py
index c9435828c5..7684928073 100644
--- a/e2e/libs/volume/base.py
+++ b/e2e/libs/volume/base.py
@@ -1,5 +1,6 @@
 from abc import ABC, abstractmethod
 
+
 class Base(ABC):
 
     @abstractmethod
@@ -43,9 +44,9 @@ def wait_for_replica_rebuilding_complete(self, volume_name, node_name):
         return NotImplemented
 
     @abstractmethod
-    def check_data(self, volume_name, checksum):
+    def check_data_checksum(self, volume_name, checksum):
         return NotImplemented
 
     @abstractmethod
     def cleanup(self, volume_names):
-        return NotImplemented
\ No newline at end of file
+        return NotImplemented
diff --git a/e2e/libs/volume/constant.py b/e2e/libs/volume/constant.py
new file mode 100644
index 0000000000..c9d6e4a990
--- /dev/null
+++ b/e2e/libs/volume/constant.py
@@ -0,0 +1,11 @@
+KIBIBYTE = 1024
+MEBIBYTE = (KIBIBYTE * KIBIBYTE)
+GIBIBYTE = (MEBIBYTE * KIBIBYTE)
+
+RETRY_COUNTS = 150
+RETRY_INTERVAL = 1
+
+VOLUME_FRONTEND_BLOCKDEV = "blockdev"
+VOLUME_FRONTEND_ISCSI = "iscsi"
+
+DEV_PATH = "/dev/longhorn/"
diff --git a/e2e/libs/volume/crd.py b/e2e/libs/volume/crd.py
index 1c9e4a155d..dc635ad74b 100644
--- a/e2e/libs/volume/crd.py
+++ b/e2e/libs/volume/crd.py
@@ -5,13 +5,13 @@
 from utility.utility import get_retry_count_and_interval
 from utility.utility import logging
 
+from engine.engine import Engine
+
 from volume.base import Base
 from volume.rest import Rest
 
+from volume.constant import GIBIBYTE
 
-Ki = 2**10
-Mi = 2**20
-Gi = 2**30
 
 class CRD(Base):
 
@@ -38,7 +38,7 @@ def create(self, volume_name, size, replica_count):
             "spec": {
                 "frontend": "blockdev",
                 "replicaAutoBalance": "ignored",
-                "size": str(int(size) * Gi),
+                "size": str(int(size) * GIBIBYTE),
                 "numberOfReplicas": int(replica_count)
             }
         }
@@ -96,9 +96,44 @@ def attach(self, volume_name, node_name):
                 Exception(f'exception for creating volumeattachments:', e)
         self.wait_for_volume_state(volume_name, "attached")
 
+    def detach(self, volume_name):
+        try:
+            self.obj_api.patch_namespaced_custom_object(
+                group="longhorn.io",
+                version="v1beta2",
+                namespace="longhorn-system",
+                plural="volumeattachments",
+                name=volume_name,
+                body={
+                    "spec": {
+                        "attachmentTickets": None,
+                    }
+                }
+            )
+        except Exception as e:
+            # new CRD: volumeattachments was added since from 1.5.0
+            # https://github.com/longhorn/longhorn/issues/3715
+            if e.reason != "Not Found":
+                Exception(f'exception for patching volumeattachments:', e)
+
+            self.obj_api.patch_namespaced_custom_object(
+                group="longhorn.io",
+                version="v1beta2",
+                namespace="longhorn-system",
+                plural="volumes",
+                name=volume_name,
+                body={
+                        "spec": {
+                            "nodeID": ""
+                        }
+                }
+            )
+
+        self.wait_for_volume_state(volume_name, "detached")
+
     def delete(self, volume_name):
         try:
-            resp = self.obj_api.delete_namespaced_custom_object(
+            self.obj_api.delete_namespaced_custom_object(
                 group="longhorn.io",
                 version="v1beta2",
                 namespace="longhorn-system",
@@ -112,7 +147,7 @@ def delete(self, volume_name):
     def wait_for_volume_delete(self, volume_name):
         for i in range(self.retry_count):
             try:
-                resp = self.obj_api.get_namespaced_custom_object(
+                self.obj_api.get_namespaced_custom_object(
                     group="longhorn.io",
                     version="v1beta2",
                     namespace="longhorn-system",
@@ -152,7 +187,7 @@ def wait_for_volume_robustness(self, volume_name, desired_state):
 
     def wait_for_volume_robustness_not(self, volume_name, not_desired_state):
         for i in range(self.retry_count):
-            logging(f"Waiting for {volume_name} not {not_desired_state} ({i}) ...")
+            logging(f"Waiting for {volume_name} robustness not {not_desired_state} ({i}) ...")
             try:
                 if self.get(volume_name)["status"]["robustness"] != not_desired_state:
                     break
@@ -161,6 +196,21 @@ def wait_for_volume_robustness_not(self, volume_name, not_desired_state):
             time.sleep(self.retry_interval)
         assert self.get(volume_name)["status"]["robustness"] != not_desired_state
 
+    def wait_for_volume_expand_to_size(self, volume_name, expected_size):
+        engine = None
+        engine_operation = Engine()
+        for i in range(self.retry_count):
+            logging(f"Waiting for {volume_name} expand to {expected_size} ({i}) ...")
+
+            engine = engine_operation.get_engine_by_volume(self.get(volume_name))
+            if int(engine['status']['currentSize']) == expected_size:
+                break
+
+            time.sleep(self.retry_interval)
+
+        assert engine is not None
+        assert int(engine['status']['currentSize']) == expected_size
+
     def get_endpoint(self, volume_name):
         logging("Delegating the get_endpoint call to API because there is no CRD implementation")
         return Rest(self.node_exec).get_endpoint(volume_name)
@@ -212,7 +262,7 @@ def wait_for_replica_rebuilding_complete(self, volume_name, node_name):
             node_name
         )
 
-    def check_data(self, volume_name, checksum):
+    def check_data_checksum(self, volume_name, checksum):
         node_name = self.get(volume_name)["spec"]["nodeID"]
         endpoint = self.get_endpoint(volume_name)
         _checksum = self.node_exec.issue_cmd(
@@ -225,4 +275,4 @@ def check_data(self, volume_name, checksum):
     def cleanup(self, volume_names):
         for volume_name in volume_names:
             logging(f"Deleting volume {volume_name}")
-            self.delete(volume_name)
\ No newline at end of file
+            self.delete(volume_name)
diff --git a/e2e/libs/volume/rest.py b/e2e/libs/volume/rest.py
index 9443bf9961..f626714cc3 100644
--- a/e2e/libs/volume/rest.py
+++ b/e2e/libs/volume/rest.py
@@ -6,13 +6,12 @@
 
 from volume.base import Base
 
+from volume.constant import DEV_PATH
+from volume.constant import RETRY_COUNTS
+from volume.constant import RETRY_INTERVAL
+from volume.constant import VOLUME_FRONTEND_BLOCKDEV
+from volume.constant import VOLUME_FRONTEND_ISCSI
 
-RETRY_COUNTS = 150
-RETRY_INTERVAL = 1
-
-VOLUME_FRONTEND_BLOCKDEV = "blockdev"
-VOLUME_FRONTEND_ISCSI = "iscsi"
-DEV_PATH = "/dev/longhorn/"
 
 class Rest(Base):
 
@@ -21,7 +20,12 @@ def __init__(self, node_exec):
         self.node_exec = node_exec
 
     def get(self, volume_name):
-        return self.longhorn_client.by_id_volume(volume_name)
+        for i in range(RETRY_COUNTS):
+            try:
+                return self.longhorn_client.by_id_volume(volume_name)
+            except Exception as e:
+                logging(f"Failed to get volume {e}")
+            time.sleep(RETRY_INTERVAL)
 
     def create(self, volume_name, size, replica_count):
         return NotImplemented
@@ -37,7 +41,7 @@ def wait_for_volume_state(self, volume_name, desired_state):
 
     def get_endpoint(self, volume_name):
         endpoint = ""
-        v = self.longhorn_client.by_id_volume(volume_name)
+        v = self.get(volume_name)
         if v.disableFrontend:
             assert endpoint == ""
             return endpoint
@@ -45,12 +49,15 @@ def get_endpoint(self, volume_name):
             assert v.frontend == VOLUME_FRONTEND_BLOCKDEV or\
                    v.frontend == VOLUME_FRONTEND_ISCSI
             for i in range(RETRY_COUNTS):
-                v = self.longhorn_client.by_id_volume(volume_name)
-                engines = v.controllers
-                assert len(engines) != 0
-                endpoint = engines[0].endpoint
-                if endpoint != "":
-                    break
+                try:
+                    v = self.longhorn_client.by_id_volume(volume_name)
+                    engines = v.controllers
+                    assert len(engines) != 0
+                    endpoint = engines[0].endpoint
+                    if endpoint != "":
+                        break
+                except Exception as e:
+                    logging(f"Failed to get volume {e}")
                 time.sleep(RETRY_INTERVAL)
 
         logging(f"Got volume {volume_name} endpoint = {endpoint}")
@@ -73,60 +80,69 @@ def delete_replica(self, volume_name, node_name):
     def wait_for_replica_rebuilding_start(self, volume_name, node_name):
         rebuilding_replica_name = None
         for i in range(RETRY_COUNTS):
-            v = self.longhorn_client.by_id_volume(volume_name)
-            logging(f"Got volume {volume_name} replicas = {v.replicas}")
-            for replica in v.replicas:
-                if replica.hostId == node_name:
-                    rebuilding_replica_name = replica.name
+            try:
+                v = self.longhorn_client.by_id_volume(volume_name)
+                logging(f"Got volume {volume_name} replicas = {v.replicas}")
+                for replica in v.replicas:
+                    if replica.hostId == node_name:
+                        rebuilding_replica_name = replica.name
+                        break
+                if rebuilding_replica_name:
                     break
-            if rebuilding_replica_name:
-                break
+            except Exception as e:
+                logging(f"Failed to get volume {e}")
             time.sleep(RETRY_INTERVAL)
         assert rebuilding_replica_name != None
         logging(f"Got rebuilding replica = {rebuilding_replica_name}")
 
         started = False
         for i in range(RETRY_COUNTS):
-            v = self.longhorn_client.by_id_volume(volume_name)
-            logging(f"Got volume rebuild status = {v.rebuildStatus}")
-            for status in v.rebuildStatus:
-                for replica in v.replicas:
-                    if status.replica == replica.name and \
-                       replica.hostId == node_name and \
-                       status.state == "in_progress":
-                       logging(f"Started {node_name}'s replica {replica.name} rebuilding")
-                       started = True
-                       break
-            if started:
-                break
+            try:
+                v = self.longhorn_client.by_id_volume(volume_name)
+                logging(f"Got volume rebuild status = {v.rebuildStatus}")
+                for status in v.rebuildStatus:
+                    for replica in v.replicas:
+                        if status.replica == replica.name and \
+                           replica.hostId == node_name and \
+                           status.state == "in_progress":
+                            logging(f"Started {node_name}'s replica {replica.name} rebuilding")
+                            started = True
+                            break
+                if started:
+                    break
+            except Exception as e:
+                logging(f"Failed to get volume {e}")
             time.sleep(RETRY_INTERVAL)
         assert started, f"wait for replica on node {node_name} rebuilding timeout: {v}"
 
     def wait_for_replica_rebuilding_complete(self, volume_name, node_name):
         completed = False
         for i in range(RETRY_COUNTS):
-            v = self.longhorn_client.by_id_volume(volume_name)
-            logging(f"Got volume {volume_name} replicas = {v.replicas}")
-            for replica in v.replicas:
-                # use replica.mode is RW or RO to check if this replica
-                # has been rebuilt or not
-                # because rebuildStatus is not reliable
-                # when the rebuild progress reaches 100%
-                # it will be removed from rebuildStatus immediately
-                # and you will just get an empty rebuildStatus []
-                # so it's no way to distinguish "rebuilding not started yet"
-                # or "rebuilding already completed" using rebuildStatus
-                if replica.hostId == node_name and replica.mode == "RW":
-                    logging(f"Completed {node_name}'s replica {replica.name} rebuilding")
-                    completed = True
+            try:
+                v = self.longhorn_client.by_id_volume(volume_name)
+                logging(f"Got volume {volume_name} replicas = {v.replicas}")
+                for replica in v.replicas:
+                    # use replica.mode is RW or RO to check if this replica
+                    # has been rebuilt or not
+                    # because rebuildStatus is not reliable
+                    # when the rebuild progress reaches 100%
+                    # it will be removed from rebuildStatus immediately
+                    # and you will just get an empty rebuildStatus []
+                    # so it's no way to distinguish "rebuilding not started yet"
+                    # or "rebuilding already completed" using rebuildStatus
+                    if replica.hostId == node_name and replica.mode == "RW":
+                        logging(f"Completed {node_name}'s replica {replica.name} rebuilding")
+                        completed = True
+                        break
+                if completed:
                     break
-            if completed:
-                break
+            except Exception as e:
+                logging(f"Failed to get volume {e}")
             time.sleep(RETRY_INTERVAL)
         assert completed
 
-    def check_data(self, volume_name, checksum):
+    def check_data_checksum(self, volume_name, checksum):
         return NotImplemented
 
     def cleanup(self, volume_names):
-        return NotImplemented
\ No newline at end of file
+        return NotImplemented
diff --git a/e2e/libs/volume/volume.py b/e2e/libs/volume/volume.py
index bf88009cd6..d8d81c50aa 100644
--- a/e2e/libs/volume/volume.py
+++ b/e2e/libs/volume/volume.py
@@ -27,6 +27,9 @@ def create(self, volume_name, size, replica_count):
     def attach(self, volume_name, node_name):
         return self.volume.attach(volume_name, node_name)
 
+    def detach(self, volume_name):
+        return self.volume.detach(volume_name)
+
     def delete(self, volume_name):
         return self.volume.delete(volume_name)
 
@@ -37,10 +40,16 @@ def wait_for_volume_attached(self, volume_name):
         self.volume.wait_for_volume_state(volume_name, "attached")
         self.volume.wait_for_volume_robustness_not(volume_name, "unknown")
 
+    def wait_for_volume_detached(self, volume_name):
+        self.volume.wait_for_volume_state(volume_name, "detached")
+
     def wait_for_volume_healthy(self, volume_name):
         self.volume.wait_for_volume_state(volume_name, "attached")
         self.volume.wait_for_volume_robustness(volume_name, "healthy")
 
+    def wait_for_volume_expand_to_size(self, volume_name, size):
+        return self.volume.wait_for_volume_expand_to_size(volume_name, size)
+
     def get_endpoint(self, volume_name):
         return self.volume.get_endpoint(volume_name)
 
@@ -65,8 +74,8 @@ def wait_for_replica_rebuilding_complete(self, volume_name, node_name):
             node_name
         )
 
-    def check_data(self, volume_name, checksum):
-        return self.volume.check_data(volume_name, checksum)
+    def check_data_checksum(self, volume_name, checksum):
+        return self.volume.check_data_checksum(volume_name, checksum)
 
     def cleanup(self, volume_names):
-        return self.volume.cleanup(volume_names)
\ No newline at end of file
+        return self.volume.cleanup(volume_names)
diff --git a/e2e/libs/workload/constant.py b/e2e/libs/workload/constant.py
new file mode 100644
index 0000000000..cd7aa90153
--- /dev/null
+++ b/e2e/libs/workload/constant.py
@@ -0,0 +1,3 @@
+IMAGE_BUSYBOX = 'busybox:1.34.0'
+IMAGE_LITMUX = 'litmuschaos/go-runner:latest'
+IMAGE_UBUNTU = 'ubuntu:16.04'
diff --git a/e2e/libs/workload/pod.py b/e2e/libs/workload/pod.py
index 84c2d278cf..30f56932ea 100644
--- a/e2e/libs/workload/pod.py
+++ b/e2e/libs/workload/pod.py
@@ -1,18 +1,20 @@
 import time
 
 from kubernetes import client
+from kubernetes.client import rest
 
 from utility.utility import logging
 from utility.utility import generate_name
 from utility.utility import get_retry_count_and_interval
 
+from workload.constant import IMAGE_BUSYBOX
 
-IMAGE_BUSYBOX = 'busybox:1.34.0'
-IMAGE_LITMUX = 'litmuschaos/go-runner:latest'
-IMAGE_UBUNTU = 'ubuntu:16.04'
 
-def new_pod_manifest(image="", command=[], args=[],
+def new_pod_manifest(pod_name="", image="", command=[], args=[],
                      claim_name="", node_name="", labels={}):
+    if pod_name == "":
+        pod_name = generate_name()
+
     # Set default image and args
     if image is None:
         image = IMAGE_BUSYBOX
@@ -25,7 +27,7 @@ def new_pod_manifest(image="", command=[], args=[],
         'apiVersion': 'v1',
         'kind': 'Pod',
         'metadata': {
-            'name': generate_name(),
+            'name': pod_name,
             'namespace': 'default',
             'labels': labels
         },
@@ -78,6 +80,7 @@ def new_pod_manifest(image="", command=[], args=[],
 
     return manifest
 
+
 def create_pod(manifest, is_wait_for_pod_running=False):
     core_api = client.CoreV1Api()
 
@@ -91,12 +94,13 @@ def create_pod(manifest, is_wait_for_pod_running=False):
 
     return get_pod(name, namespace=namespace)
 
+
 def delete_pod(name, namespace='default'):
     core_api = client.CoreV1Api()
     try:
         core_api.delete_namespaced_pod(name=name, namespace=namespace)
         wait_delete_pod(name)
-    except ApiException as e:
+    except rest.ApiException as e:
         assert e.status == 404
 
 def wait_delete_pod(name, namespace='default'):
@@ -114,9 +118,16 @@ def wait_delete_pod(name, namespace='default'):
         time.sleep(retry_interval)
     assert not found
 
+
 def get_pod(name, namespace='default'):
-    core_api = client.CoreV1Api()
-    return core_api.read_namespaced_pod(name=name, namespace=namespace)
+    try:
+        core_api = client.CoreV1Api()
+        return core_api.read_namespaced_pod(name=name, namespace=namespace)
+    except Exception as e:
+        if e.reason == 'Not Found':
+            return None
+        raise e
+
 
 def wait_for_pod_status(name, status, namespace='default'):
     retry_count, retry_interval = get_retry_count_and_interval()
diff --git a/e2e/libs/workload/workload.py b/e2e/libs/workload/workload.py
index 232aa961fd..f484077e45 100644
--- a/e2e/libs/workload/workload.py
+++ b/e2e/libs/workload/workload.py
@@ -11,6 +11,7 @@
 
 WAIT_FOR_POD_STABLE_MAX_RETRY = 90
 
+
 def get_name_suffix(*args):
     suffix = ""
     for arg in args:
@@ -18,6 +19,7 @@ def get_name_suffix(*args):
             suffix += f"-{arg}"
     return suffix
 
+
 def create_storageclass(name):
     if name == 'longhorn-test-strict-local':
         filepath = "./templates/workload/strict_local_storageclass.yaml"
@@ -30,6 +32,7 @@ def create_storageclass(name):
         api = client.StorageV1Api()
         api.create_storage_class(body=manifest_dict)
 
+
 def delete_storageclass(name):
     api = client.StorageV1Api()
     try:
@@ -37,6 +40,7 @@ def delete_storageclass(name):
     except ApiException as e:
         assert e.status == 404
 
+
 def create_deployment(volume_type, option):
     filepath = f"./templates/workload/deployment.yaml"
     with open(filepath, 'r') as f:
@@ -74,6 +78,7 @@ def create_deployment(volume_type, option):
 
     return deployment_name
 
+
 def delete_deployment(name, namespace='default'):
     api = client.AppsV1Api()
 
@@ -98,6 +103,7 @@ def delete_deployment(name, namespace='default'):
         time.sleep(retry_interval)
     assert deleted
 
+
 def create_statefulset(volume_type, option):
     filepath = "./templates/workload/statefulset.yaml"
     with open(filepath, 'r') as f:
@@ -124,21 +130,30 @@ def create_statefulset(volume_type, option):
         statefulset_name = statefulset.metadata.name
         replicas = statefulset.spec.replicas
 
-        retry_count, retry_interval = get_retry_count_and_interval()
-        for i in range(retry_count):
-            statefulset = api.read_namespaced_stateful_set(
-                name=statefulset_name,
-                namespace=namespace)
-            # statefulset is none if statefulset is not yet created
-            if statefulset is not None and \
-                statefulset.status.ready_replicas == replicas:
-                break
-            time.sleep(retry_interval)
-
-        assert statefulset.status.ready_replicas == replicas
+        wait_for_statefulset_replicas_ready(statefulset_name, replicas)
 
     return statefulset_name
 
+
+def wait_for_statefulset_replicas_ready(statefulset_name, expected_ready_count, namespace='default'):
+    apps_v1_api = client.AppsV1Api()
+
+    retry_count, retry_interval = get_retry_count_and_interval()
+    for i in range(retry_count):
+        logging(f"Waiting for statefulset {statefulset_name} replica ready ({i}) ...")
+
+        statefulset = apps_v1_api.read_namespaced_stateful_set(
+            name=statefulset_name,
+            namespace=namespace)
+        # statefulset is none if statefulset is not yet created
+        if statefulset is not None and \
+            statefulset.status.ready_replicas == expected_ready_count:
+            break
+        time.sleep(retry_interval)
+
+    assert statefulset.status.ready_replicas == expected_ready_count
+
+
 def delete_statefulset(name, namespace='default'):
     api = client.AppsV1Api()
 
@@ -163,6 +178,27 @@ def delete_statefulset(name, namespace='default'):
         time.sleep(retry_interval)
     assert deleted
 
+
+def get_statefulset(name, namespace='default'):
+    api = client.AppsV1Api()
+    return api.read_namespaced_stateful_set(name=name, namespace=namespace)
+
+
+def scale_statefulset(name, replica_count, namespace='default'):
+    logging(f"Scaling statefulset {name} to {replica_count}")
+
+    apps_v1_api = client.AppsV1Api()
+
+    scale = client.V1Scale(
+        metadata=client.V1ObjectMeta(name=name, namespace=namespace),
+        spec=client.V1ScaleSpec(replicas=int(replica_count))
+    )
+    apps_v1_api.patch_namespaced_stateful_set_scale(name=name, namespace=namespace, body=scale)
+
+    statefulset = get_statefulset(name, namespace)
+    assert statefulset.spec.replicas == int(replica_count)
+
+
 def create_pvc(volume_type, option):
     filepath = "./templates/workload/pvc.yaml"
     with open(filepath, 'r') as f:
@@ -185,6 +221,7 @@ def create_pvc(volume_type, option):
 
     return pvc.metadata.name
 
+
 def delete_pvc(name, namespace='default'):
     api = client.CoreV1Api()
     try:
@@ -208,6 +245,7 @@ def delete_pvc(name, namespace='default'):
         time.sleep(retry_interval)
     assert deleted
 
+
 def get_workload_pod_names(workload_name):
     api = client.CoreV1Api()
     label_selector = f"app={workload_name}"
@@ -219,6 +257,7 @@ def get_workload_pod_names(workload_name):
         pod_names.append(pod.metadata.name)
     return pod_names
 
+
 def get_workload_pods(workload_name):
     api = client.CoreV1Api()
     label_selector = f"app={workload_name}"
@@ -227,6 +266,7 @@ def get_workload_pods(workload_name):
         label_selector=label_selector)
     return resp.items
 
+
 def get_workload_volume_name(workload_name):
     api = client.CoreV1Api()
     pvc_name = get_workload_pvc_name(workload_name)
@@ -234,6 +274,7 @@ def get_workload_volume_name(workload_name):
         name=pvc_name, namespace='default')
     return pvc.spec.volume_name
 
+
 def get_workload_pvc_name(workload_name):
     api = client.CoreV1Api()
     pod = get_workload_pods(workload_name)[0]
@@ -245,6 +286,7 @@ def get_workload_pvc_name(workload_name):
     assert pvc_name
     return pvc_name
 
+
 def write_pod_random_data(pod_name, size_in_mb, path="/data/random-data"):
     api = client.CoreV1Api()
     write_cmd = [
@@ -258,6 +300,7 @@ def write_pod_random_data(pod_name, size_in_mb, path="/data/random-data"):
         command=write_cmd, stderr=True, stdin=False, stdout=True,
         tty=False)
 
+
 def keep_writing_pod_data(pod_name, size_in_mb=256, path="/data/overwritten-data"):
     api = client.CoreV1Api()
     write_cmd = [
@@ -273,7 +316,9 @@ def keep_writing_pod_data(pod_name, size_in_mb=256, path="/data/overwritten-data
     logging(f"Created process to keep writing pod {pod_name}")
     return res
 
-def check_pod_data(pod_name, checksum, path="/data/random-data"):
+
+def check_pod_data_checksum(pod_name, checksum, path="/data/random-data"):
+    logging(f"Checking pod {pod_name} data checksum")
     api = client.CoreV1Api()
     cmd = [
         '/bin/sh',
@@ -284,9 +329,10 @@ def check_pod_data(pod_name, checksum, path="/data/random-data"):
         api.connect_get_namespaced_pod_exec, pod_name, 'default',
         command=cmd, stderr=True, stdin=False, stdout=True,
         tty=False)
-    logging(f"Got {path} checksum = {_checksum},\
-                expected checksum = {checksum}")
-    assert _checksum == checksum
+    assert _checksum == checksum, \
+        f"Got {path} checksum = {_checksum}\n" \
+        f"Expected checksum = {checksum}"
+
 
 def wait_for_workload_pod_stable(workload_name):
     stable_pod = None
@@ -307,4 +353,4 @@ def wait_for_workload_pod_stable(workload_name):
                     if wait_for_stable_retry == WAIT_FOR_POD_STABLE_MAX_RETRY:
                         return stable_pod
         time.sleep(retry_interval)
-    assert False
\ No newline at end of file
+    assert False
diff --git a/e2e/templates/workload/pvc.yaml b/e2e/templates/workload/pvc.yaml
index 8671b755b1..f42eef2b2c 100644
--- a/e2e/templates/workload/pvc.yaml
+++ b/e2e/templates/workload/pvc.yaml
@@ -9,4 +9,4 @@ spec:
   storageClassName: longhorn-test
   resources:
     requests:
-      storage: 3Gi
\ No newline at end of file
+      storage: 3221225472  #3Gi
diff --git a/e2e/templates/workload/statefulset.yaml b/e2e/templates/workload/statefulset.yaml
index 4688d76f48..71d36a5aad 100644
--- a/e2e/templates/workload/statefulset.yaml
+++ b/e2e/templates/workload/statefulset.yaml
@@ -36,4 +36,4 @@ spec:
       storageClassName: longhorn-test
       resources:
         requests:
-          storage: 3Gi
+          storage: 3221225472  #3Gi
diff --git a/e2e/tests/replica_rebuilding.robot b/e2e/tests/replica_rebuilding.robot
index 6664b4d6eb..830b674d5b 100644
--- a/e2e/tests/replica_rebuilding.robot
+++ b/e2e/tests/replica_rebuilding.robot
@@ -36,27 +36,3 @@ Reboot Replica Node While Replica Rebuilding
         Then Wait until replica on replica node rebuilt
         And Check data is intact
     END
-
-Stress Volume Node CPU While Replica Rebuilding
-    Given Create a volume with 5 GB and 3 replicas
-    And Write data to the volume
-
-    FOR    ${i}    IN RANGE    ${LOOP_COUNT}
-        When Delete replica on volume node to trigger replica rebuilding
-        And During replica rebuilding, stress volume node cpu
-
-        Then Wait until replica on volume node rebuilt
-        And Check data is intact
-    END
-
-Stress Volume Node Memory While Replica Rebuilding
-    Given Create a volume with 5 GB and 3 replicas
-    And Write data to the volume
-
-    FOR    ${i}    IN RANGE    ${LOOP_COUNT}
-        When Delete replica on volume node to trigger replica rebuilding
-        And During replica rebuilding, stress volume node memory
-
-        Then Wait until replica on volume node rebuilt
-        And Check data is intact
-    END
diff --git a/e2e/tests/stress_cpu.robot b/e2e/tests/stress_cpu.robot
new file mode 100644
index 0000000000..b7f7998693
--- /dev/null
+++ b/e2e/tests/stress_cpu.robot
@@ -0,0 +1,76 @@
+*** Settings ***
+Documentation    Negative Test Cases
+Resource    ../keywords/stress.resource
+Resource    ../keywords/volume.resource
+Resource    ../keywords/workload.resource
+Resource    ../keywords/common.resource
+
+Test Setup    Set test environment
+Test Teardown    Cleanup test resources
+
+*** Variables ***
+${LOOP_COUNT}    1
+${RETRY_COUNT}    300
+${RETRY_INTERVAL}    1
+
+*** Test Cases ***
+
+Stress Volume Node CPU When Replica Is Rebuilding
+    Given Create a volume with 5 GB and 3 replicas
+    And Write data to the volume
+
+    FOR    ${i}    IN RANGE    ${LOOP_COUNT}
+        When Delete replica on volume node to trigger replica rebuilding
+        And Stress the CPU of all volume nodes
+
+        Then Wait until replica on volume node rebuilt
+        And Check data is intact
+    END
+
+Stress Volume Node CPU When Volume Is Detaching and Attaching
+    Given Create a volume with 5 GB and 3 replicas
+    And Write data to the volume
+
+    FOR    ${i}    IN RANGE    ${LOOP_COUNT}
+        When Stress the CPU of all volume nodes
+
+        And Detach volume from node
+        And Attach volume to node
+
+        And Check data is intact
+    END
+
+Stress Volume Node CPU When Volume Is Online Expanding
+    @{data_checksum_list} =    Create List
+    Set Test Variable    ${data_checksum_list}
+
+    Given Create statefulset 0 with rwo volume
+    And Write 1024 MB data to statefulset 0
+
+    FOR    ${i}    IN RANGE    ${LOOP_COUNT}
+        And Stress the CPU of all volume nodes
+        When Expand statefulset 0 volume by 100 MiB
+
+        Then Wait for statefulset 0 volume size expanded
+        And Check statefulset 0 data is intact
+    END
+
+Stress Volume Node CPU When Volume Is Offline Expanding
+    @{data_checksum_list} =    Create List
+    Set Test Variable    ${data_checksum_list}
+
+    Given Create statefulset 0 with rwo volume
+    And Write 1024 MB data to statefulset 0
+
+    FOR    ${i}    IN RANGE    ${LOOP_COUNT}
+        And Scale down statefulset 0 to detach volume
+        And Stress the CPU of all worker nodes
+
+        When Expand statefulset 0 volume by 100 MiB
+
+        Then Wait for statefulset 0 volume size expanded
+        And Wait for statefulset 0 volume detached
+
+        And Scale up statefulset 0 to attach volume
+        And Check statefulset 0 data is intact
+    END
diff --git a/e2e/tests/stress_memory.robot b/e2e/tests/stress_memory.robot
new file mode 100644
index 0000000000..8054b967a1
--- /dev/null
+++ b/e2e/tests/stress_memory.robot
@@ -0,0 +1,76 @@
+*** Settings ***
+Documentation    Negative Test Cases
+Resource    ../keywords/stress.resource
+Resource    ../keywords/volume.resource
+Resource    ../keywords/workload.resource
+Resource    ../keywords/common.resource
+
+Test Setup    Set test environment
+Test Teardown    Cleanup test resources
+
+*** Variables ***
+${LOOP_COUNT}    1
+${RETRY_COUNT}    300
+${RETRY_INTERVAL}    1
+
+*** Test Cases ***
+
+Stress Volume Node Memory When Replica Is Rebuilding
+    Given Create a volume with 5 GB and 3 replicas
+    And Write data to the volume
+
+    FOR    ${i}    IN RANGE    ${LOOP_COUNT}
+        When Delete replica on volume node to trigger replica rebuilding
+        And Stress the memory of all volume nodes
+
+        Then Wait until replica on volume node rebuilt
+        And Check data is intact
+    END
+
+Stress Volume Node Memory When Volume Is Detaching and Attaching
+    Given Create a volume with 5 GB and 3 replicas
+    And Write data to the volume
+
+    FOR    ${i}    IN RANGE    ${LOOP_COUNT}
+        When Stress the memory of all volume nodes
+
+        And Detach volume from node
+        And Attach volume to node
+
+        And Check data is intact
+    END
+
+Stress Volume Node Memory When Volume Is Online Expanding
+    @{data_checksum_list} =    Create List
+    Set Test Variable    ${data_checksum_list}
+
+    Given Create statefulset 0 with rwo volume
+    And Write 1024 MB data to statefulset 0
+
+    FOR    ${i}    IN RANGE    ${LOOP_COUNT}
+        And Stress the memory of all volume nodes
+        When Expand statefulset 0 volume by 100 MiB
+
+        Then Wait for statefulset 0 volume size expanded
+        And Check statefulset 0 data is intact
+    END
+
+Stress Volume Node Memory When Volume Is Offline Expanding
+    @{data_checksum_list} =    Create List
+    Set Test Variable    ${data_checksum_list}
+
+    Given Create statefulset 0 with rwo volume
+    And Write 1024 MB data to statefulset 0
+
+    FOR    ${i}    IN RANGE    ${LOOP_COUNT}
+        And Scale down statefulset 0 to detach volume
+        And Stress the memory of all worker nodes
+
+        When Expand statefulset 0 volume by 100 MiB
+
+        Then Wait for statefulset 0 volume size expanded
+        And Wait for statefulset 0 volume detached
+
+        And Scale up statefulset 0 to attach volume
+        And Check statefulset 0 data is intact
+    END
diff --git a/manager/integration/Dockerfile b/manager/integration/Dockerfile
index 6f3c45b7f3..8541f5edd9 100644
--- a/manager/integration/Dockerfile
+++ b/manager/integration/Dockerfile
@@ -42,5 +42,6 @@ ADD pipelines/helm/scripts/upgrade-longhorn.sh ./pipelines/helm/scripts/upgrade-
 ADD pipelines/rancher/scripts/upgrade-longhorn.sh ./pipelines/rancher/scripts/upgrade-longhorn.sh
 ADD pipelines/flux/scripts/upgrade-longhorn.sh ./pipelines/flux/scripts/upgrade-longhorn.sh
 ADD pipelines/argocd/scripts/upgrade-longhorn.sh ./pipelines/argocd/scripts/upgrade-longhorn.sh
+ADD pipelines/fleet/scripts/upgrade-longhorn.sh ./pipelines/fleet/scripts/upgrade-longhorn.sh
 
 ENTRYPOINT ["./run.sh"]
diff --git a/manager/integration/deploy/test.yaml b/manager/integration/deploy/test.yaml
index 7ab9b22f73..eb3a44c8f5 100644
--- a/manager/integration/deploy/test.yaml
+++ b/manager/integration/deploy/test.yaml
@@ -55,6 +55,8 @@ spec:
           fieldPath: spec.nodeName
     - name: MANAGED_K8S_CLUSTER
       value: "false"
+    - name: RESOURCE_SUFFIX
+      value: ""
     volumeMounts:
     - name: dev
       mountPath: /dev
diff --git a/manager/integration/pytest.ini b/manager/integration/pytest.ini
index a19f7de1a8..d7622d68c0 100644
--- a/manager/integration/pytest.ini
+++ b/manager/integration/pytest.ini
@@ -17,3 +17,4 @@ markers =
   system_backup_restore
   cluster_autoscaler
   long_running
+  volume_backup_restore
diff --git a/manager/integration/tests/aws.py b/manager/integration/tests/aws.py
index 7614ad0c49..0328520567 100644
--- a/manager/integration/tests/aws.py
+++ b/manager/integration/tests/aws.py
@@ -32,12 +32,17 @@ def __init__(self):
             region_name=default_region)
 
     def instance_id_by_ip(self, instance_ip):
+        resource_suffix = os.getenv("RESOURCE_SUFFIX")
         response = aws.ec2_client.describe_instances(
             Filters=[
                 {
                     'Name': 'private-ip-address',
                     'Values': [instance_ip]
                 },
+                {
+                    'Name': 'tag:Name',
+                    'Values': [f"*{resource_suffix}*"]
+                }
             ],
         )
 
diff --git a/manager/integration/tests/common.py b/manager/integration/tests/common.py
index a5a463331e..28503ae13e 100644
--- a/manager/integration/tests/common.py
+++ b/manager/integration/tests/common.py
@@ -66,6 +66,7 @@
 RETRY_COUNTS_SHORT = 30
 RETRY_COUNTS_LONG = 360
 RETRY_INTERVAL = 1
+RETRY_INTERVAL_SHORT = 0.5
 RETRY_INTERVAL_LONG = 2
 RETRY_BACKUP_COUNTS = 300
 RETRY_BACKUP_INTERVAL = 1
@@ -81,6 +82,7 @@
 UPGRADE_TEST_IMAGE_PREFIX = "longhornio/longhorn-test:upgrade-test"
 
 ISCSI_DEV_PATH = "/dev/disk/by-path"
+ISCSI_PROCESS = "iscsid"
 
 VOLUME_FIELD_STATE = "state"
 VOLUME_STATE_ATTACHED = "attached"
@@ -112,6 +114,7 @@
 
 DEFAULT_POD_INTERVAL = 1
 DEFAULT_POD_TIMEOUT = 180
+POD_DELETION_TIMEOUT = 600
 
 DEFAULT_STATEFULSET_INTERVAL = 1
 DEFAULT_STATEFULSET_TIMEOUT = 180
@@ -120,11 +123,10 @@
 DEFAULT_DEPLOYMENT_TIMEOUT = 240
 WAIT_FOR_POD_STABLE_MAX_RETRY = 90
 
-
 DEFAULT_VOLUME_SIZE = 3  # In Gi
 EXPANDED_VOLUME_SIZE = 4  # In Gi
 
-DIRECTORY_PATH = '/tmp/longhorn-test/'
+DIRECTORY_PATH = '/var/lib/longhorn/longhorn-test/'
 
 VOLUME_CONDITION_SCHEDULED = "Scheduled"
 VOLUME_CONDITION_RESTORE = "Restore"
@@ -167,6 +169,8 @@
     "allow-volume-creation-with-degraded-availability"
 SETTING_DISABLE_SCHEDULING_ON_CORDONED_NODE = \
     "disable-scheduling-on-cordoned-node"
+SETTING_DETACH_MANUALLY_ATTACHED_VOLUMES_WHEN_CORDONED = \
+    "detach-manually-attached-volumes-when-cordoned"
 SETTING_GUARANTEED_INSTANCE_MANAGER_CPU = "guaranteed-instance-manager-cpu"
 SETTING_PRIORITY_CLASS = "priority-class"
 SETTING_RECURRING_JOB_WHILE_VOLUME_DETACHED = \
@@ -207,6 +211,11 @@
 SETTING_BACKUP_COMPRESSION_METHOD = "backup-compression-method"
 SETTING_BACKUP_CONCURRENT_LIMIT = "backup-concurrent-limit"
 SETTING_RESTORE_CONCURRENT_LIMIT = "restore-concurrent-limit"
+SETTING_V1_DATA_ENGINE = "v1-data-engine"
+SETTING_ALLOW_EMPTY_NODE_SELECTOR_VOLUME = \
+    "allow-empty-node-selector-volume"
+SETTING_REPLICA_DISK_SOFT_ANTI_AFFINITY = "replica-disk-soft-anti-affinity"
+SETTING_ALLOW_EMPTY_DISK_SELECTOR_VOLUME = "allow-empty-disk-selector-volume"
 
 DEFAULT_BACKUP_COMPRESSION_METHOD = "lz4"
 BACKUP_COMPRESSION_METHOD_LZ4 = "lz4"
@@ -282,10 +291,15 @@
 FS_TYPE_EXT4 = "ext4"
 FS_TYPE_XFS = "xfs"
 
+ACCESS_MODE_RWO = "rwo"
+ACCESS_MODE_RWX = "rwx"
+
 ATTACHER_TYPE_CSI_ATTACHER = "csi-attacher"
 ATTACHER_TYPE_LONGHORN_API = "longhorn-api"
 ATTACHER_TYPE_LONGHORN_UPGRADER = "longhorn-upgrader"
 
+HOST_PROC_DIR = "/host/proc"
+
 # customize the timeout for HDD
 disktype = os.environ.get('LONGHORN_DISK_TYPE')
 if disktype == "hdd":
@@ -395,9 +409,9 @@ def create_volume_and_backup(client, vol_name, vol_size, backup_data_size):
     client.create_volume(name=vol_name,
                          numberOfReplicas=1,
                          size=str(vol_size))
-    backup_volume = wait_for_volume_detached(client, vol_name)
-    backup_volume.attach(hostId=get_self_host_id())
-    backup_volume = wait_for_volume_healthy(client, vol_name)
+    volume = wait_for_volume_detached(client, vol_name)
+    volume.attach(hostId=get_self_host_id())
+    volume = wait_for_volume_healthy(client, vol_name)
 
     data = {'pos': 0,
             'len': backup_data_size,
@@ -405,7 +419,7 @@ def create_volume_and_backup(client, vol_name, vol_size, backup_data_size):
 
     _, backup, _, _ = create_backup(client, vol_name, data)
 
-    return backup_volume, backup
+    return volume, backup
 
 
 def create_backup(client, volname, data={}, labels={}):
@@ -486,7 +500,8 @@ def delete_backup_volume(client, volume_name):
 def create_and_check_volume(client, volume_name,
                             num_of_replicas=3, size=SIZE, backing_image="",
                             frontend=VOLUME_FRONTEND_BLOCKDEV,
-                            snapshot_data_integrity=SNAPSHOT_DATA_INTEGRITY_IGNORED): # NOQA
+                            snapshot_data_integrity=SNAPSHOT_DATA_INTEGRITY_IGNORED,  # NOQA
+                            access_mode=ACCESS_MODE_RWO):
     """
     Create a new volume with the specified parameters. Assert that the new
     volume is detached and that all of the requested parameters match.
@@ -505,7 +520,8 @@ def create_and_check_volume(client, volume_name,
     client.create_volume(name=volume_name, size=size,
                          numberOfReplicas=num_of_replicas,
                          backingImage=backing_image, frontend=frontend,
-                         snapshotDataIntegrity=snapshot_data_integrity)
+                         snapshotDataIntegrity=snapshot_data_integrity,
+                         accessMode=access_mode)
     volume = wait_for_volume_detached(client, volume_name)
     assert volume.name == volume_name
     assert volume.size == size
@@ -523,11 +539,14 @@ def wait_pod(pod_name):
 
     pod = None
     for i in range(DEFAULT_POD_TIMEOUT):
-        pod = api.read_namespaced_pod(
-            name=pod_name,
-            namespace='default')
-        if pod is not None and pod.status.phase != 'Pending':
-            break
+        try:
+            pod = api.read_namespaced_pod(
+                name=pod_name,
+                namespace='default')
+            if pod is not None and pod.status.phase != 'Pending':
+                break
+        except Exception as e:
+            print(f"Waiting for pod {pod_name} failed: {e}")
         time.sleep(DEFAULT_POD_INTERVAL)
     assert pod is not None and pod.status.phase == 'Running'
 
@@ -927,7 +946,7 @@ def size_to_string(volume_size):
 
 
 def wait_delete_pod(api, pod_uid, namespace='default'):
-    for i in range(DEFAULT_POD_TIMEOUT):
+    for i in range(POD_DELETION_TIMEOUT):
         ret = api.list_namespaced_pod(namespace=namespace)
         found = False
         for item in ret.items:
@@ -1536,31 +1555,33 @@ def finalizer():
 
 @pytest.fixture
 def crypto_secret(request):
-    manifest = {
-        'apiVersion': 'v1',
-        'kind': 'Secret',
-        'metadata': {
-            'name': 'longhorn-crypto',
-            'namespace': 'longhorn-system',
-        },
-        'stringData': {
-            'CRYPTO_KEY_VALUE': 'simple',
-            'CRYPTO_KEY_PROVIDER': 'secret'
+    def get_crypto_secret(namespace=LONGHORN_NAMESPACE):
+        crypto_secret.manifest = {
+            'apiVersion': 'v1',
+            'kind': 'Secret',
+            'metadata': {
+                'name': 'longhorn-crypto',
+                'namespace': namespace,
+            },
+            'stringData': {
+                'CRYPTO_KEY_VALUE': 'simple',
+                'CRYPTO_KEY_PROVIDER': 'secret'
+            }
         }
-    }
+        return crypto_secret.manifest
 
     def finalizer():
         api = get_core_api_client()
         try:
             api.delete_namespaced_secret(
-                name=manifest['metadata']['name'],
-                namespace=manifest['metadata']['namespace'])
+                name=crypto_secret.manifest['metadata']['name'],
+                namespace=crypto_secret.manifest['metadata']['namespace'])
         except ApiException as e:
             assert e.status == 404
 
     request.addfinalizer(finalizer)
 
-    return manifest
+    return get_crypto_secret
 
 
 @pytest.fixture
@@ -1677,6 +1698,14 @@ def client(request):
 
     request.addfinalizer(lambda: cleanup_client())
 
+    if not os.path.exists(DIRECTORY_PATH):
+        try:
+            os.makedirs(DIRECTORY_PATH)
+        except OSError as e:
+            raise Exception(
+                f"Failed to create directory {DIRECTORY_PATH}: {e}"
+            )
+
     cleanup_client()
 
     return client
@@ -1766,12 +1795,11 @@ def get_mgr_ips():
 
 
 def get_self_host_id():
-    envs = os.environ
-    return envs["NODE_NAME"]
+    return os.environ.get("NODE_NAME")
 
 
 def get_backupstore_url():
-    backupstore = os.environ['LONGHORN_BACKUPSTORES']
+    backupstore = os.environ.get("LONGHORN_BACKUPSTORES", "")
     backupstore = backupstore.replace(" ", "")
     backupstores = backupstore.split(",")
 
@@ -1780,18 +1808,13 @@ def get_backupstore_url():
 
 
 def get_backupstore_poll_interval():
-    poll_interval = os.environ['LONGHORN_BACKUPSTORE_POLL_INTERVAL']
+    poll_interval = os.environ.get("LONGHORN_BACKUPSTORE_POLL_INTERVAL", "")
     assert len(poll_interval) != 0
     return poll_interval
 
 
 def get_backupstores():
-    # The try is added to avoid the pdoc3 error while publishing this on
-    # https://longhorn.github.io/longhorn-tests
-    try:
-        backupstore = os.environ['LONGHORN_BACKUPSTORES']
-    except KeyError:
-        return []
+    backupstore = os.environ.get("LONGHORN_BACKUPSTORES", "")
 
     try:
         backupstore = backupstore.replace(" ", "")
@@ -2163,7 +2186,7 @@ def wait_for_engine_image_creation(client, image_name):
                 break
         if found:
             break
-        time.sleep(RETRY_INTERVAL)
+        time.sleep(RETRY_INTERVAL_SHORT)
     assert found
 
 
@@ -2178,16 +2201,39 @@ def wait_for_engine_image_state(client, image_name, state):
     return image
 
 
+def wait_for_engine_image_incompatible(client, image_name):
+    wait_for_engine_image_creation(client, image_name)
+    for i in range(RETRY_COUNTS):
+        image = client.by_id_engine_image(image_name)
+        if image.incompatible:
+            break
+        time.sleep(RETRY_INTERVAL)
+    assert image.incompatible
+    return image
+
+
 def wait_for_engine_image_condition(client, image_name, state):
     """
     state: "True", "False"
     """
+    # Indicate many times we want to see the ENGINE_NAME in the STATE.
+    # This helps to prevent the flaky test case in which the ENGINE_NAME
+    # is flapping between ready and not ready a few times before settling
+    # down to the ready state
+    # https://github.com/longhorn/longhorn-tests/pull/1638
+    state_count = 1
+    if state == "True":
+        state_count = 5
+
+    c = 0
     for i in range(RETRY_COUNTS):
         wait_for_engine_image_creation(client, image_name)
         image = client.by_id_engine_image(image_name)
         if image['conditions'][0]['status'] == state:
-            break
-        time.sleep(RETRY_INTERVAL_LONG)
+            c += 1
+            if c >= state_count:
+                break
+        time.sleep(RETRY_INTERVAL_SHORT)
     assert image['conditions'][0]['status'] == state
     return image
 
@@ -2307,13 +2353,18 @@ def crash_replica_processes(client, api, volname, replicas=None,
 
     for r in replicas:
         assert r.instanceManagerName != ""
-        kill_command = "kill `pgrep -f " + r['dataPath'] + "`"
+
+        pgrep_command = f"pgrep -f {r['dataPath']}"
+        pid = exec_instance_manager(api, r.instanceManagerName, pgrep_command)
+        assert pid != ""
+
+        kill_command = f"kill {pid}"
         exec_instance_manager(api, r.instanceManagerName, kill_command)
 
         if wait_to_fail is True:
             thread = create_assert_error_check_thread(
                 wait_for_replica_failed,
-                client, volname, r['name'], RETRY_COUNTS*2, RETRY_INTERVAL/2
+                client, volname, r['name'], RETRY_COUNTS, RETRY_INTERVAL_SHORT
             )
             threads.append(thread)
 
@@ -2326,10 +2377,11 @@ def exec_instance_manager(api, im_name, cmd):
 
     with timeout(seconds=STREAM_EXEC_TIMEOUT,
                  error_message='Timeout on executing stream read'):
-        stream(api.connect_get_namespaced_pod_exec,
-               im_name,
-               LONGHORN_NAMESPACE, command=exec_cmd,
-               stderr=True, stdin=False, stdout=True, tty=False)
+        output = stream(api.connect_get_namespaced_pod_exec,
+                        im_name,
+                        LONGHORN_NAMESPACE, command=exec_cmd,
+                        stderr=True, stdin=False, stdout=True, tty=False)
+        return output
 
 
 def wait_for_replica_failed(client, volname, replica_name,
@@ -2639,11 +2691,22 @@ def get_iscsi_lun(iscsi):
     return iscsi_endpoint[2]
 
 
-def exec_nsenter(cmd):
-    dockerd_pid = find_dockerd_pid() or "1"
-    exec_cmd = ["nsenter", "--mount=/host/proc/{}/ns/mnt".format(dockerd_pid),
-                "--net=/host/proc/{}/ns/net".format(dockerd_pid),
-                "bash", "-c", cmd]
+def exec_nsenter(cmd, process_name=None):
+    if process_name:
+        proc_pid = find_process_pid(process_name)
+        cmd_parts = cmd.split()
+    else:
+        proc_pid = find_dockerd_pid() or "1"
+        cmd_parts = ["bash", "-c", cmd]
+
+    exec_cmd = ["nsenter", "--mount=/host/proc/{}/ns/mnt".format(proc_pid),
+                "--net=/host/proc/{}/ns/net".format(proc_pid)]
+    exec_cmd.extend(cmd_parts)
+    return subprocess.check_output(exec_cmd)
+
+
+def exec_local(cmd):
+    exec_cmd = cmd.split()
     return subprocess.check_output(exec_cmd)
 
 
@@ -2654,10 +2717,10 @@ def iscsi_login(iscsi_ep):
     lun = get_iscsi_lun(iscsi_ep)
     # discovery
     cmd_discovery = "iscsiadm -m discovery -t st -p " + ip
-    exec_nsenter(cmd_discovery)
+    exec_nsenter(cmd_discovery, ISCSI_PROCESS)
     # login
     cmd_login = "iscsiadm -m node -T " + target + " -p " + ip + " --login"
-    exec_nsenter(cmd_login)
+    exec_nsenter(cmd_login, ISCSI_PROCESS)
     blk_name = "ip-%s:%s-iscsi-%s-lun-%s" % (ip, port, target, lun)
     wait_for_device_login(ISCSI_DEV_PATH, blk_name)
     dev = os.path.realpath(ISCSI_DEV_PATH + "/" + blk_name)
@@ -2668,9 +2731,9 @@ def iscsi_logout(iscsi_ep):
     ip = get_iscsi_ip(iscsi_ep)
     target = get_iscsi_target(iscsi_ep)
     cmd_logout = "iscsiadm -m node -T " + target + " -p " + ip + " --logout"
-    exec_nsenter(cmd_logout)
+    exec_nsenter(cmd_logout, ISCSI_PROCESS)
     cmd_rm_discovery = "iscsiadm -m discovery -p " + ip + " -o delete"
-    exec_nsenter(cmd_rm_discovery)
+    exec_nsenter(cmd_rm_discovery, ISCSI_PROCESS)
 
 
 def get_process_info(p_path):
@@ -2707,6 +2770,40 @@ def find_dockerd_pid():
     return find_ancestor_process_by_name("dockerd")
 
 
+def find_process_pid(process_name):
+    for file in os.listdir(HOST_PROC_DIR):
+        if not os.path.isdir(os.path.join(HOST_PROC_DIR, file)):
+            continue
+
+        # Check if file name is an integer
+        if not file.isdigit():
+            continue
+
+        with open(os.path.join(HOST_PROC_DIR, file, 'status'), 'r') as file:
+            status_content = file.readlines()
+
+        proc_status_content = None
+        name_pattern = re.compile(r'^Name:\s+(.+)$')
+
+        for line in status_content:
+            name_match = name_pattern.match(line)
+            if name_match and name_match.group(1) == process_name:
+                proc_status_content = status_content
+                break
+
+        if proc_status_content is None:
+            continue
+
+        pid_pattern = re.compile(r'^Pid:\s+(\d+)$')
+
+        for line in proc_status_content:
+            pid_match = pid_pattern.match(line)
+            if pid_match:
+                return int(pid_match.group(1))
+
+    raise Exception(f"Failed to find the {process_name} PID")
+
+
 def generate_random_pos(size, used={}):
     for i in range(RETRY_COUNTS):
         pos = 0
@@ -3271,7 +3368,11 @@ def get_k8s_zone_label():
 
 
 def cleanup_test_disks(client):
-    del_dirs = os.listdir(DIRECTORY_PATH)
+    try:
+        del_dirs = os.listdir(DIRECTORY_PATH)
+    except FileNotFoundError:
+        del_dirs = []
+
     host_id = get_self_host_id()
     node = client.by_id_node(host_id)
     disks = node.disks
@@ -3399,6 +3500,13 @@ def reset_settings(client):
         # resetting this to an empty default value.
         if setting_name == "storage-network":
             continue
+        # The test CI deploys Longhorn with the setting value longhorn-critical
+        # for the setting priority-class. Don't reset it to empty (which is
+        # the default value defined in longhorn-manager code) because this will
+        # restart Longhorn managed components and fail the test cases.
+        # https://github.com/longhorn/longhorn/issues/7413#issuecomment-1881707958
+        if setting.name == SETTING_PRIORITY_CLASS:
+            continue
 
         # The version of the support bundle kit will be specified by a command
         # option when starting the manager. And setting requires a value.
@@ -3533,7 +3641,7 @@ def wait_for_all_instance_manager_running(client):
         node_to_instance_manager_map = {}
         try:
             for im in instance_managers:
-                if im.managerType == "aio" and im.currentState == "running":
+                if im.managerType == "aio":
                     node_to_instance_manager_map[im.nodeID] = im
                 else:
                     print("\nFound unknown instance manager:", im)
@@ -3651,7 +3759,7 @@ def find_backup(client, vol_name, snap_name):
     def find_backup_volume():
         bvs = client.list_backupVolume()
         for bv in bvs:
-            if bv.name == vol_name:
+            if bv.name == vol_name and bv.created != "":
                 return bv
         return None
 
@@ -3828,9 +3936,9 @@ def wait_statefulset(statefulset_manifest):
     assert s_set.status.ready_replicas == replicas
 
 
-def create_crypto_secret(secret_manifest):
+def create_crypto_secret(secret_manifest, namespace=LONGHORN_NAMESPACE):
     api = get_core_api_client()
-    api.create_namespaced_secret(namespace=LONGHORN_NAMESPACE,
+    api.create_namespaced_secret(namespace,
                                  body=secret_manifest)
 
 
@@ -4063,17 +4171,17 @@ def create_pv_for_volume(client, core_api, volume, pv_name, fs_type="ext4"):
     wait_volume_kubernetes_status(client, volume.name, ks)
 
 
-def create_pvc_for_volume(client, core_api, volume, pvc_name):
-    volume.pvcCreate(namespace="default", pvcName=pvc_name)
+def create_pvc_for_volume(client, core_api, volume, pvc_name, pvc_namespace="default"): # NOQA
+    volume.pvcCreate(namespace=pvc_namespace, pvcName=pvc_name)
     for i in range(RETRY_COUNTS):
-        if check_pvc_existence(core_api, pvc_name):
+        if check_pvc_existence(core_api, pvc_name, pvc_namespace):
             break
         time.sleep(RETRY_INTERVAL)
-    assert check_pvc_existence(core_api, pvc_name)
+    assert check_pvc_existence(core_api, pvc_name, pvc_namespace)
 
     ks = {
         'pvStatus': 'Bound',
-        'namespace': 'default',
+        'namespace': pvc_namespace,
         'lastPVCRefAt': '',
     }
     wait_volume_kubernetes_status(client, volume.name, ks)
@@ -4888,7 +4996,8 @@ def prepare_statefulset_with_data_in_mb(
 def prepare_pod_with_data_in_mb(
         client, core_api, csi_pv, pvc, pod_make, volume_name,
         volume_size=str(1*Gi), num_of_replicas=3, data_path="/data/test",
-        data_size_in_mb=DATA_SIZE_IN_MB_1, add_liveness_probe=True):# NOQA:
+        data_size_in_mb=DATA_SIZE_IN_MB_1, add_liveness_probe=True,
+        access_mode=ACCESS_MODE_RWO):# NOQA:
 
     pod_name = volume_name + "-pod"
     pv_name = volume_name
@@ -4913,7 +5022,8 @@ def prepare_pod_with_data_in_mb(
 
     create_and_check_volume(client, volume_name,
                             num_of_replicas=num_of_replicas,
-                            size=volume_size)
+                            size=volume_size,
+                            access_mode=access_mode)
     core_api.create_persistent_volume(csi_pv)
     core_api.create_namespaced_persistent_volume_claim(
         body=pvc, namespace='default')
@@ -4976,11 +5086,14 @@ def wait_for_pod_restart(core_api, pod_name, namespace="default"):
 def wait_for_pod_phase(core_api, pod_name, pod_phase, namespace="default"):
     is_phase = False
     for _ in range(RETRY_COUNTS):
-        pod = core_api.read_namespaced_pod(name=pod_name,
-                                           namespace=namespace)
-        if pod.status.phase == pod_phase:
-            is_phase = True
-            break
+        try:
+            pod = core_api.read_namespaced_pod(name=pod_name,
+                                               namespace=namespace)
+            if pod.status.phase == pod_phase:
+                is_phase = True
+                break
+        except Exception as e:
+            print(f"Waiting for pod {pod_name} {pod_phase} failed: {e}")
 
         time.sleep(RETRY_INTERVAL_LONG)
     assert is_phase
@@ -5999,3 +6112,13 @@ def create_volume_and_write_data(client, volume_name, volume_size=SIZE):
     volume_data = write_volume_random_data(volume)
 
     return volume, volume_data
+
+
+def wait_for_instance_manager_count(client, number, retry_counts=120):
+    for _ in range(retry_counts):
+        ims = client.list_instance_manager()
+        if len(ims) == number:
+            break
+        time.sleep(RETRY_INTERVAL_LONG)
+
+    return len(ims)
diff --git a/manager/integration/tests/requirements.txt b/manager/integration/tests/requirements.txt
index e7e5e58a6f..51fdbdb828 100644
--- a/manager/integration/tests/requirements.txt
+++ b/manager/integration/tests/requirements.txt
@@ -11,3 +11,4 @@ six==1.12.0
 minio==5.0.10
 pyyaml==5.4.1
 pandas
+prometheus_client
diff --git a/manager/integration/tests/run.sh b/manager/integration/tests/run.sh
index e37745dd24..8eb3b99115 100755
--- a/manager/integration/tests/run.sh
+++ b/manager/integration/tests/run.sh
@@ -1,3 +1,4 @@
 #!/bin/bash
 
+export PYTHONUNBUFFERED=1
 pytest -v "$@"
diff --git a/manager/integration/tests/test_backing_image.py b/manager/integration/tests/test_backing_image.py
index 2c2285d253..5fac7c272b 100644
--- a/manager/integration/tests/test_backing_image.py
+++ b/manager/integration/tests/test_backing_image.py
@@ -38,10 +38,14 @@
 from common import wait_for_backing_image_status
 from common import wait_for_backing_image_in_disk_fail
 from common import get_disk_uuid
+from common import write_volume_dev_random_mb_data, get_device_checksum
+from common import check_backing_image_disk_map_status
 from common import LONGHORN_NAMESPACE, RETRY_EXEC_COUNTS, RETRY_INTERVAL
 from common import BACKING_IMAGE_QCOW2_CHECKSUM
 from common import BACKING_IMAGE_STATE_READY
 from common import BACKING_IMAGE_STATE_FAILED_AND_CLEANUP
+from common import BACKING_IMAGE_STATE_IN_PROGRESS
+from common import RETRY_COUNTS_LONG
 import time
 
 
@@ -73,9 +77,10 @@ def backing_image_basic_operation_test(client, volume_name, bi_name, bi_url):  #
     8. Delete the backing image.
     """
 
-    volume = create_and_check_volume(
-        client, volume_name, 3,
-        str(BACKING_IMAGE_EXT4_SIZE), bi_name)
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=3,
+                                     size=str(BACKING_IMAGE_EXT4_SIZE),
+                                     backing_image=bi_name)
     lht_host_id = get_self_host_id()
     volume.attach(hostId=lht_host_id)
     volume = wait_for_volume_healthy(client, volume_name)
@@ -140,9 +145,10 @@ def backing_image_content_test(client, volume_name_prefix, bi_name, bi_url):  #
     lht_host_id = get_self_host_id()
 
     volume_name1 = volume_name_prefix + "-1"
-    volume1 = create_and_check_volume(
-        client, volume_name1, 3,
-        str(BACKING_IMAGE_EXT4_SIZE), bi_name)
+    volume1 = create_and_check_volume(client, volume_name1,
+                                      num_of_replicas=3,
+                                      size=str(BACKING_IMAGE_EXT4_SIZE),
+                                      backing_image=bi_name)
     volume1.attach(hostId=lht_host_id)
     volume1 = wait_for_volume_healthy(client, volume_name1)
     assert volume1.backingImage == bi_name
@@ -172,9 +178,10 @@ def backing_image_content_test(client, volume_name_prefix, bi_name, bi_url):  #
     check_volume_data(volume1, data)
 
     volume_name2 = volume_name_prefix + "-2"
-    volume2 = create_and_check_volume(
-        client, volume_name2, 3,
-        str(BACKING_IMAGE_EXT4_SIZE), bi_name)
+    volume2 = create_and_check_volume(client, volume_name2,
+                                      num_of_replicas=3,
+                                      size=str(BACKING_IMAGE_EXT4_SIZE),
+                                      backing_image=bi_name)
     volume2.attach(hostId=lht_host_id)
     volume2 = wait_for_volume_healthy(client, volume_name2)
     assert volume1.backingImage == bi_name
@@ -524,10 +531,10 @@ def test_backing_image_auto_resync(bi_url, client, volume_name):  # NOQA
               client, BACKING_IMAGE_NAME, bi_url)
 
     # Step 2
-    volume = create_and_check_volume(
-                                     client, volume_name, 3,
-                                     str(BACKING_IMAGE_EXT4_SIZE),
-                                     BACKING_IMAGE_NAME)
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=3,
+                                     size=str(BACKING_IMAGE_EXT4_SIZE),
+                                     backing_image=BACKING_IMAGE_NAME)
 
     # Step 3
     lht_host_id = get_self_host_id()
@@ -581,13 +588,13 @@ def backing_image_cleanup(core_api, client): # NOQA
 
     # Step 2
     lht_host_id = get_self_host_id()
-    volume1 = create_and_check_volume(
-        client, volume_name="vol-1", size=str(1 * Gi),
-        backing_image=backing_img1_name)
+    volume1 = create_and_check_volume(client, "vol-1",
+                                      size=str(1 * Gi),
+                                      backing_image=backing_img1_name)
 
-    volume2 = create_and_check_volume(
-        client, volume_name="vol-2", size=str(1 * Gi),
-        backing_image=backing_img2_name)
+    volume2 = create_and_check_volume(client, "vol-2",
+                                      size=str(1 * Gi),
+                                      backing_image=backing_img2_name)
 
     # Step 3
     volume1.attach(hostId=lht_host_id)
@@ -630,3 +637,63 @@ def test_backing_image_with_wrong_md5sum(bi_url, client): # NOQA
 
     wait_for_backing_image_status(client, BACKING_IMAGE_NAME,
                                   BACKING_IMAGE_STATE_FAILED_AND_CLEANUP)
+
+
+def test_volume_wait_for_backing_image_condition(client): # NOQA
+    """
+    Test the volume condition "WaitForBackingImage"
+
+    Given
+    - Create a BackingImage
+
+    When
+    - Creating the Volume with the BackingImage while it is still in progress
+
+    Then
+    - The condition "WaitForBackingImage" of the Volume
+      would be first True and then change to False when
+      the BackingImage is ready and all the replicas are in running state.
+    """
+    # Create a large volume and export as backingimage
+    lht_host_id = get_self_host_id()
+
+    volume1_name = "vol1"
+    volume1 = create_and_check_volume(client, volume1_name,
+                                      num_of_replicas=3,
+                                      size=str(1 * Gi))
+    volume1.attach(hostId=lht_host_id)
+    volume1 = wait_for_volume_healthy(client, volume1_name)
+    volume_endpoint = get_volume_endpoint(volume1)
+    write_volume_dev_random_mb_data(volume_endpoint, 1, 500)
+    vol1_cksum = get_device_checksum(volume_endpoint)
+
+    backing_img_name = 'bi-test'
+    backing_img = client.create_backing_image(
+            name=backing_img_name,
+            sourceType=BACKING_IMAGE_SOURCE_TYPE_FROM_VOLUME,
+            parameters={"export-type": "qcow2", "volume-name": volume1_name},
+            expectedChecksum="")
+
+    # Create volume with that backing image
+    volume2_name = "vol2"
+    volume2 = create_and_check_volume(client, volume2_name,
+                                      size=str(1 * Gi),
+                                      backing_image=backing_img["name"])
+
+    volume2.attach(hostId=lht_host_id)
+
+    if check_backing_image_disk_map_status(client,
+                                           backing_img_name,
+                                           1,
+                                           BACKING_IMAGE_STATE_IN_PROGRESS):
+        volume2 = client.by_id_volume(volume2_name)
+        assert volume2.conditions.WaitForBackingImage.status == "True"
+
+    # Check volume healthy, and backing image ready
+    volume2 = wait_for_volume_healthy(client, volume2_name, RETRY_COUNTS_LONG)
+    assert volume2.conditions.WaitForBackingImage.status == "False"
+    check_backing_image_disk_map_status(client, backing_img_name, 3, "ready")
+
+    volume_endpoint = get_volume_endpoint(volume2)
+    vol2_cksum = get_device_checksum(volume_endpoint)
+    assert vol1_cksum == vol2_cksum
diff --git a/manager/integration/tests/test_basic.py b/manager/integration/tests/test_basic.py
index 4088e9fe8d..bedd863e03 100644
--- a/manager/integration/tests/test_basic.py
+++ b/manager/integration/tests/test_basic.py
@@ -100,6 +100,7 @@
 from common import BACKUP_COMPRESSION_METHOD_NONE
 from common import create_and_wait_deployment
 from common import get_custom_object_api_client
+from common import RETRY_COUNTS_SHORT
 
 from backupstore import backupstore_delete_volume_cfg_file
 from backupstore import backupstore_cleanup
@@ -270,8 +271,10 @@ def volume_basic_test(client, volume_name, backing_image=""):  # NOQA
                                       numberOfReplicas=2,
                                       frontend="invalid_frontend")
 
-    volume = create_and_check_volume(client, volume_name, num_replicas, SIZE,
-                                     backing_image)
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=num_replicas,
+                                     size=SIZE,
+                                     backing_image=backing_image)
     assert volume.restoreRequired is False
 
     def validate_volume_basic(expected, actual):
@@ -348,8 +351,11 @@ def test_volume_iscsi_basic(client, volume_name):  # NOQA
 
 def volume_iscsi_basic_test(client, volume_name, backing_image=""):  # NOQA
     host_id = get_self_host_id()
-    volume = create_and_check_volume(client, volume_name, 3, SIZE,
-                                     backing_image, VOLUME_FRONTEND_ISCSI)
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=3,
+                                     size=SIZE,
+                                     backing_image=backing_image,
+                                     frontend=VOLUME_FRONTEND_ISCSI)
     volume.attach(hostId=host_id)
     volume = common.wait_for_volume_healthy(client, volume_name)
 
@@ -558,8 +564,10 @@ def test_backup_status_for_unavailable_replicas(set_random_backupstore, client,
 
 def backup_status_for_unavailable_replicas_test(client, volume_name,  # NOQA
                                                 size, backing_image=""):  # NOQA
-    volume = create_and_check_volume(client, volume_name, 2, str(size),
-                                     backing_image)
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=2,
+                                     size=str(size),
+                                     backing_image=backing_image)
 
     lht_hostId = get_self_host_id()
     volume = volume.attach(hostId=lht_hostId)
@@ -745,7 +753,9 @@ def test_dr_volume_activated_with_failed_replica(set_random_backupstore, client,
     backupstore_cleanup(client)
 
     host_id = get_self_host_id()
-    vol = create_and_check_volume(client, volume_name, 2, SIZE)
+    vol = create_and_check_volume(client, volume_name,
+                                  num_of_replicas=2,
+                                  size=SIZE)
     vol.attach(hostId=host_id)
     vol = common.wait_for_volume_healthy(client, volume_name)
 
@@ -820,7 +830,9 @@ def test_dr_volume_with_backup_block_deletion(set_random_backupstore, client, co
 
     host_id = get_self_host_id()
 
-    vol = create_and_check_volume(client, volume_name, 2, SIZE)
+    vol = create_and_check_volume(client, volume_name,
+                                  num_of_replicas=2,
+                                  size=SIZE)
     vol.attach(hostId=host_id)
     vol = common.wait_for_volume_healthy(client, volume_name)
 
@@ -917,7 +929,9 @@ def test_dr_volume_with_backup_block_deletion_abort_during_backup_in_progress(se
 
     host_id = get_self_host_id()
 
-    vol = create_and_check_volume(client, volume_name, 2, SIZE)
+    vol = create_and_check_volume(client, volume_name,
+                                  num_of_replicas=2,
+                                  size=SIZE)
     vol.attach(hostId=host_id)
     vol = common.wait_for_volume_healthy(client, volume_name)
 
@@ -996,7 +1010,9 @@ def test_dr_volume_with_all_backup_blocks_deleted(set_random_backupstore, client
 
     host_id = get_self_host_id()
 
-    vol = create_and_check_volume(client, volume_name, 2, SIZE)
+    vol = create_and_check_volume(client, volume_name,
+                                  num_of_replicas=2,
+                                  size=SIZE)
     vol.attach(hostId=host_id)
     vol = common.wait_for_volume_healthy(client, volume_name)
 
@@ -1321,8 +1337,10 @@ def test_backup(set_random_backupstore, client, volume_name):  # NOQA
 
 
 def backup_test(client, volume_name, size, backing_image="", compression_method=DEFAULT_BACKUP_COMPRESSION_METHOD):  # NOQA
-    volume = create_and_check_volume(client, volume_name, 2, size,
-                                     backing_image)
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=2,
+                                     size=size,
+                                     backing_image=backing_image)
 
     lht_hostId = get_self_host_id()
     volume = volume.attach(hostId=lht_hostId)
@@ -1381,8 +1399,10 @@ def test_backup_labels(set_random_backupstore, client, random_labels, volume_nam
 def backup_labels_test(client, random_labels, volume_name, size=SIZE, backing_image=""):  # NOQA
     host_id = get_self_host_id()
 
-    volume = create_and_check_volume(client, volume_name, 2, size,
-                                     backing_image)
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=2,
+                                     size=size,
+                                     backing_image=backing_image)
 
     volume.attach(hostId=host_id)
     volume = common.wait_for_volume_healthy(client, volume_name)
@@ -1432,7 +1452,9 @@ def test_restore_inc(set_random_backupstore, client, core_api, volume_name, pod)
 
 
 def restore_inc_test(client, core_api, volume_name, pod):  # NOQA
-    std_volume = create_and_check_volume(client, volume_name, 2, SIZE)
+    std_volume = create_and_check_volume(client, volume_name,
+                                         num_of_replicas=2,
+                                         size=SIZE)
     lht_host_id = get_self_host_id()
     std_volume.attach(hostId=lht_host_id)
     std_volume = common.wait_for_volume_healthy(client, volume_name)
@@ -1848,8 +1870,10 @@ def test_volume_multinode(client, volume_name):  # NOQA
     assert len(volumes) == 0
 
 
-@pytest.mark.skip(reason="TODO")
-def test_pvc_storage_class_name_from_backup_volume(): # NOQA
+def test_pvc_storage_class_name_from_backup_volume(set_random_backupstore, # NOQA
+                                                   core_api, client, volume_name, # NOQA
+                                                   pvc_name, pvc, pod_make, # NOQA
+                                                   storage_class): # NOQA
     """
     Test the storageClasName of the restored volume's PV/PVC
     should be from the backup volume
@@ -1860,14 +1884,13 @@ def test_pvc_storage_class_name_from_backup_volume(): # NOQA
       kind: StorageClass
       apiVersion: storage.k8s.io/v1
       metadata:
-        name: longhorn-sc-name-recorded
+        name: longhorn-test
       provisioner: driver.longhorn.io
       allowVolumeExpansion: true
       reclaimPolicy: Delete
       volumeBindingMode: Immediate
       parameters:
         numberOfReplicas: "3"
-        staleReplicaTimeout: "2880"
       ```
     - Create a PVC to use this SC
       ```
@@ -1878,10 +1901,10 @@ def test_pvc_storage_class_name_from_backup_volume(): # NOQA
       spec:
         accessModes:
           - ReadWriteOnce
-        storageClassName: longhorn-sc-name-recorded
+        storageClassName: longhorn-test
         resources:
           requests:
-            storage: 5Gi
+            storage: 300Mi
       ```
     - Attach the Volume and write some data
 
@@ -1890,17 +1913,89 @@ def test_pvc_storage_class_name_from_backup_volume(): # NOQA
 
     Then
     - the backupvolume's status.storageClassName should be
-      longhorn-sc-name-recorded
+      longhorn-test
 
     When
     - Restore the backup to a new volume
     - Create PV/PVC from the new volume with create new PVC option
 
     Then
-    - The new PVC's storageClassName should still be longhorn-sc-name-recorded
+    - The new PVC's storageClassName should still be longhorn-test
     - Verify the restored data is the same as original one
     """
-    pass
+    volume_size = str(300 * Mi)
+    create_storage_class(storage_class)
+
+    pod_name = "pod-" + pvc_name
+    pvc['metadata']['name'] = pvc_name
+    pvc['spec']['storageClassName'] = storage_class['metadata']['name']
+    pvc['spec']['resources']['requests']['storage'] = volume_size
+    common.create_pvc(pvc)
+
+    pv = common.wait_and_get_pv_for_pvc(core_api, pvc_name)
+    assert pv.status.phase == "Bound"
+
+    test_pod = pod_make(pod_name)
+    test_pod['metadata']['name'] = pod_name
+    test_pod['spec']['volumes'] = [{
+        'name': test_pod['spec']['containers'][0]['volumeMounts'][0]['name'],
+        'persistentVolumeClaim': {'claimName': pvc_name},
+    }]
+    create_and_wait_pod(core_api, test_pod)
+
+    test_data = generate_random_data(VOLUME_RWTEST_SIZE)
+    write_pod_volume_data(core_api, pod_name, test_data)
+
+    volume_name = pv.spec.csi.volume_handle
+    volume_id = client.by_id_volume(volume_name)
+    snapshot = volume_id.snapshotCreate()
+
+    volume_id.snapshotBackup(name=snapshot.name)
+    wait_for_backup_completion(client, volume_name, snapshot.name)
+
+    # in nfs backupstore, bv.storageClassName sometimes were empty
+    # due to timing issue
+    for i in range(RETRY_COMMAND_COUNT):
+        bv, b = find_backup(client, volume_name, snapshot.name)
+        if bv.storageClassName != "":
+            break
+        time.sleep(RETRY_INTERVAL)
+    assert bv.storageClassName == storage_class['metadata']['name']
+
+    restore_name = generate_volume_name()
+    volume = client.create_volume(name=restore_name, size=volume_size,
+                                  numberOfReplicas=3,
+                                  fromBackup=b.url)
+
+    volume = common.wait_for_volume_restoration_completed(client, restore_name)
+    volume = common.wait_for_volume_detached(client, restore_name)
+    assert volume.name == restore_name
+    assert volume.size == volume_size
+    assert volume.numberOfReplicas == 3
+    assert volume.state == "detached"
+
+    create_pv_for_volume(client, core_api, volume, restore_name)
+    create_pvc_for_volume(client, core_api, volume, restore_name)
+
+    claim = core_api.\
+        read_namespaced_persistent_volume_claim(name=restore_name,
+                                                namespace='default')
+
+    assert claim.spec.storage_class_name == storage_class['metadata']['name']
+
+    backup_pod = pod_make(name="backup-pod")
+    restore_volume_pod_name = "pod-" + restore_name
+    backup_pod['metadata']['name'] = restore_volume_pod_name
+    backup_pod['spec']['volumes'] = [{
+        'name': backup_pod['spec']['containers'][0]['volumeMounts'][0]['name'], # NOQA
+        'persistentVolumeClaim': {
+            'claimName': restore_name,
+        },
+    }]
+    create_and_wait_pod(core_api, backup_pod)
+
+    resp = read_volume_data(core_api, restore_volume_pod_name)
+    assert resp == test_data
 
 
 @pytest.mark.coretest  # NOQA
@@ -2003,7 +2098,8 @@ def test_volume_update_replica_count(client, volume_name):  # NOQA
     host_id = get_self_host_id()
 
     replica_count = 2
-    volume = create_and_check_volume(client, volume_name, replica_count)
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=replica_count)
 
     volume.attach(hostId=host_id)
     volume = common.wait_for_volume_healthy(client, volume_name)
@@ -2108,11 +2204,8 @@ def test_storage_class_from_backup(set_random_backupstore, volume_name, pvc_name
 
     pv_name = pvc_name
 
-    volume = create_and_check_volume(
-        client,
-        volume_name,
-        size=VOLUME_SIZE
-    )
+    volume = create_and_check_volume(client, volume_name,
+                                     size=VOLUME_SIZE)
 
     wait_for_volume_detached(client, volume_name)
 
@@ -2330,7 +2423,9 @@ def test_expansion_with_size_round_up(client, core_api, volume_name):  # NOQA
     5. Check if size round up '2147483648' and the written data.
     """
 
-    volume = create_and_check_volume(client, volume_name, 2, str(1 * Gi))
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=2,
+                                     size=str(1 * Gi))
 
     self_hostId = get_self_host_id()
     volume.attach(hostId=self_hostId, disableFrontend=False)
@@ -2392,7 +2487,9 @@ def test_restore_inc_with_offline_expansion(set_random_backupstore, client, core
     """
     lht_host_id = get_self_host_id()
 
-    std_volume = create_and_check_volume(client, volume_name, 2, SIZE)
+    std_volume = create_and_check_volume(client, volume_name,
+                                         num_of_replicas=2,
+                                         size=SIZE)
     std_volume.attach(hostId=lht_host_id)
     std_volume = common.wait_for_volume_healthy(client, volume_name)
 
@@ -3077,7 +3174,8 @@ def test_backup_lock_deletion_during_restoration(set_random_backupstore, client,
     wait_for_backup_completion(client, std_volume_name, snap1.name)
 
     _, b = common.find_backup(client, std_volume_name, snap1.name)
-    client.create_volume(name=restore_volume_name, fromBackup=b.url)
+    client.create_volume(name=restore_volume_name, fromBackup=b.url,
+                         numberOfReplicas=3)
     wait_for_volume_restoration_start(client, restore_volume_name, b.name)
 
     backup_volume = client.by_id_backupVolume(std_volume_name)
@@ -3172,7 +3270,8 @@ def test_backup_lock_deletion_during_backup(set_random_backupstore, client, core
         b1 = None
     assert b1 is None
 
-    client.create_volume(name=restore_volume_name_1, fromBackup=b2.url)
+    client.create_volume(name=restore_volume_name_1, fromBackup=b2.url,
+                         numberOfReplicas=3)
 
     wait_for_volume_restoration_completed(client, restore_volume_name_1)
     restore_volume_1 = wait_for_volume_detached(client, restore_volume_name_1)
@@ -4118,7 +4217,9 @@ def test_expand_pvc_with_size_round_up(client, core_api, volume_name):  # NOQA
     setting = client.update(setting, value=static_sc_name)
     assert setting.value == static_sc_name
 
-    volume = create_and_check_volume(client, volume_name, 2, str(1 * Gi))
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=2,
+                                     size=str(1 * Gi))
     create_pv_for_volume(client, core_api, volume, volume_name)
     create_pvc_for_volume(client, core_api, volume, volume_name)
 
@@ -5547,7 +5648,7 @@ def test_backuptarget_invalid(apps_api, # NOQA
     snap = create_snapshot(client, volume_name)
     volume.snapshotBackup(name=snap.name)
 
-    for i in range(RETRY_COMMAND_COUNT):
+    for i in range(RETRY_COUNTS_SHORT):
         api = get_custom_object_api_client()
         backups = api.list_namespaced_custom_object("longhorn.io",
                                                     "v1beta2",
diff --git a/manager/integration/tests/test_cluster_autoscaler.py b/manager/integration/tests/test_cluster_autoscaler.py
index 1404719701..6a103674c0 100644
--- a/manager/integration/tests/test_cluster_autoscaler.py
+++ b/manager/integration/tests/test_cluster_autoscaler.py
@@ -147,9 +147,8 @@ def finalizer():
     nodes = client.list_node()
     scale_size = len(nodes)-1
 
-    volume = create_and_check_volume(
-        client, volume_name, num_of_replicas=scale_size
-    )
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=scale_size)
     create_pv_for_volume(client, core_api, volume, volume.name)
     create_pvc_for_volume(client, core_api, volume, volume.name)
 
diff --git a/manager/integration/tests/test_csi.py b/manager/integration/tests/test_csi.py
index d9e9e16c00..84930ffbb0 100644
--- a/manager/integration/tests/test_csi.py
+++ b/manager/integration/tests/test_csi.py
@@ -280,7 +280,8 @@ def test_csi_encrypted_block_volume(client, core_api, storage_class, crypto_secr
     7. Validate the data in `pod2` is consistent with `test_data`
     """
 
-    create_crypto_secret(crypto_secret)
+    secret = crypto_secret(LONGHORN_NAMESPACE)
+    create_crypto_secret(secret)
 
     storage_class['reclaimPolicy'] = 'Retain'
     storage_class['parameters']['csi.storage.k8s.io/provisioner-secret-name'] = 'longhorn-crypto'  # NOQA
@@ -891,6 +892,7 @@ def test_csi_minimal_volume_size(
     csi_pv['metadata']['name'] = pv_name
     csi_pv['spec']['csi']['volumeHandle'] = vol_name
     csi_pv['spec']['capacity']['storage'] = min_storage
+    csi_pv['spec']['persistentVolumeReclaimPolicy'] = 'Retain'
     core_api.create_persistent_volume(csi_pv)
 
     pvc_name = vol_name + "-pvc"
diff --git a/manager/integration/tests/test_engine_upgrade.py b/manager/integration/tests/test_engine_upgrade.py
index 073f69dac9..2c16d6a8c1 100644
--- a/manager/integration/tests/test_engine_upgrade.py
+++ b/manager/integration/tests/test_engine_upgrade.py
@@ -9,6 +9,7 @@
 from common import wait_for_volume_detached
 from common import wait_for_engine_image_deletion
 from common import wait_for_engine_image_ref_count, wait_for_engine_image_state
+from common import wait_for_engine_image_incompatible
 from common import get_volume_engine, write_volume_random_data
 from common import check_volume_endpoint
 from common import wait_for_volume_replicas_mode
@@ -450,8 +451,8 @@ def test_engine_image_incompatible(client, core_api, volume_name):  # NOQA
         ctl_v, ctl_minv,
         data_v, data_minv)
     img = client.create_engine_image(image=fail_cli_v_image)
-    img = wait_for_engine_image_state(client, img.name, "incompatible")
-    assert img.state == "incompatible"
+    img = wait_for_engine_image_incompatible(client, img.name)
+    assert img.incompatible
     assert img.cliAPIVersion == cli_minv - 1
     assert img.cliAPIMinVersion == cli_minv - 1
     client.delete(img)
@@ -462,8 +463,8 @@ def test_engine_image_incompatible(client, core_api, volume_name):  # NOQA
             ctl_v, ctl_minv,
             data_v, data_minv)
     img = client.create_engine_image(image=fail_cli_minv_image)
-    img = wait_for_engine_image_state(client, img.name, "incompatible")
-    assert img.state == "incompatible"
+    img = wait_for_engine_image_incompatible(client, img.name)
+    assert img.incompatible
     assert img.cliAPIVersion == cli_v + 1
     assert img.cliAPIMinVersion == cli_v + 1
     client.delete(img)
diff --git a/manager/integration/tests/test_ha.py b/manager/integration/tests/test_ha.py
index 0137916556..6c1a794515 100644
--- a/manager/integration/tests/test_ha.py
+++ b/manager/integration/tests/test_ha.py
@@ -95,6 +95,8 @@
 SMALL_RETRY_COUNTS = 30
 BACKUPSTORE = get_backupstores()
 
+REPLICA_FAILURE_MODE_CRASH = "replica_failure_mode_crash"
+REPLICA_FAILURE_MODE_DELETE = "replica_failure_mode_delete"
 
 @pytest.mark.coretest   # NOQA
 def test_ha_simple_recovery(client, volume_name):  # NOQA
@@ -111,8 +113,10 @@ def test_ha_simple_recovery(client, volume_name):  # NOQA
 
 
 def ha_simple_recovery_test(client, volume_name, size, backing_image=""):  # NOQA
-    volume = create_and_check_volume(client, volume_name, 2, size,
-                                     backing_image)
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=2,
+                                     size=size,
+                                     backing_image=backing_image)
 
     host_id = get_self_host_id()
     volume = volume.attach(hostId=host_id)
@@ -246,7 +250,8 @@ def ha_salvage_test(client, core_api, # NOQA
     assert setting.name == SETTING_AUTO_SALVAGE
     assert setting.value == "false"
 
-    volume = create_and_check_volume(client, volume_name, 2,
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=2,
                                      backing_image=backing_image)
 
     host_id = get_self_host_id()
@@ -289,7 +294,8 @@ def ha_salvage_test(client, core_api, # NOQA
     assert setting.name == SETTING_AUTO_SALVAGE
     assert setting.value == "false"
 
-    volume = create_and_check_volume(client, volume_name, 2,
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=2,
                                      backing_image=backing_image)
     volume.attach(hostId=host_id)
     volume = wait_for_volume_healthy(client, volume_name)
@@ -337,7 +343,8 @@ def ha_salvage_test(client, core_api, # NOQA
     assert setting.name == SETTING_DISABLE_REVISION_COUNTER
     assert setting.value == "true"
 
-    volume = create_and_check_volume(client, volume_name, 3,
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=3,
                                      backing_image=backing_image)
 
     host_id = get_self_host_id()
@@ -381,7 +388,8 @@ def ha_salvage_test(client, core_api, # NOQA
     assert setting.name == "disable-revision-counter"
     assert setting.value == "false"
 
-    volume = create_and_check_volume(client, volume_name, 3,
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=3,
                                      backing_image=backing_image)
 
     host_id = get_self_host_id()
@@ -505,7 +513,8 @@ def test_ha_prohibit_deleting_last_replica(client, volume_name):  # NOQA
 
     FIXME: Move out of test_ha.py
     """
-    volume = create_and_check_volume(client, volume_name, 1)
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=1)
 
     host_id = get_self_host_id()
     volume = volume.attach(hostId=host_id)
@@ -537,7 +546,9 @@ def test_ha_recovery_with_expansion(client, volume_name, request):   # NOQA
     """
     original_size = str(3 * Gi)
     expand_size = str(4 * Gi)
-    volume = create_and_check_volume(client, volume_name, 2, original_size)
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=2,
+                                     size=original_size)
 
     host_id = get_self_host_id()
     volume.attach(hostId=host_id)
@@ -800,92 +811,32 @@ def test_rebuild_replica_and_from_replica_on_the_same_node(client, core_api, vol
 
 def test_rebuild_with_restoration(set_random_backupstore, client, core_api, volume_name, csi_pv, pvc, pod_make): # NOQA
     """
-    [HA] Test if the rebuild is disabled for the restoring volume
+    [HA] Test if the rebuild is disabled for the restoring volume.
+
+    This is similar to test_single_replica_restore_failure and
+    test_single_replica_unschedulable_restore_failure. In this version, a
+    replica is deleted. We expect a new replica to be rebuilt in its place and
+    the restore to complete.
+
     1. Setup a random backupstore.
-    2. Create a pod with a volume and wait for pod to start.
-    3. Write data to the volume and get the md5sum.
-    4. Create a backup for the volume.
-    5. Restore a volume from the backup.
-    6. Delete one replica during the restoration.
-    7. Wait for the restoration complete and the volume detached.
-    8. Check if the replica is rebuilt for the auto detachment.
-    9. Create PV/PVC/Pod for the restored volume and wait for the pod start.
-    10. Check if the restored volume is state `Healthy`
+    2. Do cleanup for the backupstore.
+    3. Create a pod with a volume and wait for pod to start.
+    4. Write data to the pod volume and get the md5sum.
+    5. Create a backup for the volume.
+    6. Restore a volume from the backup.
+    7. Wait for the volume restore start.
+    8. Delete one replica during the restoration.
+    9. Wait for the restoration complete and the volume detached.
+    10. Check if the replica is rebuilt.
+    11. Create PV/PVC/Pod for the restored volume and wait for the pod start.
+    12. Check if the restored volume is state `Healthy`
         after the attachment.
-    11. Check md5sum of the data in the restored volume.
-    12. Do cleanup.
+    13. Check md5sum of the data in the restored volume.
+    14. Do cleanup.
     """
-    update_setting(client, common.SETTING_DEGRADED_AVAILABILITY, "false")
-
-    original_volume_name = volume_name + "-origin"
-    data_path = "/data/test"
-    original_pod_name, original_pv_name, original_pvc_name, original_md5sum = \
-        prepare_pod_with_data_in_mb(
-            client, core_api, csi_pv, pvc, pod_make, original_volume_name,
-            volume_size=str(2*Gi), data_path=data_path,
-            data_size_in_mb=3*DATA_SIZE_IN_MB_3)
-
-    original_volume = client.by_id_volume(original_volume_name)
-    snap = create_snapshot(client, original_volume_name)
-    original_volume.snapshotBackup(name=snap.name)
-    wait_for_backup_completion(client,
-                               original_volume_name,
-                               snap.name,
-                               retry_count=600)
-    bv, b = find_backup(client, original_volume_name, snap.name)
-
-    restore_volume_name = volume_name + "-restore"
-    client.create_volume(name=restore_volume_name, size=str(2 * Gi),
-                         numberOfReplicas=3, fromBackup=b.url)
-    wait_for_volume_creation(client, restore_volume_name)
-
-    restoring_replica = wait_for_volume_restoration_start(
-        client, restore_volume_name, b.name)
-    restore_volume = client.by_id_volume(restore_volume_name)
-    restore_volume.replicaRemove(name=restoring_replica)
-    client.list_backupVolume()
-
-    # Wait for the rebuild start
-    running_replica_count = 0
-    for i in range(RETRY_COUNTS):
-        running_replica_count = 0
-        restore_volume = client.by_id_volume(restore_volume_name)
-        for r in restore_volume.replicas:
-            if r['running'] and not r['failedAt']:
-                running_replica_count += 1
-        if running_replica_count == 3:
-            break
-        time.sleep(RETRY_INTERVAL)
-    assert running_replica_count == 3
-
-    wait_for_volume_restoration_completed(client, restore_volume_name)
-    restore_volume = wait_for_volume_detached(client, restore_volume_name)
-    assert len(restore_volume.replicas) == 3
-    for r in restore_volume.replicas:
-        assert restoring_replica != r.name
-        assert r['failedAt'] == ""
-
-    restore_pod_name = restore_volume_name + "-pod"
-    restore_pv_name = restore_volume_name + "-pv"
-    restore_pvc_name = restore_volume_name + "-pvc"
-    restore_pod = pod_make(name=restore_pod_name)
-    create_pv_for_volume(client, core_api, restore_volume, restore_pv_name)
-    create_pvc_for_volume(client, core_api, restore_volume, restore_pvc_name)
-    restore_pod['spec']['volumes'] = [create_pvc_spec(restore_pvc_name)]
-    create_and_wait_pod(core_api, restore_pod)
-
-    restore_volume = client.by_id_volume(restore_volume_name)
-    assert restore_volume[VOLUME_FIELD_ROBUSTNESS] == VOLUME_ROBUSTNESS_HEALTHY
-
-    md5sum = get_pod_data_md5sum(core_api, restore_pod_name, data_path)
-    assert original_md5sum == md5sum
-
-    # cleanup the backupstore so we don't impact other tests
-    # since we crashed the replica that initiated the restore
-    # it's backupstore lock will still be present, so we need to
-    # wait till the lock is expired, before we can delete the backups
-    backupstore_wait_for_lock_expiration()
-    backupstore_cleanup(client)
+    restore_with_replica_failure(client, core_api, volume_name, csi_pv, pvc,
+                                 pod_make, False, False,
+                                 REPLICA_FAILURE_MODE_DELETE)
 
 
 def test_rebuild_with_inc_restoration(set_random_backupstore, client, core_api, volume_name, csi_pv, pvc, pod_make):  # NOQA
@@ -1504,121 +1455,65 @@ def test_single_replica_restore_failure(set_random_backupstore, client, core_api
     becoming Degraded, and if the restore volume is still usable after
     the failure.
 
-    Notice that this case is similar to test_rebuild_with_restoration().
-    But the way to fail the replica is different.
-    test_rebuild_with_restoration() directly crash the replica process
-    hence there is no error in the restore status.
+    This is similar to test_rebuild_with_restoration and
+    test_single_replica_unschedulable_restore_failure. In this version, a
+    replica is crashed. We expect the crashed replica to be rebuilt and the
+    restore to complete.
 
-    1. Enable auto-salvage.
-    2. Set the a random backupstore.
-    3. Do cleanup for the backupstore.
+    1. Setup a random backupstore.
+    2. Do cleanup for the backupstore.
+    3. Create a pod with a volume and wait for pod to start.
+    4. Write data to the pod volume and get the md5sum.
+    5. Create a backup for the volume.
+    6. Restore a volume from the backup.
+    7. Wait for the volume restore start.
+    8. Crash one replica during the restoration.
+    9. Wait for the restoration complete and the volume detached.
+    10. Check if the replica is rebuilt.
+    11. Create PV/PVC/Pod for the restored volume and wait for the pod start.
+    12. Check if the restored volume is state `Healthy`
+        after the attachment.
+    13. Check md5sum of the data in the restored volume.
+    14. Do cleanup.
+    """
+    restore_with_replica_failure(client, core_api, volume_name, csi_pv, pvc,
+                                 pod_make, False, False,
+                                 REPLICA_FAILURE_MODE_CRASH)
+
+
+def test_single_replica_unschedulable_restore_failure(set_random_backupstore, client, core_api, volume_name, csi_pv, pvc, pod_make): # NOQA
+    """
+    [HA] Test if the restore can complete if a restoring replica is killed
+    while it is ongoing and cannot be recovered.
+
+    This is similar to test_rebuild_with_restoration and
+    test_single_replica_restore_failure. In this version, a replica is crashed
+    and not allowed to recover. However, we enable
+    allow-volume-creation-with-degraded-availability, so we expect the restore
+    to complete anyway.
+
+    1. Setup a random backupstore.
+    2. Do cleanup for the backupstore.
+    3. Enable allow-volume-creation-with-degraded-availability (to allow
+       restoration to complete without all replicas).
     4. Create a pod with a volume and wait for pod to start.
     5. Write data to the pod volume and get the md5sum.
     6. Create a backup for the volume.
     7. Restore a volume from the backup.
-    8. Wait for the volume restore start by checking if:
-       8.1. `volume.restoreStatus` shows the related restore info.
-       8.2. `volume.conditions[Restore].status == True &&
-            volume.conditions[Restore].reason == "RestoreInProgress"`.
-       8.3. `volume.ready == false`.
-    9. Find a way to fail just one replica restore.
-       e.g. Use iptable to block the restore.
-    10. Wait for the restore volume Degraded.
-    11. Wait for the volume restore & rebuild complete and check if:
-        11.1. `volume.ready == true`
-        11.2. `volume.conditions[Restore].status == False &&
-              volume.conditions[Restore].reason == ""`.
+    8. Wait for the volume restore start.
+    9. Disable replica rebuilding (to ensure the killed replica cannot
+       recover).
+    10. Crash one replica during the restoration.
+    11. Wait for the restoration complete and the volume detached.
     12. Create PV/PVC/Pod for the restored volume and wait for the pod start.
     13. Check if the restored volume is state `Healthy`
         after the attachment.
     14. Check md5sum of the data in the restored volume.
     15. Do cleanup.
     """
-    auto_salvage_setting = client.by_id_setting(SETTING_AUTO_SALVAGE)
-    assert auto_salvage_setting.name == SETTING_AUTO_SALVAGE
-    assert auto_salvage_setting.value == "true"
-
-    update_setting(client, common.SETTING_DEGRADED_AVAILABILITY, "false")
-
-    backupstore_cleanup(client)
-
-    data_path = "/data/test"
-
-    pod_name, pv_name, pvc_name, md5sum = \
-        prepare_pod_with_data_in_mb(client, core_api, csi_pv, pvc,
-                                    pod_make,
-                                    volume_name,
-                                    data_size_in_mb=DATA_SIZE_IN_MB_2,
-                                    data_path=data_path)
-
-    volume = client.by_id_volume(volume_name)
-    snap = create_snapshot(client, volume_name)
-    volume.snapshotBackup(name=snap.name)
-    wait_for_backup_completion(client, volume_name, snap.name)
-    bv, b = find_backup(client, volume_name, snap.name)
-
-    res_name = "res-" + volume_name
-
-    client.create_volume(name=res_name, fromBackup=b.url)
-    wait_for_volume_condition_restore(client, res_name,
-                                      "status", "True")
-    wait_for_volume_condition_restore(client, res_name,
-                                      "reason", "RestoreInProgress")
-
-    res_volume = client.by_id_volume(res_name)
-    assert res_volume.ready is False
-
-    res_volume = wait_for_volume_healthy_no_frontend(client, res_name)
-
-    failed_replica = res_volume.replicas[0]
-    crash_replica_processes(client, core_api, res_name,
-                            replicas=[failed_replica],
-                            wait_to_fail=False)
-    wait_for_volume_degraded(client, res_name)
-
-    # Wait for the rebuild start
-    running_replica_count = 0
-    for i in range(RETRY_COUNTS):
-        running_replica_count = 0
-        res_volume = client.by_id_volume(res_name)
-        for r in res_volume.replicas:
-            if r['running'] and not r['failedAt']:
-                running_replica_count += 1
-        if running_replica_count == 3:
-            break
-        time.sleep(RETRY_INTERVAL)
-    assert running_replica_count == 3
-
-    wait_for_volume_restoration_completed(client, res_name)
-    wait_for_volume_condition_restore(client, res_name,
-                                      "status", "False")
-    res_volume = wait_for_volume_detached(client, res_name)
-    assert res_volume.ready is True
-
-    res_pod_name = res_name + "-pod"
-    pv_name = res_name + "-pv"
-    pvc_name = res_name + "-pvc"
-
-    create_pv_for_volume(client, core_api, res_volume, pv_name)
-    create_pvc_for_volume(client, core_api, res_volume, pvc_name)
-
-    res_pod = pod_make(name=res_pod_name)
-    res_pod['spec']['volumes'] = [create_pvc_spec(pvc_name)]
-    create_and_wait_pod(core_api, res_pod)
-
-    res_volume = client.by_id_volume(res_name)
-    assert res_volume[VOLUME_FIELD_ROBUSTNESS] == VOLUME_ROBUSTNESS_HEALTHY
-
-    res_md5sum = get_pod_data_md5sum(core_api, res_pod_name, data_path)
-    assert md5sum == res_md5sum
-
-    # cleanup the backupstore so we don't impact other tests
-    # since we crashed the replica that initiated the restore
-    # it's backupstore lock will still be present, so we need to
-    # wait till the lock is expired, before we can delete the backups
-    backupstore_wait_for_lock_expiration()
-    backupstore_cleanup(client)
-
+    restore_with_replica_failure(client, core_api, volume_name, csi_pv, pvc,
+                                 pod_make, True, True,
+                                 REPLICA_FAILURE_MODE_CRASH)
 
 def test_dr_volume_with_restore_command_error(set_random_backupstore, client, core_api, volume_name, csi_pv, pvc, pod_make):  # NOQA
     """
@@ -1995,7 +1890,8 @@ def test_rebuild_after_replica_file_crash(client, volume_name): # NOQA
     6. Read the data from the volume and verify the md5sum.
     """
     replica_count = 3
-    volume = create_and_check_volume(client, volume_name, replica_count)
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=replica_count)
     host_id = get_self_host_id()
     volume = volume.attach(hostId=host_id)
     volume = wait_for_volume_healthy(client, volume_name)
@@ -2602,7 +2498,9 @@ def test_replica_failure_during_attaching(settings_reset, client, core_api, volu
     node = common.wait_for_disk_update(client, node.name, len(update_disks))
 
     volume_name_2 = volume_name + '-2'
-    volume_2 = create_and_check_volume(client, volume_name_2, 3, str(1 * Gi))
+    volume_2 = create_and_check_volume(client, volume_name_2,
+                                       num_of_replicas=3,
+                                       size=str(1 * Gi))
     volume_2.attach(hostId=host_id)
     volume_2 = wait_for_volume_healthy(client, volume_name_2)
     write_volume_random_data(volume_2)
@@ -2961,7 +2859,8 @@ def test_engine_image_not_fully_deployed_perform_replica_scheduling(client, core
     node2 = common.wait_for_node_update(client, node2.id, "allowScheduling",
                                         False)
 
-    volume1 = create_and_check_volume(client, "vol-1", num_of_replicas=2,
+    volume1 = create_and_check_volume(client, "vol-1",
+                                      num_of_replicas=2,
                                       size=str(3 * Gi))
 
     volume1.attach(hostId=node3.id)
@@ -3008,10 +2907,12 @@ def test_engine_image_not_fully_deployed_perform_auto_upgrade_engine(client, cor
     """
     prepare_engine_not_fully_deployed_environment(client, core_api)
 
-    volume1 = create_and_check_volume(client, "vol-1", num_of_replicas=2,
+    volume1 = create_and_check_volume(client, "vol-1",
+                                      num_of_replicas=2,
                                       size=str(3 * Gi))
 
-    volume2 = create_and_check_volume(client, "vol-2", num_of_replicas=2,
+    volume2 = create_and_check_volume(client, "vol-2",
+                                      num_of_replicas=2,
                                       size=str(3 * Gi))
 
     default_img = common.get_default_engine_image(client)
@@ -3085,7 +2986,8 @@ def test_engine_image_not_fully_deployed_perform_dr_restoring_expanding_volume(c
         prepare_engine_not_fully_deployed_environment(client, core_api)
 
     # step 1
-    volume1 = create_and_check_volume(client, "vol-1", num_of_replicas=2,
+    volume1 = create_and_check_volume(client, "vol-1",
+                                      num_of_replicas=2,
                                       size=str(1 * Gi))
 
     # node1: tainted node, node2: self host node, node3: the last one
@@ -3402,3 +3304,142 @@ def test_recovery_from_im_deletion(client, core_api, volume_name, make_deploymen
 
     # Step8
     assert test_data == to_be_verified_data
+
+
+@pytest.mark.skip(reason="TODO")  # NOQA
+def test_retain_potentially_useful_replicas_in_autosalvage_loop():
+    """
+    Related issue:
+    https://github.com/longhorn/longhorn/issues/7425
+
+    Related manual test steps:
+    https://github.com/longhorn/longhorn-manager/pull/2432#issuecomment-1894675916
+
+    Steps:
+    1. Create a volume with numberOfReplicas=2 and staleReplicaTimeout=1.
+       Consider its two replicas ReplicaA and ReplicaB.
+    2. Attach the volume to a node.
+    3. Write data to the volume.
+    4. Exec into the instance-manager for ReplicaB and delete all .img.meta
+       files. This makes it impossible to restart ReplicaB successfully.
+    5. Cordon the node for Replica A. This makes it unavailable for
+       autosalvage.
+    6. Crash the instance-managers for both ReplicaA and ReplicaB.
+    7. Wait one minute and fifteen seconds. This is longer than
+       staleReplicaTimeout.
+    8. Confirm the volume is not healthy.
+    9. Confirm ReplicaA was not deleted.
+    10. Delete ReplicaB.
+    11. Wait for the volume to become healthy.
+    12. Verify the data.
+    """
+
+def restore_with_replica_failure(client, core_api, volume_name, csi_pv, # NOQA
+                                 pvc, pod_make, # NOQA
+                                 allow_degraded_availability,
+                                 disable_rebuild, replica_failure_mode):
+    """
+    restore_with_replica_failure is reusable by a number of similar tests.
+    In general, it attempts a volume restore, kills one of the restoring
+    replicas, and verifies the restore can still complete. The manner in which
+    a replica is killed and the settings enabled at the time vary with the
+    parameters.
+    """
+
+    backupstore_cleanup(client)
+
+    update_setting(client, common.SETTING_DEGRADED_AVAILABILITY,
+                   str(allow_degraded_availability).lower())
+
+    data_path = "/data/test"
+    _, _, _, md5sum = \
+        prepare_pod_with_data_in_mb(client, core_api, csi_pv, pvc,
+                                    pod_make,
+                                    volume_name,
+                                    volume_size=str(2 * Gi),
+                                    data_size_in_mb=DATA_SIZE_IN_MB_4,
+                                    data_path=data_path)
+
+    volume = client.by_id_volume(volume_name)
+    snap = create_snapshot(client, volume_name)
+    volume.snapshotBackup(name=snap.name)
+    wait_for_backup_completion(client, volume_name, snap.name, retry_count=600)
+    _, b = find_backup(client, volume_name, snap.name)
+
+    restore_volume_name = volume_name + "-restore"
+    client.create_volume(name=restore_volume_name, size=str(2 * Gi),
+                         fromBackup=b.url)
+
+    _ = wait_for_volume_restoration_start(client, restore_volume_name, b.name)
+    restore_volume = client.by_id_volume(restore_volume_name)
+    failed_replica = restore_volume.replicas[0]
+
+    if disable_rebuild:
+        common.update_setting(
+            client,
+            common.SETTING_CONCURRENT_REPLICA_REBUILD_PER_NODE_LIMIT, "0")
+
+    if replica_failure_mode == REPLICA_FAILURE_MODE_CRASH:
+        crash_replica_processes(client, core_api, restore_volume_name,
+                                replicas=[failed_replica],
+                                wait_to_fail=False)
+    if replica_failure_mode == REPLICA_FAILURE_MODE_DELETE:
+        restore_volume.replicaRemove(name=failed_replica.name)
+
+    if not disable_rebuild:
+        # If disable_rebuild then we expect the volume to quickly finish
+        # restoration and detach. We MIGHT be able to catch it degraded before,
+        # but trying can lead to flakes. Check degraded at the end of test,
+        # since no rebuilds are allowed.
+        wait_for_volume_degraded(client, restore_volume_name)
+        running_replica_count = 0
+        for i in range(RETRY_COUNTS):
+            running_replica_count = 0
+            for r in restore_volume.replicas:
+                if r['running'] and not r['failedAt']:
+                    running_replica_count += 1
+            if running_replica_count == 3:
+                break
+            time.sleep(RETRY_INTERVAL)
+        assert running_replica_count == 3
+
+    wait_for_volume_restoration_completed(client, restore_volume_name)
+    wait_for_volume_condition_restore(client, restore_volume_name,
+                                      "status", "False")
+    restore_volume = wait_for_volume_detached(client, restore_volume_name)
+    assert restore_volume.ready
+
+    if disable_rebuild and replica_failure_mode == REPLICA_FAILURE_MODE_DELETE:
+        assert len(restore_volume.replicas) == 3
+        for r in restore_volume.replicas:
+            assert r['failedAt'] == ""
+            assert failed_replica.name != r.name
+
+    restore_pod_name = restore_volume_name + "-pod"
+    restore_pv_name = restore_volume_name + "-pv"
+    restore_pvc_name = restore_volume_name + "-pvc"
+    create_pv_for_volume(client, core_api, restore_volume, restore_pv_name)
+    create_pvc_for_volume(client, core_api, restore_volume, restore_pvc_name)
+
+    restore_pod = pod_make(name=restore_pod_name)
+    restore_pod['spec']['volumes'] = [create_pvc_spec(restore_pvc_name)]
+    create_and_wait_pod(core_api, restore_pod)
+
+    restore_volume = client.by_id_volume(restore_volume_name)
+    if disable_rebuild:
+        # Restoration should be complete, but without one replica.
+        assert restore_volume[VOLUME_FIELD_ROBUSTNESS] == \
+            VOLUME_ROBUSTNESS_DEGRADED
+    else:
+        assert restore_volume[VOLUME_FIELD_ROBUSTNESS] == \
+            VOLUME_ROBUSTNESS_HEALTHY
+
+    restore_md5sum = get_pod_data_md5sum(core_api, restore_pod_name, data_path)
+    assert restore_md5sum == md5sum
+
+    # cleanup the backupstore so we don't impact other tests
+    # since we crashed the replica that initiated the restore
+    # it's backupstore lock will still be present, so we need to
+    # wait till the lock is expired, before we can delete the backups
+    backupstore_wait_for_lock_expiration()
+    backupstore_cleanup(client)
diff --git a/manager/integration/tests/test_infra.py b/manager/integration/tests/test_infra.py
index 87cf0f5d1f..6842db3090 100644
--- a/manager/integration/tests/test_infra.py
+++ b/manager/integration/tests/test_infra.py
@@ -141,7 +141,6 @@ def is_infra_k3s():
 @pytest.fixture
 def reset_cluster_ready_status(request):
     yield
-    node_worker_label = 'node-role.kubernetes.io/worker'
     node_controlplane_label = 'node-role.kubernetes.io/control-plane'
     node_ip_annotation = "flannel.alpha.coreos.com/public-ip"
 
@@ -149,27 +148,16 @@ def reset_cluster_ready_status(request):
     longhorn_api_client = get_longhorn_api_client()
     cloudprovider = detect_cloudprovider()
 
-    k3s = is_infra_k3s()
-
     print('==> test completed! reset cluster ready status ...')
 
     for node_item in k8s_api_client.list_node().items:
 
-        if k3s is True:
-            if node_controlplane_label not in node_item.metadata.labels:
-                node_name = node_item.metadata.name
-                node_ip = node_item.metadata.annotations[node_ip_annotation]
-                node = cloudprovider.instance_id_by_ip(node_ip)
-            else:
-                continue
-
+        if node_controlplane_label not in node_item.metadata.labels:
+            node_name = node_item.metadata.name
+            node_ip = node_item.metadata.annotations[node_ip_annotation]
+            node = cloudprovider.instance_id_by_ip(node_ip)
         else:
-            if node_worker_label in node_item.metadata.labels and \
-                    node_item.metadata.labels[node_worker_label] == 'true':
-                node_name = node_item.metadata.name
-                node = cloudprovider.instance_id(node_name)
-            else:
-                continue
+            continue
 
         if is_node_ready_k8s(node_name, k8s_api_client) is False:
 
@@ -199,7 +187,6 @@ def test_offline_node(reset_cluster_ready_status):
     1. Bring down one of the nodes in Kuberntes cluster (avoid current node)
     2. Make sure the Longhorn node state become `down`
     """
-    node_worker_label = 'node-role.kubernetes.io/worker'
     pod_lable_selector = "longhorn-test=test-job"
     node_controlplane_label = 'node-role.kubernetes.io/control-plane'
     node_ip_annotation = "flannel.alpha.coreos.com/public-ip"
@@ -214,27 +201,15 @@ def test_offline_node(reset_cluster_ready_status):
         if pod.metadata.name == "longhorn-test":
             longhorn_test_node_name = pod.spec.node_name
 
-    k3s = is_infra_k3s()
-
     for node_item in k8s_api_client.list_node().items:
-        if k3s is True:
-            if node_controlplane_label not in node_item.metadata.labels:
-                node_name = node_item.metadata.name
-                node_ip = node_item.metadata.annotations[node_ip_annotation]
-                if node_name == longhorn_test_node_name:
-                    continue
-                else:
-                    node = cloudprovider.instance_id_by_ip(node_ip)
-                    break
-        else:
-            if node_worker_label in node_item.metadata.labels and \
-                    node_item.metadata.labels[node_worker_label] == 'true':
-                node_name = node_item.metadata.name
-                if node_name == longhorn_test_node_name:
-                    continue
-                else:
-                    node = cloudprovider.instance_id(node_name)
-                    break
+        if node_controlplane_label not in node_item.metadata.labels:
+            node_name = node_item.metadata.name
+            node_ip = node_item.metadata.annotations[node_ip_annotation]
+            if node_name == longhorn_test_node_name:
+                continue
+            else:
+                node = cloudprovider.instance_id_by_ip(node_ip)
+                break
 
     print(f'==> stop node: {node_name}')
 
diff --git a/manager/integration/tests/test_metric.py b/manager/integration/tests/test_metric.py
new file mode 100644
index 0000000000..3210cf1f00
--- /dev/null
+++ b/manager/integration/tests/test_metric.py
@@ -0,0 +1,523 @@
+import pytest
+import requests
+import time
+
+from collections import defaultdict
+from prometheus_client.parser import text_string_to_metric_families
+
+from common import client, core_api, volume_name  # NOQA
+
+from common import delete_replica_processes
+from common import create_pv_for_volume
+from common import create_pvc_for_volume
+from common import create_snapshot
+from common import create_and_check_volume
+from common import generate_random_data
+from common import get_self_host_id
+from common import wait_for_volume_degraded
+from common import wait_for_volume_detached
+from common import wait_for_volume_detached_unknown
+from common import wait_for_volume_expansion
+from common import wait_for_volume_faulted
+from common import wait_for_volume_healthy
+from common import write_volume_data
+from common import write_volume_random_data
+from common import set_node_scheduling
+from common import set_node_cordon
+from common import Mi
+from common import LONGHORN_NAMESPACE
+from common import RETRY_COUNTS
+from common import RETRY_INTERVAL
+from common import DEFAULT_DISK_PATH
+
+# The dictionaries use float type of value because the value obtained from
+# prometheus_client is in float type.
+# https://github.com/longhorn/longhorn-tests/pull/1531#issuecomment-1833349994
+longhorn_volume_state = {
+    "creating": 1.0,
+    "attached": 2.0,
+    "detached": 3.0,
+    "attaching": 4.0,
+    "detaching": 5.0,
+    "deleting": 6.0,
+    }
+
+longhorn_volume_robustness = {
+    "unknown": 0.0,
+    "healthy": 1.0,
+    "degraded": 2.0,
+    "faulted": 3.0,
+}
+
+
+def get_metrics(core_api, metric_node_id): # NOQA
+    pods = core_api.list_namespaced_pod(namespace=LONGHORN_NAMESPACE,
+                                        label_selector="app=longhorn-manager")
+    for pod in pods.items:
+        if pod.spec.node_name == metric_node_id:
+            manager_ip = pod.status.pod_ip
+            break
+
+    metrics = requests.get("http://{}:9500/metrics".format(manager_ip)).content
+    string_data = metrics.decode('utf-8')
+    result = text_string_to_metric_families(string_data)
+    return result
+
+
+def find_metric(metric_data, metric_name):
+    return find_metrics(metric_data, metric_name)[0]
+
+
+def find_metrics(metric_data, metric_name):
+    metrics = []
+
+    # Find the metric with the given name in the provided metric data
+    for family in metric_data:
+        for sample in family.samples:
+            if sample.name == metric_name:
+                metrics.append(sample)
+
+    return metrics
+
+
+def check_metric_with_condition(core_api, metric_name, metric_labels, expected_value=None, metric_node_id=get_self_host_id()): # NOQA)
+    """
+    Some metric have multiple conditions, for exameple metric
+    longhorn_node_status have condition
+    - allowScheduling
+    - mountpropagation
+    - ready
+    - schedulable
+    metric longhorn_disk_status have conditions
+    - ready
+    - schedulable
+    Use this function to get specific condition of a mertic
+    """
+    metric_data = get_metrics(core_api, metric_node_id)
+
+    found_metric = next(
+        (sample for family in metric_data for sample in family.samples
+            if sample.name == metric_name and
+            sample.labels.get("condition") == metric_labels.get("condition")),
+        None
+        )
+
+    assert found_metric is not None
+
+    examine_metric_value(found_metric, metric_labels, expected_value)
+
+
+def check_metric(core_api, metric_name, metric_labels, expected_value=None, metric_node_id=get_self_host_id()): # NOQA
+    metric_data = get_metrics(core_api, metric_node_id)
+
+    found_metric = None
+    for family in metric_data:
+        found_metric = next((sample for sample in family.samples if sample.name == metric_name), None) # NOQA
+        if found_metric:
+            break
+
+    assert found_metric is not None
+
+    examine_metric_value(found_metric, metric_labels, expected_value)
+
+
+def examine_metric_value(found_metric, metric_labels, expected_value=None):
+    for key, value in metric_labels.items():
+        assert found_metric.labels[key] == value
+
+    assert isinstance(found_metric.value, float)
+
+    if expected_value is not None:
+        assert found_metric.value == expected_value
+    else:
+        assert found_metric.value >= 0.0
+
+
+def check_metric_sum_on_all_nodes(client, core_api, metric_name, expected_labels, expected_value=None): # NOQA
+    # Initialize total_metrics to store the sum of the metric values.
+    total_metrics = {"labels": defaultdict(None), "value": 0.0}
+
+    # Initialize the total_metric_values to store the sum of the
+    # metric label values.
+    total_metric_values = total_metrics["labels"]
+
+    # Find the metric based on the given labels.
+    def filter_metric_by_labels(metrics, labels):
+        for metric in metrics:
+            is_matched = True
+            for key, value in labels.items():
+                if type(value) in (float, int):
+                    continue
+
+                if metric.labels[key] != value:
+                    is_matched = False
+                    break
+
+            if is_matched:
+                return metric
+
+        raise AssertionError("Cannot find the metric matching the labels")
+
+    for node in client.list_node():
+        metric_data = get_metrics(core_api, node.name)
+
+        metrics = find_metrics(metric_data, metric_name)
+        if len(metrics) == 0:
+            continue
+
+        filtered_metric = filter_metric_by_labels(metrics, expected_labels)
+
+        assert isinstance(filtered_metric.value, float)
+
+        for key, value in expected_labels.items():
+            value_type = type(value)
+
+            if key not in total_metric_values:
+                total_metric_values[key] = value_type(
+                    filtered_metric.labels[key]
+                )
+            # Accumulate the metric label values.
+            elif isinstance(value, (float, int)):
+                total_metric_values[key] += value_type(
+                    filtered_metric.labels[key]
+                )
+
+        # Accumulate the metric values.
+        total_metrics["value"] += filtered_metric.value
+
+    for key, value in expected_labels.items():
+        assert total_metric_values[key] == value
+
+    if expected_value is not None:
+        assert total_metrics["value"] == expected_value
+    else:
+        assert total_metrics["value"] >= 0.0
+
+
+def wait_for_metric_count_all_nodes(client, core_api, metric_name, metric_labels, expected_count): # NOQA
+    for _ in range(RETRY_COUNTS):
+        time.sleep(RETRY_INTERVAL)
+
+        try:
+            check_metric_count_all_nodes(client, core_api, metric_name,
+                                         metric_labels, expected_count)
+            return
+        except AssertionError:
+            continue
+
+    check_metric_count_all_nodes(client, core_api, metric_name,
+                                 metric_labels, expected_count)
+
+
+def check_metric_count_all_nodes(client, core_api, metric_name, metric_labels, expected_count): # NOQA
+    # Find the metrics based on the given labels.
+    def filter_metrics_by_labels(metrics, labels):
+        filtered_metrics = []
+        for metric in metrics:
+            is_matched = True
+            for key, value in labels.items():
+                if type(value) in (float, int):
+                    continue
+
+                if metric.labels[key] != value:
+                    is_matched = False
+                    break
+
+            if is_matched:
+                filtered_metrics.append(metric)
+
+        print(filtered_metrics)
+        return filtered_metrics
+
+    filtered_metrics = []
+    for node in client.list_node():
+        metric_data = get_metrics(core_api, node.name)
+
+        metrics = find_metrics(metric_data, metric_name)
+        if len(metrics) == 0:
+            continue
+
+        filtered_metrics.extend(
+            filter_metrics_by_labels(metrics, metric_labels)
+        )
+
+    assert len(filtered_metrics) == expected_count
+
+
+
+@pytest.mark.parametrize("pvc_namespace", [LONGHORN_NAMESPACE, "default"])  # NOQA
+def test_volume_metrics(client, core_api, volume_name, pvc_namespace): # NOQA
+    """
+    https://longhorn.io/docs/master/monitoring/metrics/#volume
+
+    The goal of this test case is to verify that the accuracy
+    of volume metrics by sending HTTP requests to
+    http://{longhorn-manager IP}:9500/metrics and use
+    prometheus_client to validate the return value.
+    """
+    lht_hostId = get_self_host_id()
+    pv_name = volume_name + "-pv"
+    pvc_name = volume_name + "-pvc"
+    volume_size = str(500 * Mi)
+    volume = create_and_check_volume(client,
+                                     volume_name,
+                                     num_of_replicas=3,
+                                     size=volume_size)
+
+    volume = client.by_id_volume(volume_name)
+    create_pv_for_volume(client, core_api, volume, pv_name)
+    create_pvc_for_volume(client, core_api, volume, pvc_name, pvc_namespace)
+
+    volume = client.by_id_volume(volume_name)
+    volume.attach(hostId=lht_hostId)
+    volume = wait_for_volume_healthy(client, volume_name)
+    write_volume_random_data(volume)
+    volume = client.by_id_volume(volume_name)
+    actual_size = float(volume.controllers[0].actualSize)
+    capacity_size = float(volume.size)
+
+    metric_labels = {
+        "node": lht_hostId,
+        "pvc": pvc_name,
+        "volume": volume_name,
+        "pvc_namespace": pvc_namespace
+    }
+
+    # check volume metric basic
+    check_metric(core_api, "longhorn_volume_actual_size_bytes",
+                 metric_labels, actual_size)
+    check_metric(core_api, "longhorn_volume_capacity_bytes",
+                 metric_labels, capacity_size)
+    check_metric(core_api, "longhorn_volume_read_throughput",
+                 metric_labels)
+    check_metric(core_api, "longhorn_volume_write_throughput",
+                 metric_labels)
+    check_metric(core_api, "longhorn_volume_read_iops",
+                 metric_labels)
+    check_metric(core_api, "longhorn_volume_write_iops",
+                 metric_labels)
+    check_metric(core_api, "longhorn_volume_read_latency",
+                 metric_labels)
+    check_metric(core_api, "longhorn_volume_write_latency",
+                 metric_labels)
+
+    # verify longhorn_volume_robustness when volume is healthy,
+    # degraded, faulted or unknown
+    volume.detach()
+    volume = wait_for_volume_detached_unknown(client, volume_name)
+    check_metric(core_api, "longhorn_volume_robustness",
+                 metric_labels, longhorn_volume_robustness["unknown"])
+
+    volume.attach(hostId=lht_hostId)
+    volume = wait_for_volume_healthy(client, volume_name)
+    check_metric(core_api, "longhorn_volume_robustness",
+                 metric_labels, longhorn_volume_robustness["healthy"])
+
+    volume.updateReplicaCount(replicaCount=4)
+    volume = wait_for_volume_degraded(client, volume_name)
+    check_metric(core_api, "longhorn_volume_robustness",
+                 metric_labels, longhorn_volume_robustness["degraded"])
+
+    volume.updateReplicaCount(replicaCount=3)
+    volume = wait_for_volume_healthy(client, volume_name)
+    delete_replica_processes(client, core_api, volume_name)
+    volume = wait_for_volume_faulted(client, volume_name)
+
+    check_metric(core_api, "longhorn_volume_robustness",
+                 metric_labels, longhorn_volume_robustness["faulted"])
+
+    # verify longhorn_volume_state when volume is attached or detached
+    volume = wait_for_volume_healthy(client, volume_name)
+    check_metric(core_api, "longhorn_volume_state",
+                 metric_labels, longhorn_volume_state["attached"])
+
+    volume.detach()
+    volume = wait_for_volume_detached(client, volume_name)
+    check_metric(core_api, "longhorn_volume_state",
+                 metric_labels, longhorn_volume_state["detached"])
+
+
+def test_metric_longhorn_snapshot_actual_size_bytes(client, core_api, volume_name): # NOQA
+    """
+    Scenario: test metric longhorn_snapshot_actual_size_bytes
+
+    Issue: https://github.com/longhorn/longhorn/issues/5869
+
+    Given a volume
+
+    When 1 snapshot is created by user
+    And 1 snapshot is created by system
+    Then has a metric longhorn_snapshot_actual_size_bytes value equals to the
+         size of the user created snapshot,
+         and volume label is the volume name
+         and user_created label is true
+    And has a metric longhorn_snapshot_actual_size_bytes value equals to the
+        size of the system created snapshot,
+        and volume label is the volume name
+        and user_created label is false
+
+    When 3 snapshot is created by user
+    Then has 4 metrics longhorn_snapshot_actual_size_bytes with
+         volume label is the volume name
+         and user_created label is true
+    And has 1 metrics longhorn_snapshot_actual_size_bytes with
+        volume label is the volume name
+        and user_created label is false
+    """
+    self_hostId = get_self_host_id()
+
+    # create a volume and attach it to a node.
+    volume_size = 50 * Mi
+    client.create_volume(name=volume_name,
+                         numberOfReplicas=1,
+                         size=str(volume_size))
+    volume = wait_for_volume_detached(client, volume_name)
+    volume.attach(hostId=self_hostId)
+    volume = wait_for_volume_healthy(client, volume_name)
+
+    # create the user snapshot.
+    data_size = 10 * Mi
+    user_snapshot_data_0 = {'pos': 0,
+                            'len': data_size,
+                            'content': generate_random_data(data_size)}
+    write_volume_data(volume, user_snapshot_data_0)
+
+    create_snapshot(client, volume_name)
+
+    # create the system snapshot by expanding the volume.
+    system_snapshot_data_0 = {'pos': 0,
+                              'len': data_size,
+                              'content': generate_random_data(data_size)}
+    write_volume_data(volume, system_snapshot_data_0)
+
+    volume_size_expanded_0 = str(volume_size * 2)
+    volume.expand(size=volume_size_expanded_0)
+    wait_for_volume_expansion(client, volume_name)
+    volume = client.by_id_volume(volume_name)
+    assert volume.size == volume_size_expanded_0
+
+    # get the snapshot sizes.
+    user_snapshot_size = 0
+    system_snapshot_size = 0
+    snapshots = volume.snapshotList()
+    for snapshot in snapshots:
+        if snapshot.name == "volume-head":
+            continue
+
+        if snapshot.usercreated:
+            user_snapshot_size = int(snapshot.size)
+        else:
+            system_snapshot_size = int(snapshot.size)
+    assert user_snapshot_size > 0
+    assert system_snapshot_size > 0
+
+    # assert the metric values for the user snapshot.
+    user_snapshot_metric_labels = {
+        "volume": volume_name,
+        "user_created": "true",
+    }
+    check_metric_sum_on_all_nodes(client, core_api,
+                                  "longhorn_snapshot_actual_size_bytes",
+                                  user_snapshot_metric_labels,
+                                  user_snapshot_size)
+
+    # assert the metric values for the system snapshot.
+    system_snapshot_metric_labels = {
+        "volume": volume_name,
+        "user_created": "false",
+    }
+    check_metric_sum_on_all_nodes(client, core_api,
+                                  "longhorn_snapshot_actual_size_bytes",
+                                  system_snapshot_metric_labels,
+                                  system_snapshot_size)
+
+    # create 3 more user snapshots.
+    create_snapshot(client, volume_name)
+    create_snapshot(client, volume_name)
+    create_snapshot(client, volume_name)
+
+    wait_for_metric_count_all_nodes(client, core_api,
+                                    "longhorn_snapshot_actual_size_bytes",
+                                    user_snapshot_metric_labels, 4)
+    wait_for_metric_count_all_nodes(client, core_api,
+                                    "longhorn_snapshot_actual_size_bytes",
+                                    system_snapshot_metric_labels, 1)
+
+
+def test_node_metrics(client, core_api): # NOQA
+    lht_hostId = get_self_host_id()
+    node = client.by_id_node(lht_hostId)
+    disks = node.disks
+    for _, disk in iter(disks.items()):
+        if disk.path == DEFAULT_DISK_PATH:
+            default_disk = disk
+            break
+    assert default_disk is not None
+
+    metric_labels = {}
+    check_metric(core_api, "longhorn_node_count_total",
+                 metric_labels, expected_value=3.0)
+
+    metric_labels = {
+        "node": lht_hostId,
+    }
+    check_metric(core_api, "longhorn_node_cpu_capacity_millicpu",
+                 metric_labels)
+    check_metric(core_api, "longhorn_node_cpu_usage_millicpu",
+                 metric_labels)
+    check_metric(core_api, "longhorn_node_memory_capacity_bytes",
+                 metric_labels)
+    check_metric(core_api, "longhorn_node_memory_usage_bytes",
+                 metric_labels)
+    check_metric(core_api, "longhorn_node_storage_capacity_bytes",
+                 metric_labels, default_disk.storageMaximum)
+    check_metric(core_api, "longhorn_node_storage_usage_bytes",
+                 metric_labels)
+    check_metric(core_api, "longhorn_node_storage_reservation_bytes",
+                 metric_labels, default_disk.storageReserved)
+
+    # check longhorn_node_status by 4 different conditions
+    metric_labels = {
+        "condition": "mountpropagation",
+        "condition_reason": "",
+        "node": lht_hostId
+    }
+    check_metric_with_condition(core_api, "longhorn_node_status",
+                                metric_labels, 1.0)
+
+    metric_labels = {
+        "condition": "ready",
+        "condition_reason": "",
+        "node": lht_hostId
+    }
+    check_metric_with_condition(core_api, "longhorn_node_status",
+                                metric_labels, 1.0)
+
+    metric_labels = {
+        "condition": "allowScheduling",
+        "condition_reason": "",
+        "node": lht_hostId,
+    }
+    check_metric_with_condition(core_api, "longhorn_node_status",
+                                metric_labels, 1.0)
+    node = client.by_id_node(lht_hostId)
+    set_node_scheduling(client, node, allowScheduling=False, retry=True)
+    check_metric_with_condition(core_api, "longhorn_node_status",
+                                metric_labels, 0.0)
+
+    metric_labels = {
+        "condition": "schedulable",
+        "condition_reason": "",
+        "node": lht_hostId
+    }
+    check_metric_with_condition(core_api, "longhorn_node_status",
+                                metric_labels, 1.0)
+
+    metric_labels = {
+        "condition": "schedulable",
+        "condition_reason": "KubernetesNodeCordoned",
+        "node": lht_hostId
+    }
+    set_node_cordon(core_api, lht_hostId, True)
+    check_metric_with_condition(core_api, "longhorn_node_status",
+                                metric_labels, 0.0)
diff --git a/manager/integration/tests/test_migration.py b/manager/integration/tests/test_migration.py
index d8dced2a0e..cea8ab15e9 100644
--- a/manager/integration/tests/test_migration.py
+++ b/manager/integration/tests/test_migration.py
@@ -404,10 +404,9 @@ def test_migration_with_restore_volume(core_api, # NOQA
     """
     # Step 1
     lht_host_id = get_self_host_id()
-    volume = create_and_check_volume(client,
-                                     volume_name,
-                                     REPLICA_COUNT,
-                                     SIZE)
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=REPLICA_COUNT,
+                                     size=SIZE)
 
     attachment_id = common.generate_attachment_ticket_id()
     volume.attach(attachmentID=attachment_id, hostId=lht_host_id)
diff --git a/manager/integration/tests/test_node.py b/manager/integration/tests/test_node.py
index 2e6af2282d..1e4ad5dd32 100644
--- a/manager/integration/tests/test_node.py
+++ b/manager/integration/tests/test_node.py
@@ -19,14 +19,14 @@
     SETTING_DEFAULT_DATA_PATH, \
     SETTING_CREATE_DEFAULT_DISK_LABELED_NODES, \
     DEFAULT_STORAGE_OVER_PROVISIONING_PERCENTAGE, \
-    SETTING_DISABLE_SCHEDULING_ON_CORDONED_NODE
+    SETTING_DISABLE_SCHEDULING_ON_CORDONED_NODE, \
+    SETTING_DETACH_MANUALLY_ATTACHED_VOLUMES_WHEN_CORDONED
 from common import get_volume_endpoint
 from common import get_update_disks
 from common import wait_for_disk_status, wait_for_disk_update, \
     wait_for_disk_conditions, wait_for_node_tag_update, \
     cleanup_node_disks, wait_for_disk_storage_available, \
     wait_for_disk_uuid, wait_for_node_schedulable_condition
-from common import exec_nsenter
 
 from common import SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY
 from common import volume_name # NOQA
@@ -1425,14 +1425,17 @@ def test_replica_datapath_cleanup(client):  # NOQA
 
     # data path should exist now
     for data_path in data_paths:
-        assert exec_nsenter("ls {}".format(data_path))
+        assert os.listdir(data_path)
 
     cleanup_volume_by_name(client, vol_name)
 
     # data path should be gone due to the cleanup of replica
     for data_path in data_paths:
-        with pytest.raises(subprocess.CalledProcessError):
-            exec_nsenter("ls {}".format(data_path))
+        try:
+            os.listdir(data_path)
+            raise AssertionError(f"data path {data_path} should be gone")
+        except FileNotFoundError:
+            pass
 
     node = client.by_id_node(lht_hostId)
     disks = node.disks
@@ -2626,6 +2629,14 @@ def test_disk_eviction_with_node_level_soft_anti_affinity_disabled(client, # NOQ
     replica_path = test_disk_path + '/replicas'
     assert os.path.isdir(replica_path)
 
+    # Since https://github.com/longhorn/longhorn-manager/pull/2138, the node
+    # controller is responsible for triggering replica eviction. If the timing
+    # of the node controller and node monitor are off, the node controller
+    # may take extra time to do so. Wait for evidence eviction is in progress
+    # before proceeding.
+    wait_for_volume_replica_count(client, volume.name,
+                                  volume.numberOfReplicas + 1)
+
     for i in range(common.RETRY_COMMAND_COUNT):
         if len(os.listdir(replica_path)) > 0:
             break
@@ -2668,3 +2679,146 @@ def finalizer():
         common.cleanup_all_volumes(client)
 
     request.addfinalizer(finalizer)
+
+@pytest.mark.skip(reason="TODO")  # NOQA
+def test_drain_with_block_for_eviction_success():
+    """
+    Test drain completes after evicting replica with node-drain-policy
+    block-for-eviction
+
+    1. Set `node-drain-policy` to `block-for-eviction`.
+    2. Create a volume.
+    3. Ensure (through soft anti-affinity, low replica count, and/or enough
+       disks) that an evicted replica of the volume can be scheduled elsewhere.
+    4. Write data to the volume.
+    5. Drain a node one of the volume's replicas is scheduled to.
+    6. While the drain is ongoing:
+       - Verify that the volume never becomes degraded.
+       - Verify that `node.status.autoEvicting == true`.
+       - Optionally verify that `replica.spec.evictionRequested == true`.
+    7. Verify the drain completes.
+    8. Uncordon the node.
+    9. Verify the replica on the drained node has moved to a different one.
+    10. Verify that `node.status.autoEvicting == false`.
+    11. Verify that `replica.spec.evictionRequested == false`.
+    12. Verify the volume's data.
+    """
+
+@pytest.mark.skip(reason="TODO")  # NOQA
+def test_drain_with_block_for_eviction_if_contains_last_replica_success():
+    """
+    Test drain completes after evicting replicas with node-drain-policy
+    block-for-eviction-if-contains-last-replica
+
+    1. Set `node-drain-policy` to
+       `block-for-eviction-if-contains-last-replica`.
+    2. Create one volume with a single replica and another volume with three
+       replicas.
+    3. Ensure (through soft anti-affinity, low replica count, and/or enough
+    disks) that evicted replicas of both volumes can be scheduled elsewhere.
+    4. Write data to the volumes.
+    5. Drain a node both volumes have a replica scheduled to.
+    6. While the drain is ongoing:
+       - Verify that the volume with one replica never becomes degraded.
+       - Verify that the volume with three replicas becomes degraded.
+       - Verify that `node.status.autoEvicting == true`.
+       - Optionally verify that `replica.spec.evictionRequested == true` on the
+         replica for the volume that only has one.
+       - Optionally verify that `replica.spec.evictionRequested == false` on
+         the replica for the volume that has three.
+    7. Verify the drain completes.
+    8. Uncordon the node.
+    9. Verify the replica for the volume with one replica has moved to a
+       different node.
+    10. Verify the replica for the volume with three replicas has not moved.
+    11. Verify that `node.status.autoEvicting == false`.
+    12. Verify that `replica.spec.evictionRequested == false` on all replicas.
+    13. Verify the the data in both volumes.
+    """
+
+@pytest.mark.skip(reason="TODO")  # NOQA
+def test_drain_with_block_for_eviction_failure():
+    """
+    Test drain never completes with node-drain-policy block-for-eviction
+
+    1. Set `node-drain-policy` to `block-for-eviction`.
+    2. Create a volume.
+    3. Ensure (through soft anti-affinity, high replica count, and/or not
+       enough disks) that an evicted replica of the volume cannot be scheduled
+       elsewhere.
+    4. Write data to the volume.
+    5. Drain a node one of the volume's replicas is scheduled to.
+    6. While the drain is ongoing:
+       - Verify that `node.status.autoEvicting == true`.
+       - Verify that `replica.spec.evictionRequested == true`.
+    7. Verify the drain never completes.
+    """
+
+@pytest.mark.node  # NOQA
+def test_auto_detach_volume_when_node_is_cordoned(client, core_api, volume_name):  # NOQA
+    """
+    Test auto detach volume when node is cordoned
+
+    1. Set `detach-manually-attached-volumes-when-cordoned` to `false`.
+    2. Create a volume and attached to the node through API (manually).
+    3. Cordon the node.
+    4. Set `detach-manually-attached-volumes-when-cordoned` to `true`.
+    5. Volume will be detached automatically.
+    """
+
+    # Set `Detach Manually Attached Volumes When Cordoned` to false
+    detach_manually_attached_volumes_when_cordoned = \
+        client.by_id_setting(
+            SETTING_DETACH_MANUALLY_ATTACHED_VOLUMES_WHEN_CORDONED)
+    client.update(detach_manually_attached_volumes_when_cordoned,
+                  value="false")
+
+    # Create a volume
+    volume = client.create_volume(name=volume_name,
+                                  size=SIZE,
+                                  numberOfReplicas=3)
+    volume = common.wait_for_volume_detached(client,
+                                             volume_name)
+    assert volume.restoreRequired is False
+
+    # Attach to the node
+    host_id = get_self_host_id()
+    volume.attach(hostId=host_id)
+    volume = common.wait_for_volume_healthy(client, volume_name)
+    assert volume.restoreRequired is False
+
+    # Cordon the node
+    set_node_cordon(core_api, host_id, True)
+
+    # Volume is still attached for a while
+    time.sleep(NODE_UPDATE_WAIT_INTERVAL)
+    volume = common.wait_for_volume_healthy(client, volume_name)
+    assert volume.restoreRequired is False
+
+    # Set `Detach Manually Attached Volumes When Cordoned` to true
+    client.update(detach_manually_attached_volumes_when_cordoned, value="true")
+
+    # Volume should be detached
+    volume = common.wait_for_volume_detached(client, volume_name)
+    assert volume.restoreRequired is False
+
+    # Delete the Volume
+    client.delete(volume)
+    common.wait_for_volume_delete(client, volume_name)
+
+    volumes = client.list_volume().data
+    assert len(volumes) == 0
+
+@pytest.mark.skip(reason="TODO")  # NOQA
+def test_do_not_react_to_brief_kubelet_restart():
+    """
+    Test the node controller ignores Ready == False due to KubeletNotReady for
+    ten seconds before reacting.
+
+    Repeat the following five times:
+    1. Verify status.conditions[type == Ready] == True for the Longhorn node we
+       are running on.
+    2. Kill the kubelet process (e.g. `pkill kubelet`).
+    3. Verify status.conditions[type == Ready] != False for the Longhorn node
+       we are running on at any point for at least ten seconds.
+    """
diff --git a/manager/integration/tests/test_orphan.py b/manager/integration/tests/test_orphan.py
index 9ee0715704..9a951b990e 100644
--- a/manager/integration/tests/test_orphan.py
+++ b/manager/integration/tests/test_orphan.py
@@ -3,13 +3,13 @@
 import time
 import random
 import string
+import shutil
 
 from common import core_api, client # NOQA
 from common import Gi, SIZE
 from common import volume_name # NOQA
 from common import SETTING_ORPHAN_AUTO_DELETION
 from common import RETRY_COUNTS, RETRY_INTERVAL_LONG
-from common import exec_nsenter
 from common import get_self_host_id
 from common import get_update_disks, wait_for_disk_update, cleanup_node_disks
 from common import create_and_check_volume, wait_for_volume_healthy
@@ -18,6 +18,7 @@
 from common import wait_for_node_update
 from common import wait_for_disk_status
 from common import update_node_disks
+from common import exec_local
 
 
 def generate_random_id(num_bytes):
@@ -60,7 +61,9 @@ def create_volume_with_replica_on_host(client, volume_name):  # NOQA
 
     nodes = client.list_node()
 
-    volume = create_and_check_volume(client, volume_name, len(nodes), SIZE)
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=len(nodes),
+                                     size=SIZE)
     volume.attach(hostId=lht_hostId, disableFrontend=False)
     wait_for_volume_healthy(client, volume_name)
 
@@ -78,7 +81,7 @@ def create_orphaned_directories_on_host(volume, disk_paths, num_orphans):  # NOQ
                 replica_dir_name = volume.name + "-" + generate_random_id(8)
                 path = os.path.join(disk_path, "replicas", replica_dir_name)
                 paths.append(path)
-                exec_nsenter("cp -a {} {}".format(replica.dataPath, path))
+                exec_local("cp -a {} {}".format(replica.dataPath, path))
 
     return paths
 
@@ -120,18 +123,16 @@ def wait_for_orphan_count(client, number, retry_counts=120):  # NOQA
 
 def wait_for_file_count(path, number, retry_counts=120):
     for _ in range(retry_counts):
-        count = exec_nsenter("ls {} | wc -l".format(path))
-        if int(count) == number:
+        if len(os.listdir(path)) == number:
             break
         time.sleep(RETRY_INTERVAL_LONG)
 
-    count = exec_nsenter("ls {} | wc -l".format(path))
-    return int(count)
+    return len(os.listdir(path))
 
 
 def delete_orphaned_directory_on_host(directories):  # NOQA
     for path in directories:
-        exec_nsenter("rm -rf {}".format(path))
+        exec_local("rm -rf {}".format(path))
 
 
 def delete_extra_disks_on_host(client, disk_names):  # NOQA
@@ -190,26 +191,24 @@ def test_orphaned_dirs_with_wrong_naming_format(client, volume_name, request):
 
         # Create invalid orphaned directories.
         # 8-byte random id missing
-        exec_nsenter("mkdir -p {}".format(os.path.join(replica.diskPath,
-                                                       "replicas",
-                                                       volume_name)))
+        os.makedirs(os.path.join(replica.diskPath, "replicas", volume_name))
+
         # wrong random id length
-        exec_nsenter("mkdir -p {}".format(
-            os.path.join(replica.diskPath,
-                         "replicas",
-                         volume_name + "-" + generate_random_id(4))))
+        os.makedirs(os.path.join(replica.diskPath, "replicas",
+                                 volume_name + "-" + generate_random_id(4)))
+
         # volume.meta missing
-        path = os.path.join(replica.diskPath,
-                            "replicas",
+        path = os.path.join(replica.diskPath, "replicas",
                             volume_name + "-" + generate_random_id(8))
-        exec_nsenter("cp -a {} {}; rm -f {}".format(
-            replica.dataPath, path, os.path.join(path, "volume.meta")))
+        shutil.copytree(replica.dataPath, path)
+        os.remove(os.path.join(path, "volume.meta"))
+
         # corrupted volume.meta
-        path = os.path.join(replica.diskPath,
-                            "replicas",
+        path = os.path.join(replica.diskPath, "replicas",
                             volume_name + "-" + generate_random_id(8))
-        exec_nsenter("cp -a {} {}; echo xxx > {}".format(
-            replica.dataPath, path, os.path.join(path, "volume.meta")))
+        shutil.copytree(replica.dataPath, path)
+        with open(os.path.join(path, "volume.meta"), 'w') as file:
+            file.write("xxx")
 
     # Step 5
     cleanup_volume_by_name(client, volume_name)
@@ -535,7 +534,7 @@ def test_orphaned_dirs_in_duplicated_disks(client, volume_name, request):  # NOQ
     disks = node.disks
     disk_path = os.path.join(disk_paths[0], disk_names[1])
     disk_paths.append(disk_path)
-    exec_nsenter("mkdir -p {}".format(disk_path))
+    os.makedirs(disk_path)
     disk2 = {"path": disk_path, "allowScheduling": True}
 
     update_disk = get_update_disks(disks)
diff --git a/manager/integration/tests/test_recurring_job.py b/manager/integration/tests/test_recurring_job.py
index aeacf8c705..3e98353adb 100644
--- a/manager/integration/tests/test_recurring_job.py
+++ b/manager/integration/tests/test_recurring_job.py
@@ -17,6 +17,7 @@
 from common import random_labels, volume_name  # NOQA
 from common import storage_class, statefulset, pvc  # NOQA
 from common import make_deployment_with_pvc  # NOQA
+from common import generate_volume_name
 
 from common import get_self_host_id
 
@@ -68,6 +69,8 @@
 from common import wait_for_cron_job_create
 from common import wait_for_cron_job_delete
 
+from common import ACCESS_MODE_RWO
+from common import ACCESS_MODE_RWX
 from common import JOB_LABEL
 from common import KUBERNETES_STATUS_LABEL
 from common import LONGHORN_NAMESPACE
@@ -1991,9 +1994,9 @@ def test_recurring_job_restored_from_backup_target(set_random_backupstore, clien
     back1 = BACKUP + "1"
     back2 = BACKUP + "2"
     group1 = "group01"
-    volume_name1 = "record-recurring-job"
-    rvolume_name1 = "restore-record-recurring-job-01"
-    rvolume_name2 = "restore-record-recurring-job-02"
+    volume_name1 = "record-recur" + "-" + generate_volume_name()
+    rvolume_name1 = "restore-01" + "-" + generate_volume_name()
+    rvolume_name2 = "restore-02" + "-" + generate_volume_name()
 
     recurring_jobs = {
         back1: {
@@ -2048,8 +2051,8 @@ def test_recurring_job_restored_from_backup_target(set_random_backupstore, clien
 
     complete_backup_1_count = 0
     restore_snapshot_name = ""
-    volume = client.by_id_volume(volume_name1)
     wait_for_backup_completion(client, volume_name1)
+    volume = client.by_id_volume(volume_name1)
     for b in volume.backupStatus:
         if back1+"-" in b.snapshot:
             complete_backup_1_count += 1
@@ -2081,7 +2084,8 @@ def test_recurring_job_restored_from_backup_target(set_random_backupstore, clien
 
 
 @pytest.mark.recurring_job  # NOQA
-def test_recurring_job_filesystem_trim(client, core_api, batch_v1_api, volume_name, csi_pv, pvc, pod_make):  # NOQA
+@pytest.mark.parametrize("access_mode", [ACCESS_MODE_RWO, ACCESS_MODE_RWX])  # NOQA
+def test_recurring_job_filesystem_trim(client, core_api, batch_v1_api, volume_name, csi_pv, pvc, pod_make, access_mode):  # NOQA
     """
     Scenario: test recurring job filesystem-trim
 
@@ -2102,7 +2106,8 @@ def test_recurring_job_filesystem_trim(client, core_api, batch_v1_api, volume_na
     """
     pod_name, _, _, _ = \
         prepare_pod_with_data_in_mb(client, core_api, csi_pv, pvc, pod_make,
-                                    volume_name, data_size_in_mb=10)
+                                    volume_name, data_size_in_mb=10,
+                                    access_mode=access_mode)
 
     volume = client.by_id_volume(volume_name)
 
diff --git a/manager/integration/tests/test_rwx.py b/manager/integration/tests/test_rwx.py
index d01f840449..79ea321117 100644
--- a/manager/integration/tests/test_rwx.py
+++ b/manager/integration/tests/test_rwx.py
@@ -6,20 +6,24 @@
 from common import create_and_wait_pod, read_volume_data
 from common import get_apps_api_client, wait_statefulset
 from common import create_and_wait_deployment, delete_and_wait_pod
+from common import delete_and_wait_deployment
+from common import delete_and_wait_pvc
 from common import prepare_pod_with_data_in_mb, DATA_SIZE_IN_MB_1
 from common import create_snapshot, wait_for_backup_completion
 from common import find_backup, Gi, volume_name, csi_pv, pod_make  # NOQA
 from common import wait_for_volume_creation, DATA_SIZE_IN_MB_3
 from common import create_pv_for_volume, create_pvc_for_volume
 from common import DEFAULT_STATEFULSET_TIMEOUT, DEFAULT_STATEFULSET_INTERVAL
+from common import wait_delete_pod, wait_for_pod_remount
 from common import get_core_api_client, write_pod_volume_random_data
 from common import create_pvc_spec, make_deployment_with_pvc  # NOQA
-from common import wait_for_pod_phase
 from common import core_api, statefulset, pvc, pod, client  # NOQA
 from common import RETRY_COUNTS, RETRY_INTERVAL
 from common import EXPANDED_VOLUME_SIZE
 from common import expand_and_wait_for_pvc, wait_for_volume_expansion
 from common import wait_deployment_replica_ready, wait_for_volume_healthy
+from common import crypto_secret, storage_class  # NOQA
+from common import create_crypto_secret, create_storage_class
 from backupstore import set_random_backupstore # NOQA
 from multiprocessing import Pool
 
@@ -344,9 +348,10 @@ def test_rwx_delete_share_manager_pod(core_api, statefulset):  # NOQA
     2. Wait for StatefulSet to come up healthy.
     3. Write data and compute md5sum.
     4. Delete the share manager pod.
-    5. Check the data md5sum in statefulSet.
-    6. Write more data to it and compute md5sum.
-    7. Check the data md5sum in share manager volume.
+    5. Wait for a new pod to be created and volume getting attached.
+    6. Check the data md5sum in statefulSet.
+    7. Write more data to it and compute md5sum.
+    8. Check the data md5sum in share manager volume.
     """
 
     statefulset_name = 'statefulset-delete-share-manager-pods-test'
@@ -377,8 +382,10 @@ def test_rwx_delete_share_manager_pod(core_api, statefulset):  # NOQA
     delete_and_wait_pod(core_api, share_manager_name,
                         namespace=LONGHORN_NAMESPACE)
 
-    wait_for_pod_phase(core_api, share_manager_name,
-                       namespace=LONGHORN_NAMESPACE, pod_phase="Running")
+    target_pod = core_api.read_namespaced_pod(name=pod_name,
+                                              namespace='default')
+    wait_delete_pod(core_api, target_pod.metadata.uid)
+    wait_for_pod_remount(core_api, pod_name)
 
     test_data_2 = generate_random_data(VOLUME_RWTEST_SIZE)
     write_pod_volume_data(core_api, pod_name, test_data_2, filename='test2')
@@ -518,7 +525,7 @@ def test_restore_rwo_volume_to_rwx(set_random_backupstore, client, core_api, vol
 
 
 @pytest.mark.skip(reason="TODO")
-def test_rwx_onine_expansion(): # NOQA
+def test_rwx_online_expansion(): # NOQA
     """
     Related issue :
     https://github.com/longhorn/longhorn/issues/2181
@@ -634,3 +641,110 @@ def test_rwx_offline_expansion(client, core_api, pvc, make_deployment_with_pvc):
                                            pod_name,
                                            'default')
     assert int(data_size_in_pod)/1024/1024 == data_size_in_mb
+
+
+def test_encrypted_rwx_volume(core_api, statefulset, storage_class, crypto_secret, pvc, make_deployment_with_pvc):  # NOQA
+    """
+    Test creating encrypted rwx volume and use the secret in
+    non longhorn-system namespace.
+
+    1. Create crypto secret in non longhorn-system namespace.
+    2. Create a storage class.
+    3. Create a deployment with a PVC and the pods should be able to running.
+    """
+
+    namespace = 'default'
+    # Create crypto secret
+    secret = crypto_secret(namespace)
+    create_crypto_secret(secret, namespace)
+
+    # Create storage class
+    storage_class['reclaimPolicy'] = 'Delete'
+    storage_class['parameters']['csi.storage.k8s.io/provisioner-secret-name'] = 'longhorn-crypto'  # NOQA
+    storage_class['parameters']['csi.storage.k8s.io/provisioner-secret-namespace'] = namespace  # NOQA
+    storage_class['parameters']['csi.storage.k8s.io/node-publish-secret-name'] = 'longhorn-crypto'  # NOQA
+    storage_class['parameters']['csi.storage.k8s.io/node-publish-secret-namespace'] = namespace  # NOQA
+    storage_class['parameters']['csi.storage.k8s.io/node-stage-secret-name'] = 'longhorn-crypto'  # NOQA
+    storage_class['parameters']['csi.storage.k8s.io/node-stage-secret-namespace'] = namespace  # NOQA
+    create_storage_class(storage_class)
+
+    # Create deployment with PVC
+    pvc_name = 'pvc-deployment-with-encrypted-rwx-volume'
+    pvc['metadata']['name'] = pvc_name
+    pvc['spec']['storageClassName'] = storage_class['metadata']['name']
+    pvc['spec']['accessModes'] = ['ReadWriteMany']
+
+    core_api.create_namespaced_persistent_volume_claim(
+        body=pvc, namespace='default')
+
+    deployment = make_deployment_with_pvc(
+        'pvc-deployment-with-encrypted-rwx-volume', pvc_name, replicas=3)
+
+    apps_api = get_apps_api_client()
+    create_and_wait_deployment(apps_api, deployment)
+
+    # Clean up deployment and volume
+    delete_and_wait_deployment(apps_api, deployment["metadata"]["name"])
+    delete_and_wait_pvc(core_api, pvc_name)
+
+
+def test_rwx_volume_mount_options(core_api, storage_class, pvc, make_deployment_with_pvc):  # NOQA
+    """
+    Test creating rwx volume with custom mount options
+    non longhorn-system namespace.
+
+    1. Create a storage class with nfsOptions parameter.
+    2. Create a deployment with a PVC and the pods should be able to run.
+    3. Check the mounts on the deployment pods.
+    """
+
+    # Create storage class
+    storage_class['reclaimPolicy'] = 'Delete'
+    storage_class['parameters']['nfsOptions'] = 'vers=4.2,soft,noresvport,timeo=600,retrans=4'  # NOQA
+    create_storage_class(storage_class)
+
+    # Create deployment with PVC
+    pvc_name = 'pvc-deployment-with-custom-mount-options-volume'
+    pvc['metadata']['name'] = pvc_name
+    pvc['spec']['storageClassName'] = storage_class['metadata']['name']
+    pvc['spec']['accessModes'] = ['ReadWriteMany']
+
+    core_api.create_namespaced_persistent_volume_claim(
+        body=pvc, namespace='default')
+
+    deployment = make_deployment_with_pvc(
+        'deployment-with-custom-mount-options-volume', pvc_name, replicas=2)
+
+    apps_api = get_apps_api_client()
+    create_and_wait_deployment(apps_api, deployment)
+
+    # Check mount options on deployment pods
+    deployment_label_selector = "name=" + \
+                                deployment["metadata"]["labels"]["name"]
+
+    deployment_pod_list = \
+        core_api.list_namespaced_pod(namespace="default",
+                                     label_selector=deployment_label_selector)
+
+    assert deployment_pod_list.items.__len__() == 2
+
+    pod_name_1 = deployment_pod_list.items[0].metadata.name
+    pod_name_2 = deployment_pod_list.items[1].metadata.name
+
+    command = "cat /proc/mounts | grep 'nfs'"
+    mount_options_1 = exec_command_in_pod(core_api, command,
+                                          pod_name_1,
+                                          'default')
+    mount_options_2 = exec_command_in_pod(core_api, command,
+                                          pod_name_2,
+                                          'default')
+
+    # print(f'mount_options_1={mount_options_1}')
+    # print(f'mount_options_2={mount_options_2}')
+
+    assert "vers=4.2" in mount_options_1
+    assert "vers=4.2" in mount_options_2
+
+    # Clean up deployment and volume
+    delete_and_wait_deployment(apps_api, deployment["metadata"]["name"])
+    delete_and_wait_pvc(core_api, pvc_name)
diff --git a/manager/integration/tests/test_scheduling.py b/manager/integration/tests/test_scheduling.py
index 5b801b703f..2c164cad38 100644
--- a/manager/integration/tests/test_scheduling.py
+++ b/manager/integration/tests/test_scheduling.py
@@ -55,6 +55,12 @@
 from common import wait_for_replica_running
 
 from common import crash_engine_process_with_sigkill
+from common import set_node_tags
+from common import wait_for_node_tag_update
+from common import wait_for_volume_condition_scheduled
+from common import cleanup_host_disks
+from common import wait_for_volume_delete
+from common import wait_for_disk_update
 
 from common import Mi, Gi
 from common import DATA_SIZE_IN_MB_2
@@ -70,6 +76,11 @@
 from common import update_setting, delete_replica_on_test_node
 from common import VOLUME_FRONTEND_BLOCKDEV, SNAPSHOT_DATA_INTEGRITY_IGNORED
 from common import VOLUME_ROBUSTNESS_DEGRADED, RETRY_COUNTS_SHORT
+from common import SETTING_ALLOW_EMPTY_NODE_SELECTOR_VOLUME
+from common import SIZE, CONDITION_STATUS_FALSE, CONDITION_STATUS_TRUE
+from common import SETTING_REPLICA_ZONE_SOFT_ANTI_AFFINITY
+from common import SETTING_REPLICA_DISK_SOFT_ANTI_AFFINITY
+from common import SETTING_ALLOW_EMPTY_DISK_SELECTOR_VOLUME
 
 from time import sleep
 
@@ -1082,8 +1093,7 @@ def test_data_locality_basic(client, core_api, volume_name, pod, settings_reset)
 
     pod1['metadata']['name'] = pod1_name
 
-    volume1 = create_and_check_volume(client,
-                                      volume1_name,
+    volume1 = create_and_check_volume(client, volume1_name,
                                       num_of_replicas=1,
                                       size=volume1_size)
 
@@ -1724,8 +1734,118 @@ def finalizer():
     wait_for_statefulset_pods_healthy(statefulset)
 
 
-@pytest.mark.skip(reason="TODO")
-def test_global_disk_soft_anti_affinity(): # NOQA
+def test_allow_empty_node_selector_volume_setting(client, volume_name): # NOQA
+    """
+    Test the global setting allow-empty-node-selector-volume
+
+    If true, a replica of the volume without node selector
+    can be scheduled on node with tags.
+
+    If false, a replica of the volume without node selector
+    can not be scheduled on node with tags.
+
+    Setup
+    - Prepare 3 nodes
+    - Add `AVAIL` tag to nodes
+    - Set allow-empty-node-selector-volume to `false`
+
+    When
+    - Create a Volume with 3 replicas without tag
+
+    Then
+    - All replicas can not be scheduled to the nodes
+
+    When
+    - Remove `AVAIL` tag from one of the node
+    - Set allow-empty-node-selector-volume to `true`
+
+    Then
+    - Wait for a while for controller to resync the volume,
+      all replicas can be scheduled to the nodes
+    """
+    # Setup
+    node_tag = ["AVAIL"]
+    for node in client.list_node():
+        set_node_tags(client, node, tags=node_tag, retry=True)
+        wait_for_node_tag_update(client, node.name, node_tag)
+
+    update_setting(client, SETTING_ALLOW_EMPTY_NODE_SELECTOR_VOLUME, "false")
+
+    # Check volume can not be scehduled
+    client.create_volume(name=volume_name, size=SIZE)
+    volume = wait_for_volume_detached(client, volume_name)
+
+    volume = client.by_id_volume(volume.name)
+    volume = wait_for_volume_condition_scheduled(client, volume_name,
+                                                 "status",
+                                                 CONDITION_STATUS_FALSE)
+
+    # Rremove tag from 1 node and set setting allow-empty-node-selector-volume
+    # to true
+    node = client.by_id_node(get_self_host_id())
+    set_node_tags(client, node, tags=[], retry=True)
+    update_setting(client, SETTING_ALLOW_EMPTY_NODE_SELECTOR_VOLUME, "true")
+
+    # Volume can be schedule
+    volume = wait_for_volume_condition_scheduled(client, volume_name, "status",
+                                                 CONDITION_STATUS_TRUE)
+    assert volume.ready
+
+    # All replicas schedule to nodes
+    volume.attach(hostId=get_self_host_id())
+    volume = wait_for_volume_healthy(client, volume_name)
+
+
+def prepare_for_affinity_tests(client, volume_name, request): # NOQA
+    """
+    For 'test_global_disk_soft_anti_affinity' and
+    'test_volume_disk_soft_anti_affinity' use, they have identical
+    the same preparation steps as below:
+
+    Given
+    - One node has three disks
+    - The three disks have very different sizes
+    - Only two disks are available for scheduling
+    - No other node is available for scheduling
+    """
+    def finalizer():
+        volume = client.by_id_volume(volume_name)
+        volume.detach(hostId=lht_hostId)
+        wait_for_volume_detached(client, volume_name)
+        client.delete(volume)
+        wait_for_volume_delete(client, volume.name)
+        cleanup_host_disks(client, 'vol-disk-1', 'vol-disk-2')
+    request.addfinalizer(finalizer)
+
+    # Preparation
+    lht_hostId = get_self_host_id()
+    node = client.by_id_node(lht_hostId)
+    disks = node.disks
+    disk_path1 = create_host_disk(client, 'vol-disk-1',
+                                  str(2 * Gi), lht_hostId)
+    disk1 = {"path": disk_path1, "allowScheduling": True}
+    disk_path2 = create_host_disk(client, 'vol-disk-2',
+                                  str(4 * Gi), lht_hostId)
+    disk2 = {"path": disk_path2, "allowScheduling": False}
+
+    update_disk = get_update_disks(disks)
+    update_disk["disk1"] = disk1
+    update_disk["disk2"] = disk2
+
+    node = update_node_disks(client, node.name, disks=update_disk, retry=True)
+    node = wait_for_disk_update(client, lht_hostId, len(update_disk))
+    assert len(node.disks) == len(update_disk)
+
+    # Make only current node schedulable
+    nodes = client.list_node()
+    for node in nodes:
+        if node.id != lht_hostId:
+            set_node_scheduling(client, node, allowScheduling=False)
+
+    return disk_path1, disk_path2
+
+
+def test_global_disk_soft_anti_affinity(client, volume_name, request): # NOQA
     """
     1. When Replica Disk Soft Anti-Affinity is false, it should be impossible
        to schedule replicas to the same disk.
@@ -1771,11 +1891,131 @@ def test_global_disk_soft_anti_affinity(): # NOQA
     - Verify all three replicas are healthy
     - Verify all three replicas have a different spec.diskID
     """
-    pass
+    # Preparation
+    disk_path1, disk_path2 = prepare_for_affinity_tests(client,
+                                                        volume_name,
+                                                        request)
+
+    # Test start
+    update_setting(client, SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY, "true")
+    update_setting(client, SETTING_REPLICA_ZONE_SOFT_ANTI_AFFINITY, "true")
+    update_setting(client, SETTING_REPLICA_DISK_SOFT_ANTI_AFFINITY, "false")
+
+    lht_hostId = get_self_host_id()
+    client.create_volume(name=volume_name, size=str(500*Mi))
+    volume = wait_for_volume_detached(client, volume_name)
+    volume.attach(hostId=lht_hostId)
+    volume = wait_for_volume_degraded(client, volume_name)
+
+    num_running = 0
+    for replica in volume.replicas:
+        if replica.running:
+            num_running += 1
+        else:
+            assert replica.hostId == ""
+
+    assert num_running == 2
+
+    # After enable SETTING_REPLICA_DISK_SOFT_ANTI_AFFINITY to true,
+    # replicas can schedule on the same disk, threrefore volume become healthy
+    update_setting(client, SETTING_REPLICA_DISK_SOFT_ANTI_AFFINITY, "true")
+
+    volume = wait_for_volume_healthy(client, volume_name)
+
+    node = client.by_id_node(lht_hostId)
+    disks = node.disks
+    for fsid, disk in iter(disks.items()):
+        if disk.path == disk_path2:
+            disk.allowScheduling = True
+
+    # Enable disk2
+    update_disks = get_update_disks(disks)
+    update_node_disks(client, node.name, disks=update_disks, retry=True)
+
+    # Delete one of the two replicas with the same diskID
+    disk_id = []
+    for replica in volume.replicas:
+        if replica.diskID not in disk_id:
+            disk_id.append(replica.diskID)
+        else:
+            volume.replicaRemove(name=replica.name)
+
+    volume = wait_for_volume_degraded(client, volume_name)
+    volume = wait_for_volume_healthy(client, volume_name)
+
+    # Replcas should located on 3 different disks on current node
+    disk_id.clear()
+    for replica in volume.replicas:
+        assert replica.diskID not in disk_id
+        disk_id.append(replica.diskID)
+
+
+def test_allow_empty_disk_selector_volume_setting(client, volume_name): # NOQA
+    """
+    Test the global setting allow-empty-disk-selector-volume
+
+    If true, a replica of the volume without disk selector
+    can be scheduled on disk with tags.
+
+    If false, a replica of the volume without disk selector
+    can not be scheduled on disk with tags.
+
+    Setup
+    - Prepare 3 nodes each with one disk
+    - Add `AVAIL` tag to every disk
+    - Set allow-empty-disk-selector-volume to `false`
+
+    When
+    - Create a Volume with 3 replicas without tag
+
+    Then
+    - All replicas can not be scheduled to the disks on the nodes
+
+    When
+    - Remove `AVAIL` tag from one of the node
+    - Set allow-empty-disk-selector-volume to `true`
+
+    Then
+    - Wait for a while for controller to resync the volume,
+      all replicas can be scheduled to the disks on the nodes
+    """
+    # Preparation
+    nodes = client.list_node()
+    for node in nodes:
+        disks = get_update_disks(node.disks)
+        disks[list(disks)[0]].tags = ["AVAIL"]
+        update_node_disks(client, node.name, disks=disks)
+
+    update_setting(client, SETTING_ALLOW_EMPTY_DISK_SELECTOR_VOLUME, "false")
+
+    # Check volume can not be scehduled
+    client.create_volume(name=volume_name, size=SIZE)
+    volume = wait_for_volume_detached(client, volume_name)
+
+    volume = client.by_id_volume(volume.name)
+    volume = wait_for_volume_condition_scheduled(client, volume_name,
+                                                 "status",
+                                                 CONDITION_STATUS_FALSE)
+
+    # Remove tag from current node
+    host_id = get_self_host_id()
+    node = client.by_id_node(host_id)
+    disks = get_update_disks(node.disks)
+    disks[list(disks)[0]].tags = []
+    update_node_disks(client, node.name, disks=disks)
+    update_setting(client, SETTING_ALLOW_EMPTY_DISK_SELECTOR_VOLUME, "true")
+
+    # Volume can be schedule
+    volume = wait_for_volume_condition_scheduled(client, volume_name, "status",
+                                                 CONDITION_STATUS_TRUE)
+    assert volume.ready
 
+    # All replicas schedule to disks on nodes
+    volume.attach(hostId=host_id)
+    volume = wait_for_volume_healthy(client, volume_name)
 
-@pytest.mark.skip(reason="TODO")
-def test_volume_disk_soft_anti_affinity(): # NOQA
+
+def test_volume_disk_soft_anti_affinity(client, volume_name, request): # NOQA
     """
     1. When Replica Disk Soft Anti-Affinity is disabled, it should be
        impossible to schedule replicas to the same disk.
@@ -1818,6 +2058,73 @@ def test_volume_disk_soft_anti_affinity(): # NOQA
     Then
     - Verify the volume is in a healthy state
     - Verify all three replicas are healthy
-    - Verify all three replicas have a different `replica.HostID`
+    - Verify all three replicas have a different diskID
     """
-    pass
+    # Preparation
+    disk_path1, disk_path2 = prepare_for_affinity_tests(client,
+                                                        volume_name,
+                                                        request)
+
+    # Test start
+    update_setting(client, SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY, "true")
+    update_setting(client, SETTING_REPLICA_ZONE_SOFT_ANTI_AFFINITY, "true")
+
+    lht_hostId = get_self_host_id()
+    client.create_volume(name=volume_name, size=str(500*Mi),
+                         replicaDiskSoftAntiAffinity="disabled")
+    volume = wait_for_volume_detached(client, volume_name)
+    assert volume.replicaDiskSoftAntiAffinity == "disabled"
+
+    volume.attach(hostId=lht_hostId)
+    volume = wait_for_volume_degraded(client, volume_name)
+
+    num_running = 0
+    for replica in volume.replicas:
+        if replica.running:
+            num_running += 1
+        else:
+            assert replica.hostId == ""
+
+    assert num_running == 2
+
+    # After set update volume.updateReplicaDiskSoftAntiAffinity to enabled,
+    # replicas can schedule on the same disk, threrefore volume become healthy
+    volume = volume.updateReplicaDiskSoftAntiAffinity(
+             replicaDiskSoftAntiAffinity="enabled")
+    assert volume.replicaDiskSoftAntiAffinity == "enabled"
+
+    volume = wait_for_volume_healthy(client, volume_name)
+
+    disk_id = []
+    for replica in volume.replicas:
+        if replica.diskID not in disk_id:
+            disk_id.append(replica.diskID)
+
+    assert len(disk_id) == 2
+
+    node = client.by_id_node(lht_hostId)
+    disks = node.disks
+    for fsid, disk in iter(disks.items()):
+        if disk.path == disk_path2:
+            disk.allowScheduling = True
+
+    # Enable disk2
+    update_disks = get_update_disks(disks)
+    update_node_disks(client, node.name, disks=update_disks, retry=True)
+
+    # Delete one of the two replicas with the same diskID
+    disk_id.clear()
+    for replica in volume.replicas:
+        if replica.diskID not in disk_id:
+            disk_id.append(replica.diskID)
+        else:
+            volume.replicaRemove(name=replica.name)
+
+    volume = wait_for_volume_degraded(client, volume_name)
+    volume = wait_for_volume_healthy(client, volume_name)
+
+    # Replcas should located on 3 different disks on current node
+    disk_id.clear()
+    for replica in volume.replicas:
+        assert replica.diskID not in disk_id
+        disk_id.append(replica.diskID)
diff --git a/manager/integration/tests/test_settings.py b/manager/integration/tests/test_settings.py
index ce54a31b46..1f025b2fae 100644
--- a/manager/integration/tests/test_settings.py
+++ b/manager/integration/tests/test_settings.py
@@ -22,7 +22,7 @@
     get_engine_image_status_value,
     create_volume, create_volume_and_backup, cleanup_volume_by_name,
     wait_for_volume_restoration_completed, wait_for_backup_restore_completed,
-    get_engine_host_id,
+    get_engine_host_id, wait_for_instance_manager_count,
     Gi, Mi,
 
     LONGHORN_NAMESPACE,
@@ -32,6 +32,7 @@
     SETTING_DEFAULT_REPLICA_COUNT,
     SETTING_BACKUP_TARGET,
     SETTING_CONCURRENT_VOLUME_BACKUP_RESTORE,
+    SETTING_V1_DATA_ENGINE,
     RETRY_COUNTS, RETRY_INTERVAL, RETRY_INTERVAL_LONG,
     update_setting, BACKING_IMAGE_QCOW2_URL, BACKING_IMAGE_NAME,
     create_backing_image_with_matching_url, BACKING_IMAGE_EXT4_SIZE,
@@ -105,8 +106,7 @@ def test_setting_toleration():
     2.  Verify the request fails.
     3.  Create a volume and attach it.
     4.  Set `taint-toleration` to "key1=value1:NoSchedule; key2:NoExecute".
-    5.  Verify that cannot update toleration setting when any volume is
-        attached.
+    5.  Verify that can update toleration setting when any volume is attached.
     6.  Generate and write `data1` into the volume.
     7.  Detach the volume.
     8.  Set `taint-toleration` to "key1=value1:NoSchedule; key2:NoExecute".
@@ -155,10 +155,8 @@ def test_setting_toleration():
             "effect": "NoExecute"
         },
     ]
-    with pytest.raises(Exception) as e:
-        client.update(setting, value=setting_value_str)
-    assert 'cannot modify toleration setting before all volumes are detached' \
-           in str(e.value)
+    setting = client.update(setting, value=setting_value_str)
+    assert setting.value == setting_value_str
 
     data1 = write_volume_random_data(volume)
     check_volume_data(volume, data1)
@@ -166,8 +164,6 @@ def test_setting_toleration():
     volume.detach()
     wait_for_volume_detached(client, volume_name)
 
-    setting = client.update(setting, value=setting_value_str)
-    assert setting.value == setting_value_str
     wait_for_toleration_update(core_api, apps_api, count, setting_value_dicts)
 
     client, node = wait_for_longhorn_node_ready()
@@ -493,8 +489,8 @@ def test_setting_priority_class(core_api, apps_api, scheduling_api, priority_cla
     for the Setting.
     2. Create a new Priority Class in Kubernetes.
     3. Create and attach a Volume.
-    4. Verify that the Priority Class Setting cannot be updated with an
-    attached Volume.
+    4. Verify that the Priority Class Setting can be updated with an attached
+       volume.
     5. Generate and write `data1`.
     6. Detach the Volume.
     7. Update the Priority Class Setting to the new Priority Class.
@@ -528,10 +524,8 @@ def test_setting_priority_class(core_api, apps_api, scheduling_api, priority_cla
     volume.attach(hostId=get_self_host_id())
     volume = wait_for_volume_healthy(client, volume_name)
 
-    with pytest.raises(Exception) as e:
-        client.update(setting, value=name)
-    assert 'cannot modify priority class setting before all volumes are ' \
-           'detached' in str(e.value)
+    setting = client.update(setting, value=name)
+    assert setting.value == name
 
     data1 = write_volume_random_data(volume)
     check_volume_data(volume, data1)
@@ -539,9 +533,6 @@ def test_setting_priority_class(core_api, apps_api, scheduling_api, priority_cla
     volume.detach()
     wait_for_volume_detached(client, volume_name)
 
-    setting = client.update(setting, value=name)
-    assert setting.value == name
-
     wait_for_priority_class_update(core_api, apps_api, count, priority_class)
 
     client, node = wait_for_longhorn_node_ready()
@@ -657,9 +648,10 @@ def test_setting_backing_image_auto_cleanup(client, core_api, volume_name):  # N
     ]
 
     for volume_name in volume_names:
-        create_and_check_volume(
-            client, volume_name, 3, str(BACKING_IMAGE_EXT4_SIZE),
-            BACKING_IMAGE_NAME)
+        create_and_check_volume(client, volume_name,
+                                num_of_replicas=3,
+                                size=str(BACKING_IMAGE_EXT4_SIZE),
+                                backing_image=BACKING_IMAGE_NAME)
 
     # Step 4
     lht_host_id = get_self_host_id()
@@ -941,7 +933,7 @@ def setting_concurrent_volume_backup_restore_limit_concurrent_restoring_test(cli
                    str(concurrent_limit))
 
     _, backup = create_volume_and_backup(client, volname + "-with-backup",
-                                         500 * Mi, 300 * Mi)
+                                         1000 * Mi, 600 * Mi)
 
     nodes = client.list_node()
     restore_volume_names = []
@@ -1238,6 +1230,66 @@ def test_setting_update_with_invalid_value_via_configmap(core_api, request):  #
                       [SETTING_BACKUP_TARGET,
                        SETTING_TAINT_TOLERATION],
                       [target,
-                       ""])
+                       "key1=value1:NoSchedule"])
 
     cleanup_volume_by_name(client, vol_name)
+
+
+def test_setting_v1_data_engine(client, request): # NOQA
+    """
+    Test that the v1 data engine setting works correctly.
+    1. Create a volume and attach it.
+    2. Set v1 data engine setting to false. The setting should be rejected.
+    3. Detach the volume.
+    4. Set v1 data engine setting to false again. The setting should be
+       accepted. Then, attach the volume. The volume is unable to attach.
+    5. set v1 data engine setting to true. The setting should be accepted.
+    6. Attach the volume.
+    """
+
+    setting = client.by_id_setting(SETTING_V1_DATA_ENGINE)
+
+    # Step 1
+    volume_name = "test-v1-vol"  # NOQA
+    volume = create_and_check_volume(client, volume_name)
+
+    def finalizer():
+        cleanup_volume(client, volume)
+        client.update(setting, value="true")
+
+    request.addfinalizer(finalizer)
+
+    volume.attach(hostId=get_self_host_id())
+    volume = wait_for_volume_healthy(client, volume_name)
+
+    # Step 2
+    with pytest.raises(Exception) as e:
+        client.update(setting, value="false")
+    assert 'cannot apply v1-data-engine setting to Longhorn workloads when ' \
+        'there are attached v1 volumes' in str(e.value)
+
+    # Step 3
+    volume.detach()
+    wait_for_volume_detached(client, volume_name)
+
+    # Step 4
+    setting = client.by_id_setting(SETTING_V1_DATA_ENGINE)
+    client.update(setting, value="false")
+
+    count = wait_for_instance_manager_count(client, 0)
+    assert count == 0
+
+    volume.attach(hostId=get_self_host_id())
+    with pytest.raises(Exception) as e:
+        wait_for_volume_healthy(client, volume_name)
+    assert 'volume[key]=detached' in str(e.value)
+
+    # Step 5
+    client.update(setting, value="true")
+    nodes = client.list_node()
+    count = wait_for_instance_manager_count(client, len(nodes))
+    assert count == len(nodes)
+
+    # Step 6
+    volume.attach(hostId=get_self_host_id())
+    volume = wait_for_volume_healthy(client, volume_name)
diff --git a/manager/integration/tests/test_snapshot.py b/manager/integration/tests/test_snapshot.py
index 4d4a70edb5..da8c7dc8ce 100644
--- a/manager/integration/tests/test_snapshot.py
+++ b/manager/integration/tests/test_snapshot.py
@@ -321,7 +321,9 @@ def detect_and_repair_corrupted_replica(client, volume_name, data_integrity_mode
     """
 
     # Step 1
-    volume = create_and_check_volume(client, volume_name, 3, size=str(2 * Gi))
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=3,
+                                     size=str(2 * Gi))
 
     lht_hostId = get_self_host_id()
     volume = volume.attach(hostId=lht_hostId)
@@ -572,7 +574,8 @@ def check_hashed_and_with_immediate_hash(client, volume_name, snapshot_data_inte
     """
 
     # Step 1
-    volume = create_and_check_volume(client, volume_name, 3,
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=3,
                                      size=str(volume_size * Mi),
                                      snapshot_data_integrity=snapshot_data_integrity) # NOQA
 
@@ -604,7 +607,8 @@ def check_hashed_and_without_immediate_hash(client, volume_name, snapshot_data_i
     """
 
     # Step 1
-    volume = create_and_check_volume(client, volume_name, 3,
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=3,
                                      size=str(16 * Mi),
                                      snapshot_data_integrity=snapshot_data_integrity) # NOQA
 
@@ -644,7 +648,8 @@ def check_per_volume_hash_disable(client, volume_name, snapshot_data_integrity):
     """
 
     # Step 1
-    volume = create_and_check_volume(client, volume_name, 3,
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=3,
                                      size=str(16 * Mi),
                                      snapshot_data_integrity=snapshot_data_integrity)  # NOQA
 
@@ -694,7 +699,9 @@ def test_snapshot_cr(client, volume_name, settings_reset):  # NOQA
     client.update(setting, value="true")
 
     lht_hostId = get_self_host_id()
-    volume = create_and_check_volume(client, volume_name, 3, size=str(1 * Gi))
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=3,
+                                     size=str(1 * Gi))
     volume = volume.attach(hostId=lht_hostId)
     wait_for_volume_healthy(client, volume_name)
     volume = client.by_id_volume(volume_name)
diff --git a/manager/integration/tests/test_upgrade.py b/manager/integration/tests/test_upgrade.py
index 428d41c7b2..249af8caf5 100644
--- a/manager/integration/tests/test_upgrade.py
+++ b/manager/integration/tests/test_upgrade.py
@@ -205,6 +205,12 @@ def longhorn_upgrade(longhorn_install_method,
                                     longhorn_repo_url,
                                     longhorn_repo_branch],
                                    shell=False)
+    elif longhorn_install_method == "fleet":
+        command = "./pipelines/fleet/scripts/upgrade-longhorn.sh"
+        process = subprocess.Popen([command,
+                                    longhorn_repo_url,
+                                    longhorn_repo_branch],
+                                   shell=False)
 
     process.wait()
     if process.returncode == 0:
@@ -343,11 +349,9 @@ def test_upgrade(longhorn_upgrade_type,
         set_backupstore_nfs(client)
         mount_nfs_backupstore(client)
     backup_vol_name = "backup-vol"
-    backup_vol = create_and_check_volume(
-        client,
-        backup_vol_name,
-        2,
-        str(DEFAULT_VOLUME_SIZE * Gi))
+    backup_vol = create_and_check_volume(client, backup_vol_name,
+                                         num_of_replicas=2,
+                                         size=str(DEFAULT_VOLUME_SIZE * Gi))
     backup_vol.attach(hostId=host_id)
     backup_vol = wait_for_volume_healthy(client, backup_vol_name)
     data0 = {'pos': 0, 'len': BACKUP_BLOCK_SIZE,
diff --git a/manager/integration/tests/test_zone.py b/manager/integration/tests/test_zone.py
index 6fcc66f5ff..5965a7bf39 100644
--- a/manager/integration/tests/test_zone.py
+++ b/manager/integration/tests/test_zone.py
@@ -8,8 +8,12 @@
 from common import pvc, pod  # NOQA
 from common import volume_name # NOQA
 
+from common import cleanup_node_disks
 from common import get_self_host_id
 
+from common import get_update_disks
+from common import update_node_disks
+
 from common import create_and_wait_pod
 from common import create_pv_for_volume
 from common import create_pvc_for_volume
@@ -141,7 +145,8 @@ def test_zone_tags(client, core_api, volume_name, k8s_node_zone_tags):  # NOQA
 
     wait_longhorn_node_zone_updated(client)
 
-    volume = create_and_check_volume(client, volume_name, num_of_replicas=2)
+    volume = create_and_check_volume(client, volume_name,
+                                     num_of_replicas=2)
 
     host_id = get_self_host_id()
 
@@ -503,6 +508,166 @@ def test_replica_auto_balance_zone_best_effort(client, core_api, volume_name):
     assert z3_r_count == 2
 
 
+def test_replica_auto_balance_when_disabled_disk_scheduling_in_zone(client, core_api, volume_name):  # NOQA
+    """
+    Scenario: replica auto-balance when disk scheduling is disabled on nodes
+              in a zone.
+
+    Issue: https://github.com/longhorn/longhorn/issues/6508
+
+    Given `replica-soft-anti-affinity` setting is `true`.
+    And node-1 is in zone-1.
+        node-2 is in zone-2.
+        node-3 is in zone-3.
+    And disk scheduling is disabled on node-3.
+    And create a volume with 3 replicas.
+    And attach the volume to test pod node.
+    And 3 replicas running in zone-1 and zone-2.
+        0 replicas running in zone-3.
+
+    When set `replica-auto-balance` to `best-effort`.
+
+    Then 3 replicas running in zone-1 and zone-2.
+         0 replicas running in zone-3.
+    And replica count remains stable across zones and nodes.
+    """
+    # Set `replica-soft-anti-affinity` to `true`.
+    update_setting(client, SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY, "true")
+
+    # Assign nodes to respective zones
+    node1, node2, node3 = client.list_node()
+    set_k8s_node_zone_label(core_api, node1.name, ZONE1)
+    set_k8s_node_zone_label(core_api, node2.name, ZONE2)
+    set_k8s_node_zone_label(core_api, node3.name, ZONE3)
+    wait_longhorn_node_zone_updated(client)
+
+    # Disable disk scheduling on node 3
+    cleanup_node_disks(client, node3.name)
+
+    # Create a volume with 3 replicas
+    num_of_replicas = 3
+    volume = client.create_volume(name=volume_name,
+                                  numberOfReplicas=num_of_replicas)
+
+    # Wait for the volume to detach and attach it to the test pod node
+    volume = wait_for_volume_detached(client, volume_name)
+    volume.attach(hostId=get_self_host_id())
+
+    # Define a function to assert replica count
+    def assert_replica_count(is_stable=False):
+        for _ in range(RETRY_COUNTS):
+            time.sleep(RETRY_INTERVAL)
+
+            zone3_replica_count = get_zone_replica_count(
+                client, volume_name, ZONE3, chk_running=True)
+            assert zone3_replica_count == 0
+
+            total_replica_count = \
+                get_zone_replica_count(
+                    client, volume_name, ZONE1, chk_running=True) + \
+                get_zone_replica_count(
+                    client, volume_name, ZONE2, chk_running=True)
+
+            if is_stable:
+                assert total_replica_count == num_of_replicas
+            elif total_replica_count == num_of_replicas:
+                break
+
+        assert total_replica_count == 3
+
+    # Perform the initial assertion to ensure the replica count is as expected
+    assert_replica_count()
+
+    # Update the replica-auto-balance setting to `best-effort`
+    update_setting(client, SETTING_REPLICA_AUTO_BALANCE, "best-effort")
+
+    # Perform the final assertion to ensure the replica count is as expected,
+    # and stable after the setting update
+    assert_replica_count(is_stable=True)
+
+
+def test_replica_auto_balance_when_no_storage_available_in_zone(client, core_api, volume_name):  # NOQA
+    """
+    Scenario: replica auto-balance when there is no storage available on nodes
+              in a zone.
+
+    Issue: https://github.com/longhorn/longhorn/issues/6671
+
+    Given `replica-soft-anti-affinity` setting is `true`.
+    And node-1 is in zone-1.
+        node-2 is in zone-2.
+        node-3 is in zone-3.
+    And fill up the storage on node-3.
+    And create a volume with 3 replicas.
+    And attach the volume to test pod node.
+    And 3 replicas running in zone-1 and zone-2.
+        0 replicas running in zone-3.
+
+    When set `replica-auto-balance` to `best-effort`.
+
+    Then 3 replicas running in zone-1 and zone-2.
+         0 replicas running in zone-3.
+    And replica count remains stable across zones and nodes.
+    """
+    # Set `replica-soft-anti-affinity` to `true`.
+    update_setting(client, SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY, "true")
+
+    # Assign nodes to respective zones
+    node1, node2, node3 = client.list_node()
+    set_k8s_node_zone_label(core_api, node1.name, ZONE1)
+    set_k8s_node_zone_label(core_api, node2.name, ZONE2)
+    set_k8s_node_zone_label(core_api, node3.name, ZONE3)
+    wait_longhorn_node_zone_updated(client)
+
+    # Fill up the storage on node 3
+    for _, disk in node3.disks.items():
+        disk.storageReserved = disk.storageMaximum
+
+    update_disks = get_update_disks(node3.disks)
+    update_node_disks(client, node3.name, disks=update_disks, retry=True)
+
+    # Create a volume with 3 replicas
+    num_of_replicas = 3
+    volume = client.create_volume(name=volume_name,
+                                  numberOfReplicas=num_of_replicas)
+
+    # Wait for the volume to detach and attach it to the test pod node
+    volume = wait_for_volume_detached(client, volume_name)
+    volume.attach(hostId=get_self_host_id())
+
+    # Define a function to assert replica count
+    def assert_replica_count(is_stable=False):
+        for _ in range(RETRY_COUNTS):
+            time.sleep(RETRY_INTERVAL)
+
+            zone3_replica_count = get_zone_replica_count(
+                client, volume_name, ZONE3, chk_running=True)
+            assert zone3_replica_count == 0
+
+            total_replica_count = \
+                get_zone_replica_count(
+                    client, volume_name, ZONE1, chk_running=True) + \
+                get_zone_replica_count(
+                    client, volume_name, ZONE2, chk_running=True)
+
+            if is_stable:
+                assert total_replica_count == num_of_replicas
+            elif total_replica_count == num_of_replicas:
+                break
+
+        assert total_replica_count == 3
+
+    # Perform the initial assertion to ensure the replica count is as expected
+    assert_replica_count()
+
+    # Update the replica-auto-balance setting to `best-effort`
+    update_setting(client, SETTING_REPLICA_AUTO_BALANCE, "best-effort")
+
+    # Perform the final assertion to ensure the replica count is as expected,
+    # and stable after the setting update
+    assert_replica_count(is_stable=True)
+
+
 def test_replica_auto_balance_when_replica_on_unschedulable_node(client, core_api, volume_name, request):  # NOQA
     """
     Scenario: replica auto-balance when replica already running on
diff --git a/mirror_csi_images/scripts/publish.sh b/mirror_csi_images/scripts/publish.sh
index 5f3cb322f3..4d918fb099 100755
--- a/mirror_csi_images/scripts/publish.sh
+++ b/mirror_csi_images/scripts/publish.sh
@@ -13,13 +13,21 @@ if [[ -n "${LONGHORN_IMAGES_FILE_URL}" ]]; then
       CSI_IMAGE=$(echo "${LINE}" | sed -e "s/longhornio\///g")
       IFS=: read -ra IMAGE_TAG_PAIR <<< "${CSI_IMAGE}"
       echo "registry.k8s.io/sig-storage/${IMAGE_TAG_PAIR[0]}" "longhornio/${IMAGE_TAG_PAIR[0]}" "${IMAGE_TAG_PAIR[1]}" >> "${INFILE}"
+    elif [[ "${LINE}" =~ "support-bundle-kit" ]]; then
+      SUPPORT_BUNDLE_KIT_IMAGE=$(echo "${LINE}" | sed -e "s/longhornio\///g")
+      IFS=: read -ra IMAGE_TAG_PAIR <<< "${SUPPORT_BUNDLE_KIT_IMAGE}"
+      echo "rancher/${IMAGE_TAG_PAIR[0]}" "longhornio/${IMAGE_TAG_PAIR[0]}" "${IMAGE_TAG_PAIR[1]}" >> "${INFILE}"
     fi
   done < "${LONGHORN_IMAGES_FILE}"
 else
   IFS=, read -ra CSI_IMAGES_ARR <<< "${CSI_IMAGES}"
   for CSI_IMAGE in "${CSI_IMAGES_ARR[@]}"; do
     IFS=: read -ra IMAGE_TAG_PAIR <<< "$CSI_IMAGE"
-    echo "registry.k8s.io/sig-storage/${IMAGE_TAG_PAIR[0]}" "longhornio/${IMAGE_TAG_PAIR[0]}" "${IMAGE_TAG_PAIR[1]}" >> "${INFILE}"
+    if [[ "${CSI_IMAGE}" =~ "csi-" ]]; then
+      echo "registry.k8s.io/sig-storage/${IMAGE_TAG_PAIR[0]}" "longhornio/${IMAGE_TAG_PAIR[0]}" "${IMAGE_TAG_PAIR[1]}" >> "${INFILE}"
+    elif [[ "${CSI_IMAGE}" =~ "support-bundle-kit" ]]; then
+      echo "rancher/${IMAGE_TAG_PAIR[0]}" "longhornio/${IMAGE_TAG_PAIR[0]}" "${IMAGE_TAG_PAIR[1]}" >> "${INFILE}"
+    fi
   done
 fi
 
diff --git a/pipelines/e2e/Dockerfile.setup b/pipelines/e2e/Dockerfile.setup
index 4b79eafa60..5fd0be7b8c 100644
--- a/pipelines/e2e/Dockerfile.setup
+++ b/pipelines/e2e/Dockerfile.setup
@@ -25,7 +25,7 @@ RUN wget -q https://storage.googleapis.com/kubernetes-release/release/$KUBECTL_V
     wget -q "https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64" && \
     mv yq_linux_amd64 /usr/local/bin/yq && \
     chmod +x /usr/local/bin/yq && \
-    apk add openssl openssh-client ca-certificates git rsync bash curl jq python3 py3-pip gcc python3-dev libc-dev && \
+    apk add openssl openssh-client ca-certificates git rsync bash curl jq python3 py3-pip gcc python3-dev libc-dev py3-virtualenv && \
     ssh-keygen -t rsa -b 4096 -N "" -f ~/.ssh/id_rsa && \
     curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 && \
     chmod 700 get_helm.sh && \
diff --git a/pipelines/fleet/Dockerfile.setup b/pipelines/fleet/Dockerfile.setup
new file mode 100644
index 0000000000..18deaed96b
--- /dev/null
+++ b/pipelines/fleet/Dockerfile.setup
@@ -0,0 +1,34 @@
+From alpine:latest
+
+ARG KUBECTL_VERSION=v1.20.2
+
+ARG RKE_VERSION=v1.3.4
+
+ARG TERRAFORM_VERSION=1.3.5
+
+ARG YQ_VERSION=v4.24.2
+
+ENV WORKSPACE /src/longhorn-tests
+
+WORKDIR $WORKSPACE
+
+RUN wget -q https://storage.googleapis.com/kubernetes-release/release/$KUBECTL_VERSION/bin/linux/amd64/kubectl && \
+    mv kubectl /usr/local/bin/kubectl && \
+    chmod +x /usr/local/bin/kubectl && \
+    wget -q https://github.com/rancher/rke/releases/download/$RKE_VERSION/rke_linux-amd64 && \
+    mv rke_linux-amd64 /usr/bin/rke && \
+    chmod +x /usr/bin/rke && \
+    wget -q https://releases.hashicorp.com/terraform/${TERRAFORM_VERSION}/terraform_${TERRAFORM_VERSION}_linux_amd64.zip && \
+    unzip terraform_${TERRAFORM_VERSION}_linux_amd64.zip && rm terraform_${TERRAFORM_VERSION}_linux_amd64.zip && \
+    mv terraform /usr/bin/terraform && \
+    chmod +x /usr/bin/terraform && \
+    wget -q "https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64" && \
+    mv yq_linux_amd64 /usr/local/bin/yq && \
+    chmod +x /usr/local/bin/yq && \
+    apk add openssl openssh-client ca-certificates git rsync bash curl jq python3 py3-pip && \
+    ssh-keygen -t rsa -b 4096 -N "" -f ~/.ssh/id_rsa && \
+    curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 && \
+    chmod 700 get_helm.sh && \
+    ./get_helm.sh
+
+COPY [".", "$WORKSPACE"]
diff --git a/pipelines/fleet/Jenkinsfile b/pipelines/fleet/Jenkinsfile
new file mode 100644
index 0000000000..d2fdf43a09
--- /dev/null
+++ b/pipelines/fleet/Jenkinsfile
@@ -0,0 +1,163 @@
+def imageName = "${JOB_BASE_NAME}-${env.BUILD_NUMBER}"
+def summary
+def WORKSPACE = "/src/longhorn-tests"
+def BUILD_TRIGGER_BY = "\n${currentBuild.getBuildCauses()[0].shortDescription}"
+
+// define optional parameters
+def SELINUX_MODE = params.SELINUX_MODE ? params.SELINUX_MODE : ""
+
+def CREDS_ID = JOB_BASE_NAME == "longhorn-tests-regression" ? "AWS_CREDS_RANCHER_QA" : "AWS_CREDS"
+def REGISTRATION_CODE_ID = params.ARCH == "amd64" ? "REGISTRATION_CODE" : "REGISTRATION_CODE_ARM64"
+
+// parameters for air gap installation
+def AIR_GAP_INSTALLATION = params.AIR_GAP_INSTALLATION ? params.AIR_GAP_INSTALLATION : false
+def CIS_HARDENING = params.CIS_HARDENING ? params.CIS_HARDENING : false
+def REGISTRY_URL
+def REGISTRY_USERNAME
+def REGISTRY_PASSWORD
+
+// parameter for hdd test
+def USE_HDD = params.USE_HDD ? params.USE_HDD : false
+
+node {
+
+    withCredentials([
+        usernamePassword(credentialsId: CREDS_ID, passwordVariable: 'AWS_SECRET_KEY', usernameVariable: 'AWS_ACCESS_KEY'),
+        string(credentialsId: REGISTRATION_CODE_ID, variable: 'REGISTRATION_CODE'),
+    ]) {
+
+        if (params.SEND_SLACK_NOTIFICATION) {
+            notifyBuild('STARTED', BUILD_TRIGGER_BY, params.NOTIFY_SLACK_CHANNEL)
+        }
+
+        checkout scm
+
+        try {
+
+            stage('build') {
+
+                echo "Using credentials: $CREDS_ID"
+                echo "Using registration code: $REGISTRATION_CODE_ID"
+
+                sh "pipelines/fleet/scripts/build.sh"
+                sh """ docker run -itd --name ${JOB_BASE_NAME}-${BUILD_NUMBER} \
+                                       --env REGISTRY_URL=${REGISTRY_URL} \
+                                       --env REGISTRY_USERNAME=${REGISTRY_USERNAME} \
+                                       --env REGISTRY_PASSWORD=${REGISTRY_PASSWORD} \
+                                       --env FLEET_REPO_URI=${FLEET_REPO_URI} \
+                                       --env FLEET_REPO_VERSION=${FLEET_REPO_VERSION} \
+                                       --env LONGHORN_TESTS_CUSTOM_IMAGE=${LONGHORN_TESTS_CUSTOM_IMAGE} \
+                                       --env DISTRO=${DISTRO} \
+                                       --env FLEET_REPO_STABLE_VERSION=${FLEET_REPO_STABLE_VERSION} \
+                                       --env FLEET_REPO_TRANSIENT_VERSION=${FLEET_REPO_TRANSIENT_VERSION} \
+                                       --env LONGHORN_TEST_CLOUDPROVIDER=${LONGHORN_TEST_CLOUDPROVIDER} \
+                                       --env LONGHORN_UPGRADE_TEST=${LONGHORN_UPGRADE_TEST} \
+                                       --env PYTEST_CUSTOM_OPTIONS="${PYTEST_CUSTOM_OPTIONS}" \
+                                       --env BACKUP_STORE_TYPE="${BACKUP_STORE_TYPE}" \
+                                       --env TF_VAR_use_hdd=${USE_HDD} \
+                                       --env TF_VAR_arch=${ARCH} \
+                                       --env TF_VAR_k8s_distro_name=${K8S_DISTRO_NAME} \
+                                       --env TF_VAR_k8s_distro_version=${K8S_DISTRO_VERSION} \
+                                       --env TF_VAR_aws_availability_zone=${AWS_AVAILABILITY_ZONE} \
+                                       --env TF_VAR_aws_region=${AWS_REGION} \
+                                       --env TF_VAR_os_distro_version=${DISTRO_VERSION} \
+                                       --env TF_VAR_lh_aws_access_key=${AWS_ACCESS_KEY} \
+                                       --env TF_VAR_lh_aws_instance_name_controlplane="${JOB_BASE_NAME}-ctrl" \
+                                       --env TF_VAR_lh_aws_instance_name_worker="${JOB_BASE_NAME}-wrk" \
+                                       --env TF_VAR_lh_aws_instance_type_controlplane=${CONTROLPLANE_INSTANCE_TYPE} \
+                                       --env TF_VAR_lh_aws_instance_type_worker=${WORKER_INSTANCE_TYPE}\
+                                       --env TF_VAR_lh_aws_secret_key=${AWS_SECRET_KEY} \
+                                       --env TF_VAR_selinux_mode=${SELINUX_MODE} \
+                                       --env TF_VAR_registration_code=${REGISTRATION_CODE} \
+                                       --env TF_VAR_cis_hardening=${CIS_HARDENING} \
+                                       ${imageName}
+                """
+            }
+
+            timeout(60) {
+                stage ('terraform') {
+                    sh "docker exec ${JOB_BASE_NAME}-${BUILD_NUMBER} pipelines/utilities/terraform_setup.sh"
+                }
+			}
+
+            stage ('longhorn setup & tests') {
+                sh "docker exec ${JOB_BASE_NAME}-${BUILD_NUMBER} pipelines/fleet/scripts/longhorn-setup.sh"
+            }
+
+            stage ('download support bundle') {
+                sh "docker exec ${JOB_BASE_NAME}-${BUILD_NUMBER} pipelines/utilities/download_support_bundle.sh  ${JOB_BASE_NAME}-${BUILD_NUMBER}-bundle.zip"
+                sh "docker cp ${JOB_BASE_NAME}-${BUILD_NUMBER}:${WORKSPACE}/${JOB_BASE_NAME}-${BUILD_NUMBER}-bundle.zip ."
+				archiveArtifacts allowEmptyArchive: true, artifacts: '**/*.zip', followSymlinks: false
+			}
+
+            stage ('report generation') {
+                sh "docker cp ${JOB_BASE_NAME}-${BUILD_NUMBER}:${WORKSPACE}/longhorn-test-junit-report.xml ."
+
+                if(params.LONGHORN_UPGRADE_TEST && params.FLEET_REPO_TRANSIENT_VERSION) {
+                    sh "docker cp ${JOB_BASE_NAME}-${BUILD_NUMBER}:${WORKSPACE}/longhorn-test-upgrade-from-stable-junit-report.xml ."
+                    sh "docker cp ${JOB_BASE_NAME}-${BUILD_NUMBER}:${WORKSPACE}/longhorn-test-upgrade-from-transient-junit-report.xml ."
+                    summary = junit 'longhorn-test-upgrade-from-stable-junit-report.xml, longhorn-test-upgrade-from-transient-junit-report.xml, longhorn-test-junit-report.xml'
+                }
+                else if(params.LONGHORN_UPGRADE_TEST) {
+                    sh "docker cp ${JOB_BASE_NAME}-${BUILD_NUMBER}:${WORKSPACE}/longhorn-test-upgrade-from-stable-junit-report.xml ."
+                    summary = junit 'longhorn-test-upgrade-from-stable-junit-report.xml, longhorn-test-junit-report.xml'
+                }
+                else {
+                    summary = junit 'longhorn-test-junit-report.xml'
+                }
+            }
+
+        } catch (e) {
+            currentBuild.result = "FAILED"
+            throw e
+        } finally {
+            stage ('releasing resources') {
+
+                if (sh (script: "docker container inspect ${JOB_BASE_NAME}-${BUILD_NUMBER} > /dev/null 2>&1", returnStatus: true) == 0) {
+                    sh "docker exec ${JOB_BASE_NAME}-${BUILD_NUMBER} pipelines/utilities/cleanup.sh"
+                    sh "docker stop ${JOB_BASE_NAME}-${BUILD_NUMBER}"
+                    sh "docker rm -v ${JOB_BASE_NAME}-${BUILD_NUMBER}"
+                    sh "docker rmi ${imageName}"
+                }
+
+                if (summary) {
+                    summary_msg = "\nTest Summary - Failures: ${summary.failCount}, Skipped: ${summary.skipCount}, Passed: ${summary.passCount}  -- Job completed in ${currentBuild.durationString.replace(' and counting', '')}"
+                } else {
+                    summary_msg = "\n Test Failed: No Junit report"
+                }
+
+                if(params.SEND_SLACK_NOTIFICATION){
+                    notifyBuild(currentBuild.result, summary_msg, params.NOTIFY_SLACK_CHANNEL)
+                }
+            }
+        }
+    }
+
+}
+
+
+def notifyBuild(String buildStatus = 'STARTED', String summary_msg, String slack_channel) {
+  // build status of null means successful
+  buildStatus =  buildStatus ?: 'SUCCESSFUL'
+
+  // Default values
+  def colorName = 'RED'
+  def colorCode = '#FF0000'
+  def subject = "${buildStatus}: Job '${env.JOB_BASE_NAME} [${env.BUILD_NUMBER}]'"
+  def summary = "${subject} (${env.BUILD_URL})" + summary_msg
+
+  // Override default values based on build status
+  if (buildStatus == 'STARTED') {
+    color = 'YELLOW'
+    colorCode = '#FFFF00'
+  } else if (buildStatus == 'SUCCESSFUL') {
+    color = 'GREEN'
+    colorCode = '#00FF00'
+  } else {
+    color = 'RED'
+    colorCode = '#FF0000'
+  }
+
+  // Send notifications
+  slackSend (color: colorCode, message: summary, channel: slack_channel,  tokenCredentialId: 'longhorn-tests-slack-token')
+}
diff --git a/pipelines/fleet/scripts/build.sh b/pipelines/fleet/scripts/build.sh
new file mode 100755
index 0000000000..0e4b6813ac
--- /dev/null
+++ b/pipelines/fleet/scripts/build.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+docker build --no-cache -f ./pipelines/fleet/Dockerfile.setup -t "${JOB_BASE_NAME}-${BUILD_NUMBER}" .
diff --git a/pipelines/fleet/scripts/longhorn-setup.sh b/pipelines/fleet/scripts/longhorn-setup.sh
new file mode 100755
index 0000000000..4332155876
--- /dev/null
+++ b/pipelines/fleet/scripts/longhorn-setup.sh
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+
+set -x
+
+source pipelines/utilities/kubeconfig.sh
+source pipelines/utilities/selinux_workaround.sh
+source pipelines/utilities/install_csi_snapshotter.sh
+source pipelines/utilities/create_aws_secret.sh
+source pipelines/utilities/install_backupstores.sh
+source pipelines/utilities/create_longhorn_namespace.sh
+source pipelines/utilities/fleet.sh
+source pipelines/utilities/run_longhorn_test.sh
+
+
+export LONGHORN_NAMESPACE="longhorn-system"
+export LONGHORN_INSTALL_METHOD="fleet"
+
+
+main(){
+  set_kubeconfig
+
+  if [[ ${DISTRO} == "rhel" ]] || [[ ${DISTRO} == "rockylinux" ]] || [[ ${DISTRO} == "oracle" ]]; then
+    apply_selinux_workaround
+  fi
+
+  # set debugging mode off to avoid leaking aws secrets to the logs.
+  # DON'T REMOVE!
+  set +x
+  create_aws_secret
+  set -x
+
+  create_longhorn_namespace
+  install_backupstores
+  install_csi_snapshotter
+
+  install_fleet
+
+  if [[ "${LONGHORN_UPGRADE_TEST}" == true ]]; then
+    create_fleet_git_repo "${FLEET_REPO_STABLE_VERSION}"
+    LONGHORN_UPGRADE_TYPE="from_stable"
+    LONGHORN_UPGRADE_TEST_POD_NAME="longhorn-test-upgrade-from-stable"
+    if [[ -n "${FLEET_REPO_TRANSIENT_VERSION}" ]]; then
+      UPGRADE_LH_REPO_URL="${FLEET_REPO_URI}"
+      UPGRADE_LH_REPO_BRANCH="${FLEET_REPO_TRANSIENT_VERSION}"
+      UPGRADE_LH_ENGINE_IMAGE="longhornio/longhorn-engine:${FLEET_REPO_TRANSIENT_VERSION}"
+      run_longhorn_upgrade_test
+      LONGHORN_UPGRADE_TYPE="from_transient"
+      LONGHORN_UPGRADE_TEST_POD_NAME="longhorn-test-upgrade-from-transient"
+    fi
+    UPGRADE_LH_REPO_URL="${FLEET_REPO_URI}"
+    UPGRADE_LH_REPO_BRANCH="${FLEET_REPO_VERSION}"
+    UPGRADE_LH_ENGINE_IMAGE="longhornio/longhorn-engine:${FLEET_REPO_VERSION}"
+    run_longhorn_upgrade_test
+    run_longhorn_test
+  else
+    create_fleet_git_repo
+    run_longhorn_test
+  fi
+}
+
+main
diff --git a/pipelines/fleet/scripts/upgrade-longhorn.sh b/pipelines/fleet/scripts/upgrade-longhorn.sh
new file mode 100755
index 0000000000..13571ec608
--- /dev/null
+++ b/pipelines/fleet/scripts/upgrade-longhorn.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+
+set -x
+
+export FLEET_REPO_URI="${1}"
+export FLEET_REPO_VERSION="${2}"
+
+source pipelines/utilities/fleet.sh
+
+export LONGHORN_NAMESPACE="longhorn-system"
+
+create_fleet_git_repo
diff --git a/pipelines/utilities/argocd.sh b/pipelines/utilities/argocd.sh
index fcfdb890be..cb6d6c0e8d 100755
--- a/pipelines/utilities/argocd.sh
+++ b/pipelines/utilities/argocd.sh
@@ -34,7 +34,27 @@ init_argocd(){
 
 create_argocd_app(){
   REVISION="${1:-${LONGHORN_INSTALL_VERSION}}"
-  argocd app create longhorn --repo "${LONGHORN_REPO_URI}" --revision "${REVISION}" --path chart --dest-server https://kubernetes.default.svc --dest-namespace "${LONGHORN_NAMESPACE}"
+  cat > longhorn-application.yaml <<EOF
+apiVersion: argoproj.io/v1alpha1
+kind: Application
+metadata:
+  name: longhorn
+  namespace: argocd
+spec:
+  project: default
+  sources:
+    - chart: longhorn
+      repoURL: ${LONGHORN_REPO_URI}
+      targetRevision: ${REVISION}
+      helm:
+        values: |
+          helmPreUpgradeCheckerJob:
+            enabled: false
+  destination:
+    server: https://kubernetes.default.svc
+    namespace: ${LONGHORN_NAMESPACE}
+EOF
+  kubectl apply -f longhorn-application.yaml
 }
 
 
diff --git a/pipelines/utilities/fleet.sh b/pipelines/utilities/fleet.sh
new file mode 100755
index 0000000000..3c80f40f4c
--- /dev/null
+++ b/pipelines/utilities/fleet.sh
@@ -0,0 +1,61 @@
+source pipelines/utilities/longhorn_status.sh
+
+
+install_fleet(){
+  helm repo add fleet https://rancher.github.io/fleet-helm-charts/
+  helm -n cattle-fleet-system install --create-namespace --wait fleet-crd fleet/fleet-crd
+  helm -n cattle-fleet-system install --create-namespace --wait fleet fleet/fleet
+}
+
+
+create_fleet_git_repo(){
+  REVISION="${1:-${FLEET_REPO_VERSION}}"
+  cat > longhorn-gitrepo.yaml <<EOF
+apiVersion: fleet.cattle.io/v1alpha1
+kind: GitRepo
+metadata:
+  name: longhorn
+  # This namespace is special and auto-wired to deploy to the local cluster
+  namespace: fleet-local
+spec:
+  # Everything from this repo will be run in this cluster. You trust me right?
+  repo: ${FLEET_REPO_URI}
+  revision: ${REVISION}
+  paths:
+  - .
+EOF
+  kubectl apply -f longhorn-gitrepo.yaml
+  wait_for_bundle_deployment_applied
+  wait_for_bundle_deployment_ready
+  wait_longhorn_status_running
+}
+
+
+wait_for_bundle_deployment_applied(){
+  local RETRY_COUNTS=60 # in seconds
+  local RETRY_INTERVAL="1s"
+
+  RETRIES=0
+  while [[ $(kubectl -n fleet-local get gitrepo longhorn -o jsonpath='{.status.summary.waitApplied}') != 1 ]]; do
+    echo "Wait for fleet bundle deployment applied ... re-checking in 1s"
+    sleep ${RETRY_INTERVAL}
+    RETRIES=$((RETRIES+1))
+
+    if [[ ${RETRIES} -eq ${RETRY_COUNTS} ]]; then echo "Error: apply fleet bundle deployment timeout"; exit 1 ; fi
+  done
+}
+
+
+wait_for_bundle_deployment_ready(){
+  local RETRY_COUNTS=10 # in minutes
+  local RETRY_INTERVAL="1m"
+
+  RETRIES=0
+  while [[ $(kubectl -n fleet-local get gitrepo longhorn -o jsonpath='{.status.readyClusters}') != 1 ]]; do
+    echo "Wait for fleet bundle deployment ready ... re-checking in 1m"
+    sleep ${RETRY_INTERVAL}
+    RETRIES=$((RETRIES+1))
+
+    if [[ ${RETRIES} -eq ${RETRY_COUNTS} ]]; then echo "Error: fleet bundle deployment timeout"; exit 1 ; fi
+  done
+}
diff --git a/pipelines/utilities/longhorn_status.sh b/pipelines/utilities/longhorn_status.sh
index 9a0970dbc1..53a4c2e60f 100755
--- a/pipelines/utilities/longhorn_status.sh
+++ b/pipelines/utilities/longhorn_status.sh
@@ -2,10 +2,11 @@ wait_longhorn_status_running(){
   local RETRY_COUNTS=10 # in minutes
   local RETRY_INTERVAL="1m"
 
-  # csi components are installed after longhorn components.
+  # csi and engine image components are installed after longhorn components.
   # it's possible that all longhorn components are running but csi components aren't created yet.
   RETRIES=0
   while [[ -z `kubectl get pods -n ${LONGHORN_NAMESPACE} --no-headers 2>&1 | awk '{print $1}' | grep csi-` ]] || \
+    [[ -z `kubectl get pods -n ${LONGHORN_NAMESPACE} --no-headers 2>&1 | awk '{print $1}' | grep engine-image-` ]] || \
     [[ -n `kubectl get pods -n ${LONGHORN_NAMESPACE} --no-headers 2>&1 | awk '{print $3}' | grep -v "Running\|Completed"` ]]; do
     echo "Longhorn is still installing ... re-checking in 1m"
     sleep ${RETRY_INTERVAL}
diff --git a/pipelines/utilities/run_longhorn_e2e_test.sh b/pipelines/utilities/run_longhorn_e2e_test.sh
index 589bdc4d52..c819dfdd35 100755
--- a/pipelines/utilities/run_longhorn_e2e_test.sh
+++ b/pipelines/utilities/run_longhorn_e2e_test.sh
@@ -74,6 +74,8 @@ run_longhorn_e2e_test_out_of_cluster(){
   export LONGHORN_BACKUPSTORE_POLL_INTERVAL="30"
 
   cd e2e
+  python3 -m venv .
+  source bin/activate
   pip install -r requirements.txt
 
   eval "ROBOT_COMMAND_ARGS=($PYTEST_CUSTOM_OPTIONS)"
diff --git a/test_framework/Dockerfile.setup b/test_framework/Dockerfile.setup
index a507e256c2..d4c035e2f1 100644
--- a/test_framework/Dockerfile.setup
+++ b/test_framework/Dockerfile.setup
@@ -25,8 +25,7 @@ RUN wget -q https://storage.googleapis.com/kubernetes-release/release/$KUBECTL_V
     wget -q "https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64" && \
     mv yq_linux_amd64 /usr/local/bin/yq && \
     chmod +x /usr/local/bin/yq && \
-    apk add openssl openssh-client ca-certificates git rsync bash curl jq chromium chromium-chromedriver python3 py3-pip && \
-    pip3 install -U selenium==3.141.0 && \
+    apk add openssl openssh-client ca-certificates git rsync bash curl jq && \
     ssh-keygen -t rsa -b 4096 -N "" -f ~/.ssh/id_rsa && \
     curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 && \
     chmod 700 get_helm.sh && \
diff --git a/test_framework/Jenkinsfile b/test_framework/Jenkinsfile
index 56b1a081bf..3f2c7bd508 100644
--- a/test_framework/Jenkinsfile
+++ b/test_framework/Jenkinsfile
@@ -18,6 +18,7 @@ def RANCHER_CHART_GIT_BRANCH = params.RANCHER_CHART_GIT_BRANCH ? params.RANCHER_
 def RANCHER_CHART_INSTALL_VERSION = params.RANCHER_CHART_INSTALL_VERSION ? params.RANCHER_CHART_INSTALL_VERSION : ""
 def LONGHORN_TRANSIENT_VERSION = params.LONGHORN_TRANSIENT_VERSION ? params.LONGHORN_TRANSIENT_VERSION : ""
 def CIS_HARDENING = params.CIS_HARDENING ? params.CIS_HARDENING : false
+def CUSTOM_SSH_PUBLIC_KEY = params.CUSTOM_SSH_PUBLIC_KEY ? params.CUSTOM_SSH_PUBLIC_KEY : ""
 def REGISTRY_URL
 def REGISTRY_USERNAME
 def REGISTRY_PASSWORD
@@ -136,6 +137,7 @@ node {
                                        --env TF_VAR_azure_tenant_id=${AZURE_TENANT_ID} \
                                        --env TF_VAR_azure_subscription_id=${AZURE_SUBSCRIPTION_ID} \
                                        --env TF_VAR_cis_hardening=${CIS_HARDENING} \
+                                       --env TF_VAR_custom_ssh_public_key="${CUSTOM_SSH_PUBLIC_KEY}" \
                                        ${imageName}
                 """
 
diff --git a/test_framework/scripts/longhorn-setup.sh b/test_framework/scripts/longhorn-setup.sh
index e39c04924a..691dedabf3 100755
--- a/test_framework/scripts/longhorn-setup.sh
+++ b/test_framework/scripts/longhorn-setup.sh
@@ -56,9 +56,13 @@ install_cluster_autoscaler(){
 
 install_csi_snapshotter_crds(){
     CSI_SNAPSHOTTER_REPO_URL="https://github.com/kubernetes-csi/external-snapshotter.git"
-    CSI_SNAPSHOTTER_REPO_BRANCH="v6.2.1"
     CSI_SNAPSHOTTER_REPO_DIR="${TMPDIR}/k8s-csi-external-snapshotter"
 
+    [[ "${LONGHORN_REPO_URI}" =~ https://([^/]+)/([^/]+)/([^/.]+)(.git)? ]]
+    wget "https://raw.githubusercontent.com/${BASH_REMATCH[2]}/${BASH_REMATCH[3]}/${LONGHORN_REPO_BRANCH}/deploy/longhorn-images.txt" -O "/tmp/longhorn-images.txt"
+    IFS=: read -ra IMAGE_TAG_PAIR <<< $(grep csi-snapshotter /tmp/longhorn-images.txt)
+    CSI_SNAPSHOTTER_REPO_BRANCH="${IMAGE_TAG_PAIR[1]}"
+
     git clone --single-branch \
               --branch "${CSI_SNAPSHOTTER_REPO_BRANCH}" \
             "${CSI_SNAPSHOTTER_REPO_URL}" \
@@ -89,9 +93,10 @@ install_rancher() {
 
 
 get_rancher_api_key() {
-  python3 "${TF_VAR_tf_workspace}/scripts/rancher/webdriver/main.py" "${RANCHER_HOSTNAME}" "${RANCHER_BOOTSTRAP_PASSWORD}"
-  RANCHER_ACCESS_KEY=`cat "${PWD}/access_key"`
-  RANCHER_SECRET_KEY=`cat "${PWD}/secret_key"`
+  TOKEN=$(curl -X POST -s -k "https://${RANCHER_HOSTNAME}/v3-public/localproviders/local?action=login" -H 'Content-Type: application/json' -d "{\"username\":\"admin\", \"password\":\"${RANCHER_BOOTSTRAP_PASSWORD}\", \"responseType\": \"json\"}" | jq -r '.token' | tr -d '"')
+  ARR=(${TOKEN//:/ })
+  RANCHER_ACCESS_KEY=${ARR[0]}
+  RANCHER_SECRET_KEY=${ARR[1]}
 }
 
 
@@ -116,10 +121,11 @@ wait_longhorn_status_running(){
   local RETRY_COUNTS=10 # in minutes
   local RETRY_INTERVAL="1m"
 
-  # csi components are installed after longhorn components.
+  # csi and engine image components are installed after longhorn components.
   # it's possible that all longhorn components are running but csi components aren't created yet.
   RETRIES=0
   while [[ -z `kubectl get pods -n ${LONGHORN_NAMESPACE} --no-headers 2>&1 | awk '{print $1}' | grep csi-` ]] || \
+    [[ -z `kubectl get pods -n ${LONGHORN_NAMESPACE} --no-headers 2>&1 | awk '{print $1}' | grep engine-image-` ]] || \
     [[ -n `kubectl get pods -n ${LONGHORN_NAMESPACE} --no-headers 2>&1 | awk '{print $3}' | grep -v Running` ]]; do
     echo "Longhorn is still installing ... re-checking in 1m"
     sleep ${RETRY_INTERVAL}
@@ -317,6 +323,8 @@ run_longhorn_upgrade_test(){
 
   yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env[4].value="'${LONGHORN_UPGRADE_TYPE}'"' ${LONGHORN_UPGRADE_TESTS_MANIFEST_FILE_PATH}
 
+  yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env[7].value="'${RESOURCE_SUFFIX}'"' ${LONGHORN_UPGRADE_TESTS_MANIFEST_FILE_PATH}
+
   kubectl apply -f ${LONGHORN_UPGRADE_TESTS_MANIFEST_FILE_PATH}
 
   # wait upgrade test pod to start running
diff --git a/test_framework/scripts/rancher/webdriver/main.py b/test_framework/scripts/rancher/webdriver/main.py
deleted file mode 100644
index b57fb7ed63..0000000000
--- a/test_framework/scripts/rancher/webdriver/main.py
+++ /dev/null
@@ -1,138 +0,0 @@
-import os
-import sys
-
-from selenium import webdriver
-from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support import expected_conditions
-from selenium.webdriver.support.wait import WebDriverWait
-
-wait_timeout = 10
-click_retry_timeout = 20
-
-
-def navigate_and_wait_for(target, expect):
-    success = False
-    retry = 0
-    max_retry = 10
-    while success is not True and retry < max_retry:
-        try:
-            driver.get(target)
-            wait_for(expect)
-            success = True
-        except ElementClickInterceptedException as e:
-            raise e
-        except Exception as e:
-            print(f'[retry {retry}] try to navigate to {target} and wait for {expect} failed ... {repr(e)} {e}')
-        retry += 1
-
-
-def wait_for(target):
-    try:
-        WebDriverWait(driver, wait_timeout).until(
-            expected_conditions.presence_of_element_located((By.XPATH, elements[target]))
-        )
-        return True
-    except Exception as e:
-        print(f'wait for {target} error: {e}')
-        return False
-
-
-def click(target):
-    driver.find_element_by_xpath(elements[target]).click()
-
-
-def click_and_wait(target, expect):
-    success = False
-    retry = 0
-    max_retry = 10
-    while success is not True and retry < max_retry:
-        try:
-            _target = driver.find_element_by_xpath(elements[target])
-            _target.click()
-            WebDriverWait(driver, click_retry_timeout).until(
-                expected_conditions.presence_of_element_located((By.XPATH, elements[expect]))
-            )
-            success = True
-        except ElementClickInterceptedException as e:
-            raise e
-        except Exception as e:
-            print(f'[retry {retry}] try to click {target} and wait for {expect} failed ... {repr(e)} {e}')
-        retry += 1
-
-
-def send_keys(target, keys):
-    _target = driver.find_element_by_xpath(elements[target])
-    _target.send_keys(keys)
-
-
-def get_element(element):
-    content = driver.find_element_by_xpath(elements[element]).text
-    return content
-
-
-elements = {
-    'username_input': '//*[@id="username"]',
-    'password_input': '//*[@type="password"]',
-    'login': '//button[@id="submit"]',
-    'agree': '(//*[contains(@class, "checkbox-custom")])[2]',
-    'continue': '//*[@type="submit" and not(@disabled)]',
-    'local_cluster': '//*[contains(@href, "/local")]',
-    'create_api_key': '//button[contains(text(), "Create API Key")]',
-    'create_confirm': '//button//*[contains(text() ,"Create")]',
-    'access_key': '//*[contains(@class, "with-copy")][1]/span',
-    'secret_key': '//*[contains(@class, "with-copy")][2]/span',
-    'create_done': '//button//*[contains(text() ,"Done")]'
-}
-
-if __name__ == '__main__':
-
-    url = 'https://' + sys.argv[1]
-    login_url = url + '/dashboard/auth/login'
-    account_url = url + '/dashboard/account'
-
-    options = webdriver.ChromeOptions()
-    prefs = {
-        'profile.default_content_setting_values.notifications': 2
-    }
-    options.add_experimental_option('prefs', prefs)
-    options.add_argument('--headless')
-    options.add_argument('--ignore-certificate-errors')
-    options.add_argument('--no-sandbox')
-    options.add_argument('window-size=1920,1200')
-
-    driver = webdriver.Chrome(executable_path='/usr/bin/chromedriver', options=options)
-
-    success = False
-    _retry = 0
-    _max_retry = 3
-
-    while not success and _retry < _max_retry:
-        try:
-            print(login_url)
-            driver.get(login_url)
-            try:
-                wait_for('username_input')
-                send_keys('username_input', 'admin')
-            except Exception as e:
-                print(f'no username field {e}')
-            send_keys('password_input', sys.argv[2])
-            click_and_wait('login', 'agree')
-            click_and_wait('agree', 'continue')
-            click_and_wait('continue', 'local_cluster')
-            navigate_and_wait_for(account_url, 'create_api_key')
-            click_and_wait('create_api_key', 'create_confirm')
-            click_and_wait('create_confirm', 'access_key')
-            access_key = get_element('access_key')
-            secret_key = get_element('secret_key')
-            click('create_done')
-            with open('access_key', 'w') as f:
-                f.write(str(access_key))
-            with open('secret_key', 'w') as f:
-                f.write(str(secret_key))
-            success = True
-        except Exception as e:
-            print(f'parsing error: {e}')
-            _retry += 1
-
-    driver.quit()
diff --git a/test_framework/scripts/terraform-setup.sh b/test_framework/scripts/terraform-setup.sh
index 6c7beee307..3d28f7c2c5 100755
--- a/test_framework/scripts/terraform-setup.sh
+++ b/test_framework/scripts/terraform-setup.sh
@@ -32,6 +32,8 @@ terraform_setup(){
   if [[ "${TF_VAR_create_load_balancer}" == true ]]; then
     terraform -chdir=${TF_VAR_tf_workspace}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} output -raw load_balancer_url > ${TF_VAR_tf_workspace}/load_balancer_url
   fi
+
+  export RESOURCE_SUFFIX=$(terraform -chdir=${TF_VAR_tf_workspace}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} output -raw resource_suffix)
 }
 
 
diff --git a/test_framework/terraform/aws/oracle/data.tf b/test_framework/terraform/aws/oracle/data.tf
index 0365770e87..1072b2ce91 100644
--- a/test_framework/terraform/aws/oracle/data.tf
+++ b/test_framework/terraform/aws/oracle/data.tf
@@ -27,6 +27,7 @@ data "template_file" "provision_k3s_server" {
     k3s_cluster_secret = random_password.cluster_secret.result
     k3s_server_public_ip = aws_eip.lh_aws_eip_controlplane[0].public_ip
     k3s_version =  var.k8s_distro_version
+    custom_ssh_public_key = var.custom_ssh_public_key
   }
 }
 
@@ -37,6 +38,7 @@ data "template_file" "provision_k3s_agent" {
     k3s_server_url = "https://${aws_eip.lh_aws_eip_controlplane[0].public_ip}:6443"
     k3s_cluster_secret = random_password.cluster_secret.result
     k3s_version =  var.k8s_distro_version
+    custom_ssh_public_key = var.custom_ssh_public_key
   }
 }
 
@@ -47,6 +49,7 @@ data "template_file" "provision_rke2_server" {
     rke2_cluster_secret = random_password.cluster_secret.result
     rke2_server_public_ip = aws_eip.lh_aws_eip_controlplane[0].public_ip
     rke2_version =  var.k8s_distro_version
+    custom_ssh_public_key = var.custom_ssh_public_key
   }
 }
 
@@ -57,5 +60,6 @@ data "template_file" "provision_rke2_agent" {
     rke2_server_url = "https://${aws_eip.lh_aws_eip_controlplane[0].public_ip}:9345"
     rke2_cluster_secret = random_password.cluster_secret.result
     rke2_version =  var.k8s_distro_version
+    custom_ssh_public_key = var.custom_ssh_public_key
   }
 }
diff --git a/test_framework/terraform/aws/oracle/output.tf b/test_framework/terraform/aws/oracle/output.tf
index ea0c7196c7..3e0941a2c1 100644
--- a/test_framework/terraform/aws/oracle/output.tf
+++ b/test_framework/terraform/aws/oracle/output.tf
@@ -46,3 +46,11 @@ output "load_balancer_url" {
 
   value = var.create_load_balancer ? aws_lb.lh_aws_lb[0].dns_name : null
 }
+
+output "resource_suffix" {
+  depends_on = [
+    random_string.random_suffix
+  ]
+
+  value = random_string.random_suffix.id
+}
\ No newline at end of file
diff --git a/test_framework/terraform/aws/oracle/user-data-scripts/provision_k3s_agent.sh.tpl b/test_framework/terraform/aws/oracle/user-data-scripts/provision_k3s_agent.sh.tpl
index 36fc5dc613..092350791b 100755
--- a/test_framework/terraform/aws/oracle/user-data-scripts/provision_k3s_agent.sh.tpl
+++ b/test_framework/terraform/aws/oracle/user-data-scripts/provision_k3s_agent.sh.tpl
@@ -19,3 +19,7 @@ until (curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="agent --token ${k3s_clus
   echo 'k3s agent did not install correctly'
   sleep 2
 done
+
+if [[ -n "${custom_ssh_public_key}" ]]; then
+  echo "${custom_ssh_public_key}" >> /home/ec2-user/.ssh/authorized_keys
+fi
diff --git a/test_framework/terraform/aws/oracle/user-data-scripts/provision_k3s_server.sh.tpl b/test_framework/terraform/aws/oracle/user-data-scripts/provision_k3s_server.sh.tpl
index 9fcdcb2c59..1e5ca030c2 100755
--- a/test_framework/terraform/aws/oracle/user-data-scripts/provision_k3s_server.sh.tpl
+++ b/test_framework/terraform/aws/oracle/user-data-scripts/provision_k3s_server.sh.tpl
@@ -21,3 +21,6 @@ until (kubectl get pods -A | grep 'Running'); do
   sleep 5
 done
 
+if [[ -n "${custom_ssh_public_key}" ]]; then
+  echo "${custom_ssh_public_key}" >> /home/ec2-user/.ssh/authorized_keys
+fi
diff --git a/test_framework/terraform/aws/oracle/user-data-scripts/provision_rke2_agent.sh.tpl b/test_framework/terraform/aws/oracle/user-data-scripts/provision_rke2_agent.sh.tpl
index 2ee3532529..f4f8089780 100644
--- a/test_framework/terraform/aws/oracle/user-data-scripts/provision_rke2_agent.sh.tpl
+++ b/test_framework/terraform/aws/oracle/user-data-scripts/provision_rke2_agent.sh.tpl
@@ -33,4 +33,9 @@ EOF
 
 systemctl enable rke2-agent.service
 systemctl start rke2-agent.service
+
+if [[ -n "${custom_ssh_public_key}" ]]; then
+  echo "${custom_ssh_public_key}" >> /home/ec2-user/.ssh/authorized_keys
+fi
+
 exit $?
diff --git a/test_framework/terraform/aws/oracle/user-data-scripts/provision_rke2_server.sh.tpl b/test_framework/terraform/aws/oracle/user-data-scripts/provision_rke2_server.sh.tpl
index 742a31967d..d670a60be0 100644
--- a/test_framework/terraform/aws/oracle/user-data-scripts/provision_rke2_server.sh.tpl
+++ b/test_framework/terraform/aws/oracle/user-data-scripts/provision_rke2_server.sh.tpl
@@ -29,3 +29,7 @@ until (KUBECONFIG=/etc/rancher/rke2/rke2.yaml /var/lib/rancher/rke2/bin/kubectl
   echo 'Waiting for rke2 startup'
   sleep 5
 done
+
+if [[ -n "${custom_ssh_public_key}" ]]; then
+  echo "${custom_ssh_public_key}" >> /home/ec2-user/.ssh/authorized_keys
+fi
diff --git a/test_framework/terraform/aws/oracle/variables.tf b/test_framework/terraform/aws/oracle/variables.tf
index 2c08e6ec0a..aa588e6f1d 100644
--- a/test_framework/terraform/aws/oracle/variables.tf
+++ b/test_framework/terraform/aws/oracle/variables.tf
@@ -114,3 +114,9 @@ variable "create_load_balancer" {
   type    = bool
   default = false
 }
+
+variable "custom_ssh_public_key" {
+  type = string
+  default = ""
+  sensitive = true
+}
diff --git a/test_framework/terraform/aws/rhel/data.tf b/test_framework/terraform/aws/rhel/data.tf
index a5e590adc8..c80ecb91d5 100644
--- a/test_framework/terraform/aws/rhel/data.tf
+++ b/test_framework/terraform/aws/rhel/data.tf
@@ -31,6 +31,7 @@ data "template_file" "provision_k3s_server" {
     k3s_version =  var.k8s_distro_version
     selinux_mode = var.selinux_mode
     enable_selinux = var.selinux_mode == "permissive" ? "false" : "true"
+    custom_ssh_public_key = var.custom_ssh_public_key
   }
 }
 
@@ -43,6 +44,7 @@ data "template_file" "provision_k3s_agent" {
     k3s_version =  var.k8s_distro_version
     selinux_mode = var.selinux_mode
     enable_selinux = var.selinux_mode == "permissive" ? "false" : "true"
+    custom_ssh_public_key = var.custom_ssh_public_key
   }
 }
 
@@ -54,6 +56,7 @@ data "template_file" "provision_rke2_server" {
     rke2_server_public_ip = aws_eip.lh_aws_eip_controlplane[0].public_ip
     rke2_version =  var.k8s_distro_version
     selinux_mode = var.selinux_mode
+    custom_ssh_public_key = var.custom_ssh_public_key
   }
 }
 
@@ -65,5 +68,6 @@ data "template_file" "provision_rke2_agent" {
     rke2_cluster_secret = random_password.cluster_secret.result
     rke2_version =  var.k8s_distro_version
     selinux_mode = var.selinux_mode
+    custom_ssh_public_key = var.custom_ssh_public_key
   }
 }
diff --git a/test_framework/terraform/aws/rhel/output.tf b/test_framework/terraform/aws/rhel/output.tf
index ea0c7196c7..cbae005ce5 100644
--- a/test_framework/terraform/aws/rhel/output.tf
+++ b/test_framework/terraform/aws/rhel/output.tf
@@ -46,3 +46,11 @@ output "load_balancer_url" {
 
   value = var.create_load_balancer ? aws_lb.lh_aws_lb[0].dns_name : null
 }
+
+output "resource_suffix" {
+  depends_on = [
+    random_string.random_suffix
+  ]
+
+  value = random_string.random_suffix.id
+}
diff --git a/test_framework/terraform/aws/rhel/user-data-scripts/provision_k3s_agent.sh.tpl b/test_framework/terraform/aws/rhel/user-data-scripts/provision_k3s_agent.sh.tpl
index 879ba574d4..5e2a5f9d0d 100755
--- a/test_framework/terraform/aws/rhel/user-data-scripts/provision_k3s_agent.sh.tpl
+++ b/test_framework/terraform/aws/rhel/user-data-scripts/provision_k3s_agent.sh.tpl
@@ -25,3 +25,7 @@ until (curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="agent --token ${k3s_clus
   echo 'k3s agent did not install correctly'
   sleep 2
 done
+
+if [[ -n "${custom_ssh_public_key}" ]]; then
+  echo "${custom_ssh_public_key}" >> /home/ec2-user/.ssh/authorized_keys
+fi
\ No newline at end of file
diff --git a/test_framework/terraform/aws/rhel/user-data-scripts/provision_k3s_server.sh.tpl b/test_framework/terraform/aws/rhel/user-data-scripts/provision_k3s_server.sh.tpl
index d767f41f5b..a874b9bb2d 100755
--- a/test_framework/terraform/aws/rhel/user-data-scripts/provision_k3s_server.sh.tpl
+++ b/test_framework/terraform/aws/rhel/user-data-scripts/provision_k3s_server.sh.tpl
@@ -26,3 +26,7 @@ until (sudo /usr/local/bin/kubectl get pods -A | grep 'Running'); do
   echo 'Waiting for k3s startup'
   sleep 5
 done
+
+if [[ -n "${custom_ssh_public_key}" ]]; then
+  echo "${custom_ssh_public_key}" >> /home/ec2-user/.ssh/authorized_keys
+fi
\ No newline at end of file
diff --git a/test_framework/terraform/aws/rhel/user-data-scripts/provision_rke2_agent.sh.tpl b/test_framework/terraform/aws/rhel/user-data-scripts/provision_rke2_agent.sh.tpl
index fec3a2e169..4884b10f4e 100755
--- a/test_framework/terraform/aws/rhel/user-data-scripts/provision_rke2_agent.sh.tpl
+++ b/test_framework/terraform/aws/rhel/user-data-scripts/provision_rke2_agent.sh.tpl
@@ -39,4 +39,9 @@ EOF
 
 sudo systemctl enable rke2-agent.service
 sudo systemctl start rke2-agent.service
-exit $?
+
+if [[ -n "${custom_ssh_public_key}" ]]; then
+  echo "${custom_ssh_public_key}" >> /home/ec2-user/.ssh/authorized_keys
+fi
+
+exit $?
\ No newline at end of file
diff --git a/test_framework/terraform/aws/rhel/user-data-scripts/provision_rke2_server.sh.tpl b/test_framework/terraform/aws/rhel/user-data-scripts/provision_rke2_server.sh.tpl
index 78392c43b2..69d800c7bb 100755
--- a/test_framework/terraform/aws/rhel/user-data-scripts/provision_rke2_server.sh.tpl
+++ b/test_framework/terraform/aws/rhel/user-data-scripts/provision_rke2_server.sh.tpl
@@ -35,3 +35,7 @@ until (KUBECONFIG=/etc/rancher/rke2/rke2.yaml sudo /var/lib/rancher/rke2/bin/kub
   echo 'Waiting for rke2 startup'
   sleep 5
 done
+
+if [[ -n "${custom_ssh_public_key}" ]]; then
+  echo "${custom_ssh_public_key}" >> /home/ec2-user/.ssh/authorized_keys
+fi
\ No newline at end of file
diff --git a/test_framework/terraform/aws/rhel/variables.tf b/test_framework/terraform/aws/rhel/variables.tf
index 3178442037..98f211e362 100644
--- a/test_framework/terraform/aws/rhel/variables.tf
+++ b/test_framework/terraform/aws/rhel/variables.tf
@@ -120,3 +120,9 @@ variable "create_load_balancer" {
   type    = bool
   default = false
 }
+
+variable "custom_ssh_public_key" {
+  type = string
+  default = ""
+  sensitive = true
+}
diff --git a/test_framework/terraform/aws/rockylinux/data.tf b/test_framework/terraform/aws/rockylinux/data.tf
index ead3e87e4b..5ed2a86f44 100644
--- a/test_framework/terraform/aws/rockylinux/data.tf
+++ b/test_framework/terraform/aws/rockylinux/data.tf
@@ -36,6 +36,7 @@ data "template_file" "provision_k3s_server" {
     k3s_version =  var.k8s_distro_version
     selinux_mode = var.selinux_mode
     enable_selinux = var.selinux_mode == "permissive" ? "false" : "true"
+    custom_ssh_public_key = var.custom_ssh_public_key
   }
 }
 
@@ -48,6 +49,7 @@ data "template_file" "provision_k3s_agent" {
     k3s_version =  var.k8s_distro_version
     selinux_mode = var.selinux_mode
     enable_selinux = var.selinux_mode == "permissive" ? "false" : "true"
+    custom_ssh_public_key = var.custom_ssh_public_key
   }
 }
 
@@ -59,6 +61,7 @@ data "template_file" "provision_rke2_server" {
     rke2_server_public_ip = aws_eip.lh_aws_eip_controlplane[0].public_ip
     rke2_version =  var.k8s_distro_version
     selinux_mode = var.selinux_mode
+    custom_ssh_public_key = var.custom_ssh_public_key
   }
 }
 
@@ -70,5 +73,6 @@ data "template_file" "provision_rke2_agent" {
     rke2_cluster_secret = random_password.cluster_secret.result
     rke2_version =  var.k8s_distro_version
     selinux_mode = var.selinux_mode
+    custom_ssh_public_key = var.custom_ssh_public_key
   }
 }
diff --git a/test_framework/terraform/aws/rockylinux/output.tf b/test_framework/terraform/aws/rockylinux/output.tf
index 68ea8dbf23..804fa2d099 100644
--- a/test_framework/terraform/aws/rockylinux/output.tf
+++ b/test_framework/terraform/aws/rockylinux/output.tf
@@ -46,3 +46,11 @@ output "load_balancer_url" {
 
   value = var.create_load_balancer ? aws_lb.lh_aws_lb[0].dns_name : null
 }
+
+output "resource_suffix" {
+  depends_on = [
+    random_string.random_suffix
+  ]
+
+  value = random_string.random_suffix.id
+}
\ No newline at end of file
diff --git a/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_k3s_agent.sh.tpl b/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_k3s_agent.sh.tpl
index 29760319ea..7dab190733 100755
--- a/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_k3s_agent.sh.tpl
+++ b/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_k3s_agent.sh.tpl
@@ -24,3 +24,7 @@ until (curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="agent --token ${k3s_clus
   echo 'k3s agent did not install correctly'
   sleep 2
 done
+
+if [[ -n "${custom_ssh_public_key}" ]]; then
+  echo "${custom_ssh_public_key}" >> /home/rocky/.ssh/authorized_keys
+fi
\ No newline at end of file
diff --git a/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_k3s_server.sh.tpl b/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_k3s_server.sh.tpl
index 0e77a01a26..a87c270175 100755
--- a/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_k3s_server.sh.tpl
+++ b/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_k3s_server.sh.tpl
@@ -24,3 +24,6 @@ until (kubectl get pods -A | grep 'Running'); do
   sleep 5
 done
 
+if [[ -n "${custom_ssh_public_key}" ]]; then
+  echo "${custom_ssh_public_key}" >> /home/rocky/.ssh/authorized_keys
+fi
diff --git a/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_rke2_agent.sh.tpl b/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_rke2_agent.sh.tpl
index 04133b229f..e2a67e95f4 100755
--- a/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_rke2_agent.sh.tpl
+++ b/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_rke2_agent.sh.tpl
@@ -38,4 +38,9 @@ EOF
 
 sudo systemctl enable rke2-agent.service
 sudo systemctl start rke2-agent.service
+
+if [[ -n "${custom_ssh_public_key}" ]]; then
+  echo "${custom_ssh_public_key}" >> /home/rocky/.ssh/authorized_keys
+fi
+
 exit $?
diff --git a/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_rke2_server.sh.tpl b/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_rke2_server.sh.tpl
index 6cab617458..c7686fc9b9 100755
--- a/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_rke2_server.sh.tpl
+++ b/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_rke2_server.sh.tpl
@@ -34,3 +34,7 @@ until (KUBECONFIG=/etc/rancher/rke2/rke2.yaml /var/lib/rancher/rke2/bin/kubectl
   echo 'Waiting for rke2 startup'
   sleep 5
 done
+
+if [[ -n "${custom_ssh_public_key}" ]]; then
+  echo "${custom_ssh_public_key}" >> /home/rocky/.ssh/authorized_keys
+fi
diff --git a/test_framework/terraform/aws/rockylinux/variables.tf b/test_framework/terraform/aws/rockylinux/variables.tf
index 34ee882cf3..dfce28e0ae 100644
--- a/test_framework/terraform/aws/rockylinux/variables.tf
+++ b/test_framework/terraform/aws/rockylinux/variables.tf
@@ -119,4 +119,10 @@ variable "use_hdd" {
 variable "create_load_balancer" {
   type    = bool
   default = false
+}
+
+variable "custom_ssh_public_key" {
+  type = string
+  default = ""
+  sensitive = true
 }
\ No newline at end of file
diff --git a/test_framework/terraform/aws/sle-micro/data.tf b/test_framework/terraform/aws/sle-micro/data.tf
index a5969891c2..472d0ca177 100644
--- a/test_framework/terraform/aws/sle-micro/data.tf
+++ b/test_framework/terraform/aws/sle-micro/data.tf
@@ -20,6 +20,7 @@ data "template_file" "provision_k3s_server" {
     k3s_cluster_secret = random_password.cluster_secret.result
     k3s_server_public_ip = aws_eip.lh_aws_eip_controlplane[0].public_ip
     k3s_version =  var.k8s_distro_version
+    custom_ssh_public_key = var.custom_ssh_public_key
   }
 }
 
@@ -30,6 +31,7 @@ data "template_file" "provision_k3s_agent" {
     k3s_server_url = "https://${aws_eip.lh_aws_eip_controlplane[0].public_ip}:6443"
     k3s_cluster_secret = random_password.cluster_secret.result
     k3s_version =  var.k8s_distro_version
+    custom_ssh_public_key = var.custom_ssh_public_key
   }
 }
 
@@ -40,6 +42,7 @@ data "template_file" "provision_rke2_server" {
     rke2_cluster_secret = random_password.cluster_secret.result
     rke2_server_public_ip = aws_eip.lh_aws_eip_controlplane[0].public_ip
     rke2_version =  var.k8s_distro_version
+    custom_ssh_public_key = var.custom_ssh_public_key
   }
 }
 
@@ -50,5 +53,6 @@ data "template_file" "provision_rke2_agent" {
     rke2_server_url = "https://${aws_eip.lh_aws_eip_controlplane[0].public_ip}:9345"
     rke2_cluster_secret = random_password.cluster_secret.result
     rke2_version =  var.k8s_distro_version
+    custom_ssh_public_key = var.custom_ssh_public_key
   }
 }
diff --git a/test_framework/terraform/aws/sle-micro/k3s_instaces.tf b/test_framework/terraform/aws/sle-micro/k3s_instaces.tf
index 82c0bd889e..c233186a15 100644
--- a/test_framework/terraform/aws/sle-micro/k3s_instaces.tf
+++ b/test_framework/terraform/aws/sle-micro/k3s_instaces.tf
@@ -112,11 +112,8 @@ resource "null_resource" "registration_controlplane_k3s" {
 
     inline = [
       "sudo transactional-update register -r ${var.registration_code}",
-      "sudo shutdown -r now",
     ]
 
-    on_failure = continue
-
     connection {
       type     = "ssh"
       user     = "suse"
@@ -139,11 +136,8 @@ resource "null_resource" "registration_worker_k3s" {
 
     inline = [
       "sudo transactional-update register -r ${var.registration_code}",
-      "sudo shutdown -r now",
     ]
 
-    on_failure = continue
-
     connection {
       type     = "ssh"
       user     = "suse"
@@ -252,7 +246,32 @@ resource "null_resource" "cluster_setup_worker_k3s" {
 
 }
 
-# node initialization step 4: download KUBECONFIG file for k3s
+# node initialization step 4: make sure k8s components running
+resource "null_resource" "make_sure_k8s_components_running_controlplane_k3s" {
+  count = var.k8s_distro_name == "k3s" ? 1 : 0
+
+  depends_on = [
+    null_resource.cluster_setup_controlplane_k3s,
+    null_resource.cluster_setup_worker_k3s
+  ]
+
+  provisioner "remote-exec" {
+
+    inline = [
+      "until (kubectl get pods -A | grep 'Running'); do echo 'Waiting for k3s startup'; sleep 5; done"
+    ]
+
+    connection {
+      type     = "ssh"
+      user     = "suse"
+      host     = aws_eip.lh_aws_eip_controlplane[0].public_ip
+      private_key = file(var.aws_ssh_private_key_file_path)
+    }
+  }
+
+}
+
+# node initialization step 5: download KUBECONFIG file for k3s
 resource "null_resource" "rsync_kubeconfig_file" {
   count = var.k8s_distro_name == "k3s" ? 1 : 0
 
@@ -260,8 +279,7 @@ resource "null_resource" "rsync_kubeconfig_file" {
     aws_instance.lh_aws_instance_controlplane_k3s,
     aws_eip.lh_aws_eip_controlplane,
     aws_eip_association.lh_aws_eip_assoc_k3s,
-    null_resource.cluster_setup_controlplane_k3s,
-    null_resource.cluster_setup_worker_k3s
+    null_resource.make_sure_k8s_components_running_controlplane_k3s
   ]
 
   provisioner "remote-exec" {
diff --git a/test_framework/terraform/aws/sle-micro/output.tf b/test_framework/terraform/aws/sle-micro/output.tf
index f1fdd5e52d..1946e2eaa4 100644
--- a/test_framework/terraform/aws/sle-micro/output.tf
+++ b/test_framework/terraform/aws/sle-micro/output.tf
@@ -45,3 +45,11 @@ output "load_balancer_url" {
 
   value = var.create_load_balancer ? aws_lb.lh_aws_lb[0].dns_name : null
 }
+
+output "resource_suffix" {
+  depends_on = [
+    random_string.random_suffix
+  ]
+
+  value = random_string.random_suffix.id
+}
\ No newline at end of file
diff --git a/test_framework/terraform/aws/sle-micro/rke2_instances.tf b/test_framework/terraform/aws/sle-micro/rke2_instances.tf
index 9f8033357c..07d2957ee6 100644
--- a/test_framework/terraform/aws/sle-micro/rke2_instances.tf
+++ b/test_framework/terraform/aws/sle-micro/rke2_instances.tf
@@ -112,11 +112,8 @@ resource "null_resource" "registration_controlplane_rke2" {
 
     inline = [
       "sudo transactional-update register -r ${var.registration_code}",
-      "sudo shutdown -r now",
     ]
 
-    on_failure = continue
-
     connection {
       type     = "ssh"
       user     = "suse"
@@ -139,11 +136,8 @@ resource "null_resource" "registration_worker_rke2" {
 
     inline = [
       "sudo transactional-update register -r ${var.registration_code}",
-      "sudo shutdown -r now",
     ]
 
-    on_failure = continue
-
     connection {
       type     = "ssh"
       user     = "suse"
@@ -252,7 +246,32 @@ resource "null_resource" "cluster_setup_worker_rke2" {
 
 }
 
-# node initialization step 3: download KUBECONFIG file for rke2
+# node initialization step 4: make sure k8s components running
+resource "null_resource" "make_sure_k8s_components_running_controlplane_rke2" {
+  count = var.k8s_distro_name == "rke2" ? 1 : 0
+
+  depends_on = [
+    null_resource.cluster_setup_controlplane_rke2,
+    null_resource.cluster_setup_worker_rke2
+  ]
+
+  provisioner "remote-exec" {
+
+    inline = [
+      "until (KUBECONFIG=/etc/rancher/rke2/rke2.yaml kubectl get pods -A | grep 'Running'); do echo 'Waiting for rke2 startup'; sleep 5; done"
+    ]
+
+    connection {
+      type     = "ssh"
+      user     = "suse"
+      host     = aws_eip.lh_aws_eip_controlplane[0].public_ip
+      private_key = file(var.aws_ssh_private_key_file_path)
+    }
+  }
+
+}
+
+# node initialization step 5: download KUBECONFIG file for rke2
 resource "null_resource" "rsync_kubeconfig_file_rke2" {
   count = var.k8s_distro_name == "rke2" ? 1 : 0
 
@@ -260,8 +279,7 @@ resource "null_resource" "rsync_kubeconfig_file_rke2" {
     aws_instance.lh_aws_instance_controlplane_rke2,
     aws_eip.lh_aws_eip_controlplane,
     aws_eip_association.lh_aws_eip_assoc_rke2,
-    null_resource.cluster_setup_controlplane_rke2,
-    null_resource.cluster_setup_worker_rke2
+    null_resource.make_sure_k8s_components_running_controlplane_rke2
   ]
 
   provisioner "remote-exec" {
diff --git a/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_k3s_agent.sh.tpl b/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_k3s_agent.sh.tpl
index b9cf4373b8..fb81b0214d 100755
--- a/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_k3s_agent.sh.tpl
+++ b/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_k3s_agent.sh.tpl
@@ -20,3 +20,7 @@ fi
 
 curl -sfL https://get.k3s.io | sudo INSTALL_K3S_EXEC="agent --token ${k3s_cluster_secret}" K3S_URL="${k3s_server_url}" INSTALL_K3S_VERSION="${k3s_version}" sh -
 sudo systemctl start k3s-agent
+
+if [[ -n "${custom_ssh_public_key}" ]]; then
+  echo "${custom_ssh_public_key}" >> /home/suse/.ssh/authorized_keys
+fi
\ No newline at end of file
diff --git a/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_k3s_server.sh.tpl b/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_k3s_server.sh.tpl
index 9bcc200570..7f254cda4b 100755
--- a/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_k3s_server.sh.tpl
+++ b/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_k3s_server.sh.tpl
@@ -2,3 +2,7 @@
 
 curl -sfL https://get.k3s.io | sudo INSTALL_K3S_EXEC="server --node-taint "node-role.kubernetes.io/master=true:NoExecute" --node-taint "node-role.kubernetes.io/master=true:NoSchedule" --tls-san ${k3s_server_public_ip} --write-kubeconfig-mode 644 --token ${k3s_cluster_secret}" INSTALL_K3S_VERSION="${k3s_version}" sh -
 sudo systemctl start k3s
+
+if [[ -n "${custom_ssh_public_key}" ]]; then
+  echo "${custom_ssh_public_key}" >> /home/suse/.ssh/authorized_keys
+fi
\ No newline at end of file
diff --git a/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_rke2_agent.sh.tpl b/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_rke2_agent.sh.tpl
index 74d2dd67e6..6efbaab929 100755
--- a/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_rke2_agent.sh.tpl
+++ b/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_rke2_agent.sh.tpl
@@ -32,4 +32,9 @@ EOF
 
 sudo systemctl enable rke2-agent.service
 sudo systemctl start rke2-agent.service
-exit $?
+
+if [[ -n "${custom_ssh_public_key}" ]]; then
+  echo "${custom_ssh_public_key}" >> /home/suse/.ssh/authorized_keys
+fi
+
+exit $?
\ No newline at end of file
diff --git a/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_rke2_server.sh.tpl b/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_rke2_server.sh.tpl
index 9fd79f05c4..1ec23d1df3 100755
--- a/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_rke2_server.sh.tpl
+++ b/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_rke2_server.sh.tpl
@@ -15,4 +15,8 @@ EOF
 
 sudo systemctl enable rke2-server.service
 sudo systemctl start rke2-server.service
-sudo ln -s /var/lib/rancher/rke2/bin/kubectl /usr/local/bin/kubectl
\ No newline at end of file
+sudo ln -s /var/lib/rancher/rke2/bin/kubectl /usr/local/bin/kubectl
+
+if [[ -n "${custom_ssh_public_key}" ]]; then
+  echo "${custom_ssh_public_key}" >> /home/suse/.ssh/authorized_keys
+fi
\ No newline at end of file
diff --git a/test_framework/terraform/aws/sle-micro/variables.tf b/test_framework/terraform/aws/sle-micro/variables.tf
index f1b7afd7f6..b00745d94b 100644
--- a/test_framework/terraform/aws/sle-micro/variables.tf
+++ b/test_framework/terraform/aws/sle-micro/variables.tf
@@ -121,4 +121,10 @@ variable "create_load_balancer" {
 variable "registration_code" {
   type    = string
   sensitive   = true
+}
+
+variable "custom_ssh_public_key" {
+  type = string
+  default = ""
+  sensitive = true
 }
\ No newline at end of file
diff --git a/test_framework/terraform/aws/sles/data.tf b/test_framework/terraform/aws/sles/data.tf
index a71132950b..b8964a56dd 100644
--- a/test_framework/terraform/aws/sles/data.tf
+++ b/test_framework/terraform/aws/sles/data.tf
@@ -17,6 +17,7 @@ data "template_file" "provision_k3s_server" {
     k3s_cluster_secret = random_password.cluster_secret.result
     k3s_server_public_ip = aws_eip.lh_aws_eip_controlplane[0].public_ip
     k3s_version =  var.k8s_distro_version
+    custom_ssh_public_key = var.custom_ssh_public_key
   }
 }
 
@@ -27,6 +28,7 @@ data "template_file" "provision_k3s_agent" {
     k3s_server_url = "https://${aws_eip.lh_aws_eip_controlplane[0].public_ip}:6443"
     k3s_cluster_secret = random_password.cluster_secret.result
     k3s_version =  var.k8s_distro_version
+    custom_ssh_public_key = var.custom_ssh_public_key
   }
 }
 
@@ -38,6 +40,7 @@ data "template_file" "provision_rke2_server" {
     rke2_server_public_ip = aws_eip.lh_aws_eip_controlplane[0].public_ip
     rke2_version =  var.k8s_distro_version
     cis_hardening = var.cis_hardening
+    custom_ssh_public_key = var.custom_ssh_public_key
   }
 }
 
@@ -49,5 +52,6 @@ data "template_file" "provision_rke2_agent" {
     rke2_cluster_secret = random_password.cluster_secret.result
     rke2_version =  var.k8s_distro_version
     cis_hardening = var.cis_hardening
+    custom_ssh_public_key = var.custom_ssh_public_key
   }
 }
diff --git a/test_framework/terraform/aws/sles/output.tf b/test_framework/terraform/aws/sles/output.tf
index bd2b4c12f7..291f420816 100644
--- a/test_framework/terraform/aws/sles/output.tf
+++ b/test_framework/terraform/aws/sles/output.tf
@@ -78,3 +78,11 @@ output "controlplane_public_ip" {
   ]
   value = aws_eip.lh_aws_eip_controlplane[0].public_ip
 }
+
+output "resource_suffix" {
+  depends_on = [
+    random_string.random_suffix
+  ]
+
+  value = random_string.random_suffix.id
+}
diff --git a/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_agent.sh.tpl b/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_agent.sh.tpl
index af5faa1d4c..c799b47a07 100755
--- a/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_agent.sh.tpl
+++ b/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_agent.sh.tpl
@@ -34,3 +34,7 @@ until (curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="agent --token ${k3s_clus
   echo 'k3s agent did not install correctly'
   sleep 2
 done
+
+if [[ -n "${custom_ssh_public_key}" ]]; then
+  echo "${custom_ssh_public_key}" >> /home/ec2-user/.ssh/authorized_keys
+fi
\ No newline at end of file
diff --git a/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_server.sh.tpl b/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_server.sh.tpl
index c04dd4cab8..2a2df03018 100755
--- a/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_server.sh.tpl
+++ b/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_server.sh.tpl
@@ -25,3 +25,6 @@ until (kubectl get pods -A | grep 'Running'); do
   RETRY=$((RETRY+1))
 done
 
+if [[ -n "${custom_ssh_public_key}" ]]; then
+  echo "${custom_ssh_public_key}" >> /home/ec2-user/.ssh/authorized_keys
+fi
diff --git a/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_agent.sh.tpl b/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_agent.sh.tpl
index e74b801290..04a9e1959c 100755
--- a/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_agent.sh.tpl
+++ b/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_agent.sh.tpl
@@ -60,4 +60,9 @@ EOF
 fi
 
 systemctl start rke2-agent.service
+
+if [[ -n "${custom_ssh_public_key}" ]]; then
+  echo "${custom_ssh_public_key}" >> /home/ec2-user/.ssh/authorized_keys
+fi
+
 exit $?
diff --git a/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_server.sh.tpl b/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_server.sh.tpl
index 9f5add5c49..6bf855bc44 100755
--- a/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_server.sh.tpl
+++ b/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_server.sh.tpl
@@ -53,3 +53,7 @@ until (KUBECONFIG=/etc/rancher/rke2/rke2.yaml /var/lib/rancher/rke2/bin/kubectl
   fi
   RETRY=$((RETRY+1))
 done
+
+if [[ -n "${custom_ssh_public_key}" ]]; then
+  echo "${custom_ssh_public_key}" >> /home/ec2-user/.ssh/authorized_keys
+fi
diff --git a/test_framework/terraform/aws/sles/variables.tf b/test_framework/terraform/aws/sles/variables.tf
index 6e683a7d55..1a435ac39b 100644
--- a/test_framework/terraform/aws/sles/variables.tf
+++ b/test_framework/terraform/aws/sles/variables.tf
@@ -31,7 +31,7 @@ variable "arch" {
 
 variable "os_distro_version" {
   type        = string
-  default     = "15-sp4"
+  default     = "15-sp5"
 }
 
 variable "aws_ami_sles_account_number" {
@@ -126,4 +126,10 @@ variable "cis_hardening" {
 variable "resources_owner" {
   type        = string
   default     = "longhorn-infra"
+}
+
+variable "custom_ssh_public_key" {
+  type = string
+  default = ""
+  sensitive = true
 }
\ No newline at end of file
diff --git a/test_framework/terraform/aws/ubuntu/data.tf b/test_framework/terraform/aws/ubuntu/data.tf
index 374a3069b4..9edd0495f2 100644
--- a/test_framework/terraform/aws/ubuntu/data.tf
+++ b/test_framework/terraform/aws/ubuntu/data.tf
@@ -16,6 +16,7 @@ data "template_file" "provision_k3s_server" {
     k3s_cluster_secret = random_password.cluster_secret.result
     k3s_server_public_ip = aws_eip.lh_aws_eip_controlplane[0].public_ip
     k3s_version =  var.k8s_distro_version
+    custom_ssh_public_key = var.custom_ssh_public_key
   }
 }
 
@@ -26,6 +27,7 @@ data "template_file" "provision_k3s_agent" {
     k3s_server_url = "https://${aws_eip.lh_aws_eip_controlplane[0].public_ip}:6443"
     k3s_cluster_secret = random_password.cluster_secret.result
     k3s_version =  var.k8s_distro_version
+    custom_ssh_public_key = var.custom_ssh_public_key
   }
 }
 
@@ -37,6 +39,7 @@ data "template_file" "provision_rke2_server" {
     rke2_cluster_secret = random_password.cluster_secret.result
     rke2_server_public_ip = aws_eip.lh_aws_eip_controlplane[0].public_ip
     rke2_version =  var.k8s_distro_version
+    custom_ssh_public_key = var.custom_ssh_public_key
   }
 }
 
@@ -47,6 +50,7 @@ data "template_file" "provision_rke2_agent" {
     rke2_server_url = "https://${aws_eip.lh_aws_eip_controlplane[0].public_ip}:9345"
     rke2_cluster_secret = random_password.cluster_secret.result
     rke2_version =  var.k8s_distro_version
+    custom_ssh_public_key = var.custom_ssh_public_key
   }
 }
 
diff --git a/test_framework/terraform/aws/ubuntu/output.tf b/test_framework/terraform/aws/ubuntu/output.tf
index a441717d06..9e865901a5 100644
--- a/test_framework/terraform/aws/ubuntu/output.tf
+++ b/test_framework/terraform/aws/ubuntu/output.tf
@@ -46,3 +46,11 @@ output "load_balancer_url" {
 
   value = var.create_load_balancer ? aws_lb.lh_aws_lb[0].dns_name : null
 }
+
+output "resource_suffix" {
+  depends_on = [
+    random_string.random_suffix
+  ]
+
+  value = random_string.random_suffix.id
+}
diff --git a/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_k3s_agent.sh.tpl b/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_k3s_agent.sh.tpl
index 38112a8aa6..c7b825fc63 100755
--- a/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_k3s_agent.sh.tpl
+++ b/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_k3s_agent.sh.tpl
@@ -13,3 +13,7 @@ until (curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="agent --token ${k3s_clus
   echo 'k3s agent did not install correctly'
   sleep 2
 done
+
+if [[ -n "${custom_ssh_public_key}" ]]; then
+  echo "${custom_ssh_public_key}" >> /home/ubuntu/.ssh/authorized_keys
+fi
diff --git a/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_k3s_server.sh.tpl b/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_k3s_server.sh.tpl
index d0a2ae2fcd..2ac9c835fe 100755
--- a/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_k3s_server.sh.tpl
+++ b/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_k3s_server.sh.tpl
@@ -15,3 +15,6 @@ until (kubectl get pods -A | grep 'Running'); do
   sleep 5
 done
 
+if [[ -n "${custom_ssh_public_key}" ]]; then
+  echo "${custom_ssh_public_key}" >> /home/ubuntu/.ssh/authorized_keys
+fi
diff --git a/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_rke2_agent.sh.tpl b/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_rke2_agent.sh.tpl
index 09804ab903..b2d58b4ed9 100755
--- a/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_rke2_agent.sh.tpl
+++ b/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_rke2_agent.sh.tpl
@@ -28,4 +28,9 @@ EOF
 
 systemctl enable rke2-agent.service
 systemctl start rke2-agent.service
-exit $?
+
+if [[ -n "${custom_ssh_public_key}" ]]; then
+  echo "${custom_ssh_public_key}" >> /home/ubuntu/.ssh/authorized_keys
+fi
+
+exit $?
\ No newline at end of file
diff --git a/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_rke2_server.sh.tpl b/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_rke2_server.sh.tpl
index 6543e059c7..ee3358398b 100755
--- a/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_rke2_server.sh.tpl
+++ b/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_rke2_server.sh.tpl
@@ -26,3 +26,6 @@ until (KUBECONFIG=/etc/rancher/rke2/rke2.yaml /var/lib/rancher/rke2/bin/kubectl
   sleep 5
 done
 
+if [[ -n "${custom_ssh_public_key}" ]]; then
+  echo "${custom_ssh_public_key}" >> /home/ubuntu/.ssh/authorized_keys
+fi
diff --git a/test_framework/terraform/aws/ubuntu/variables.tf b/test_framework/terraform/aws/ubuntu/variables.tf
index b6c53dd1b8..468a06b00e 100644
--- a/test_framework/terraform/aws/ubuntu/variables.tf
+++ b/test_framework/terraform/aws/ubuntu/variables.tf
@@ -115,3 +115,9 @@ variable "create_load_balancer" {
   type    = bool
   default = false
 }
+
+variable "custom_ssh_public_key" {
+  type = string
+  default = ""
+  sensitive = true
+}
\ No newline at end of file
diff --git a/test_tools/gen_data/README.md b/test_tools/gen_data/README.md
new file mode 100644
index 0000000000..3e73e6720a
--- /dev/null
+++ b/test_tools/gen_data/README.md
@@ -0,0 +1,46 @@
+# Generate test data script
+Generate RWO/RWX workloads, write data into `/mnt/data/data` in workloads and record md5 to data.output.
+
+# Usage
+Modify config.yaml
+```yaml
+storage: 1Gi # Each volume size
+storageClass: longhorn-test # Need to prepare your own storage class first
+dataSizeInMb: 500
+namespace: default # Nees to prepare first before run script
+statefulSet: # Single RWO/RWX statefulset and its replica counts
+  rwo:
+    replicas: 1 
+  rwx:
+    replicas: 0
+deployment: # Number of RWO/RWX deployments, replica of RWO fixed to 1
+  rwo:
+    pvCounts: 0 
+  rwx:
+    pvCounts: 1
+    deploymentReplicas: 2 # Replica count of each RWX deployment 
+```
+
+# Generate test data
+ `./run.sh`
+
+# Cleanup workloads and PVC
+`./clean.sh`
+
+# Output(example)
+`cat data.output`
+
+Can see worklad name and md5sum of mount point file
+```
+test-data-rwx-statefulset-0
+2bccd99c8e35ccab2cd7620a200bc3e1
+
+test-data-rwx-statefulset-1
+8f96c74b8b990ff11e98d478fc65f77b
+
+test-data-rwo-deployment-1-7f99f8bf76-cqblb
+91fc370c81957d12f01581f78e4bdeba
+
+test-data-rwo-deployment-2-549d6cb995-gvc79
+883c98d04e2c54c89f979b20d3fa277e
+```
diff --git a/test_tools/gen_data/clean.sh b/test_tools/gen_data/clean.sh
new file mode 100755
index 0000000000..7c0514ef28
--- /dev/null
+++ b/test_tools/gen_data/clean.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+NAMESPACE=$(yq eval '.namespace' config.yaml)
+
+kubectl get statefulset -n $NAMESPACE --no-headers | grep "test-data-" | awk '{print $1}' | xargs kubectl -n $NAMESPACE delete statefulset
+kubectl get deployment -n $NAMESPACE --no-headers | grep "test-data-" | awk '{print $1}' | xargs kubectl -n $NAMESPACE delete deployment
+kubectl get pvc -n $NAMESPACE --no-headers | grep "test-data-" | awk '{print $1}' | xargs kubectl -n $NAMESPACE delete pvc
\ No newline at end of file
diff --git a/test_tools/gen_data/config.yaml b/test_tools/gen_data/config.yaml
new file mode 100644
index 0000000000..965e2a2419
--- /dev/null
+++ b/test_tools/gen_data/config.yaml
@@ -0,0 +1,15 @@
+storage: 500Mi
+storageClass: longhorn
+dataSizeInMb: 50
+namespace: default
+statefulSet:
+  rwo:
+    replicas: 0
+  rwx:
+    replicas: 0
+deployment:
+  rwo:
+    pvCounts: 0
+  rwx:
+    pvCounts: 0
+    deploymentReplicas: 0
\ No newline at end of file
diff --git a/test_tools/gen_data/deployment.yaml b/test_tools/gen_data/deployment.yaml
new file mode 100644
index 0000000000..97ea776424
--- /dev/null
+++ b/test_tools/gen_data/deployment.yaml
@@ -0,0 +1,42 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: test-deployment
+  namespace: default
+spec:
+  accessModes:
+    - ReadWriteOnce
+  storageClassName: longhorn
+  resources:
+    requests:
+      storage: 1Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: test-deployment
+  namespace: default
+  labels:
+    name: test-deployment
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      name: test-deployment
+  template:
+    metadata:
+      labels:
+        name: test-deployment
+    spec:
+      containers:
+        - name: test-deployment
+          image: nginx:stable-alpine
+          command: ["/bin/sh"]
+          args: [-c, 'dd if=/dev/urandom of=/mnt/data/data bs=1M count=500; trap : TERM INT; sleep infinity & wait']
+          volumeMounts:
+            - name: vol1
+              mountPath: /mnt/data
+      volumes:
+        - name: vol1
+          persistentVolumeClaim:
+            claimName: test-deployment
diff --git a/test_tools/gen_data/run.sh b/test_tools/gen_data/run.sh
new file mode 100755
index 0000000000..d9b786786a
--- /dev/null
+++ b/test_tools/gen_data/run.sh
@@ -0,0 +1,279 @@
+#!/bin/bash
+ACCESS_MODE_RWO="ReadWriteOnce"
+ACCESS_MODE_RWX="ReadWriteMany"
+CONFIG_FILE="config.yaml"
+OUTPUT_FILE="data.output"
+DEPLOYMENT_TEMPLATE="deployment.yaml"
+STATEFULSET_TEMPLATE="statefulset.yaml"
+RWO_DEPLOYMENT_WORKLOAD_PREFIX="test-data-rwo-deployment-"
+RWX_DEPLOYMENT_WORKLOAD_PREFIX="test-data-rwx-deployment-"
+RWO_STATEFULSET_NAME="test-data-rwo-statefulset"
+RWX_STATEFULSET_NAME="test-data-rwx-statefulset"
+RETRY_COUNTS=60
+RETRY_INTERVAL=5
+RETRY_INTERVAL_LONG=10
+
+######################################################
+# Log
+######################################################
+export RED='\x1b[0;31m'
+export GREEN='\x1b[38;5;22m'
+export CYAN='\x1b[36m'
+export YELLOW='\x1b[33m'
+export NO_COLOR='\x1b[0m'
+
+if [ -z "${LOG_TITLE}" ]; then
+  LOG_TITLE=''
+fi
+if [ -z "${LOG_LEVEL}" ]; then
+  LOG_LEVEL="INFO"
+fi
+
+debug() {
+  if [[ "${LOG_LEVEL}" == "DEBUG" ]]; then
+    local log_title
+    if [ -n "${LOG_TITLE}" ]; then
+     log_title="(${LOG_TITLE})"
+    else
+     log_title=''
+    fi
+    echo -e "${GREEN}[DEBUG]${log_title} ${NO_COLOR}$1"
+  fi
+}
+
+info() {
+  if [[ "${LOG_LEVEL}" == "DEBUG" ]] ||\
+     [[ "${LOG_LEVEL}" == "INFO" ]]; then
+    local log_title
+    if [ -n "${LOG_TITLE}" ]; then
+     log_title="(${LOG_TITLE})"
+    else
+     log_title=''
+    fi
+    echo -e "${CYAN}[INFO] ${log_title} ${NO_COLOR}$1"
+  fi
+}
+
+warn() {
+  if [[ "${LOG_LEVEL}" == "DEBUG" ]] ||\
+     [[ "${LOG_LEVEL}" == "INFO" ]] ||\
+     [[ "${LOG_LEVEL}" == "WARN" ]]; then
+    local log_title
+    if [ -n "${LOG_TITLE}" ]; then
+     log_title="(${LOG_TITLE})"
+    else
+     log_title=''
+    fi
+    echo -e "${YELLOW}[WARN] ${log_title} ${NO_COLOR}$1"
+  fi
+}
+
+error() {
+  if [[ "${LOG_LEVEL}" == "DEBUG" ]] ||\
+     [[ "${LOG_LEVEL}" == "INFO" ]] ||\
+     [[ "${LOG_LEVEL}" == "WARN" ]] ||\
+     [[ "${LOG_LEVEL}" == "ERROR" ]]; then
+    local log_title
+    if [ -n "${LOG_TITLE}" ]; then
+     log_title="(${LOG_TITLE})"
+    else
+     log_title=''
+    fi
+    echo -e "${RED}[ERROR]${log_title} ${NO_COLOR}$1"
+  fi
+}
+
+######################################################
+# Check Logics
+######################################################
+check_local_dependencies() {
+  local targets=($@)
+
+  local all_found=true
+  for ((i=0; i<${#targets[@]}; i++)); do
+    local target=${targets[$i]}
+    if [ "$(which $target)" = "" ]; then
+      all_found=false
+      error "Not found: $target"
+    fi
+  done
+  
+  if [ "$all_found" = "false" ]; then
+    msg="Please install missing dependencies: ${targets[@]}."
+    info "$msg"
+    exit 1
+  fi
+
+  msg="Required dependencies '${targets[@]}' are installed."
+  info "$msg"
+}
+
+check_config_input() {
+    NAMESPACE=$(yq eval '.namespace' config.yaml)
+    STORAGE_SIZE=$(yq eval '.storage' config.yaml)
+    STORAGE_CLASS_NAME=$(yq eval '.storageClass' config.yaml)
+    DATA_SIZE_IN_MB=$(yq eval '.dataSizeInMb' config.yaml)
+    STATEFULSET_RWO_REPLICAS=$(yq eval '.statefulSet.rwo.replicas' config.yaml)
+    STATEFULSET_RWX_REPLICAS=$(yq eval '.statefulSet.rwx.replicas' config.yaml)
+    DEPLOYMENT_RWO_COUNTS=$(yq eval '.deployment.rwo.pvCounts' config.yaml)
+    DEPLOYMENT_RWX_COUNTS=$(yq eval '.deployment.rwx.pvCounts' config.yaml)
+    DEPLOYMENT_RWX_REPLICAS=$(yq eval '.deployment.rwx.deploymentReplicas' config.yaml)
+
+    msg="$CONFIG_FILE is not correct, please check"
+    # varialbe = "null" when yq not find yaml field
+    [ "$STORAGE_SIZE" = "null" -o ${#STORAGE_SIZE} -eq 0 ] && error "$msg" && exit 2
+    [ "$NAMESPACE" = "null" -o ${#NAMESPACE} -eq 0 ] && error "$msg" && exit 2
+    [ "$STORAGE_CLASS_NAME" = "null" -o ${#STORAGE_CLASS_NAME} -eq 0 ] && error "$msg" && exit 2
+    [ "$DATA_SIZE_IN_MB" = "null" -o ${#DATA_SIZE_IN_MB} -eq 0 ] && error "$msg" && exit 2
+    [ "$STATEFULSET_RWO_REPLICAS" = "null" -o ${#STATEFULSET_RWO_REPLICAS} -eq 0 ] && error "$msg" && exit 2
+    [ "$STATEFULSET_RWX_REPLICAS" = "null" -o ${#STATEFULSET_RWX_REPLICAS} -eq 0 ] && error "$msg" && exit 2
+    [ "$DEPLOYMENT_RWO_COUNTS" = "null" -o ${#DEPLOYMENT_RWO_COUNTS} -eq 0 ] && error "$msg" && exit 2
+    [ "$DEPLOYMENT_RWX_COUNTS" = "null" -o ${#DEPLOYMENT_RWX_COUNTS} -eq 0 ] && error "$msg" && exit 2
+    [ "$DEPLOYMENT_RWX_REPLICAS" = "null" -o ${#DEPLOYMENT_RWX_REPLICAS} -eq 0 ] && error "$msg" && exit 2
+}
+
+check_kubernetes_resources() {
+    if ! kubectl get storageclass "$STORAGE_CLASS_NAME" &> /dev/null; then
+        msg="StorageClass '$STORAGE_CLASS_NAME' does not exist."
+        error "$msg"
+        exit 1
+    fi
+
+    if ! kubectl get namespace "$NAMESPACE" &> /dev/null; then
+        msg="Namespace '$NAMESPACE' does not exist."
+        error "$msg"
+        exit 1
+    fi  
+}
+
+wait_workload_ready() {
+  local workload_type=$1
+  local workload_name=$2
+  local workload_replicas=$3
+  local retries=0
+  while [[ -n `kubectl -n $NAMESPACE get $workload_type --no-headers | grep $workload_name | awk '{print $2}' | grep -v $workload_replicas/$workload_replicas` ]]; do
+    msg="Pod is still creating ... re-checking in ${RETRY_INTERVAL}s"
+    info "$msg"
+    sleep ${RETRY_INTERVAL}
+    retries=$((RETRIES+1))
+
+    if [[ ${retries} -eq ${RETRY_COUNTS} ]]; then echo "Error: Pod create timeout"; exit 1 ; fi
+  done
+
+
+}
+
+record_pod_data() {
+    local pattern="$1"
+    local pod_names=($(kubectl -n $NAMESPACE get pods | grep $pattern | cut -d ' ' -f1))
+    # wait md5sum stable in case data is large
+    for pod_name in "${pod_names[@]}"; do
+      for ((i=0; i<=$RETRY_COUNTS; i++)); do
+        local md5_temp1=$(kubectl -n $NAMESPACE exec -it $pod_name -- /bin/sh -c "md5sum /mnt/data/data" | cut -d ' ' -f1)
+        sleep ${RETRY_INTERVAL_LONG}
+        local md5_temp2=$(kubectl -n $NAMESPACE exec -it $pod_name -- /bin/sh -c "md5sum /mnt/data/data" | cut -d ' ' -f1)
+        if [ "${md5_temp1}" != "${md5_temp2}" ]; then
+          continue
+        else
+          local md5=${md5_temp1}
+          break
+        fi
+      done
+      msg="${pod_name} data md5: ${md5}"
+      info "$msg"
+      echo $pod_name >> $OUTPUT_FILE
+      echo $md5 >> $OUTPUT_FILE
+      echo "" >> $OUTPUT_FILE
+    done
+}
+
+######################################################
+# Workloads
+######################################################
+create_deployments() {
+    local deployment_type=$1
+    if [ "${deployment_type}" == "rwo" ]; then
+        local deployment_replica=1
+        local access_mode=$ACCESS_MODE_RWO
+        local deployment_cnt=$DEPLOYMENT_RWO_COUNTS
+        local deployment_prefix=$RWO_DEPLOYMENT_WORKLOAD_PREFIX
+    elif [ "${deployment_type}" == "rwx" ]; then
+        local deployment_replica=$DEPLOYMENT_RWX_REPLICAS
+        local access_mode=$ACCESS_MODE_RWX
+        local deployment_cnt=$DEPLOYMENT_RWX_COUNTS
+        local deployment_prefix=$RWX_DEPLOYMENT_WORKLOAD_PREFIX
+    fi
+
+    local command="[\"-c\", \"if [ ! -f /mnt/data/data ]; then dd if=/dev/urandom of=/mnt/data/data bs=1M count=${DATA_SIZE_IN_MB}; fi; trap : TERM INT; sleep infinity & wait\"]" 
+    for (( i=1; i<=$deployment_cnt; i++)) do
+        local deployment_name="${deployment_prefix}$i"
+        local pvc_name="pvc-${deployment_name}"
+
+        yq -i e "select(.kind == \"PersistentVolumeClaim\").metadata.name = \"${pvc_name}\"" "${DEPLOYMENT_TEMPLATE}"
+        yq -i e "select(.kind == \"PersistentVolumeClaim\").metadata.namespace = \"${NAMESPACE}\"" "${DEPLOYMENT_TEMPLATE}"
+        yq -i e "select(.kind == \"PersistentVolumeClaim\").spec.accessModes[0] = \"${access_mode}\"" "${DEPLOYMENT_TEMPLATE}"
+        yq -i e "select(.kind == \"PersistentVolumeClaim\").spec.resources.requests.storage = \"${STORAGE_SIZE}\"" "${DEPLOYMENT_TEMPLATE}"
+        yq -i e "select(.kind == \"PersistentVolumeClaim\").spec.storageClassName = \"${STORAGE_CLASS_NAME}\"" "${DEPLOYMENT_TEMPLATE}"
+        yq -i e "select(.kind == \"Deployment\").metadata.name = \"${deployment_name}\"" "${DEPLOYMENT_TEMPLATE}"
+        yq -i e "select(.kind == \"Deployment\").metadata.namespace = \"${NAMESPACE}\"" "${DEPLOYMENT_TEMPLATE}" 
+        yq -i e "select(.kind == \"Deployment\").metadata.labels.name = \"${deployment_name}\"" "${DEPLOYMENT_TEMPLATE}"
+        yq -i e "select(.kind == \"Deployment\").spec.replicas = ${deployment_replica}" "${DEPLOYMENT_TEMPLATE}"
+        yq -i e "select(.kind == \"Deployment\").spec.selector.matchLabels.name = \"${deployment_name}\"" "${DEPLOYMENT_TEMPLATE}"
+        yq -i e "select(.kind == \"Deployment\").spec.template.metadata.labels.name = \"${deployment_name}\"" "${DEPLOYMENT_TEMPLATE}"
+        yq -i e "select(.kind == \"Deployment\").spec.template.spec.containers[0].name = \"${deployment_name}\"" "${DEPLOYMENT_TEMPLATE}"
+        yq -i e "select(.kind == \"Deployment\").spec.template.spec.volumes[0].persistentVolumeClaim.claimName = \"${pvc_name}\"" "${DEPLOYMENT_TEMPLATE}"
+        yq -i e "select(.kind == \"Deployment\").spec.template.spec.containers[0].args = ${command}" "${DEPLOYMENT_TEMPLATE}"
+        kubectl apply -f ${DEPLOYMENT_TEMPLATE}
+        wait_workload_ready "deployment" $deployment_name $deployment_replica
+
+    done
+
+    record_pod_data $deployment_prefix
+}
+
+create_statefulsets() {
+    local stateful_type=$1
+    local command="[\"-c\", \"dd if=/dev/urandom of=/mnt/data/data bs=1M count=${DATA_SIZE_IN_MB}; trap : TERM INT; sleep infinity & wait\"]"
+    if [ "${stateful_type}" == "rwo" ]; then
+      local statefulset_cnt=$STATEFULSET_RWO_REPLICAS
+      local access_mode=$ACCESS_MODE_RWO
+      local statefulset_name=$RWO_STATEFULSET_NAME
+    elif [ "${stateful_type}" == "rwx" ]; then
+      local statefulset_cnt=$STATEFULSET_RWX_REPLICAS
+      local access_mode=$ACCESS_MODE_RWX
+      local statefulset_name=$RWX_STATEFULSET_NAME
+    fi
+
+    if [ "$statefulset_cnt" -eq 0 ]; then
+      return
+    fi
+  
+    yq -i e "select(.kind == \"StatefulSet\").metadata.name = \"${statefulset_name}\"" "${STATEFULSET_TEMPLATE}"
+    yq -i e "select(.kind == \"StatefulSet\").metadata.namespace = \"${NAMESPACE}\"" "${STATEFULSET_TEMPLATE}"
+    yq -i e "select(.kind == \"StatefulSet\").spec.selector.matchLabels.app = \"${statefulset_name}\"" "${STATEFULSET_TEMPLATE}"
+    yq -i e "select(.kind == \"StatefulSet\").spec.serviceName = \"${statefulset_name}\"" "${STATEFULSET_TEMPLATE}"
+    yq -i e "select(.kind == \"StatefulSet\").spec.template.metadata.labels.app = \"${statefulset_name}\"" "${STATEFULSET_TEMPLATE}"
+    yq -i e "select(.kind == \"StatefulSet\").spec.replicas = ${statefulset_cnt}" "${STATEFULSET_TEMPLATE}"
+    yq -i e "select(.kind == \"StatefulSet\").spec.template.spec.containers[0].name = \"${statefulset_name}\"" "${STATEFULSET_TEMPLATE}"
+    yq -i e "select(.kind == \"StatefulSet\").spec.template.spec.containers[0].args = ${command}" "${STATEFULSET_TEMPLATE}"
+    yq -i e "select(.kind == \"StatefulSet\").spec.volumeClaimTemplates[0].spec.accessModes[0] = \"${access_mode}\"" "${STATEFULSET_TEMPLATE}"
+    yq -i e "select(.kind == \"StatefulSet\").spec.volumeClaimTemplates[0].spec.storageClassName = \"${STORAGE_CLASS_NAME}\"" "${STATEFULSET_TEMPLATE}"
+    yq -i e "select(.kind == \"StatefulSet\").spec.volumeClaimTemplates[0].spec.resources.requests.storage = \"${STORAGE_SIZE}\"" "${STATEFULSET_TEMPLATE}"
+    kubectl apply -f ${STATEFULSET_TEMPLATE}
+
+    wait_workload_ready "statefulset" $statefulset_name $statefulset_cnt
+    record_pod_data $statefulset_name
+}
+
+######################################################
+# Main logics
+######################################################
+echo "" > $OUTPUT_FILE
+DEPENDENCIES=("kubectl" "yq")
+check_local_dependencies "${DEPENDENCIES[@]}"
+check_config_input
+check_kubernetes_resources
+create_statefulsets "rwo"
+create_statefulsets "rwx"
+create_deployments "rwo"
+create_deployments "rwx"
diff --git a/test_tools/gen_data/statefulset.yaml b/test_tools/gen_data/statefulset.yaml
new file mode 100644
index 0000000000..0478b3aa21
--- /dev/null
+++ b/test_tools/gen_data/statefulset.yaml
@@ -0,0 +1,34 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: test-statefulset
+  namespace: default
+spec:
+  selector:
+    matchLabels:
+      app: test-statefulset
+  serviceName: test-statefulset
+  replicas: 2
+  template:
+    metadata:
+      labels:
+        app: test-statefulset
+    spec:
+      containers:
+        - name: test-statefulset
+          image: nginx:stable-alpine
+          command: ["/bin/sh"]
+          args: [-c, 'dd if=/dev/urandom of=/mnt/data/data bs=1M count=500; trap : TERM INT; sleep infinity & wait']
+          volumeMounts:
+            - name: pvc
+              mountPath: /mnt/data
+  volumeClaimTemplates:
+  - metadata:
+      name: pvc
+    spec:
+      accessModes:
+      - ReadWriteOnce
+      storageClassName: longhorn
+      resources:
+        requests:
+          storage: 1Gi
\ No newline at end of file