diff --git a/.drone.yml b/.drone.yml index 281a568cbe..ceb6c24470 100644 --- a/.drone.yml +++ b/.drone.yml @@ -23,7 +23,6 @@ steps: image: rancher/dapper:v0.5.3 commands: - dapper - privileged: true volumes: - name: socket path: /var/run/docker.sock @@ -92,7 +91,6 @@ steps: image: rancher/dapper:v0.5.3 commands: - dapper - privileged: true volumes: - name: socket path: /var/run/docker.sock diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000000..af61c5c1c6 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,11 @@ +#### Which issue(s) this PR fixes: + +Issue # + +#### What this PR does / why we need it: + +#### Special notes for your reviewer: + +#### Additional documentation or context diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 7cc4476071..416ddc1047 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -7,7 +7,7 @@ on: jobs: publish: - runs-on: [self-hosted, python3.8] + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v2 diff --git a/build_engine_test_images/Dockerfile.setup b/build_engine_test_images/Dockerfile.setup index feaf3d3811..5a4bacccf5 100644 --- a/build_engine_test_images/Dockerfile.setup +++ b/build_engine_test_images/Dockerfile.setup @@ -15,7 +15,7 @@ RUN wget -q https://releases.hashicorp.com/terraform/${TERRAFORM_VERSION}/terraf wget -q "https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64" && \ mv yq_linux_amd64 /usr/local/bin/yq && \ chmod +x /usr/local/bin/yq && \ - apk add openssh-client ca-certificates git rsync bash curl jq docker && \ + apk add openssh-client ca-certificates git rsync bash curl jq && \ ssh-keygen -t rsa -b 4096 -N "" -f ~/.ssh/id_rsa COPY [".", "$WORKSPACE"] \ No newline at end of file diff --git a/build_engine_test_images/Jenkinsfile b/build_engine_test_images/Jenkinsfile index 82623329b3..d940435469 100644 --- a/build_engine_test_images/Jenkinsfile +++ b/build_engine_test_images/Jenkinsfile @@ -15,12 +15,11 @@ node { usernamePassword(credentialsId: 'DOCKER_CREDS', passwordVariable: 'DOCKER_PASSWORD', usernameVariable: 'DOCKER_USERNAME'), usernamePassword(credentialsId: 'AWS_CREDS', passwordVariable: 'AWS_SECRET_KEY', usernameVariable: 'AWS_ACCESS_KEY') ]) { - stage('build') { + stage('build') { sh "build_engine_test_images/scripts/build.sh" - sh """ docker run -itd --privileged -v /var/run/docker.sock:/var/run/docker.sock \ - --name ${JOB_BASE_NAME}-${BUILD_NUMBER} \ + sh """ docker run -itd --name ${JOB_BASE_NAME}-${BUILD_NUMBER} \ --env TF_VAR_build_engine_aws_access_key=${AWS_ACCESS_KEY} \ --env TF_VAR_build_engine_aws_secret_key=${AWS_SECRET_KEY} \ --env TF_VAR_docker_id=${DOCKER_USERNAME} \ diff --git a/build_engine_test_images/run.sh b/build_engine_test_images/run.sh index 9377685a9d..fc831c228e 100755 --- a/build_engine_test_images/run.sh +++ b/build_engine_test_images/run.sh @@ -26,30 +26,6 @@ if [[ -z "$TF_VAR_docker_repo" ]]; then exit 1 fi -# if commit_id is empty, we can directly check longhorn-engine:master-head's api version -if [[ -z "${TF_VAR_commit_id}" ]]; then - - docker login -u="${TF_VAR_docker_id}" -p="${TF_VAR_docker_password}" - docker pull longhornio/longhorn-engine:master-head - version=`docker run longhornio/longhorn-engine:master-head longhorn version --client-only` - CLIAPIVersion=`echo $version | jq -r ".clientVersion.cliAPIVersion"` - CLIAPIMinVersion=`echo $version | jq -r ".clientVersion.cliAPIMinVersion"` - ControllerAPIVersion=`echo $version | jq -r ".clientVersion.controllerAPIVersion"` - ControllerAPIMinVersion=`echo $version | jq -r ".clientVersion.controllerAPIMinVersion"` - DataFormatVersion=`echo $version | jq -r ".clientVersion.dataFormatVersion"` - DataFormatMinVersion=`echo $version | jq -r ".clientVersion.dataFormatMinVersion"` - echo "latest engine version: ${version}" - - upgrade_image="${TF_VAR_docker_repo}:upgrade-test.$CLIAPIVersion-$CLIAPIMinVersion"\ -".$ControllerAPIVersion-$ControllerAPIMinVersion"\ -".$DataFormatVersion-$DataFormatMinVersion" - - if [[ $(docker manifest inspect "${upgrade_image}") != "" ]]; then - echo "latest engine test images have already published" - exit 0 - fi -fi - trap ./scripts/cleanup.sh EXIT # Build amd64 images diff --git a/docs/content/manual/pre-release/basic-operations/storage-network.md b/docs/content/manual/pre-release/basic-operations/storage-network.md index f1f59642c5..93b29145d1 100644 --- a/docs/content/manual/pre-release/basic-operations/storage-network.md +++ b/docs/content/manual/pre-release/basic-operations/storage-network.md @@ -6,13 +6,17 @@ https://github.com/longhorn/longhorn/issues/2285 ## Test Multus version below v4.0.0 **Given** Set up the Longhorn environment as mentioned [here](https://longhorn.github.io/longhorn-tests/manual/release-specific/v1.3.0/test-storage-network/) + **When** Run Longhorn core tests on the environment. -**Then** All the tests should pass. + +**Then** All the tests should pass. ## Related issue: https://github.com/longhorn/longhorn/issues/6953 ## Test Multus version above v4.0.0 **Given** Set up the Longhorn environment as mentioned [here](https://longhorn.github.io/longhorn-tests/manual/release-specific/v1.6.0/test-storage-network/) + **When** Run Longhorn core tests on the environment. + **Then** All the tests should pass. diff --git a/docs/content/manual/pre-release/environment/cluster-using-customized-kubelet-root-directory.md b/docs/content/manual/pre-release/environment/cluster-using-customized-kubelet-root-directory.md index d27f7ec62b..c8426cfee9 100644 --- a/docs/content/manual/pre-release/environment/cluster-using-customized-kubelet-root-directory.md +++ b/docs/content/manual/pre-release/environment/cluster-using-customized-kubelet-root-directory.md @@ -2,8 +2,10 @@ title: Cluster using customize kubelet root directory --- -1. Set up a cluster using a customized kubelet root directory. - e.g., launching k3s `k3s server --kubelet-arg "root-dir=/var/lib/longhorn-test" &` +1. Set up a cluster using a customized kubelet root directory. + For example, launching k3s: + - Controller: `k3s server --kubelet-arg "root-dir=/var/lib/longhorn-test"` + - Worker: `k3s agent --kubelet-arg "root-dir=/var/lib/longhorn-test"` 2. Install `Longhorn` with env `KUBELET_ROOT_DIR` in `longhorn-driver-deployer` being set to the corresponding value. 3. Launch a pod using Longhorn volumes via StorageClass. Everything should work fine. 4. Delete the pod and the PVC. Everything should be cleaned up. diff --git a/docs/content/manual/pre-release/resiliency/pvc_provisioning_with_insufficient_storage.md b/docs/content/manual/pre-release/resiliency/pvc_provisioning_with_insufficient_storage.md index e1a8079826..cd7f88356b 100644 --- a/docs/content/manual/pre-release/resiliency/pvc_provisioning_with_insufficient_storage.md +++ b/docs/content/manual/pre-release/resiliency/pvc_provisioning_with_insufficient_storage.md @@ -5,17 +5,22 @@ title: "PVC provisioning with insufficient storage" #### Related Issue: - https://github.com/longhorn/longhorn/issues/4654 - https://github.com/longhorn/longhorn/issues/3529 +- https://github.com/longhorn/longhorn/issues/6461 #### Root Cause Analysis - https://github.com/longhorn/longhorn/issues/4654#issuecomment-1264870672 This case need to be tested on both RWO/RWX volumes -1. Create a PVC with size larger than 8589934591 GiB. +1. Create a PVC with size larger than `8589934591` GiB. - Deployment keep in pending status, RWO/RWX volume will keep in a create -> delete loop. -2. Create a PVC with size <= 8589934591 GiB, but greater than the actual available space size. - - RWO/RWX volume will be created, and volume will have annotation "longhorn.io/volume-scheduling-error": "insufficient storage volume scheduling failure" in it. -3. Create a PVC with size < the actual available space size,Resize the PVC to a not schedulable size +1. Create a PVC with size <= `8589934591` GiB, but greater than the actual available space size. + - RWO/RWX volume will be created, and the associated PV for this volume will have annotation "**longhorn.io/volume-scheduling-error**": "**insufficient storage**" in it. + - We can observe "**Scheduling Failure**" and "**Replica Scheduling Failure**" error messages on the Longhorn UI with the following details + - **Scheduling Failure** + - Replica Scheduling Failure + - Error Message: insufficient storage +1. Create a PVC with size < the actual available space size,Resize the PVC to a not schedulable size - After resize PVC to a not schedulable size, both RWO/RWX were still in scheduling status. We can modify/use https://raw.githubusercontent.com/longhorn/longhorn/master/examples/rwx/rwx-nginx-deployment.yaml to deploy RWO/RWX PVC for this test \ No newline at end of file diff --git a/docs/content/manual/pre-release/ui/_index.md b/docs/content/manual/pre-release/ui/_index.md new file mode 100644 index 0000000000..9ec12c4ba1 --- /dev/null +++ b/docs/content/manual/pre-release/ui/_index.md @@ -0,0 +1,3 @@ +--- +title: UI +--- diff --git a/docs/content/manual/pre-release/ui/ui-sanity-check.md b/docs/content/manual/pre-release/ui/ui-sanity-check.md new file mode 100644 index 0000000000..0c19f20015 --- /dev/null +++ b/docs/content/manual/pre-release/ui/ui-sanity-check.md @@ -0,0 +1,14 @@ +--- +title: ui sanity check +--- + +1. Access Longhorn UI on `Chrome`, `Firefox` and `Safari` latest/stable version. +1. Check the pages. All the text, form, tables should be proper. +1. Verify all the links at the bottom, they shouldn't be broken and redirects to right pages. +1. Check the setting page, all the settings's text, values should be proper. +1. Create `Backing Image`, `volume`, `pv`, `pvc` and `recurring jobs` using UI. +1. Take `volume snapshot`, create `volume backup`, and `system backup` using UI. +1. Restore `Backup` and `system backup` using UI. +1. Check the `events` on dashboard, they should be normal. +1. Check the logs on the volume detail page, there shouldn't be any error. +1. Check the browser's console, there shouldn't be any error. \ No newline at end of file diff --git a/docs/content/manual/pre-release/upgrade/upgrade-with-new-instance-manager.md b/docs/content/manual/pre-release/upgrade/upgrade-with-new-instance-manager.md index a206f8dd66..e3b5c82451 100644 --- a/docs/content/manual/pre-release/upgrade/upgrade-with-new-instance-manager.md +++ b/docs/content/manual/pre-release/upgrade/upgrade-with-new-instance-manager.md @@ -4,7 +4,7 @@ title: Test System Upgrade with New Instance Manager 1. Prepare 3 sets of longhorn-manager and longhorn-instance-manager images. 2. Deploy Longhorn with the 1st set of images. -3. Set `Guaranteed Engine Manager CPU` and `Guaranteed Replica Manager CPU` to 15 and 24, respectively. +3. Set `Guaranteed Instance Manager CPU` to 40, respectively. Then wait for the instance manager recreation. 4. Create and attach a volume to a node (node1). 5. Upgrade the Longhorn system with the 2nd set of images. @@ -13,4 +13,4 @@ title: Test System Upgrade with New Instance Manager 7. Upgrade the Longhorn system with the 3rd set of images. 8. Verify the pods of the 3rd instance manager cannot be launched on node1 since there is no available CPU for the allocation. 9. Detach the volume in the 1st instance manager pod. - Verify the related instance manager pods will be cleaned up and the new instance manager pod can be launched on node1. + Verify the related instance manager pods will be cleaned up and the new instance manager pod can be launched on node1. \ No newline at end of file diff --git a/docs/content/manual/pre-release/v2-volume/_index.md b/docs/content/manual/pre-release/v2-volume/_index.md new file mode 100644 index 0000000000..73a4d68854 --- /dev/null +++ b/docs/content/manual/pre-release/v2-volume/_index.md @@ -0,0 +1,3 @@ +--- +title: v2 volume +--- diff --git a/docs/content/manual/pre-release/v2-volume/sanity-check.md b/docs/content/manual/pre-release/v2-volume/sanity-check.md new file mode 100644 index 0000000000..55041ed418 --- /dev/null +++ b/docs/content/manual/pre-release/v2-volume/sanity-check.md @@ -0,0 +1,14 @@ +--- +title: v2 volume sanity check +--- +## Related doc: +https://longhorn.io/docs/1.6.0/v2-data-engine/features/ + +- Support both amd64 and arm64 +- Volume creation, attachment, detachment and deletion +- Automatic offline replica rebuilding +- [Orphaned replica management](https://github.com/longhorn/longhorn/issues/5827) +- Snapshot creation, deletion and reversion +- Volume backup and restoration +- [Selective v2 Data Engine activation](https://github.com/longhorn/longhorn/issues/7015) +- Upgrade Longhorn from previous version with v2 volume \ No newline at end of file diff --git a/docs/content/manual/release-specific/v1.6.0/test-engine-version-enforcement.md b/docs/content/manual/release-specific/v1.6.0/test-engine-version-enforcement.md new file mode 100644 index 0000000000..ba4f32d956 --- /dev/null +++ b/docs/content/manual/release-specific/v1.6.0/test-engine-version-enforcement.md @@ -0,0 +1,43 @@ +--- +title: Test engine version enforcement +--- + +## Related issue +https://github.com/longhorn/longhorn/issues/5842 +https://github.com/longhorn/longhorn/issues/7539 + +## Test step + +**Given** Longhorn v1.4.x cluster running +And create and attach a volume (volume-1) +And upgraded Longhorn to v1.5.x +And create and attach a volume (volume-2) + +**When** upgraded Longhorn to v1.6.0 +**Then** v1.6.0 longhorn-manager Pods should be in crashloop +``` +longhorn-manager-zrf8r 0/1 CrashLoopBackOff 2 (10s ago) 52s +longhorn-manager-zsph2 0/1 CrashLoopBackOff 2 (8s ago) 52s +longhorn-manager-grhsf 0/1 CrashLoopBackOff 2 (8s ago) 51s +``` +And should see incompatible version error in longhorn-manager Pod logs +``` +time="2023-08-17T03:03:20Z" level=fatal msg="Error starting manager: failed checking Engine upgarde path: incompatible Engine ei-7fa7c208 client API version: found version 7 is below required minimal version 8" +``` + +**When** downgraded Longhorn to v1.5.x +**Then** Longhorn components should be running + +**When** upgraded v1.4.1 volume (volume-1) engine +And upgraded Longhorn to v1.6.0 +**Then** Longhorn components should be running +And v1.4.x EngineImage state should be deployed and incompatible should be true. +``` +NAME INCOMPATIBLE STATE IMAGE REFCOUNT BUILDDATE AGE +ei-74783864 false deployed longhornio/longhorn-engine:v1.5.1 10 28d 12m +ei-7fa7c208 true deployed longhornio/longhorn-engine:v1.4.1 0 157d 13m +ei-ad420081 false deployed c3y1huang/research:2017-lh-ei 0 44h 24s +``` + +**When** update existing volume/engine/replica custom resourcs `spec.image` with `longhornio/longhorn-engine:v1.4.x` +**Then** should be blocked diff --git a/docs/content/manual/release-specific/v1.6.0/test-list-backup-when-cluster-has-node-cordoned-before-longhorn-installation.md b/docs/content/manual/release-specific/v1.6.0/test-list-backup-when-cluster-has-node-cordoned-before-longhorn-installation.md new file mode 100644 index 0000000000..a5a40de4d8 --- /dev/null +++ b/docs/content/manual/release-specific/v1.6.0/test-list-backup-when-cluster-has-node-cordoned-before-longhorn-installation.md @@ -0,0 +1,18 @@ +--- +title: Test list backup when cluster has node cordoned before Longhorn installation +--- + +## Related issue +https://github.com/longhorn/longhorn/issues/7619 + +## Test step + +**Given** a cluster has 3 worker nodes. +**And** 2 worker nodes are cordoned. +**And** Longhorn is installed. + +**When** Setting up a backup target. + +**Then** no error is observed on the UI Backup page. +**And** Backup custom resources are created if the backup target has existing backups. + diff --git a/docs/content/manual/release-specific/v1.6.0/test-pvc-name-and-namespace-included-in-volume-metrics.md b/docs/content/manual/release-specific/v1.6.0/test-pvc-name-and-namespace-included-in-volume-metrics.md new file mode 100644 index 0000000000..850c8ea8db --- /dev/null +++ b/docs/content/manual/release-specific/v1.6.0/test-pvc-name-and-namespace-included-in-volume-metrics.md @@ -0,0 +1,47 @@ +--- +title: Test PVC Name and Namespace included in the volume metrics +--- + +## Related issues + +- https://github.com/longhorn/longhorn/issues/5297 +- https://github.com/longhorn/longhorn-manager/pull/2284 + +## Test step + +**Given** created 2 volumes (volume-1, volume-2) + +**When** PVC created for volume (volume-1) +And attached volumes (volume-1, volume-2) + +**Then** metrics with `longhorn_volume_` prefix should include `pvc="volume-1"` + +```bash +curl -sSL http://10.0.2.212:32744/metrics | grep longhorn_volume | grep ip-10-0-2-151 | grep volume-1 +longhorn_volume_actual_size_bytes{pvc_namespace="default",node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0 +longhorn_volume_capacity_bytes{pvc_namespace="default",node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 1.073741824e+09 +longhorn_volume_read_iops{pvc_namespace="default",node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0 +longhorn_volume_read_latency{pvc_namespace="default",node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0 +longhorn_volume_read_throughput{pvc_namespace="default",node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0 +longhorn_volume_robustness{pvc_namespace="default",node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 1 +longhorn_volume_state{pvc_namespace="default",node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 2 +longhorn_volume_write_iops{pvc_namespace="default",node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0 +longhorn_volume_write_latency{pvc_namespace="default",node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0 +longhorn_volume_write_throughput{pvc_namespace="default",node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0 +``` + +And metrics with `longhorn_volume_` prefix should include `pvc=""` for (volume-2) + +```bash +> curl -sSL http://10.0.2.212:32744/metrics | grep longhorn_volume | grep ip-10-0-2-151 | grep volume-2 +longhorn_volume_actual_size_bytes{pvc_namespace="",node="ip-10-0-2-151",pvc="",volume="volume-2"} 0 +longhorn_volume_capacity_bytes{pvc_namespace="",node="ip-10-0-2-151",pvc="",volume="volume-2"} 1.073741824e+09 +longhorn_volume_read_iops{pvc_namespace="",node="ip-10-0-2-151",pvc="",volume="volume-2"} 0 +longhorn_volume_read_latency{pvc_namespace="",node="ip-10-0-2-151",pvc="",volume="volume-2"} 0 +longhorn_volume_read_throughput{pvc_namespace="",node="ip-10-0-2-151",pvc="",volume="volume-2"} 0 +longhorn_volume_robustness{pvc_namespace="",node="ip-10-0-2-151",pvc="",volume="volume-2"} 1 +longhorn_volume_state{pvc_namespace="",node="ip-10-0-2-151",pvc="",volume="volume-2"} 2 +longhorn_volume_write_iops{pvc_namespace="",node="ip-10-0-2-151",pvc="",volume="volume-2"} 0 +longhorn_volume_write_latency{pvc_namespace="",node="ip-10-0-2-151",pvc="",volume="volume-2"} 0 +longhorn_volume_write_throughput{pvc_namespace="",node="ip-10-0-2-151",pvc="",volume="volume-2"} 0 +``` diff --git a/docs/content/manual/release-specific/v1.6.0/test-pvc-name-included-in-volume-metrics.md b/docs/content/manual/release-specific/v1.6.0/test-pvc-name-included-in-volume-metrics.md deleted file mode 100644 index 340474634b..0000000000 --- a/docs/content/manual/release-specific/v1.6.0/test-pvc-name-included-in-volume-metrics.md +++ /dev/null @@ -1,42 +0,0 @@ ---- -title: Test PVC Name included in the volume metrics ---- - -## Related issue -https://github.com/longhorn/longhorn/issues/5297 - -## Test step - -**Given** created 2 volumes (volume-1, volume-2) - -**When** PVC created for volume (volume-1) -And attached volumes (volume-1, volume-2) - -**Then** metrics with `longhorn_volume_` prefix should include `pvc="volume-1"` -```bash -curl -sSL http://10.0.2.212:32744/metrics | grep longhorn_volume | grep ip-10-0-2-151 | grep volume-1 -longhorn_volume_actual_size_bytes{node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0 -longhorn_volume_capacity_bytes{node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 1.073741824e+09 -longhorn_volume_read_iops{node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0 -longhorn_volume_read_latency{node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0 -longhorn_volume_read_throughput{node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0 -longhorn_volume_robustness{node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 1 -longhorn_volume_state{node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 2 -longhorn_volume_write_iops{node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0 -longhorn_volume_write_latency{node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0 -longhorn_volume_write_throughput{node="ip-10-0-2-151",pvc="volume-1",volume="volume-1"} 0 -``` -And metrics with `longhorn_volume_` prefix should include `pvc=""` for (volume-2) -```bash -> curl -sSL http://10.0.2.212:32744/metrics | grep longhorn_volume | grep ip-10-0-2-151 | grep volume-2 -longhorn_volume_actual_size_bytes{node="ip-10-0-2-151",pvc="",volume="volume-2"} 0 -longhorn_volume_capacity_bytes{node="ip-10-0-2-151",pvc="",volume="volume-2"} 1.073741824e+09 -longhorn_volume_read_iops{node="ip-10-0-2-151",pvc="",volume="volume-2"} 0 -longhorn_volume_read_latency{node="ip-10-0-2-151",pvc="",volume="volume-2"} 0 -longhorn_volume_read_throughput{node="ip-10-0-2-151",pvc="",volume="volume-2"} 0 -longhorn_volume_robustness{node="ip-10-0-2-151",pvc="",volume="volume-2"} 1 -longhorn_volume_state{node="ip-10-0-2-151",pvc="",volume="volume-2"} 2 -longhorn_volume_write_iops{node="ip-10-0-2-151",pvc="",volume="volume-2"} 0 -longhorn_volume_write_latency{node="ip-10-0-2-151",pvc="",volume="volume-2"} 0 -longhorn_volume_write_throughput{node="ip-10-0-2-151",pvc="",volume="volume-2"} 0 -``` \ No newline at end of file diff --git a/docs/content/manual/release-specific/v1.6.0/test-storage-network.md b/docs/content/manual/release-specific/v1.6.0/test-storage-network.md index 546d0c8410..1d10cf5654 100644 --- a/docs/content/manual/release-specific/v1.6.0/test-storage-network.md +++ b/docs/content/manual/release-specific/v1.6.0/test-storage-network.md @@ -38,6 +38,8 @@ https://github.com/longhorn/longhorn/issues/6953 ### Setup instances +#### Thin Plugin + **Given** K3s K8s cluster installed on EC2 instances. *And* Deploy Multus DaemonSet on the control-plane node. @@ -177,6 +179,140 @@ kubectl apply -f nad-192-168-0-0.yaml ``` +#### Thick Plugin + +**Given** K3s K8s cluster installed on EC2 instances. + +*And* (For K3s) Establish symbolic links on all cluster nodes. + ```bash + mkdir /etc/cni + mkdir /opt/cni + ln -s /var/lib/rancher/k3s/agent/etc/cni/net.d /etc/cni/ + ln -s /var/lib/rancher/k3s/data/current/bin /opt/cni/ + ``` + +*And* Deploy Multus DaemonSet on the control-plane node. +- Download YAML. + ``` + curl -O https://raw.githubusercontent.com/k8snetworkplumbingwg/multus-cni/v4.0.2/deployments/multus-daemonset-thick.yml + ``` +- Edit YAML. + ``` + diff --git a/deployments/multus-daemonset-thick.yml b/deployments/multus-daemonset-thick.yml + index eaa92ece..c895651b 100644 + --- a/deployments/multus-daemonset-thick.yml + +++ b/deployments/multus-daemonset-thick.yml + @@ -152,7 +152,7 @@ spec: + serviceAccountName: multus + containers: + - name: kube-multus + - image: ghcr.io/k8snetworkplumbingwg/multus-cni:snapshot-thick + + image: ghcr.io/k8snetworkplumbingwg/multus-cni:v4.0.2-thick + command: [ "/usr/src/multus-cni/bin/multus-daemon" ] + resources: + requests: + @@ -183,9 +183,11 @@ spec: + - name: hostroot + mountPath: /hostroot + mountPropagation: HostToContainer + + - name: cnibin + + mountPath: /opt/cni/bin + initContainers: + - name: install-multus-binary + - image: ghcr.io/k8snetworkplumbingwg/multus-cni:snapshot-thick + + image: ghcr.io/k8snetworkplumbingwg/multus-cni:v4.0.2-thick + command: + - "cp" + - "/usr/src/multus-cni/bin/multus-shim" + ``` +- Apply YAML to K8s cluster. + ``` + kubectl apply -f multus-daemonset-thick.yml + ``` + +*And* Download `ipvlan` and put to K3s binaries path to all cluster nodes. +``` +curl -OL https://github.com/containernetworking/plugins/releases/download/v1.3.0/cni-plugins-linux-amd64-v1.3.0.tgz +tar -zxvf cni-plugins-linux-amd64-v1.3.0.tgz +cp ipvlan /var/lib/rancher/k3s/data/current/bin/ +``` + +*And* Setup flannels on all cluster nodes. +``` +# Update nodes eth1 IP to N1, N2, N3 +N1="10.0.2.95" +N2="10.0.2.139" +N3="10.0.2.158" +NODES=(${N1} ${N2} ${N3}) + +STORAGE_NETWORK_PREFIX="192.168" + +ETH1_IP=`ip a | grep eth1 | grep -Eo 'inet (addr:)?([0-9]*\.){3}[0-9]*' | awk '{print $2}'` + +count=1 +for n in "${NODES[@]}"; do + [[ ${ETH1_IP} != $n ]] && ((count=count+1)) && continue + + NET=$count + break +done + +cat << EOF > /run/flannel/multus-subnet-${STORAGE_NETWORK_PREFIX}.0.0.env +FLANNEL_NETWORK=${STORAGE_NETWORK_PREFIX}.0.0/16 +FLANNEL_SUBNET=${STORAGE_NETWORK_PREFIX}.${NET}.0/24 +FLANNEL_MTU=1472 +FLANNEL_IPMASQ=true +EOF +``` +*And* Setup routes on all cluster nodes. +``` +# Update nodes eth1 IP to N1, N2, N3 +N1="10.0.2.95" +N2="10.0.2.139" +N3="10.0.2.158" + +STORAGE_NETWORK_PREFIX="192.168" +ACTION="add" + +ETH1_IP=`ip a | grep eth1 | grep -Eo 'inet (addr:)?([0-9]*\.){3}[0-9]*' | awk '{print $2}'` + +[[ ${ETH1_IP} != ${N1} ]] && ip r ${ACTION} ${STORAGE_NETWORK_PREFIX}.1.0/24 via ${N1} dev eth1 +[[ ${ETH1_IP} != ${N2} ]] && ip r ${ACTION} ${STORAGE_NETWORK_PREFIX}.2.0/24 via ${N2} dev eth1 +[[ ${ETH1_IP} != ${N3} ]] && ip r ${ACTION} ${STORAGE_NETWORK_PREFIX}.3.0/24 via ${N3} dev eth1 +``` + +*And* Deploy `NetworkAttachmentDefinition`. +``` +cat << EOF > nad-192-168-0-0.yaml +apiVersion: "k8s.cni.cncf.io/v1" +kind: NetworkAttachmentDefinition +metadata: + name: demo-192-168-0-0 + namespace: kube-system + #namespace: longhorn-system +spec: + config: '{ + "cniVersion": "0.3.1", + "type": "flannel", + "subnetFile": "/run/flannel/multus-subnet-192.168.0.0.env", + "dataDir": "/var/lib/cni/multus-subnet-192.168.0.0", + "delegate": { + "type": "ipvlan", + "master": "eth1", + "mode": "l3", + "capabilities": { + "ips": true + } + }, + "kubernetes": { + "kubeconfig": "/etc/cni/net.d/multus.d/multus.kubeconfig" + } + }' +EOF +kubectl apply -f nad-192-168-0-0.yaml +``` + + ### Test storage network **Given** Longhorn deployed. diff --git a/docs/content/manual/release-specific/v1.6.0/test-support-bundle-kubelet-log-for-k3s.md b/docs/content/manual/release-specific/v1.6.0/test-support-bundle-kubelet-log-for-k3s.md new file mode 100644 index 0000000000..7e6ce16a11 --- /dev/null +++ b/docs/content/manual/release-specific/v1.6.0/test-support-bundle-kubelet-log-for-k3s.md @@ -0,0 +1,13 @@ +--- +title: Test Support Bundle Should Include Kubelet Log When On K3s Cluster +--- + +## Related issue +https://github.com/longhorn/longhorn/issues/7121 + +## Test + +**Given** Longhorn installed on K3s cluster +**When** generated support-bundle +**Then** should have worker node kubelet logs in `k3s-agent-service.log` +**And** should have control-plan node kubelet log in `k3s-service.log` (if Longhorn is deployed on control-plan node) diff --git a/docs/content/manual/release-specific/v1.6.0/test-support-bundle-metadata-file.md b/docs/content/manual/release-specific/v1.6.0/test-support-bundle-metadata-file.md new file mode 100644 index 0000000000..14c23cf5ec --- /dev/null +++ b/docs/content/manual/release-specific/v1.6.0/test-support-bundle-metadata-file.md @@ -0,0 +1,13 @@ +--- +title: Test Support Bundle Metadata File +--- + +## Related issue +https://github.com/longhorn/longhorn/issues/6997 + +## Test + +**Given** Longhorn installed on SUSE Linux +**When** generated support-bundle with description and issue URL +**Then** `issuedescription` has the description in the metadata.yaml +**And** `issueurl` has the issue URL in the metadata.yaml diff --git a/docs/content/manual/release-specific/v1.6.0/test-upgrade-responder-collect-spdk-related-info.md b/docs/content/manual/release-specific/v1.6.0/test-upgrade-responder-collect-spdk-related-info.md new file mode 100644 index 0000000000..0ae906383d --- /dev/null +++ b/docs/content/manual/release-specific/v1.6.0/test-upgrade-responder-collect-spdk-related-info.md @@ -0,0 +1,105 @@ +--- +title: Test upgrade responder should collect SPDK related info +--- + +## Related issue +https://github.com/longhorn/longhorn/issues/6033 + +## Test step + +### Prerequisite + +**Given** Patch build and deploy Longhorn. +``` +diff --git a/controller/setting_controller.go b/controller/setting_controller.go +index de77b7246..ac6263ac5 100644 +--- a/controller/setting_controller.go ++++ b/controller/setting_controller.go +@@ -49,7 +49,7 @@ const ( + var ( + upgradeCheckInterval = time.Hour + settingControllerResyncPeriod = time.Hour +- checkUpgradeURL = "https://longhorn-upgrade-responder.rancher.io/v1/checkupgrade" ++ checkUpgradeURL = "http://longhorn-upgrade-responder.default.svc.cluster.local:8314/v1/checkupgrade" + ) + + type SettingController struct { +``` +> Match the checkUpgradeURL with the application name: `http://-upgrade-responder.default.svc.cluster.local:8314/v1/checkupgrade` + +**And** Set setting `v2-data-engine` to `true`. +**And** [Add two block-type Disks in Longhorn Nodes](https://longhorn.io/docs/1.5.3/spdk/quick-start/#add-block-type-disks-in-longhorn-nodes). + +#### Test Collecting Longhorn Disk Type + +**Given** [Prerequisite](#prerequisite). +**And** [Deploy upgrade responder stack](https://github.com/longhorn/longhorn/tree/master/dev/upgrade-responder). + +**When** Wait 1~2 hours for collection data to send to the influxDB database. + +**Then** `longhorn_disk_block_Count` should exist the influxDB database. + `longhorn_disk_filesystem_Count` should exist the influxDB database. +```bash +> app_name="longhorn" +> influxdb_pod=$(kubectl get pod | grep influxdb | awk '{print $1}' | head -n 1) +> kubectl exec -it ${influxdb_pod} -- influx -execute 'SHOW FIELD KEYS FROM upgrade_request' -database="${app_name}_upgrade_responder" | grep longhorn_disk +longhorn_disk_block_count float +longhorn_disk_filesystem_count float +``` + +**And** the value in `longhorn_disk_filesystem_Count` should equal to the number of volume using the V1 engine. +```bash +> kubectl exec -it ${influxdb_pod} -- influx -execute 'SELECT "longhorn_disk_filesystem_count" FROM "upgrade_request"' -database="${app_name}_upgrade_responder" +name: upgrade_request +time longhorn_disk_filesystem_count +---- ------------------------------ +1702351841122419036 1 +1702351841563938125 1 +1702351842436864452 1 +``` +**And** the value in `longhorn_disk_block_Count` should equal to the number of volume using the V2 engine. +```bash +> kubectl exec -it ${influxdb_pod} -- influx -execute 'SELECT "longhorn_disk_block_count" FROM "upgrade_request"' -database="${app_name}_upgrade_responder" +name: upgrade_request +time longhorn_disk_block_count +---- ------------------------- +1702351841122419036 2 +1702351841563938125 2 +1702351842436864452 2 +``` + +#### Test Collecting Volume Backend Store Driver + +**Given** [Prerequisite](#prerequisite). +**And** Create one volume using V1 engine. + Create two volume using V2 engine. +**And** [Deploy upgrade responder stack](https://github.com/longhorn/longhorn/tree/master/dev/upgrade-responder). + +**When** Wait 1~2 hours for collection data to send to the influxDB database. + +**Then** `longhorn_volume_backend_store_driver_v1_count` should exist the influxDB database. + `longhorn_volume_backend_store_driver_v2_count` should exist the influxDB database. +```bash +> app_name="longhorn" +> influxdb_pod=$(kubectl get pod | grep influxdb | awk '{print $1}' | head -n 1) +> kubectl exec -it ${influxdb_pod} -- influx -execute 'SHOW FIELD KEYS FROM upgrade_request' -database="${app_name}_upgrade_responder" | grep longhorn_volume_backend_store_driver +longhorn_volume_backend_store_driver_v1_count float +longhorn_volume_backend_store_driver_v2_count float +``` + +**And** the value in `longhorn_volume_backend_store_driver_v1_count` should equal to the number of volume using the V1 engine. +```bash +> kubectl exec -it ${influxdb_pod} -- influx -execute 'SELECT "longhorn_volume_backend_store_driver_v1_count" FROM "upgrade_request"' -database="${app_name}_upgrade_responder" +name: upgrade_request +time longhorn_volume_backend_store_driver_v1_count +---- --------------------------------------------- +1702351841122419036 3 +``` +**And** the value in `longhorn_volume_backend_store_driver_v2_count` should equal to the number of volume using the V2 engine. +```bash +> kubectl exec -it ${influxdb_pod} -- influx -execute 'SELECT "longhorn_volume_backend_store_driver_v2_count" FROM "upgrade_request"' -database="${app_name}_upgrade_responder" +name: upgrade_request +time longhorn_volume_backend_store_driver_v2_count +---- --------------------------------------------- +1702351841122419036 2 +``` diff --git a/docs/content/manual/release-specific/v1.6.0/test-upgrade-responder-collectiing-average-sizes-for-v1-volumes-only.md b/docs/content/manual/release-specific/v1.6.0/test-upgrade-responder-collectiing-average-sizes-for-v1-volumes-only.md new file mode 100644 index 0000000000..34de7afbad --- /dev/null +++ b/docs/content/manual/release-specific/v1.6.0/test-upgrade-responder-collectiing-average-sizes-for-v1-volumes-only.md @@ -0,0 +1,81 @@ +--- +Test upgrade-responder: Collecting Average Sizes for V1 Volumes Only +--- + +## Related issues + +- https://github.com/longhorn/longhorn/issues/7380 + +## Test step + +**Given** Patch build and deploy Longhorn. +``` +diff --git a/controller/setting_controller.go b/controller/setting_controller.go +index de77b7246..ac6263ac5 100644 +--- a/controller/setting_controller.go ++++ b/controller/setting_controller.go +@@ -49,7 +49,7 @@ const ( + var ( + upgradeCheckInterval = time.Hour + settingControllerResyncPeriod = time.Hour +- checkUpgradeURL = "https://longhorn-upgrade-responder.rancher.io/v1/checkupgrade" ++ checkUpgradeURL = "http://longhorn-upgrade-responder.default.svc.cluster.local:8314/v1/checkupgrade" + ) + + type SettingController struct { +``` +> Match the checkUpgradeURL with the application name: `http://-upgrade-responder.default.svc.cluster.local:8314/v1/checkupgrade` + +**And** setting `v2-data-engine` value is `true`. +**And** add a block disk to cluster nodes. +**And** [deploy upgrade responder stack](https://github.com/longhorn/longhorn/tree/master/dev/upgrade-responder). + +**When** create 50 mi volume `lhv-v1` using v1 data engine. +**And** create 50 mi volume `lhv-v2` using v2 data engine. +**And** attach volume `lhv-v1` and write some data. +**And** attach volume `lhv-v2` and write some data. +**And** Wait 1~2 hours for collection data to send to the influxDB database. + +**Then** the value of field `longhorn_volume_average_size_bytes` in the influxdb should equal to the average size of all v1 volumes (excluding v2 volumes). +**And** the value of field `longhorn_volume_average_actual_size_bytes` in the influxdb should be equal or simular to the average actual size of all v1 volumes (excluding v2 volumes). +> It's OK for the actual size to be slightly off due to ongoing workload activities, such as data writing by the upgrade-responder. +```bash +# Get the sizes in the influxdb. +# +# Sample: +# > name: upgrade_request +# time longhorn_volume_average_actual_size_bytes longhorn_volume_average_size_bytes +# ---- ----------------------------------------- ---------------------------------- +# 1703045996398941914 73269248 1449132032 +# 1703046063248379696 73284266 1449132032 +app_name="longhorn" +influxdb_pod=$(kubectl get pod | grep influxdb | awk '{print $1}' | head -n 1) +kubectl exec -it ${influxdb_pod} -- influx -execute 'SELECT "longhorn_volume_average_actual_size_bytes", "longhorn_volume_average_size_bytes" FROM "upgrade_request"' -database="${app_name}_upgrade_responder" +``` + +```bash +# Get the sizes from Longhorn volumes. + +v1_volume_count=$(kubectl get volumes -n longhorn-system -o=jsonpath='{range .items[*]}{.spec.backendStoreDriver}{"\n"}{end}' | grep -c 'v1') +echo "Number of V1 volumes: $v1_volume_count" + +# Get the expected average size. +# > Total size: 4347396096 +# > Average size: 1449132032 +total_size=$(kubectl get volumes -n longhorn-system -o=json | jq -r '[.items[] | select(.spec.backendStoreDriver != "v2") | .spec.size | tonumber] | add') +echo "Total size: $total_size" + +average_size=$(echo "scale=0; $total_size / $v1_volume_count" | bc) +echo "Average size: $average_size" + +# Get the expected average actual size. +# +# Sample: +# > Total actualSize: 220368896 +# > Average actual size: 73456298 +total_actual_size=$(kubectl get volumes -n longhorn-system -o=json | jq -r '[.items[] | select(.spec.backendStoreDriver != "v2") | .status.actualSize | tonumber] | add') +echo "Total actualSize: $total_actual_size" + +average_total_actual_size=$(echo "scale=0; $total_actual_size / $v1_volume_count" | bc) +echo "Average actual size: $average_total_actual_size" +``` diff --git a/e2e/keywords/common.resource b/e2e/keywords/common.resource index 6ccb7ec41a..8a3795bea1 100644 --- a/e2e/keywords/common.resource +++ b/e2e/keywords/common.resource @@ -2,7 +2,7 @@ Documentation Common keywords Library ../libs/keywords/common_keywords.py -Library ../libs/keywords/node_keywords.py +Library ../libs/keywords/stress_keywords.py Library ../libs/keywords/volume_keywords.py Library ../libs/keywords/recurring_job_keywords.py Library ../libs/keywords/workload_keywords.py @@ -24,6 +24,8 @@ Set test environment Set Test Variable ${deployment_list} @{statefulset_list} = Create List Set Test Variable ${statefulset_list} + @{persistentvolumeclaim_list} = Create List + Set Test Variable ${persistentvolumeclaim_list} setup_control_plane_network_latency set_backupstore diff --git a/e2e/keywords/node.resource b/e2e/keywords/node.resource index 59bb4cb61f..0dc25d63b8 100644 --- a/e2e/keywords/node.resource +++ b/e2e/keywords/node.resource @@ -66,9 +66,3 @@ Restart cluster FOR ${statefulset} IN @{statefulset_list} wait_for_workload_pod_stable ${statefulset} END - -During replica rebuilding, stress volume node cpu - stress_node_cpu_by_volume ${volume_name} - -During replica rebuilding, stress volume node memory - stress_node_memory_by_volume ${volume_name} diff --git a/e2e/keywords/stress.resource b/e2e/keywords/stress.resource new file mode 100644 index 0000000000..d4a99230fc --- /dev/null +++ b/e2e/keywords/stress.resource @@ -0,0 +1,17 @@ +*** Settings *** +Documentation Stress Node Keywords + +Library ../libs/keywords/stress_keywords.py + +*** Keywords *** +Stress the CPU of all ${role} nodes + stress_node_cpu_by_role ${role} + +Stress the CPU of all volume nodes + stress_node_cpu_by_volumes ${volume_list} + +Stress the memory of all ${role} nodes + stress_node_memory_by_role ${role} + +Stress the memory of all volume nodes + stress_node_memory_by_volumes ${volume_list} diff --git a/e2e/keywords/volume.resource b/e2e/keywords/volume.resource index 47815ced3e..723132f019 100644 --- a/e2e/keywords/volume.resource +++ b/e2e/keywords/volume.resource @@ -18,6 +18,12 @@ Create volume ${idx} with ${size} GB and ${replica_count} replicas attach_volume ${volume_name} Insert Into List ${volume_list} ${idx} ${volume_name} +Attach volume to node + attach_volume ${volume_name} + +Detach volume from node + detach_volume ${volume_name} + Write data to the volume ${volume_data_checksum} = write_volume_random_data ${volume_name} 2048 Set Test Variable ${volume_data_checksum} @@ -48,7 +54,7 @@ Wait until replica ${replica_0} rebuilt, delete replica ${replica_2} delete_replica ${volume_name} ${replica_2} Check data is intact - check_data ${volume_name} ${volume_data_checksum} + check_data_checksum ${volume_name} ${volume_data_checksum} Check volume ${idx} works ${volume_data_checksum} = write_volume_random_data ${volume_list}[${idx}] 1024 @@ -73,4 +79,4 @@ Wait until replica on replica node rebuilt Wait for volume of statefulset ${idx} healthy ${volume_name} = get_workload_volume_name ${statefulset_list}[${idx}] - wait_for_volume_healthy ${volume_name} \ No newline at end of file + wait_for_volume_healthy ${volume_name} diff --git a/e2e/keywords/workload.resource b/e2e/keywords/workload.resource index 6d8b91d6d0..8bbba38b96 100644 --- a/e2e/keywords/workload.resource +++ b/e2e/keywords/workload.resource @@ -3,6 +3,7 @@ Documentation Workload Keywords Library Collections Library ../libs/keywords/workload_keywords.py +Library ../libs/keywords/persistent_volume_claim_keywords.py *** Keywords *** Create deployment ${idx} with ${volume_type} volume @@ -13,6 +14,25 @@ Create statefulset ${idx} with ${volume_type} volume ${statefulset_name} = create_statefulset ${volume_type} Insert Into List ${statefulset_list} ${idx} ${statefulset_name} + ${volume_name} = get_workload_volume_name ${statefulset_name} + Insert Into List ${volume_list} ${idx} ${volume_name} + + ${pvc_name} = get_workload_pvc_name ${statefulset_name} + Insert Into List ${persistentvolumeclaim_list} ${idx} ${pvc_name} + +Scale down statefulset ${idx} to detach volume + ${statefulset} = get_statefulset ${statefulset_list}[${idx}] + ${scale_up_replica_count} = Set Variable ${statefulset.spec.replicas} + Set Test Variable ${scale_up_replica_count} + + scale_statefulset ${statefulset_list}[${idx}] 0 + wait_for_volume_detached ${volume_list}[${idx}] + +Scale up statefulset ${idx} to attach volume + scale_statefulset ${statefulset_list}[${idx}] ${scale_up_replica_count} + wait_for_volume_healthy ${volume_list}[${idx}] + wait_for_statefulset_replicas_ready ${statefulset_list}[${idx}] ${scale_up_replica_count} + Create deployment ${idx} with ${volume_type} and ${option} volume ${deployment_name} = create_deployment ${volume_type} ${option} Insert Into List ${deployment_list} ${idx} ${deployment_name} @@ -21,6 +41,15 @@ Create statefulset ${idx} with ${volume_type} and ${option} volume ${statefulset_name} = create_statefulset ${volume_type} ${option} Insert Into List ${statefulset_list} ${idx} ${statefulset_name} +Expand statefulset ${idx} volume by ${size} MiB + ${expected_size} = expand_pvc_size_by_mib ${persistentvolumeclaim_list}[${idx}] ${size} + Set Test Variable ${expected_size} + +Write ${size} MB data to statefulset ${idx} + ${pod_name} = get_workload_pod_name ${statefulset_list}[${idx}] + ${pod_data_checksum} = write_pod_random_data ${pod_name} ${size} + Insert Into List ${data_checksum_list} ${idx} ${pod_data_checksum} + Keep writing data to deployment ${idx} ${pod_name} = get_workload_pod_name ${deployment_list}[${idx}] keep_writing_pod_data ${pod_name} @@ -32,12 +61,24 @@ Keep writing data to statefulset ${idx} Check deployment ${idx} works ${pod_name} = get_workload_pod_name ${deployment_list}[${idx}] ${pod_data_checksum} = write_pod_random_data ${pod_name} 1024 - check_pod_data ${pod_name} ${pod_data_checksum} + check_pod_data_checksum ${pod_name} ${pod_data_checksum} Check statefulset ${idx} works ${pod_name} = get_workload_pod_name ${statefulset_list}[${idx}] ${pod_data_checksum} = write_pod_random_data ${pod_name} 1024 - check_pod_data ${pod_name} ${pod_data_checksum} + check_pod_data_checksum ${pod_name} ${pod_data_checksum} + +Check statefulset ${idx} data is intact + ${pod_name} = get_workload_pod_name ${statefulset_list}[${idx}] + ${expected_data_checksum} = Get From List ${data_checksum_list} ${idx} + check_pod_data_checksum ${pod_name} ${expected_data_checksum} + +Wait for statefulset ${idx} volume size expanded + wait_for_volume_expand_to_size ${volume_list}[${idx}] ${expected_size} + +Wait for statefulset ${idx} volume detached + wait_for_volume_detached ${volume_list}[${idx}] Wait for statefulset ${idx} stable wait_for_workload_pod_stable ${statefulset_list}[${idx}] + diff --git a/e2e/libs/engine/crd.py b/e2e/libs/engine/crd.py index 5b4ddef80b..4af52eea84 100644 --- a/e2e/libs/engine/crd.py +++ b/e2e/libs/engine/crd.py @@ -1,13 +1,13 @@ import logging -from engine.base import Base +from kubernetes import client -from utils.common_utils import k8s_cr_api +from engine.base import Base class CRD(Base): def __init__(self): - self.cr_api = k8s_cr_api() + self.obj_api = client.CustomObjectsApi() def get_engine(self, volume_name, node_name): if volume_name == "" or node_name == "": @@ -22,14 +22,22 @@ def get_engine(self, volume_name, node_name): if node_name != "": label_selector.append(f"longhornnode={node_name}") - api_response = self.cr_api.list_namespaced_custom_object( + api_response = self.obj_api.list_namespaced_custom_object( group="longhorn.io", version="v1beta2", namespace="longhorn-system", plural="engines", label_selector=",".join(label_selector) ) - return api_response + + if api_response == "" or api_response is None: + raise Exception(f"failed to get the volume {volume_name} engine") + + engines = api_response["items"] + if len(engines) == 0: + logging.warning(f"cannot get the volume {volume_name} engines") + + return engines def delete_engine(self, volume_name, node_name): if volume_name == "" or node_name == "": @@ -38,17 +46,9 @@ def delete_engine(self, volume_name, node_name): logging.info( f"delete the volume {volume_name} on node {node_name} engine") - resp = self.get_engine(volume_name, node_name) - assert resp != "", "failed to get engines" - - engines = resp['items'] - if len(engines) == 0: - logging.warning("cannot find engines") - return - - for engine in engines: + for engine in self.get_engine(volume_name, node_name): engine_name = engine['metadata']['name'] - self.cr_api.delete_namespaced_custom_object( + self.obj_api.delete_namespaced_custom_object( group="longhorn.io", version="v1beta2", namespace="longhorn-system", diff --git a/e2e/libs/engine/engine.py b/e2e/libs/engine/engine.py index 2003712917..e0b8e9654e 100644 --- a/e2e/libs/engine/engine.py +++ b/e2e/libs/engine/engine.py @@ -3,6 +3,8 @@ from strategy import LonghornOperationStrategy +from utility.utility import logging + class Engine(Base): @@ -15,6 +17,13 @@ def __init__(self): def get_engine(self, volume_name, node_name): return self.engine.get_engine(volume_name, node_name) + def get_engine_by_volume(self, volume): + engines = self.engine.get_engine(volume["metadata"]["name"], "") + assert len(engines) == 1, \ + f"Expected exactly one engine but found {len(engines)}" + + return engines[0] + # delete engines, if input parameters are empty then will delete all def delete_engine(self, volume_name="", node_name=""): return self.engine.delete_engine(volume_name, node_name) @@ -22,17 +31,8 @@ def delete_engine(self, volume_name="", node_name=""): def get_engine_state(self, volume_name, node_name): logging(f"Getting the volume {volume_name} engine on the node {node_name} state") - resp = self.get_engine(volume_name, node_name) - if resp == "" or resp is None: - raise Exception(f"failed to get the volume {volume_name} engine") - - engines = resp["items"] - if len(engines) == 0: - logging.warning(f"cannot get the volume {volume_name} engines") - return - engines_states = {} - for engine in engines: + for engine in self.engine.get_engine(volume_name, node_name): engine_name = engine["metadata"]["name"] engine_state = engine['status']['currentState'] engines_states[engine_name] = engine_state diff --git a/e2e/libs/keywords/kubelet_keywords.py b/e2e/libs/keywords/kubelet_keywords.py index 55c8e6cef6..58f33dec59 100644 --- a/e2e/libs/keywords/kubelet_keywords.py +++ b/e2e/libs/keywords/kubelet_keywords.py @@ -1,5 +1,6 @@ from kubelet.kubelet import restart_kubelet + class kubelet_keywords: def restart_kubelet(self, node_name, stop_time_in_sec): diff --git a/e2e/libs/keywords/node_keywords.py b/e2e/libs/keywords/node_keywords.py index c41977bbeb..fc23bb45c5 100644 --- a/e2e/libs/keywords/node_keywords.py +++ b/e2e/libs/keywords/node_keywords.py @@ -1,9 +1,8 @@ from robot.libraries.BuiltIn import BuiltIn from node import Node -from node import Stress +from node.utility import get_node_by_index -from utility.utility import get_node from utility.utility import wait_for_all_instance_manager_running @@ -11,7 +10,6 @@ class node_keywords: def __init__(self): self.node = Node() - self.stress = Stress() def reboot_volume_node(self, volume_name): volume_keywords = BuiltIn().get_library_instance('volume_keywords') @@ -24,7 +22,7 @@ def reboot_replica_node(self, volume_name): self.node.reboot_node(replica_node) def reboot_node_by_index(self, idx, power_off_time_in_min=1): - node_name = get_node(idx) + node_name = get_node_by_index(idx) self.node.reboot_node(node_name, int(power_off_time_in_min) * 60) def reboot_all_worker_nodes(self, power_off_time_in_min=1): @@ -38,16 +36,3 @@ def reboot_node_by_name(self, node_name, power_off_time_in_min=1): def wait_for_all_instance_manager_running(self): wait_for_all_instance_manager_running() - - def cleanup_stress_helper(self): - self.stress.cleanup() - - def stress_node_cpu_by_volume(self, volume_name): - volume_keywords = BuiltIn().get_library_instance('volume_keywords') - volume_node = volume_keywords.get_volume_node(volume_name) - self.stress.cpu([volume_node]) - - def stress_node_memory_by_volume(self, volume_name): - volume_keywords = BuiltIn().get_library_instance('volume_keywords') - volume_node = volume_keywords.get_volume_node(volume_name) - self.stress.memory([volume_node]) diff --git a/e2e/libs/keywords/persistent_volume_claim_keywords.py b/e2e/libs/keywords/persistent_volume_claim_keywords.py new file mode 100644 index 0000000000..5bef238a64 --- /dev/null +++ b/e2e/libs/keywords/persistent_volume_claim_keywords.py @@ -0,0 +1,16 @@ +from persistent_volume_claim import PersistentVolumeClaim + +from utility.utility import logging + +from volume.constant import MEBIBYTE + + +class persistent_volume_claim_keywords: + + def __init__(self): + self.pvc = PersistentVolumeClaim() + + def expand_pvc_size_by_mib(self, claim_name, size_in_mib): + logging(f'Expanding PVC {claim_name} by {size_in_mib} MiB') + size_in_byte = int(size_in_mib) * MEBIBYTE + return self.pvc.expand(claim_name, size_in_byte) diff --git a/e2e/libs/keywords/recurring_job_keywords.py b/e2e/libs/keywords/recurring_job_keywords.py index 999ee40794..148f92dd43 100644 --- a/e2e/libs/keywords/recurring_job_keywords.py +++ b/e2e/libs/keywords/recurring_job_keywords.py @@ -8,7 +8,6 @@ class recurring_job_keywords: def __init__(self): self.recurring_job = RecurringJob() - def create_snapshot_recurring_job_for_volume(self, volume_name): job_name = volume_name + '-snap' self.recurring_job.create(job_name, task="snapshot") @@ -16,7 +15,6 @@ def create_snapshot_recurring_job_for_volume(self, volume_name): self.recurring_job.get(job_name) logging(f'Created recurring job {job_name} for volume {volume_name}') - def create_backup_recurring_job_for_volume(self, volume_name): job_name = volume_name + '-bak' self.recurring_job.create(job_name, task="backup") @@ -24,10 +22,8 @@ def create_backup_recurring_job_for_volume(self, volume_name): self.recurring_job.get(job_name) logging(f'Created recurring job {job_name} for volume {volume_name}') - def check_recurring_jobs_work(self, volume_name): self.recurring_job.check_jobs_work(volume_name) - def cleanup_recurring_jobs(self, volume_names): self.recurring_job.cleanup(volume_names) diff --git a/e2e/libs/keywords/stress_keywords.py b/e2e/libs/keywords/stress_keywords.py new file mode 100644 index 0000000000..f9b9928d44 --- /dev/null +++ b/e2e/libs/keywords/stress_keywords.py @@ -0,0 +1,26 @@ +from robot.libraries.BuiltIn import BuiltIn + +from node import Stress +from node.utility import list_node_names_by_role +from node.utility import list_node_names_by_volumes + + +class stress_keywords: + + def __init__(self): + self.stress = Stress() + + def cleanup_stress_helper(self): + self.stress.cleanup() + + def stress_node_cpu_by_role(self, role): + self.stress.cpu(list_node_names_by_role(role)) + + def stress_node_cpu_by_volumes(self, volume_names): + self.stress.cpu(list_node_names_by_volumes(volume_names)) + + def stress_node_memory_by_role(self, role): + self.stress.memory(list_node_names_by_role(role)) + + def stress_node_memory_by_volumes(self, volume_names): + self.stress.memory(list_node_names_by_volumes(volume_names)) diff --git a/e2e/libs/keywords/volume_keywords.py b/e2e/libs/keywords/volume_keywords.py index 192b1a303f..813a0fddbd 100644 --- a/e2e/libs/keywords/volume_keywords.py +++ b/e2e/libs/keywords/volume_keywords.py @@ -1,7 +1,10 @@ -from utility.utility import logging +from node.utility import get_node_by_index +from node.utility import list_node_names_by_role + from utility.utility import generate_volume_name -from utility.utility import get_node, list_nodes -from utility.utility import get_test_pod_running_node, get_test_pod_not_running_node +from utility.utility import get_test_pod_not_running_node +from utility.utility import get_test_pod_running_node +from utility.utility import logging from volume import Volume @@ -11,68 +14,65 @@ class volume_keywords: def __init__(self): self.volume = Volume() - def create_volume(self, size, replica_count): volume_name = generate_volume_name() + logging(f'Creating volume {volume_name}') self.volume.create(volume_name, size, replica_count) - logging(f'Created volume {volume_name}') return volume_name - def attach_volume(self, volume_name): attach_node = get_test_pod_not_running_node() - logging(f'Attached volume {volume_name} to {attach_node}') + logging(f'Attaching volume {volume_name} to {attach_node}') self.volume.attach(volume_name, attach_node) + def detach_volume(self, volume_name): + logging(f'Detaching volume {volume_name}') + self.volume.detach(volume_name) + + def wait_for_volume_expand_to_size(self, volume_name, size): + logging(f'Waiting for volume {volume_name} expand to {size}') + return self.volume.wait_for_volume_expand_to_size(volume_name, size) def get_volume_node(self, volume_name): volume = self.volume.get(volume_name) return volume['spec']['nodeID'] - # return volume.controllers[0].hostId - def get_replica_node(self, volume_name): - nodes = list_nodes() + worker_nodes = list_node_names_by_role("worker") volume_node = self.get_volume_node(volume_name) test_pod_running_node = get_test_pod_running_node() - for node in nodes: - if node != volume_node and node != test_pod_running_node: - return node - + for worker_node in worker_nodes: + if worker_node != volume_node and worker_node != test_pod_running_node: + return worker_node def write_volume_random_data(self, volume_name, size_in_mb): return self.volume.write_random_data(volume_name, size_in_mb) - def keep_writing_data(self, volume_name): self.volume.keep_writing_data(volume_name) - - def check_data(self, volume_name, checksum): + def check_data_checksum(self, volume_name, checksum): logging(f"Checking volume {volume_name} data with checksum {checksum}") - self.volume.check_data(volume_name, checksum) - + self.volume.check_data_checksum(volume_name, checksum) def delete_replica(self, volume_name, replica_node): if str(replica_node).isdigit(): - replica_node = get_node(replica_node) + replica_node = get_node_by_index(replica_node) logging(f"Deleting volume {volume_name}'s replica on node {replica_node}") self.volume.delete_replica(volume_name, replica_node) - def wait_for_replica_rebuilding_start(self, volume_name, replica_node): if str(replica_node).isdigit(): - replica_node = get_node(replica_node) + replica_node = get_node_by_index(replica_node) logging(f"Waiting for volume {volume_name}'s replica on node {replica_node} rebuilding started") self.volume.wait_for_replica_rebuilding_start( volume_name, replica_node ) - def wait_for_replica_rebuilding_complete(self, volume_name, replica_node): if str(replica_node).isdigit(): - replica_node = get_node(replica_node) + replica_node = get_node_by_index(replica_node) logging(f"Waiting for volume {volume_name}'s replica on node {replica_node} rebuilding completed") self.volume.wait_for_replica_rebuilding_complete( volume_name, @@ -82,8 +82,11 @@ def wait_for_replica_rebuilding_complete(self, volume_name, replica_node): def wait_for_volume_attached(self, volume_name): self.volume.wait_for_volume_attached(volume_name) + def wait_for_volume_detached(self, volume_name): + self.volume.wait_for_volume_detached(volume_name) + def wait_for_volume_healthy(self, volume_name): self.volume.wait_for_volume_healthy(volume_name) def cleanup_volumes(self, volume_names): - self.volume.cleanup(volume_names) \ No newline at end of file + self.volume.cleanup(volume_names) diff --git a/e2e/libs/keywords/workload_keywords.py b/e2e/libs/keywords/workload_keywords.py index cee3fb2c94..51450df9e5 100644 --- a/e2e/libs/keywords/workload_keywords.py +++ b/e2e/libs/keywords/workload_keywords.py @@ -1,5 +1,6 @@ from workload.workload import * + class workload_keywords: def __init__(self): @@ -14,7 +15,7 @@ def cleanup_storageclasses(self): delete_storageclass('longhorn-test-strict-local') def create_deployment(self, volume_type="rwo", option=""): - pvc_name = create_pvc(volume_type, option) + create_pvc(volume_type, option) deployment_name = create_deployment(volume_type, option) return deployment_name @@ -22,9 +23,18 @@ def create_statefulset(self, volume_type="rwo", option=""): statefulset_name = create_statefulset(volume_type, option) return statefulset_name + def get_statefulset(self, statefulset_name): + return get_statefulset(statefulset_name) + + def scale_statefulset(self, statefulset_name, replica_count): + return scale_statefulset(statefulset_name, replica_count) + def get_workload_pod_name(self, workload_name): return get_workload_pod_names(workload_name)[0] + def get_workload_pvc_name(self, workload_name): + return get_workload_pvc_name(workload_name) + def get_workload_volume_name(self, workload_name): return get_workload_volume_name(workload_name) @@ -34,8 +44,8 @@ def keep_writing_pod_data(self, pod_name): def write_pod_random_data(self, pod, size_in_mb): return write_pod_random_data(pod, size_in_mb) - def check_pod_data(self, pod_name, checksum): - check_pod_data(pod_name, checksum) + def check_pod_data_checksum(self, pod_name, checksum): + check_pod_data_checksum(pod_name, checksum) def cleanup_deployments(self, deployment_names): for name in deployment_names: @@ -51,3 +61,6 @@ def cleanup_statefulsets(self, statefulset_names): def wait_for_workload_pod_stable(self, workload_name): return wait_for_workload_pod_stable(workload_name) + + def wait_for_statefulset_replicas_ready(self, statefulset_name, expected_ready_count): + return wait_for_statefulset_replicas_ready(statefulset_name, expected_ready_count) diff --git a/e2e/libs/kubelet/kubelet.py b/e2e/libs/kubelet/kubelet.py index c9c5180050..06beb039da 100644 --- a/e2e/libs/kubelet/kubelet.py +++ b/e2e/libs/kubelet/kubelet.py @@ -1,11 +1,11 @@ -from utility.utility import logging import time -from workload.pod import new_pod_manifest from workload.pod import create_pod -from workload.pod import wait_for_pod_status from workload.pod import delete_pod -from workload.pod import IMAGE_UBUNTU +from workload.pod import new_pod_manifest + +from workload.constant import IMAGE_UBUNTU + def restart_kubelet(node_name, stop_time_in_sec=10): manifest = new_pod_manifest( diff --git a/e2e/libs/network/network.py b/e2e/libs/network/network.py index a9d81b4b3d..b775430f52 100644 --- a/e2e/libs/network/network.py +++ b/e2e/libs/network/network.py @@ -1,29 +1,35 @@ from robot.libraries.BuiltIn import BuiltIn -from utility.utility import get_control_plane_nodes + +from node.utility import list_node_names_by_role + from node_exec import NodeExec + + def get_control_plane_node_network_latency_in_ms(): - latency_in_ms = int(BuiltIn().get_variable_value("${CONTROL_PLANE_NODE_NETWORK_LATENCY_IN_MS}")) + latency_in_ms = int(BuiltIn().get_variable_value("${CONTROL_PLANE_NODE_NETWORK_LATENCY_IN_MS}", default="0")) return latency_in_ms + def setup_control_plane_network_latency(): latency_in_ms = get_control_plane_node_network_latency_in_ms() if latency_in_ms != 0: - nodes = get_control_plane_nodes() - for node in nodes: + control_plane_nodes = list_node_names_by_role("control-plane") + for control_plane_node in control_plane_nodes: cmd = f"tc qdisc replace dev eth0 root netem delay {latency_in_ms}ms" - res = NodeExec.get_instance().issue_cmd(node, cmd) + res = NodeExec.get_instance().issue_cmd(control_plane_node, cmd) cmd = f"tc qdisc show dev eth0 | grep delay" - res = NodeExec.get_instance().issue_cmd(node, cmd) + res = NodeExec.get_instance().issue_cmd(control_plane_node, cmd) assert res, "setup control plane network latency failed" + def cleanup_control_plane_network_latency(): latency_in_ms = get_control_plane_node_network_latency_in_ms() if latency_in_ms != 0: - nodes = get_control_plane_nodes() - for node in nodes: + control_plane_nodes = list_node_names_by_role("control-plane") + for control_plane_node in control_plane_nodes: cmd = "tc qdisc del dev eth0 root" - res = NodeExec.get_instance().issue_cmd(node, cmd) + res = NodeExec.get_instance().issue_cmd(control_plane_node, cmd) cmd = f"tc qdisc show dev eth0 | grep -v delay" - res = NodeExec.get_instance().issue_cmd(node, cmd) - assert res, "cleanup control plane network failed" \ No newline at end of file + res = NodeExec.get_instance().issue_cmd(control_plane_node, cmd) + assert res, "cleanup control plane network failed" diff --git a/e2e/libs/node/constant.py b/e2e/libs/node/constant.py new file mode 100644 index 0000000000..b7dc738512 --- /dev/null +++ b/e2e/libs/node/constant.py @@ -0,0 +1,7 @@ +NODE_STRESS_CPU_LOAD_PERCENTAGE = 100 +NODE_STRESS_MEM_LOAD_PERCENTAGE = 100 +NODE_STRESS_MEM_VM_WORKERS = 1 +NODE_STRESS_TIMEOUT_SECOND = 60 * 60 # 1 hour + +STRESS_HELPER_LABEL = "longhorn-stress-helper" +STRESS_HELPER_POD_NAME_PREFIX = "longhorn-stress-helper-" diff --git a/e2e/libs/node/node.py b/e2e/libs/node/node.py index 98935bffcd..4938a473ad 100644 --- a/e2e/libs/node/node.py +++ b/e2e/libs/node/node.py @@ -4,7 +4,8 @@ from kubernetes import client -from utility.utility import list_nodes +from node.utility import list_node_names_by_role + from utility.utility import logging from utility.utility import wait_for_cluster_ready @@ -19,7 +20,7 @@ def __init__(self): def reboot_all_nodes(self, shut_down_time_in_sec=60): instance_ids = [value for value in self.mapping.values()] - resp = self.aws_client.stop_instances(InstanceIds=instance_ids) + resp = self.aws_client.stop_instances(InstanceIds=instance_ids, Force=True) logging(f"Stopping instances {instance_ids} response: {resp}") waiter = self.aws_client.get_waiter('instance_stopped') waiter.wait(InstanceIds=instance_ids) @@ -37,7 +38,7 @@ def reboot_all_nodes(self, shut_down_time_in_sec=60): def reboot_node(self, reboot_node_name, shut_down_time_in_sec=60): instance_ids = [self.mapping[reboot_node_name]] - resp = self.aws_client.stop_instances(InstanceIds=instance_ids) + resp = self.aws_client.stop_instances(InstanceIds=instance_ids, Force=True) logging(f"Stopping instances {instance_ids} response: {resp}") waiter = self.aws_client.get_waiter('instance_stopped') waiter.wait(InstanceIds=instance_ids) @@ -52,9 +53,9 @@ def reboot_node(self, reboot_node_name, shut_down_time_in_sec=60): logging(f"Started instances") def reboot_all_worker_nodes(self, shut_down_time_in_sec=60): - instance_ids = [self.mapping[value] for value in list_nodes()] + instance_ids = [self.mapping[value] for value in list_node_names_by_role("worker")] - resp = self.aws_client.stop_instances(InstanceIds=instance_ids) + resp = self.aws_client.stop_instances(InstanceIds=instance_ids, Force=True) logging(f"Stopping instances {instance_ids} response: {resp}") waiter = self.aws_client.get_waiter('instance_stopped') waiter.wait(InstanceIds=instance_ids) diff --git a/e2e/libs/node/stress.py b/e2e/libs/node/stress.py index f142fca7c5..b293103a5c 100644 --- a/e2e/libs/node/stress.py +++ b/e2e/libs/node/stress.py @@ -1,39 +1,57 @@ -from kubernetes import client +from kubernetes.client.rest import ApiException from node.utility import get_node_cpu_cores +from node.constant import NODE_STRESS_CPU_LOAD_PERCENTAGE +from node.constant import NODE_STRESS_MEM_LOAD_PERCENTAGE +from node.constant import NODE_STRESS_MEM_VM_WORKERS +from node.constant import NODE_STRESS_TIMEOUT_SECOND +from node.constant import STRESS_HELPER_LABEL +from node.constant import STRESS_HELPER_POD_NAME_PREFIX + from utility.utility import logging from workload.pod import create_pod from workload.pod import delete_pod +from workload.pod import get_pod from workload.pod import new_pod_manifest from workload.workload import get_workload_pods -from workload.pod import IMAGE_LITMUX - -NODE_CPU_LOAD_PERCENTAGE = 100 -NODE_MEM_LOAD_PERCENTAGE = 100 -NODE_MEM_VM_WORKERS = 1 -NODE_STRESS_TIMEOUT_SECOND = 300 +from workload.constant import IMAGE_LITMUX -LABEL_STRESS_HELPER = "longhorn-stress-helper" class Stress: def cleanup(self): - for pod in get_workload_pods(LABEL_STRESS_HELPER): + for pod in get_workload_pods(STRESS_HELPER_LABEL): logging(f"Cleaning up stress pod {pod.metadata.name}") delete_pod(pod.metadata.name, pod.metadata.namespace) def cpu(self, node_names): for node_name in node_names: + pod_name = f"{STRESS_HELPER_POD_NAME_PREFIX}{node_name}" + + # If the helper pod creation is called inside of a test case loop, + # we need to check if the pod already running. + try: + pod = get_pod(pod_name) + if pod and pod.status.phase != "Running": + logging(f"Deleting stress pod {pod_name} in phase {pod.status.phase}") + delete_pod(pod_name) + elif pod: + logging(f"Stress pod {pod_name} already running") + continue + except ApiException as e: + assert e.status == 404 + manifest = new_pod_manifest( + pod_name=pod_name, image=IMAGE_LITMUX, command=["stress-ng"], args=['--cpu', str(get_node_cpu_cores(node_name)), - '--cpu-load', str(NODE_CPU_LOAD_PERCENTAGE), + '--cpu-load', str(NODE_STRESS_CPU_LOAD_PERCENTAGE), '--timeout', str(NODE_STRESS_TIMEOUT_SECOND)], node_name=node_name, - labels={'app': LABEL_STRESS_HELPER} + labels={'app': STRESS_HELPER_LABEL} ) pod_name = manifest['metadata']['name'] @@ -42,14 +60,30 @@ def cpu(self, node_names): def memory(self, node_names): for node_name in node_names: + pod_name = f"{STRESS_HELPER_POD_NAME_PREFIX}{node_name}" + + # If the helper pod creation is called inside of a test case loop, + # we need to check if the pod already running. + try: + pod = get_pod(pod_name) + if pod and pod.status.phase != "Running": + logging(f"Deleting stress pod {pod_name} in phase {pod.status.phase}") + delete_pod(pod_name) + elif pod: + logging(f"Stress pod {pod_name} already running") + continue + except ApiException as e: + assert e.status == 404 + manifest = new_pod_manifest( + pod_name=pod_name, image=IMAGE_LITMUX, command=["stress-ng"], - args=['--vm', str(NODE_MEM_VM_WORKERS), - '--vm-bytes', f"{NODE_MEM_LOAD_PERCENTAGE}%", + args=['--vm', str(NODE_STRESS_MEM_VM_WORKERS), + '--vm-bytes', f"{NODE_STRESS_MEM_LOAD_PERCENTAGE}%", '--timeout', str(NODE_STRESS_TIMEOUT_SECOND)], node_name=node_name, - labels={'app': LABEL_STRESS_HELPER} + labels={'app': STRESS_HELPER_LABEL} ) pod_name = manifest['metadata']['name'] diff --git a/e2e/libs/node/utility.py b/e2e/libs/node/utility.py index 571b983b6f..d89c44f1d0 100644 --- a/e2e/libs/node/utility.py +++ b/e2e/libs/node/utility.py @@ -1,9 +1,53 @@ +from robot.libraries.BuiltIn import BuiltIn + from kubernetes import client + +def get_node_by_index(index, role="worker"): + nodes = list_node_names_by_role(role) + return nodes[int(index)] + + def get_node_by_name(node_name): core_api = client.CoreV1Api() return core_api.read_node(node_name) + def get_node_cpu_cores(node_name): node = get_node_by_name(node_name) return node.status.capacity['cpu'] + + +def list_node_names_by_role(role="all"): + if role not in ["all", "control-plane", "worker"]: + raise ValueError("Role must be one of 'all', 'master' or 'worker'") + + def filter_nodes(nodes, condition): + return [node.metadata.name for node in nodes if condition(node)] + + core_api = client.CoreV1Api() + nodes = core_api.list_node().items + + control_plane_labels = ['node-role.kubernetes.io/master', 'node-role.kubernetes.io/control-plane'] + + if role == "all": + return sorted(filter_nodes(nodes, lambda node: True)) + + if role == "control-plane": + condition = lambda node: all(label in node.metadata.labels for label in control_plane_labels) + return sorted(filter_nodes(nodes, condition)) + + if role == "worker": + condition = lambda node: not any(label in node.metadata.labels for label in control_plane_labels) + return sorted(filter_nodes(nodes, condition)) + + +def list_node_names_by_volumes(volume_names): + volume_nodes = {} + volume_keywords = BuiltIn().get_library_instance('volume_keywords') + + for volume_name in volume_names: + volume_node = volume_keywords.get_volume_node(volume_name) + if volume_node not in volume_nodes: + volume_nodes[volume_node] = True + return list(volume_nodes.keys()) diff --git a/e2e/libs/node_exec/constant.py b/e2e/libs/node_exec/constant.py new file mode 100644 index 0000000000..255c49afc4 --- /dev/null +++ b/e2e/libs/node_exec/constant.py @@ -0,0 +1,2 @@ +DEFAULT_POD_TIMEOUT = 180 +DEFAULT_POD_INTERVAL = 1 diff --git a/e2e/libs/node_exec/node_exec.py b/e2e/libs/node_exec/node_exec.py index d01f39988f..dd6aee3c1d 100644 --- a/e2e/libs/node_exec/node_exec.py +++ b/e2e/libs/node_exec/node_exec.py @@ -7,9 +7,9 @@ from workload.pod import wait_delete_pod from utility.utility import wait_delete_ns +from node_exec.constant import DEFAULT_POD_INTERVAL +from node_exec.constant import DEFAULT_POD_TIMEOUT -DEFAULT_POD_TIMEOUT = 180 -DEFAULT_POD_INTERVAL = 1 class NodeExec: @@ -198,4 +198,4 @@ def launch_pod(self, node_name): break time.sleep(DEFAULT_POD_INTERVAL) self.node_exec_pod[node_name] = pod - return pod \ No newline at end of file + return pod diff --git a/e2e/libs/persistent_volume_claim/__init__.py b/e2e/libs/persistent_volume_claim/__init__.py new file mode 100644 index 0000000000..75f06a434b --- /dev/null +++ b/e2e/libs/persistent_volume_claim/__init__.py @@ -0,0 +1 @@ +from persistent_volume_claim.persistent_volume_claim import PersistentVolumeClaim diff --git a/e2e/libs/persistent_volume_claim/base.py b/e2e/libs/persistent_volume_claim/base.py new file mode 100644 index 0000000000..43d45fba0c --- /dev/null +++ b/e2e/libs/persistent_volume_claim/base.py @@ -0,0 +1,12 @@ +from abc import ABC, abstractmethod + + +class Base(ABC): + + @abstractmethod + def get(self, volume_name): + return NotImplemented + + @abstractmethod + def expand(self, claim_name, size, claim_namespace="default"): + return NotImplemented diff --git a/e2e/libs/persistent_volume_claim/constant.py b/e2e/libs/persistent_volume_claim/constant.py new file mode 100644 index 0000000000..82e875b169 --- /dev/null +++ b/e2e/libs/persistent_volume_claim/constant.py @@ -0,0 +1,2 @@ +RETRY_COUNTS = 150 +RETRY_INTERVAL = 1 diff --git a/e2e/libs/persistent_volume_claim/crd.py b/e2e/libs/persistent_volume_claim/crd.py new file mode 100644 index 0000000000..bd3f80be07 --- /dev/null +++ b/e2e/libs/persistent_volume_claim/crd.py @@ -0,0 +1,40 @@ +from kubernetes import client + +from persistent_volume_claim.base import Base + +from utility.utility import get_retry_count_and_interval +from utility.utility import logging + + +class CRD(Base): + + def __init__(self): + self.core_v1_api = client.CoreV1Api() + self.retry_count, self.retry_interval = get_retry_count_and_interval() + + def get(self, claim_name, claim_namespace="default"): + return self.core_v1_api.read_namespaced_persistent_volume_claim( + name=claim_name, + namespace=claim_namespace, + ) + + def expand(self, claim_name, size, namespace="default"): + try: + self.core_v1_api.patch_namespaced_persistent_volume_claim( + name=claim_name, + namespace=namespace, + body={ + 'spec': { + 'resources': { + 'requests': { + 'storage': str(size) + } + } + } + } + ) + return size + except client.exceptions.ApiException as e: + logging(f"Exception when expanding PVC: {e}") + + return size diff --git a/e2e/libs/persistent_volume_claim/persistent_volume_claim.py b/e2e/libs/persistent_volume_claim/persistent_volume_claim.py new file mode 100644 index 0000000000..7fce6e1ad2 --- /dev/null +++ b/e2e/libs/persistent_volume_claim/persistent_volume_claim.py @@ -0,0 +1,26 @@ +from strategy import LonghornOperationStrategy + +from persistent_volume_claim.base import Base +from persistent_volume_claim.crd import CRD + +from utility.utility import logging + + +class PersistentVolumeClaim(Base): + + _strategy = LonghornOperationStrategy.CRD + + def __init__(self): + if self._strategy == LonghornOperationStrategy.CRD: + self.pvc = CRD() + + def get(self, claim_name): + return self.pvc.get(claim_name) + + def expand(self, claim_name, size_in_byte): + pvc = self.pvc.get(claim_name) + current_size = int(pvc.spec.resources.requests['storage']) + + target_size = current_size + size_in_byte + logging(f"Expanding PVC {claim_name} from {current_size} to {target_size}") + return self.pvc.expand(claim_name, target_size) diff --git a/e2e/libs/recurring_job/base.py b/e2e/libs/recurring_job/base.py index e74e536745..5e4897fbdf 100644 --- a/e2e/libs/recurring_job/base.py +++ b/e2e/libs/recurring_job/base.py @@ -25,4 +25,4 @@ def check_jobs_work(self, volume_name): @abstractmethod def cleanup(self, volume_names): - return NotImplemented \ No newline at end of file + return NotImplemented diff --git a/e2e/libs/recurring_job/constant.py b/e2e/libs/recurring_job/constant.py new file mode 100644 index 0000000000..bb5017e701 --- /dev/null +++ b/e2e/libs/recurring_job/constant.py @@ -0,0 +1,2 @@ +RETRY_COUNTS = 180 +RETRY_INTERVAL = 1 diff --git a/e2e/libs/recurring_job/crd.py b/e2e/libs/recurring_job/crd.py index 1b10fc8ce5..43ed450e31 100644 --- a/e2e/libs/recurring_job/crd.py +++ b/e2e/libs/recurring_job/crd.py @@ -30,4 +30,4 @@ def check_jobs_work(self, volume_name): def cleanup(self, volume_names): logging("Delegating the cleanup call to API because there is no CRD implementation") - return self.rest.cleanup(volume_names) \ No newline at end of file + return self.rest.cleanup(volume_names) diff --git a/e2e/libs/recurring_job/recurring_job.py b/e2e/libs/recurring_job/recurring_job.py index 9ee52f2347..3616619d73 100644 --- a/e2e/libs/recurring_job/recurring_job.py +++ b/e2e/libs/recurring_job/recurring_job.py @@ -38,4 +38,4 @@ def check_jobs_work(self, volume_name): return self.recurring_job.check_jobs_work(volume_name) def cleanup(self, volume_names): - return self.recurring_job.cleanup(volume_names) \ No newline at end of file + return self.recurring_job.cleanup(volume_names) diff --git a/e2e/libs/recurring_job/rest.py b/e2e/libs/recurring_job/rest.py index d988783f71..b973c6aae2 100644 --- a/e2e/libs/recurring_job/rest.py +++ b/e2e/libs/recurring_job/rest.py @@ -10,9 +10,9 @@ from utility.utility import get_longhorn_client from utility.utility import logging +from recurring_job.constant import RETRY_COUNTS +from recurring_job.constant import RETRY_INTERVAL -RETRY_COUNTS = 180 -RETRY_INTERVAL = 1 class Rest(Base): @@ -65,16 +65,21 @@ def _wait_for_volume_recurring_job_delete(self, job_name, volume_name): assert deleted def get_volume_recurring_jobs_and_groups(self, volume_name): - volume = self.client.by_id_volume(volume_name) - list = volume.recurringJobList() - jobs = [] - groups = [] - for item in list: - if item['isGroup']: - groups.append(item['name']) - else: - jobs.append(item['name']) - return jobs, groups + for _ in range(RETRY_COUNTS): + try: + volume = self.client.by_id_volume(volume_name) + list = volume.recurringJobList() + jobs = [] + groups = [] + for item in list: + if item['isGroup']: + groups.append(item['name']) + else: + jobs.append(item['name']) + return jobs, groups + except Exception as e: + logging(f"Getting volume {volume} recurring job list error: {e}") + time.sleep(RETRY_INTERVAL) def _wait_for_cron_job_create(self, job_name): created = False @@ -125,23 +130,26 @@ def _check_snapshot_created_in_time(self, volume_name, job_name, period_in_sec): snapshot_timestamp = 0 for _ in range(period_in_sec * 2): snapshot_list = filter_cr("longhorn.io", "v1beta2", "longhorn-system", "snapshots", label_selector=label_selector) - if len(snapshot_list['items']) > 0: - for item in snapshot_list['items']: - # this snapshot can be created by snapshot or backup recurring job - # but job_name is in spec.labels.RecurringJob - # and crd doesn't support field selector - # so need to filter by ourselves - if 'RecurringJob' in item['status']['labels'] and \ - item['status']['labels']['RecurringJob'] == job_name and \ - item['status']['readyToUse'] == True: - logging(f"Got snapshot {item}") - snapshot_time = item['metadata']['creationTimestamp'] - snapshot_time = datetime.strptime(snapshot_time, '%Y-%m-%dT%H:%M:%SZ') - snapshot_timestamp = snapshot_time.timestamp() - logging(f"Got snapshot time = {snapshot_time}, timestamp = {snapshot_timestamp}") - break - if snapshot_timestamp > current_timestamp: - return + try: + if len(snapshot_list['items']) > 0: + for item in snapshot_list['items']: + # this snapshot can be created by snapshot or backup recurring job + # but job_name is in spec.labels.RecurringJob + # and crd doesn't support field selector + # so need to filter by ourselves + if 'RecurringJob' in item['status']['labels'] and \ + item['status']['labels']['RecurringJob'] == job_name and \ + item['status']['readyToUse'] == True: + logging(f"Got snapshot {item}") + snapshot_time = item['metadata']['creationTimestamp'] + snapshot_time = datetime.strptime(snapshot_time, '%Y-%m-%dT%H:%M:%SZ') + snapshot_timestamp = snapshot_time.timestamp() + logging(f"Got snapshot time = {snapshot_time}, timestamp = {snapshot_timestamp}") + break + if snapshot_timestamp > current_timestamp: + return + except Exception as e: + logging(f"Iterating snapshot list error: {e}") time.sleep(1) assert False, f"since {current_time},\ there's no new snapshot created by recurring job \ @@ -156,17 +164,24 @@ def _check_backup_created_in_time(self, volume_name, period_in_sec): backup_timestamp = 0 for _ in range(period_in_sec * 2): backup_list = filter_cr("longhorn.io", "v1beta2", "longhorn-system", "backups", label_selector=label_selector) - if len(backup_list['items']) > 0: - state = backup_list['items'][0]['status']['state'] - if state != "InProgress" and state != "Completed": - continue - backup_time = backup_list['items'][0]['metadata']['creationTimestamp'] - backup_time = datetime.strptime(backup_time, '%Y-%m-%dT%H:%M:%SZ') - backup_timestamp = backup_time.timestamp() - logging(f"Got backup time = {backup_time}, timestamp = {backup_timestamp}") - if backup_timestamp > current_timestamp: - return + try: + if len(backup_list['items']) > 0: + state = backup_list['items'][0]['status']['state'] + if state != "InProgress" and state != "Completed": + continue + backup_time = backup_list['items'][0]['metadata']['creationTimestamp'] + backup_time = datetime.strptime(backup_time, '%Y-%m-%dT%H:%M:%SZ') + backup_timestamp = backup_time.timestamp() + logging(f"Got backup time = {backup_time}, timestamp = {backup_timestamp}") + if backup_timestamp > current_timestamp: + return + except Exception as e: + logging(f"Iterating backup list error: {e}") time.sleep(1) + logging(f"since {current_time},\ + there's no new backup created by recurring job \ + {backup_list}") + time.sleep(86400 * 3) assert False, f"since {current_time},\ there's no new backup created by recurring job \ {backup_list}" @@ -177,4 +192,4 @@ def cleanup(self, volume_names): jobs, _ = self.get_volume_recurring_jobs_and_groups(volume_name) for job in jobs: logging(f"Deleting recurring job {job}") - self.delete(job, volume_name) \ No newline at end of file + self.delete(job, volume_name) diff --git a/e2e/libs/replica/base.py b/e2e/libs/replica/base.py index 1fdef38f35..2abca02ca5 100644 --- a/e2e/libs/replica/base.py +++ b/e2e/libs/replica/base.py @@ -17,4 +17,4 @@ def wait_for_replica_rebuilding_start(self, volume_name, node_name): @abstractmethod def wait_for_replica_rebuilding_complete(self, volume_name, node_name): - return NotImplemented \ No newline at end of file + return NotImplemented diff --git a/e2e/libs/replica/constant.py b/e2e/libs/replica/constant.py new file mode 100644 index 0000000000..82e875b169 --- /dev/null +++ b/e2e/libs/replica/constant.py @@ -0,0 +1,2 @@ +RETRY_COUNTS = 150 +RETRY_INTERVAL = 1 diff --git a/e2e/libs/replica/rest.py b/e2e/libs/replica/rest.py index 6ace292c03..8c492c0b56 100644 --- a/e2e/libs/replica/rest.py +++ b/e2e/libs/replica/rest.py @@ -3,10 +3,11 @@ from replica.base import Base from utils import common_utils +from utility.utility import logging +from replica.constant import RETRY_COUNTS +from replica.constant import RETRY_INTERVAL -RETRY_COUNTS = 150 -RETRY_INTERVAL = 1 class Rest(Base): def __init__(self, node_exec): @@ -22,46 +23,55 @@ def delete_replica(self, volume_name, node_name): def wait_for_replica_rebuilding_start(self, volume_name, node_name): rebuilding_replica_name = None for i in range(RETRY_COUNTS): - v = self.longhorn_client.by_id_volume(volume_name) - for replica in v.replicas: - if replica.hostId == node_name: - rebuilding_replica_name = replica.name + try: + v = self.longhorn_client.by_id_volume(volume_name) + for replica in v.replicas: + if replica.hostId == node_name: + rebuilding_replica_name = replica.name + break + if rebuilding_replica_name: break - if rebuilding_replica_name: - break + except Exception as e: + logging(f"Failed to get volume {e}") time.sleep(RETRY_INTERVAL) assert rebuilding_replica_name != None, f'failed to get rebuilding replica name' started = False for i in range(RETRY_COUNTS): - v = self.longhorn_client.by_id_volume(volume_name) - for status in v.rebuildStatus: - if status.replica == rebuilding_replica_name and\ - status.state == "in_progress": - started = True + try: + v = self.longhorn_client.by_id_volume(volume_name) + for status in v.rebuildStatus: + if status.replica == rebuilding_replica_name and\ + status.state == "in_progress": + started = True + break + if started: break - if started: - break + except Exception as e: + logging(f"Failed to get volume {e}") time.sleep(RETRY_INTERVAL) assert started, f'replica {rebuilding_replica_name} rebuilding starting failed' def wait_for_replica_rebuilding_complete(self, volume_name, node_name): completed = False for i in range(RETRY_COUNTS): - v = self.longhorn_client.by_id_volume(volume_name) - for replica in v.replicas: - # use replica.mode is RW or RO to check if this replica - # has been rebuilt or not - # because rebuildStatus is not reliable - # when the rebuild progress reaches 100% - # it will be removed from rebuildStatus immediately - # and you will just get an empty rebuildStatus [] - # so it's no way to distinguish "rebuilding not started yet" - # or "rebuilding already completed" using rebuildStatus - if replica.hostId == node_name and replica.mode == "RW": - completed = True - break + try: + v = self.longhorn_client.by_id_volume(volume_name) + for replica in v.replicas: + # use replica.mode is RW or RO to check if this replica + # has been rebuilt or not + # because rebuildStatus is not reliable + # when the rebuild progress reaches 100% + # it will be removed from rebuildStatus immediately + # and you will just get an empty rebuildStatus [] + # so it's no way to distinguish "rebuilding not started yet" + # or "rebuilding already completed" using rebuildStatus + if replica.hostId == node_name and replica.mode == "RW": + completed = True + break + except Exception as e: + logging(f"Failed to get volume {e}") if completed: break time.sleep(RETRY_INTERVAL) - assert completed, f'failed rebuilding replicas' \ No newline at end of file + assert completed, f'failed rebuilding replicas' diff --git a/e2e/libs/utility/utility.py b/e2e/libs/utility/utility.py index 7b04d260c0..83b07ab944 100644 --- a/e2e/libs/utility/utility.py +++ b/e2e/libs/utility/utility.py @@ -1,17 +1,23 @@ -from kubernetes import config, client, dynamic -from kubernetes.client.rest import ApiException -from kubernetes.stream import stream -from longhorn import from_env -import string -import random import os import socket +import string import time +import random import yaml +from longhorn import from_env + +from kubernetes import client +from kubernetes import config +from kubernetes import dynamic +from kubernetes.client.rest import ApiException + from robot.api import logger from robot.libraries.BuiltIn import BuiltIn +from node.utility import get_node_by_index +from node.utility import list_node_names_by_role + def logging(msg, also_report=False): if also_report: @@ -19,16 +25,19 @@ def logging(msg, also_report=False): else: logger.console(msg) + def get_retry_count_and_interval(): retry_count = int(BuiltIn().get_variable_value("${RETRY_COUNT}")) retry_interval = int(BuiltIn().get_variable_value("${RETRY_INTERVAL}")) return retry_count, retry_interval + def generate_name(name_prefix="test-"): return name_prefix + \ ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(6)) + def generate_volume_name(): return generate_name("vol-") @@ -43,25 +52,6 @@ def init_k8s_api_client(): config.load_incluster_config() logging("Initialized in-cluster k8s api client") -def list_nodes(): - core_api = client.CoreV1Api() - obj = core_api.list_node() - nodes = [] - for item in obj.items: - if 'node-role.kubernetes.io/control-plane' not in item.metadata.labels and \ - 'node-role.kubernetes.io/master' not in item.metadata.labels: - nodes.append(item.metadata.name) - return sorted(nodes) - -def get_control_plane_nodes(): - core_api = client.CoreV1Api() - obj = core_api.list_node() - nodes = [] - for item in obj.items: - if 'node-role.kubernetes.io/control-plane' in item.metadata.labels or \ - 'node-role.kubernetes.io/master' in item.metadata.labels: - nodes.append(item.metadata.name) - return sorted(nodes) def wait_for_cluster_ready(): core_api = client.CoreV1Api() @@ -83,30 +73,27 @@ def wait_for_cluster_ready(): time.sleep(retry_interval) assert ready, f"expect cluster's ready but it isn't {resp}" + def wait_for_all_instance_manager_running(): - core_api = client.CoreV1Api() longhorn_client = get_longhorn_client() - nodes = list_nodes() + worker_nodes = list_node_names_by_role("worker") retry_count, retry_interval = get_retry_count_and_interval() for _ in range(retry_count): logging(f"Waiting for all instance manager running ({_}) ...") - instance_managers = longhorn_client.list_instance_manager() - instance_manager_map = {} try: + instance_managers = longhorn_client.list_instance_manager() + instance_manager_map = {} for im in instance_managers: if im.currentState == "running": instance_manager_map[im.nodeID] = im - if len(instance_manager_map) == len(nodes): + if len(instance_manager_map) == len(worker_nodes): break time.sleep(retry_interval) except Exception as e: logging(f"Getting instance manager state error: {e}") - assert len(instance_manager_map) == len(nodes), f"expect all instance managers running, instance_managers = {instance_managers}, instance_manager_map = {instance_manager_map}" + assert len(instance_manager_map) == len(worker_nodes), f"expect all instance managers running, instance_managers = {instance_managers}, instance_manager_map = {instance_manager_map}" -def get_node(index): - nodes = list_nodes() - return nodes[int(index)] def apply_cr(manifest_dict): dynamic_client = dynamic.DynamicClient(client.api_client.ApiClient()) @@ -125,11 +112,13 @@ def apply_cr(manifest_dict): crd_api.create(body=manifest_dict, namespace=namespace) logging.info(f"{namespace}/{resource_name} created") + def apply_cr_from_yaml(filepath): with open(filepath, 'r') as f: manifest_dict = yaml.safe_load(f) apply_cr(manifest_dict) + def get_cr(group, version, namespace, plural, name): api = client.CustomObjectsApi() try: @@ -138,6 +127,7 @@ def get_cr(group, version, namespace, plural, name): except ApiException as e: logging(f"Getting namespaced custom object error: {e}") + def filter_cr(group, version, namespace, plural, field_selector="", label_selector=""): api = client.CustomObjectsApi() try: @@ -146,6 +136,7 @@ def filter_cr(group, version, namespace, plural, field_selector="", label_select except ApiException as e: logging(f"Listing namespaced custom object: {e}") + def wait_delete_ns(name): api = client.CoreV1Api() retry_count, retry_interval = get_retry_count_and_interval() @@ -161,6 +152,7 @@ def wait_delete_ns(name): time.sleep(retry_interval) assert not found + def get_mgr_ips(): ret = client.CoreV1Api().list_pod_for_all_namespaces( label_selector="app=longhorn-manager", @@ -170,6 +162,7 @@ def get_mgr_ips(): mgr_ips.append(i.status.pod_ip) return mgr_ips + def get_longhorn_client(): retry_count, retry_interval = get_retry_count_and_interval() if os.getenv('LONGHORN_CLIENT_URL'): @@ -204,18 +197,21 @@ def get_longhorn_client(): logging(f"Getting longhorn client error: {e}") time.sleep(retry_interval) + def get_test_pod_running_node(): if "NODE_NAME" in os.environ: return os.environ["NODE_NAME"] else: - return get_node(0) + return get_node_by_index(0) + def get_test_pod_not_running_node(): - nodes = list_nodes() + worker_nodes = list_node_names_by_role("worker") test_pod_running_node = get_test_pod_running_node() - for node in nodes: - if node != test_pod_running_node: - return node + for worker_node in worker_nodes: + if worker_node != test_pod_running_node: + return worker_node + def get_test_case_namespace(test_name): return test_name.lower().replace(' ', '-') diff --git a/e2e/libs/volume/base.py b/e2e/libs/volume/base.py index c9435828c5..7684928073 100644 --- a/e2e/libs/volume/base.py +++ b/e2e/libs/volume/base.py @@ -1,5 +1,6 @@ from abc import ABC, abstractmethod + class Base(ABC): @abstractmethod @@ -43,9 +44,9 @@ def wait_for_replica_rebuilding_complete(self, volume_name, node_name): return NotImplemented @abstractmethod - def check_data(self, volume_name, checksum): + def check_data_checksum(self, volume_name, checksum): return NotImplemented @abstractmethod def cleanup(self, volume_names): - return NotImplemented \ No newline at end of file + return NotImplemented diff --git a/e2e/libs/volume/constant.py b/e2e/libs/volume/constant.py new file mode 100644 index 0000000000..c9d6e4a990 --- /dev/null +++ b/e2e/libs/volume/constant.py @@ -0,0 +1,11 @@ +KIBIBYTE = 1024 +MEBIBYTE = (KIBIBYTE * KIBIBYTE) +GIBIBYTE = (MEBIBYTE * KIBIBYTE) + +RETRY_COUNTS = 150 +RETRY_INTERVAL = 1 + +VOLUME_FRONTEND_BLOCKDEV = "blockdev" +VOLUME_FRONTEND_ISCSI = "iscsi" + +DEV_PATH = "/dev/longhorn/" diff --git a/e2e/libs/volume/crd.py b/e2e/libs/volume/crd.py index 1c9e4a155d..dc635ad74b 100644 --- a/e2e/libs/volume/crd.py +++ b/e2e/libs/volume/crd.py @@ -5,13 +5,13 @@ from utility.utility import get_retry_count_and_interval from utility.utility import logging +from engine.engine import Engine + from volume.base import Base from volume.rest import Rest +from volume.constant import GIBIBYTE -Ki = 2**10 -Mi = 2**20 -Gi = 2**30 class CRD(Base): @@ -38,7 +38,7 @@ def create(self, volume_name, size, replica_count): "spec": { "frontend": "blockdev", "replicaAutoBalance": "ignored", - "size": str(int(size) * Gi), + "size": str(int(size) * GIBIBYTE), "numberOfReplicas": int(replica_count) } } @@ -96,9 +96,44 @@ def attach(self, volume_name, node_name): Exception(f'exception for creating volumeattachments:', e) self.wait_for_volume_state(volume_name, "attached") + def detach(self, volume_name): + try: + self.obj_api.patch_namespaced_custom_object( + group="longhorn.io", + version="v1beta2", + namespace="longhorn-system", + plural="volumeattachments", + name=volume_name, + body={ + "spec": { + "attachmentTickets": None, + } + } + ) + except Exception as e: + # new CRD: volumeattachments was added since from 1.5.0 + # https://github.com/longhorn/longhorn/issues/3715 + if e.reason != "Not Found": + Exception(f'exception for patching volumeattachments:', e) + + self.obj_api.patch_namespaced_custom_object( + group="longhorn.io", + version="v1beta2", + namespace="longhorn-system", + plural="volumes", + name=volume_name, + body={ + "spec": { + "nodeID": "" + } + } + ) + + self.wait_for_volume_state(volume_name, "detached") + def delete(self, volume_name): try: - resp = self.obj_api.delete_namespaced_custom_object( + self.obj_api.delete_namespaced_custom_object( group="longhorn.io", version="v1beta2", namespace="longhorn-system", @@ -112,7 +147,7 @@ def delete(self, volume_name): def wait_for_volume_delete(self, volume_name): for i in range(self.retry_count): try: - resp = self.obj_api.get_namespaced_custom_object( + self.obj_api.get_namespaced_custom_object( group="longhorn.io", version="v1beta2", namespace="longhorn-system", @@ -152,7 +187,7 @@ def wait_for_volume_robustness(self, volume_name, desired_state): def wait_for_volume_robustness_not(self, volume_name, not_desired_state): for i in range(self.retry_count): - logging(f"Waiting for {volume_name} not {not_desired_state} ({i}) ...") + logging(f"Waiting for {volume_name} robustness not {not_desired_state} ({i}) ...") try: if self.get(volume_name)["status"]["robustness"] != not_desired_state: break @@ -161,6 +196,21 @@ def wait_for_volume_robustness_not(self, volume_name, not_desired_state): time.sleep(self.retry_interval) assert self.get(volume_name)["status"]["robustness"] != not_desired_state + def wait_for_volume_expand_to_size(self, volume_name, expected_size): + engine = None + engine_operation = Engine() + for i in range(self.retry_count): + logging(f"Waiting for {volume_name} expand to {expected_size} ({i}) ...") + + engine = engine_operation.get_engine_by_volume(self.get(volume_name)) + if int(engine['status']['currentSize']) == expected_size: + break + + time.sleep(self.retry_interval) + + assert engine is not None + assert int(engine['status']['currentSize']) == expected_size + def get_endpoint(self, volume_name): logging("Delegating the get_endpoint call to API because there is no CRD implementation") return Rest(self.node_exec).get_endpoint(volume_name) @@ -212,7 +262,7 @@ def wait_for_replica_rebuilding_complete(self, volume_name, node_name): node_name ) - def check_data(self, volume_name, checksum): + def check_data_checksum(self, volume_name, checksum): node_name = self.get(volume_name)["spec"]["nodeID"] endpoint = self.get_endpoint(volume_name) _checksum = self.node_exec.issue_cmd( @@ -225,4 +275,4 @@ def check_data(self, volume_name, checksum): def cleanup(self, volume_names): for volume_name in volume_names: logging(f"Deleting volume {volume_name}") - self.delete(volume_name) \ No newline at end of file + self.delete(volume_name) diff --git a/e2e/libs/volume/rest.py b/e2e/libs/volume/rest.py index 9443bf9961..f626714cc3 100644 --- a/e2e/libs/volume/rest.py +++ b/e2e/libs/volume/rest.py @@ -6,13 +6,12 @@ from volume.base import Base +from volume.constant import DEV_PATH +from volume.constant import RETRY_COUNTS +from volume.constant import RETRY_INTERVAL +from volume.constant import VOLUME_FRONTEND_BLOCKDEV +from volume.constant import VOLUME_FRONTEND_ISCSI -RETRY_COUNTS = 150 -RETRY_INTERVAL = 1 - -VOLUME_FRONTEND_BLOCKDEV = "blockdev" -VOLUME_FRONTEND_ISCSI = "iscsi" -DEV_PATH = "/dev/longhorn/" class Rest(Base): @@ -21,7 +20,12 @@ def __init__(self, node_exec): self.node_exec = node_exec def get(self, volume_name): - return self.longhorn_client.by_id_volume(volume_name) + for i in range(RETRY_COUNTS): + try: + return self.longhorn_client.by_id_volume(volume_name) + except Exception as e: + logging(f"Failed to get volume {e}") + time.sleep(RETRY_INTERVAL) def create(self, volume_name, size, replica_count): return NotImplemented @@ -37,7 +41,7 @@ def wait_for_volume_state(self, volume_name, desired_state): def get_endpoint(self, volume_name): endpoint = "" - v = self.longhorn_client.by_id_volume(volume_name) + v = self.get(volume_name) if v.disableFrontend: assert endpoint == "" return endpoint @@ -45,12 +49,15 @@ def get_endpoint(self, volume_name): assert v.frontend == VOLUME_FRONTEND_BLOCKDEV or\ v.frontend == VOLUME_FRONTEND_ISCSI for i in range(RETRY_COUNTS): - v = self.longhorn_client.by_id_volume(volume_name) - engines = v.controllers - assert len(engines) != 0 - endpoint = engines[0].endpoint - if endpoint != "": - break + try: + v = self.longhorn_client.by_id_volume(volume_name) + engines = v.controllers + assert len(engines) != 0 + endpoint = engines[0].endpoint + if endpoint != "": + break + except Exception as e: + logging(f"Failed to get volume {e}") time.sleep(RETRY_INTERVAL) logging(f"Got volume {volume_name} endpoint = {endpoint}") @@ -73,60 +80,69 @@ def delete_replica(self, volume_name, node_name): def wait_for_replica_rebuilding_start(self, volume_name, node_name): rebuilding_replica_name = None for i in range(RETRY_COUNTS): - v = self.longhorn_client.by_id_volume(volume_name) - logging(f"Got volume {volume_name} replicas = {v.replicas}") - for replica in v.replicas: - if replica.hostId == node_name: - rebuilding_replica_name = replica.name + try: + v = self.longhorn_client.by_id_volume(volume_name) + logging(f"Got volume {volume_name} replicas = {v.replicas}") + for replica in v.replicas: + if replica.hostId == node_name: + rebuilding_replica_name = replica.name + break + if rebuilding_replica_name: break - if rebuilding_replica_name: - break + except Exception as e: + logging(f"Failed to get volume {e}") time.sleep(RETRY_INTERVAL) assert rebuilding_replica_name != None logging(f"Got rebuilding replica = {rebuilding_replica_name}") started = False for i in range(RETRY_COUNTS): - v = self.longhorn_client.by_id_volume(volume_name) - logging(f"Got volume rebuild status = {v.rebuildStatus}") - for status in v.rebuildStatus: - for replica in v.replicas: - if status.replica == replica.name and \ - replica.hostId == node_name and \ - status.state == "in_progress": - logging(f"Started {node_name}'s replica {replica.name} rebuilding") - started = True - break - if started: - break + try: + v = self.longhorn_client.by_id_volume(volume_name) + logging(f"Got volume rebuild status = {v.rebuildStatus}") + for status in v.rebuildStatus: + for replica in v.replicas: + if status.replica == replica.name and \ + replica.hostId == node_name and \ + status.state == "in_progress": + logging(f"Started {node_name}'s replica {replica.name} rebuilding") + started = True + break + if started: + break + except Exception as e: + logging(f"Failed to get volume {e}") time.sleep(RETRY_INTERVAL) assert started, f"wait for replica on node {node_name} rebuilding timeout: {v}" def wait_for_replica_rebuilding_complete(self, volume_name, node_name): completed = False for i in range(RETRY_COUNTS): - v = self.longhorn_client.by_id_volume(volume_name) - logging(f"Got volume {volume_name} replicas = {v.replicas}") - for replica in v.replicas: - # use replica.mode is RW or RO to check if this replica - # has been rebuilt or not - # because rebuildStatus is not reliable - # when the rebuild progress reaches 100% - # it will be removed from rebuildStatus immediately - # and you will just get an empty rebuildStatus [] - # so it's no way to distinguish "rebuilding not started yet" - # or "rebuilding already completed" using rebuildStatus - if replica.hostId == node_name and replica.mode == "RW": - logging(f"Completed {node_name}'s replica {replica.name} rebuilding") - completed = True + try: + v = self.longhorn_client.by_id_volume(volume_name) + logging(f"Got volume {volume_name} replicas = {v.replicas}") + for replica in v.replicas: + # use replica.mode is RW or RO to check if this replica + # has been rebuilt or not + # because rebuildStatus is not reliable + # when the rebuild progress reaches 100% + # it will be removed from rebuildStatus immediately + # and you will just get an empty rebuildStatus [] + # so it's no way to distinguish "rebuilding not started yet" + # or "rebuilding already completed" using rebuildStatus + if replica.hostId == node_name and replica.mode == "RW": + logging(f"Completed {node_name}'s replica {replica.name} rebuilding") + completed = True + break + if completed: break - if completed: - break + except Exception as e: + logging(f"Failed to get volume {e}") time.sleep(RETRY_INTERVAL) assert completed - def check_data(self, volume_name, checksum): + def check_data_checksum(self, volume_name, checksum): return NotImplemented def cleanup(self, volume_names): - return NotImplemented \ No newline at end of file + return NotImplemented diff --git a/e2e/libs/volume/volume.py b/e2e/libs/volume/volume.py index bf88009cd6..d8d81c50aa 100644 --- a/e2e/libs/volume/volume.py +++ b/e2e/libs/volume/volume.py @@ -27,6 +27,9 @@ def create(self, volume_name, size, replica_count): def attach(self, volume_name, node_name): return self.volume.attach(volume_name, node_name) + def detach(self, volume_name): + return self.volume.detach(volume_name) + def delete(self, volume_name): return self.volume.delete(volume_name) @@ -37,10 +40,16 @@ def wait_for_volume_attached(self, volume_name): self.volume.wait_for_volume_state(volume_name, "attached") self.volume.wait_for_volume_robustness_not(volume_name, "unknown") + def wait_for_volume_detached(self, volume_name): + self.volume.wait_for_volume_state(volume_name, "detached") + def wait_for_volume_healthy(self, volume_name): self.volume.wait_for_volume_state(volume_name, "attached") self.volume.wait_for_volume_robustness(volume_name, "healthy") + def wait_for_volume_expand_to_size(self, volume_name, size): + return self.volume.wait_for_volume_expand_to_size(volume_name, size) + def get_endpoint(self, volume_name): return self.volume.get_endpoint(volume_name) @@ -65,8 +74,8 @@ def wait_for_replica_rebuilding_complete(self, volume_name, node_name): node_name ) - def check_data(self, volume_name, checksum): - return self.volume.check_data(volume_name, checksum) + def check_data_checksum(self, volume_name, checksum): + return self.volume.check_data_checksum(volume_name, checksum) def cleanup(self, volume_names): - return self.volume.cleanup(volume_names) \ No newline at end of file + return self.volume.cleanup(volume_names) diff --git a/e2e/libs/workload/constant.py b/e2e/libs/workload/constant.py new file mode 100644 index 0000000000..cd7aa90153 --- /dev/null +++ b/e2e/libs/workload/constant.py @@ -0,0 +1,3 @@ +IMAGE_BUSYBOX = 'busybox:1.34.0' +IMAGE_LITMUX = 'litmuschaos/go-runner:latest' +IMAGE_UBUNTU = 'ubuntu:16.04' diff --git a/e2e/libs/workload/pod.py b/e2e/libs/workload/pod.py index 84c2d278cf..30f56932ea 100644 --- a/e2e/libs/workload/pod.py +++ b/e2e/libs/workload/pod.py @@ -1,18 +1,20 @@ import time from kubernetes import client +from kubernetes.client import rest from utility.utility import logging from utility.utility import generate_name from utility.utility import get_retry_count_and_interval +from workload.constant import IMAGE_BUSYBOX -IMAGE_BUSYBOX = 'busybox:1.34.0' -IMAGE_LITMUX = 'litmuschaos/go-runner:latest' -IMAGE_UBUNTU = 'ubuntu:16.04' -def new_pod_manifest(image="", command=[], args=[], +def new_pod_manifest(pod_name="", image="", command=[], args=[], claim_name="", node_name="", labels={}): + if pod_name == "": + pod_name = generate_name() + # Set default image and args if image is None: image = IMAGE_BUSYBOX @@ -25,7 +27,7 @@ def new_pod_manifest(image="", command=[], args=[], 'apiVersion': 'v1', 'kind': 'Pod', 'metadata': { - 'name': generate_name(), + 'name': pod_name, 'namespace': 'default', 'labels': labels }, @@ -78,6 +80,7 @@ def new_pod_manifest(image="", command=[], args=[], return manifest + def create_pod(manifest, is_wait_for_pod_running=False): core_api = client.CoreV1Api() @@ -91,12 +94,13 @@ def create_pod(manifest, is_wait_for_pod_running=False): return get_pod(name, namespace=namespace) + def delete_pod(name, namespace='default'): core_api = client.CoreV1Api() try: core_api.delete_namespaced_pod(name=name, namespace=namespace) wait_delete_pod(name) - except ApiException as e: + except rest.ApiException as e: assert e.status == 404 def wait_delete_pod(name, namespace='default'): @@ -114,9 +118,16 @@ def wait_delete_pod(name, namespace='default'): time.sleep(retry_interval) assert not found + def get_pod(name, namespace='default'): - core_api = client.CoreV1Api() - return core_api.read_namespaced_pod(name=name, namespace=namespace) + try: + core_api = client.CoreV1Api() + return core_api.read_namespaced_pod(name=name, namespace=namespace) + except Exception as e: + if e.reason == 'Not Found': + return None + raise e + def wait_for_pod_status(name, status, namespace='default'): retry_count, retry_interval = get_retry_count_and_interval() diff --git a/e2e/libs/workload/workload.py b/e2e/libs/workload/workload.py index 232aa961fd..f484077e45 100644 --- a/e2e/libs/workload/workload.py +++ b/e2e/libs/workload/workload.py @@ -11,6 +11,7 @@ WAIT_FOR_POD_STABLE_MAX_RETRY = 90 + def get_name_suffix(*args): suffix = "" for arg in args: @@ -18,6 +19,7 @@ def get_name_suffix(*args): suffix += f"-{arg}" return suffix + def create_storageclass(name): if name == 'longhorn-test-strict-local': filepath = "./templates/workload/strict_local_storageclass.yaml" @@ -30,6 +32,7 @@ def create_storageclass(name): api = client.StorageV1Api() api.create_storage_class(body=manifest_dict) + def delete_storageclass(name): api = client.StorageV1Api() try: @@ -37,6 +40,7 @@ def delete_storageclass(name): except ApiException as e: assert e.status == 404 + def create_deployment(volume_type, option): filepath = f"./templates/workload/deployment.yaml" with open(filepath, 'r') as f: @@ -74,6 +78,7 @@ def create_deployment(volume_type, option): return deployment_name + def delete_deployment(name, namespace='default'): api = client.AppsV1Api() @@ -98,6 +103,7 @@ def delete_deployment(name, namespace='default'): time.sleep(retry_interval) assert deleted + def create_statefulset(volume_type, option): filepath = "./templates/workload/statefulset.yaml" with open(filepath, 'r') as f: @@ -124,21 +130,30 @@ def create_statefulset(volume_type, option): statefulset_name = statefulset.metadata.name replicas = statefulset.spec.replicas - retry_count, retry_interval = get_retry_count_and_interval() - for i in range(retry_count): - statefulset = api.read_namespaced_stateful_set( - name=statefulset_name, - namespace=namespace) - # statefulset is none if statefulset is not yet created - if statefulset is not None and \ - statefulset.status.ready_replicas == replicas: - break - time.sleep(retry_interval) - - assert statefulset.status.ready_replicas == replicas + wait_for_statefulset_replicas_ready(statefulset_name, replicas) return statefulset_name + +def wait_for_statefulset_replicas_ready(statefulset_name, expected_ready_count, namespace='default'): + apps_v1_api = client.AppsV1Api() + + retry_count, retry_interval = get_retry_count_and_interval() + for i in range(retry_count): + logging(f"Waiting for statefulset {statefulset_name} replica ready ({i}) ...") + + statefulset = apps_v1_api.read_namespaced_stateful_set( + name=statefulset_name, + namespace=namespace) + # statefulset is none if statefulset is not yet created + if statefulset is not None and \ + statefulset.status.ready_replicas == expected_ready_count: + break + time.sleep(retry_interval) + + assert statefulset.status.ready_replicas == expected_ready_count + + def delete_statefulset(name, namespace='default'): api = client.AppsV1Api() @@ -163,6 +178,27 @@ def delete_statefulset(name, namespace='default'): time.sleep(retry_interval) assert deleted + +def get_statefulset(name, namespace='default'): + api = client.AppsV1Api() + return api.read_namespaced_stateful_set(name=name, namespace=namespace) + + +def scale_statefulset(name, replica_count, namespace='default'): + logging(f"Scaling statefulset {name} to {replica_count}") + + apps_v1_api = client.AppsV1Api() + + scale = client.V1Scale( + metadata=client.V1ObjectMeta(name=name, namespace=namespace), + spec=client.V1ScaleSpec(replicas=int(replica_count)) + ) + apps_v1_api.patch_namespaced_stateful_set_scale(name=name, namespace=namespace, body=scale) + + statefulset = get_statefulset(name, namespace) + assert statefulset.spec.replicas == int(replica_count) + + def create_pvc(volume_type, option): filepath = "./templates/workload/pvc.yaml" with open(filepath, 'r') as f: @@ -185,6 +221,7 @@ def create_pvc(volume_type, option): return pvc.metadata.name + def delete_pvc(name, namespace='default'): api = client.CoreV1Api() try: @@ -208,6 +245,7 @@ def delete_pvc(name, namespace='default'): time.sleep(retry_interval) assert deleted + def get_workload_pod_names(workload_name): api = client.CoreV1Api() label_selector = f"app={workload_name}" @@ -219,6 +257,7 @@ def get_workload_pod_names(workload_name): pod_names.append(pod.metadata.name) return pod_names + def get_workload_pods(workload_name): api = client.CoreV1Api() label_selector = f"app={workload_name}" @@ -227,6 +266,7 @@ def get_workload_pods(workload_name): label_selector=label_selector) return resp.items + def get_workload_volume_name(workload_name): api = client.CoreV1Api() pvc_name = get_workload_pvc_name(workload_name) @@ -234,6 +274,7 @@ def get_workload_volume_name(workload_name): name=pvc_name, namespace='default') return pvc.spec.volume_name + def get_workload_pvc_name(workload_name): api = client.CoreV1Api() pod = get_workload_pods(workload_name)[0] @@ -245,6 +286,7 @@ def get_workload_pvc_name(workload_name): assert pvc_name return pvc_name + def write_pod_random_data(pod_name, size_in_mb, path="/data/random-data"): api = client.CoreV1Api() write_cmd = [ @@ -258,6 +300,7 @@ def write_pod_random_data(pod_name, size_in_mb, path="/data/random-data"): command=write_cmd, stderr=True, stdin=False, stdout=True, tty=False) + def keep_writing_pod_data(pod_name, size_in_mb=256, path="/data/overwritten-data"): api = client.CoreV1Api() write_cmd = [ @@ -273,7 +316,9 @@ def keep_writing_pod_data(pod_name, size_in_mb=256, path="/data/overwritten-data logging(f"Created process to keep writing pod {pod_name}") return res -def check_pod_data(pod_name, checksum, path="/data/random-data"): + +def check_pod_data_checksum(pod_name, checksum, path="/data/random-data"): + logging(f"Checking pod {pod_name} data checksum") api = client.CoreV1Api() cmd = [ '/bin/sh', @@ -284,9 +329,10 @@ def check_pod_data(pod_name, checksum, path="/data/random-data"): api.connect_get_namespaced_pod_exec, pod_name, 'default', command=cmd, stderr=True, stdin=False, stdout=True, tty=False) - logging(f"Got {path} checksum = {_checksum},\ - expected checksum = {checksum}") - assert _checksum == checksum + assert _checksum == checksum, \ + f"Got {path} checksum = {_checksum}\n" \ + f"Expected checksum = {checksum}" + def wait_for_workload_pod_stable(workload_name): stable_pod = None @@ -307,4 +353,4 @@ def wait_for_workload_pod_stable(workload_name): if wait_for_stable_retry == WAIT_FOR_POD_STABLE_MAX_RETRY: return stable_pod time.sleep(retry_interval) - assert False \ No newline at end of file + assert False diff --git a/e2e/templates/workload/pvc.yaml b/e2e/templates/workload/pvc.yaml index 8671b755b1..f42eef2b2c 100644 --- a/e2e/templates/workload/pvc.yaml +++ b/e2e/templates/workload/pvc.yaml @@ -9,4 +9,4 @@ spec: storageClassName: longhorn-test resources: requests: - storage: 3Gi \ No newline at end of file + storage: 3221225472 #3Gi diff --git a/e2e/templates/workload/statefulset.yaml b/e2e/templates/workload/statefulset.yaml index 4688d76f48..71d36a5aad 100644 --- a/e2e/templates/workload/statefulset.yaml +++ b/e2e/templates/workload/statefulset.yaml @@ -36,4 +36,4 @@ spec: storageClassName: longhorn-test resources: requests: - storage: 3Gi + storage: 3221225472 #3Gi diff --git a/e2e/tests/replica_rebuilding.robot b/e2e/tests/replica_rebuilding.robot index 6664b4d6eb..830b674d5b 100644 --- a/e2e/tests/replica_rebuilding.robot +++ b/e2e/tests/replica_rebuilding.robot @@ -36,27 +36,3 @@ Reboot Replica Node While Replica Rebuilding Then Wait until replica on replica node rebuilt And Check data is intact END - -Stress Volume Node CPU While Replica Rebuilding - Given Create a volume with 5 GB and 3 replicas - And Write data to the volume - - FOR ${i} IN RANGE ${LOOP_COUNT} - When Delete replica on volume node to trigger replica rebuilding - And During replica rebuilding, stress volume node cpu - - Then Wait until replica on volume node rebuilt - And Check data is intact - END - -Stress Volume Node Memory While Replica Rebuilding - Given Create a volume with 5 GB and 3 replicas - And Write data to the volume - - FOR ${i} IN RANGE ${LOOP_COUNT} - When Delete replica on volume node to trigger replica rebuilding - And During replica rebuilding, stress volume node memory - - Then Wait until replica on volume node rebuilt - And Check data is intact - END diff --git a/e2e/tests/stress_cpu.robot b/e2e/tests/stress_cpu.robot new file mode 100644 index 0000000000..b7f7998693 --- /dev/null +++ b/e2e/tests/stress_cpu.robot @@ -0,0 +1,76 @@ +*** Settings *** +Documentation Negative Test Cases +Resource ../keywords/stress.resource +Resource ../keywords/volume.resource +Resource ../keywords/workload.resource +Resource ../keywords/common.resource + +Test Setup Set test environment +Test Teardown Cleanup test resources + +*** Variables *** +${LOOP_COUNT} 1 +${RETRY_COUNT} 300 +${RETRY_INTERVAL} 1 + +*** Test Cases *** + +Stress Volume Node CPU When Replica Is Rebuilding + Given Create a volume with 5 GB and 3 replicas + And Write data to the volume + + FOR ${i} IN RANGE ${LOOP_COUNT} + When Delete replica on volume node to trigger replica rebuilding + And Stress the CPU of all volume nodes + + Then Wait until replica on volume node rebuilt + And Check data is intact + END + +Stress Volume Node CPU When Volume Is Detaching and Attaching + Given Create a volume with 5 GB and 3 replicas + And Write data to the volume + + FOR ${i} IN RANGE ${LOOP_COUNT} + When Stress the CPU of all volume nodes + + And Detach volume from node + And Attach volume to node + + And Check data is intact + END + +Stress Volume Node CPU When Volume Is Online Expanding + @{data_checksum_list} = Create List + Set Test Variable ${data_checksum_list} + + Given Create statefulset 0 with rwo volume + And Write 1024 MB data to statefulset 0 + + FOR ${i} IN RANGE ${LOOP_COUNT} + And Stress the CPU of all volume nodes + When Expand statefulset 0 volume by 100 MiB + + Then Wait for statefulset 0 volume size expanded + And Check statefulset 0 data is intact + END + +Stress Volume Node CPU When Volume Is Offline Expanding + @{data_checksum_list} = Create List + Set Test Variable ${data_checksum_list} + + Given Create statefulset 0 with rwo volume + And Write 1024 MB data to statefulset 0 + + FOR ${i} IN RANGE ${LOOP_COUNT} + And Scale down statefulset 0 to detach volume + And Stress the CPU of all worker nodes + + When Expand statefulset 0 volume by 100 MiB + + Then Wait for statefulset 0 volume size expanded + And Wait for statefulset 0 volume detached + + And Scale up statefulset 0 to attach volume + And Check statefulset 0 data is intact + END diff --git a/e2e/tests/stress_memory.robot b/e2e/tests/stress_memory.robot new file mode 100644 index 0000000000..8054b967a1 --- /dev/null +++ b/e2e/tests/stress_memory.robot @@ -0,0 +1,76 @@ +*** Settings *** +Documentation Negative Test Cases +Resource ../keywords/stress.resource +Resource ../keywords/volume.resource +Resource ../keywords/workload.resource +Resource ../keywords/common.resource + +Test Setup Set test environment +Test Teardown Cleanup test resources + +*** Variables *** +${LOOP_COUNT} 1 +${RETRY_COUNT} 300 +${RETRY_INTERVAL} 1 + +*** Test Cases *** + +Stress Volume Node Memory When Replica Is Rebuilding + Given Create a volume with 5 GB and 3 replicas + And Write data to the volume + + FOR ${i} IN RANGE ${LOOP_COUNT} + When Delete replica on volume node to trigger replica rebuilding + And Stress the memory of all volume nodes + + Then Wait until replica on volume node rebuilt + And Check data is intact + END + +Stress Volume Node Memory When Volume Is Detaching and Attaching + Given Create a volume with 5 GB and 3 replicas + And Write data to the volume + + FOR ${i} IN RANGE ${LOOP_COUNT} + When Stress the memory of all volume nodes + + And Detach volume from node + And Attach volume to node + + And Check data is intact + END + +Stress Volume Node Memory When Volume Is Online Expanding + @{data_checksum_list} = Create List + Set Test Variable ${data_checksum_list} + + Given Create statefulset 0 with rwo volume + And Write 1024 MB data to statefulset 0 + + FOR ${i} IN RANGE ${LOOP_COUNT} + And Stress the memory of all volume nodes + When Expand statefulset 0 volume by 100 MiB + + Then Wait for statefulset 0 volume size expanded + And Check statefulset 0 data is intact + END + +Stress Volume Node Memory When Volume Is Offline Expanding + @{data_checksum_list} = Create List + Set Test Variable ${data_checksum_list} + + Given Create statefulset 0 with rwo volume + And Write 1024 MB data to statefulset 0 + + FOR ${i} IN RANGE ${LOOP_COUNT} + And Scale down statefulset 0 to detach volume + And Stress the memory of all worker nodes + + When Expand statefulset 0 volume by 100 MiB + + Then Wait for statefulset 0 volume size expanded + And Wait for statefulset 0 volume detached + + And Scale up statefulset 0 to attach volume + And Check statefulset 0 data is intact + END diff --git a/manager/integration/Dockerfile b/manager/integration/Dockerfile index 6f3c45b7f3..8541f5edd9 100644 --- a/manager/integration/Dockerfile +++ b/manager/integration/Dockerfile @@ -42,5 +42,6 @@ ADD pipelines/helm/scripts/upgrade-longhorn.sh ./pipelines/helm/scripts/upgrade- ADD pipelines/rancher/scripts/upgrade-longhorn.sh ./pipelines/rancher/scripts/upgrade-longhorn.sh ADD pipelines/flux/scripts/upgrade-longhorn.sh ./pipelines/flux/scripts/upgrade-longhorn.sh ADD pipelines/argocd/scripts/upgrade-longhorn.sh ./pipelines/argocd/scripts/upgrade-longhorn.sh +ADD pipelines/fleet/scripts/upgrade-longhorn.sh ./pipelines/fleet/scripts/upgrade-longhorn.sh ENTRYPOINT ["./run.sh"] diff --git a/manager/integration/deploy/test.yaml b/manager/integration/deploy/test.yaml index 7ab9b22f73..eb3a44c8f5 100644 --- a/manager/integration/deploy/test.yaml +++ b/manager/integration/deploy/test.yaml @@ -55,6 +55,8 @@ spec: fieldPath: spec.nodeName - name: MANAGED_K8S_CLUSTER value: "false" + - name: RESOURCE_SUFFIX + value: "" volumeMounts: - name: dev mountPath: /dev diff --git a/manager/integration/pytest.ini b/manager/integration/pytest.ini index a19f7de1a8..d7622d68c0 100644 --- a/manager/integration/pytest.ini +++ b/manager/integration/pytest.ini @@ -17,3 +17,4 @@ markers = system_backup_restore cluster_autoscaler long_running + volume_backup_restore diff --git a/manager/integration/tests/aws.py b/manager/integration/tests/aws.py index 7614ad0c49..0328520567 100644 --- a/manager/integration/tests/aws.py +++ b/manager/integration/tests/aws.py @@ -32,12 +32,17 @@ def __init__(self): region_name=default_region) def instance_id_by_ip(self, instance_ip): + resource_suffix = os.getenv("RESOURCE_SUFFIX") response = aws.ec2_client.describe_instances( Filters=[ { 'Name': 'private-ip-address', 'Values': [instance_ip] }, + { + 'Name': 'tag:Name', + 'Values': [f"*{resource_suffix}*"] + } ], ) diff --git a/manager/integration/tests/common.py b/manager/integration/tests/common.py index a5a463331e..28503ae13e 100644 --- a/manager/integration/tests/common.py +++ b/manager/integration/tests/common.py @@ -66,6 +66,7 @@ RETRY_COUNTS_SHORT = 30 RETRY_COUNTS_LONG = 360 RETRY_INTERVAL = 1 +RETRY_INTERVAL_SHORT = 0.5 RETRY_INTERVAL_LONG = 2 RETRY_BACKUP_COUNTS = 300 RETRY_BACKUP_INTERVAL = 1 @@ -81,6 +82,7 @@ UPGRADE_TEST_IMAGE_PREFIX = "longhornio/longhorn-test:upgrade-test" ISCSI_DEV_PATH = "/dev/disk/by-path" +ISCSI_PROCESS = "iscsid" VOLUME_FIELD_STATE = "state" VOLUME_STATE_ATTACHED = "attached" @@ -112,6 +114,7 @@ DEFAULT_POD_INTERVAL = 1 DEFAULT_POD_TIMEOUT = 180 +POD_DELETION_TIMEOUT = 600 DEFAULT_STATEFULSET_INTERVAL = 1 DEFAULT_STATEFULSET_TIMEOUT = 180 @@ -120,11 +123,10 @@ DEFAULT_DEPLOYMENT_TIMEOUT = 240 WAIT_FOR_POD_STABLE_MAX_RETRY = 90 - DEFAULT_VOLUME_SIZE = 3 # In Gi EXPANDED_VOLUME_SIZE = 4 # In Gi -DIRECTORY_PATH = '/tmp/longhorn-test/' +DIRECTORY_PATH = '/var/lib/longhorn/longhorn-test/' VOLUME_CONDITION_SCHEDULED = "Scheduled" VOLUME_CONDITION_RESTORE = "Restore" @@ -167,6 +169,8 @@ "allow-volume-creation-with-degraded-availability" SETTING_DISABLE_SCHEDULING_ON_CORDONED_NODE = \ "disable-scheduling-on-cordoned-node" +SETTING_DETACH_MANUALLY_ATTACHED_VOLUMES_WHEN_CORDONED = \ + "detach-manually-attached-volumes-when-cordoned" SETTING_GUARANTEED_INSTANCE_MANAGER_CPU = "guaranteed-instance-manager-cpu" SETTING_PRIORITY_CLASS = "priority-class" SETTING_RECURRING_JOB_WHILE_VOLUME_DETACHED = \ @@ -207,6 +211,11 @@ SETTING_BACKUP_COMPRESSION_METHOD = "backup-compression-method" SETTING_BACKUP_CONCURRENT_LIMIT = "backup-concurrent-limit" SETTING_RESTORE_CONCURRENT_LIMIT = "restore-concurrent-limit" +SETTING_V1_DATA_ENGINE = "v1-data-engine" +SETTING_ALLOW_EMPTY_NODE_SELECTOR_VOLUME = \ + "allow-empty-node-selector-volume" +SETTING_REPLICA_DISK_SOFT_ANTI_AFFINITY = "replica-disk-soft-anti-affinity" +SETTING_ALLOW_EMPTY_DISK_SELECTOR_VOLUME = "allow-empty-disk-selector-volume" DEFAULT_BACKUP_COMPRESSION_METHOD = "lz4" BACKUP_COMPRESSION_METHOD_LZ4 = "lz4" @@ -282,10 +291,15 @@ FS_TYPE_EXT4 = "ext4" FS_TYPE_XFS = "xfs" +ACCESS_MODE_RWO = "rwo" +ACCESS_MODE_RWX = "rwx" + ATTACHER_TYPE_CSI_ATTACHER = "csi-attacher" ATTACHER_TYPE_LONGHORN_API = "longhorn-api" ATTACHER_TYPE_LONGHORN_UPGRADER = "longhorn-upgrader" +HOST_PROC_DIR = "/host/proc" + # customize the timeout for HDD disktype = os.environ.get('LONGHORN_DISK_TYPE') if disktype == "hdd": @@ -395,9 +409,9 @@ def create_volume_and_backup(client, vol_name, vol_size, backup_data_size): client.create_volume(name=vol_name, numberOfReplicas=1, size=str(vol_size)) - backup_volume = wait_for_volume_detached(client, vol_name) - backup_volume.attach(hostId=get_self_host_id()) - backup_volume = wait_for_volume_healthy(client, vol_name) + volume = wait_for_volume_detached(client, vol_name) + volume.attach(hostId=get_self_host_id()) + volume = wait_for_volume_healthy(client, vol_name) data = {'pos': 0, 'len': backup_data_size, @@ -405,7 +419,7 @@ def create_volume_and_backup(client, vol_name, vol_size, backup_data_size): _, backup, _, _ = create_backup(client, vol_name, data) - return backup_volume, backup + return volume, backup def create_backup(client, volname, data={}, labels={}): @@ -486,7 +500,8 @@ def delete_backup_volume(client, volume_name): def create_and_check_volume(client, volume_name, num_of_replicas=3, size=SIZE, backing_image="", frontend=VOLUME_FRONTEND_BLOCKDEV, - snapshot_data_integrity=SNAPSHOT_DATA_INTEGRITY_IGNORED): # NOQA + snapshot_data_integrity=SNAPSHOT_DATA_INTEGRITY_IGNORED, # NOQA + access_mode=ACCESS_MODE_RWO): """ Create a new volume with the specified parameters. Assert that the new volume is detached and that all of the requested parameters match. @@ -505,7 +520,8 @@ def create_and_check_volume(client, volume_name, client.create_volume(name=volume_name, size=size, numberOfReplicas=num_of_replicas, backingImage=backing_image, frontend=frontend, - snapshotDataIntegrity=snapshot_data_integrity) + snapshotDataIntegrity=snapshot_data_integrity, + accessMode=access_mode) volume = wait_for_volume_detached(client, volume_name) assert volume.name == volume_name assert volume.size == size @@ -523,11 +539,14 @@ def wait_pod(pod_name): pod = None for i in range(DEFAULT_POD_TIMEOUT): - pod = api.read_namespaced_pod( - name=pod_name, - namespace='default') - if pod is not None and pod.status.phase != 'Pending': - break + try: + pod = api.read_namespaced_pod( + name=pod_name, + namespace='default') + if pod is not None and pod.status.phase != 'Pending': + break + except Exception as e: + print(f"Waiting for pod {pod_name} failed: {e}") time.sleep(DEFAULT_POD_INTERVAL) assert pod is not None and pod.status.phase == 'Running' @@ -927,7 +946,7 @@ def size_to_string(volume_size): def wait_delete_pod(api, pod_uid, namespace='default'): - for i in range(DEFAULT_POD_TIMEOUT): + for i in range(POD_DELETION_TIMEOUT): ret = api.list_namespaced_pod(namespace=namespace) found = False for item in ret.items: @@ -1536,31 +1555,33 @@ def finalizer(): @pytest.fixture def crypto_secret(request): - manifest = { - 'apiVersion': 'v1', - 'kind': 'Secret', - 'metadata': { - 'name': 'longhorn-crypto', - 'namespace': 'longhorn-system', - }, - 'stringData': { - 'CRYPTO_KEY_VALUE': 'simple', - 'CRYPTO_KEY_PROVIDER': 'secret' + def get_crypto_secret(namespace=LONGHORN_NAMESPACE): + crypto_secret.manifest = { + 'apiVersion': 'v1', + 'kind': 'Secret', + 'metadata': { + 'name': 'longhorn-crypto', + 'namespace': namespace, + }, + 'stringData': { + 'CRYPTO_KEY_VALUE': 'simple', + 'CRYPTO_KEY_PROVIDER': 'secret' + } } - } + return crypto_secret.manifest def finalizer(): api = get_core_api_client() try: api.delete_namespaced_secret( - name=manifest['metadata']['name'], - namespace=manifest['metadata']['namespace']) + name=crypto_secret.manifest['metadata']['name'], + namespace=crypto_secret.manifest['metadata']['namespace']) except ApiException as e: assert e.status == 404 request.addfinalizer(finalizer) - return manifest + return get_crypto_secret @pytest.fixture @@ -1677,6 +1698,14 @@ def client(request): request.addfinalizer(lambda: cleanup_client()) + if not os.path.exists(DIRECTORY_PATH): + try: + os.makedirs(DIRECTORY_PATH) + except OSError as e: + raise Exception( + f"Failed to create directory {DIRECTORY_PATH}: {e}" + ) + cleanup_client() return client @@ -1766,12 +1795,11 @@ def get_mgr_ips(): def get_self_host_id(): - envs = os.environ - return envs["NODE_NAME"] + return os.environ.get("NODE_NAME") def get_backupstore_url(): - backupstore = os.environ['LONGHORN_BACKUPSTORES'] + backupstore = os.environ.get("LONGHORN_BACKUPSTORES", "") backupstore = backupstore.replace(" ", "") backupstores = backupstore.split(",") @@ -1780,18 +1808,13 @@ def get_backupstore_url(): def get_backupstore_poll_interval(): - poll_interval = os.environ['LONGHORN_BACKUPSTORE_POLL_INTERVAL'] + poll_interval = os.environ.get("LONGHORN_BACKUPSTORE_POLL_INTERVAL", "") assert len(poll_interval) != 0 return poll_interval def get_backupstores(): - # The try is added to avoid the pdoc3 error while publishing this on - # https://longhorn.github.io/longhorn-tests - try: - backupstore = os.environ['LONGHORN_BACKUPSTORES'] - except KeyError: - return [] + backupstore = os.environ.get("LONGHORN_BACKUPSTORES", "") try: backupstore = backupstore.replace(" ", "") @@ -2163,7 +2186,7 @@ def wait_for_engine_image_creation(client, image_name): break if found: break - time.sleep(RETRY_INTERVAL) + time.sleep(RETRY_INTERVAL_SHORT) assert found @@ -2178,16 +2201,39 @@ def wait_for_engine_image_state(client, image_name, state): return image +def wait_for_engine_image_incompatible(client, image_name): + wait_for_engine_image_creation(client, image_name) + for i in range(RETRY_COUNTS): + image = client.by_id_engine_image(image_name) + if image.incompatible: + break + time.sleep(RETRY_INTERVAL) + assert image.incompatible + return image + + def wait_for_engine_image_condition(client, image_name, state): """ state: "True", "False" """ + # Indicate many times we want to see the ENGINE_NAME in the STATE. + # This helps to prevent the flaky test case in which the ENGINE_NAME + # is flapping between ready and not ready a few times before settling + # down to the ready state + # https://github.com/longhorn/longhorn-tests/pull/1638 + state_count = 1 + if state == "True": + state_count = 5 + + c = 0 for i in range(RETRY_COUNTS): wait_for_engine_image_creation(client, image_name) image = client.by_id_engine_image(image_name) if image['conditions'][0]['status'] == state: - break - time.sleep(RETRY_INTERVAL_LONG) + c += 1 + if c >= state_count: + break + time.sleep(RETRY_INTERVAL_SHORT) assert image['conditions'][0]['status'] == state return image @@ -2307,13 +2353,18 @@ def crash_replica_processes(client, api, volname, replicas=None, for r in replicas: assert r.instanceManagerName != "" - kill_command = "kill `pgrep -f " + r['dataPath'] + "`" + + pgrep_command = f"pgrep -f {r['dataPath']}" + pid = exec_instance_manager(api, r.instanceManagerName, pgrep_command) + assert pid != "" + + kill_command = f"kill {pid}" exec_instance_manager(api, r.instanceManagerName, kill_command) if wait_to_fail is True: thread = create_assert_error_check_thread( wait_for_replica_failed, - client, volname, r['name'], RETRY_COUNTS*2, RETRY_INTERVAL/2 + client, volname, r['name'], RETRY_COUNTS, RETRY_INTERVAL_SHORT ) threads.append(thread) @@ -2326,10 +2377,11 @@ def exec_instance_manager(api, im_name, cmd): with timeout(seconds=STREAM_EXEC_TIMEOUT, error_message='Timeout on executing stream read'): - stream(api.connect_get_namespaced_pod_exec, - im_name, - LONGHORN_NAMESPACE, command=exec_cmd, - stderr=True, stdin=False, stdout=True, tty=False) + output = stream(api.connect_get_namespaced_pod_exec, + im_name, + LONGHORN_NAMESPACE, command=exec_cmd, + stderr=True, stdin=False, stdout=True, tty=False) + return output def wait_for_replica_failed(client, volname, replica_name, @@ -2639,11 +2691,22 @@ def get_iscsi_lun(iscsi): return iscsi_endpoint[2] -def exec_nsenter(cmd): - dockerd_pid = find_dockerd_pid() or "1" - exec_cmd = ["nsenter", "--mount=/host/proc/{}/ns/mnt".format(dockerd_pid), - "--net=/host/proc/{}/ns/net".format(dockerd_pid), - "bash", "-c", cmd] +def exec_nsenter(cmd, process_name=None): + if process_name: + proc_pid = find_process_pid(process_name) + cmd_parts = cmd.split() + else: + proc_pid = find_dockerd_pid() or "1" + cmd_parts = ["bash", "-c", cmd] + + exec_cmd = ["nsenter", "--mount=/host/proc/{}/ns/mnt".format(proc_pid), + "--net=/host/proc/{}/ns/net".format(proc_pid)] + exec_cmd.extend(cmd_parts) + return subprocess.check_output(exec_cmd) + + +def exec_local(cmd): + exec_cmd = cmd.split() return subprocess.check_output(exec_cmd) @@ -2654,10 +2717,10 @@ def iscsi_login(iscsi_ep): lun = get_iscsi_lun(iscsi_ep) # discovery cmd_discovery = "iscsiadm -m discovery -t st -p " + ip - exec_nsenter(cmd_discovery) + exec_nsenter(cmd_discovery, ISCSI_PROCESS) # login cmd_login = "iscsiadm -m node -T " + target + " -p " + ip + " --login" - exec_nsenter(cmd_login) + exec_nsenter(cmd_login, ISCSI_PROCESS) blk_name = "ip-%s:%s-iscsi-%s-lun-%s" % (ip, port, target, lun) wait_for_device_login(ISCSI_DEV_PATH, blk_name) dev = os.path.realpath(ISCSI_DEV_PATH + "/" + blk_name) @@ -2668,9 +2731,9 @@ def iscsi_logout(iscsi_ep): ip = get_iscsi_ip(iscsi_ep) target = get_iscsi_target(iscsi_ep) cmd_logout = "iscsiadm -m node -T " + target + " -p " + ip + " --logout" - exec_nsenter(cmd_logout) + exec_nsenter(cmd_logout, ISCSI_PROCESS) cmd_rm_discovery = "iscsiadm -m discovery -p " + ip + " -o delete" - exec_nsenter(cmd_rm_discovery) + exec_nsenter(cmd_rm_discovery, ISCSI_PROCESS) def get_process_info(p_path): @@ -2707,6 +2770,40 @@ def find_dockerd_pid(): return find_ancestor_process_by_name("dockerd") +def find_process_pid(process_name): + for file in os.listdir(HOST_PROC_DIR): + if not os.path.isdir(os.path.join(HOST_PROC_DIR, file)): + continue + + # Check if file name is an integer + if not file.isdigit(): + continue + + with open(os.path.join(HOST_PROC_DIR, file, 'status'), 'r') as file: + status_content = file.readlines() + + proc_status_content = None + name_pattern = re.compile(r'^Name:\s+(.+)$') + + for line in status_content: + name_match = name_pattern.match(line) + if name_match and name_match.group(1) == process_name: + proc_status_content = status_content + break + + if proc_status_content is None: + continue + + pid_pattern = re.compile(r'^Pid:\s+(\d+)$') + + for line in proc_status_content: + pid_match = pid_pattern.match(line) + if pid_match: + return int(pid_match.group(1)) + + raise Exception(f"Failed to find the {process_name} PID") + + def generate_random_pos(size, used={}): for i in range(RETRY_COUNTS): pos = 0 @@ -3271,7 +3368,11 @@ def get_k8s_zone_label(): def cleanup_test_disks(client): - del_dirs = os.listdir(DIRECTORY_PATH) + try: + del_dirs = os.listdir(DIRECTORY_PATH) + except FileNotFoundError: + del_dirs = [] + host_id = get_self_host_id() node = client.by_id_node(host_id) disks = node.disks @@ -3399,6 +3500,13 @@ def reset_settings(client): # resetting this to an empty default value. if setting_name == "storage-network": continue + # The test CI deploys Longhorn with the setting value longhorn-critical + # for the setting priority-class. Don't reset it to empty (which is + # the default value defined in longhorn-manager code) because this will + # restart Longhorn managed components and fail the test cases. + # https://github.com/longhorn/longhorn/issues/7413#issuecomment-1881707958 + if setting.name == SETTING_PRIORITY_CLASS: + continue # The version of the support bundle kit will be specified by a command # option when starting the manager. And setting requires a value. @@ -3533,7 +3641,7 @@ def wait_for_all_instance_manager_running(client): node_to_instance_manager_map = {} try: for im in instance_managers: - if im.managerType == "aio" and im.currentState == "running": + if im.managerType == "aio": node_to_instance_manager_map[im.nodeID] = im else: print("\nFound unknown instance manager:", im) @@ -3651,7 +3759,7 @@ def find_backup(client, vol_name, snap_name): def find_backup_volume(): bvs = client.list_backupVolume() for bv in bvs: - if bv.name == vol_name: + if bv.name == vol_name and bv.created != "": return bv return None @@ -3828,9 +3936,9 @@ def wait_statefulset(statefulset_manifest): assert s_set.status.ready_replicas == replicas -def create_crypto_secret(secret_manifest): +def create_crypto_secret(secret_manifest, namespace=LONGHORN_NAMESPACE): api = get_core_api_client() - api.create_namespaced_secret(namespace=LONGHORN_NAMESPACE, + api.create_namespaced_secret(namespace, body=secret_manifest) @@ -4063,17 +4171,17 @@ def create_pv_for_volume(client, core_api, volume, pv_name, fs_type="ext4"): wait_volume_kubernetes_status(client, volume.name, ks) -def create_pvc_for_volume(client, core_api, volume, pvc_name): - volume.pvcCreate(namespace="default", pvcName=pvc_name) +def create_pvc_for_volume(client, core_api, volume, pvc_name, pvc_namespace="default"): # NOQA + volume.pvcCreate(namespace=pvc_namespace, pvcName=pvc_name) for i in range(RETRY_COUNTS): - if check_pvc_existence(core_api, pvc_name): + if check_pvc_existence(core_api, pvc_name, pvc_namespace): break time.sleep(RETRY_INTERVAL) - assert check_pvc_existence(core_api, pvc_name) + assert check_pvc_existence(core_api, pvc_name, pvc_namespace) ks = { 'pvStatus': 'Bound', - 'namespace': 'default', + 'namespace': pvc_namespace, 'lastPVCRefAt': '', } wait_volume_kubernetes_status(client, volume.name, ks) @@ -4888,7 +4996,8 @@ def prepare_statefulset_with_data_in_mb( def prepare_pod_with_data_in_mb( client, core_api, csi_pv, pvc, pod_make, volume_name, volume_size=str(1*Gi), num_of_replicas=3, data_path="/data/test", - data_size_in_mb=DATA_SIZE_IN_MB_1, add_liveness_probe=True):# NOQA: + data_size_in_mb=DATA_SIZE_IN_MB_1, add_liveness_probe=True, + access_mode=ACCESS_MODE_RWO):# NOQA: pod_name = volume_name + "-pod" pv_name = volume_name @@ -4913,7 +5022,8 @@ def prepare_pod_with_data_in_mb( create_and_check_volume(client, volume_name, num_of_replicas=num_of_replicas, - size=volume_size) + size=volume_size, + access_mode=access_mode) core_api.create_persistent_volume(csi_pv) core_api.create_namespaced_persistent_volume_claim( body=pvc, namespace='default') @@ -4976,11 +5086,14 @@ def wait_for_pod_restart(core_api, pod_name, namespace="default"): def wait_for_pod_phase(core_api, pod_name, pod_phase, namespace="default"): is_phase = False for _ in range(RETRY_COUNTS): - pod = core_api.read_namespaced_pod(name=pod_name, - namespace=namespace) - if pod.status.phase == pod_phase: - is_phase = True - break + try: + pod = core_api.read_namespaced_pod(name=pod_name, + namespace=namespace) + if pod.status.phase == pod_phase: + is_phase = True + break + except Exception as e: + print(f"Waiting for pod {pod_name} {pod_phase} failed: {e}") time.sleep(RETRY_INTERVAL_LONG) assert is_phase @@ -5999,3 +6112,13 @@ def create_volume_and_write_data(client, volume_name, volume_size=SIZE): volume_data = write_volume_random_data(volume) return volume, volume_data + + +def wait_for_instance_manager_count(client, number, retry_counts=120): + for _ in range(retry_counts): + ims = client.list_instance_manager() + if len(ims) == number: + break + time.sleep(RETRY_INTERVAL_LONG) + + return len(ims) diff --git a/manager/integration/tests/requirements.txt b/manager/integration/tests/requirements.txt index e7e5e58a6f..51fdbdb828 100644 --- a/manager/integration/tests/requirements.txt +++ b/manager/integration/tests/requirements.txt @@ -11,3 +11,4 @@ six==1.12.0 minio==5.0.10 pyyaml==5.4.1 pandas +prometheus_client diff --git a/manager/integration/tests/run.sh b/manager/integration/tests/run.sh index e37745dd24..8eb3b99115 100755 --- a/manager/integration/tests/run.sh +++ b/manager/integration/tests/run.sh @@ -1,3 +1,4 @@ #!/bin/bash +export PYTHONUNBUFFERED=1 pytest -v "$@" diff --git a/manager/integration/tests/test_backing_image.py b/manager/integration/tests/test_backing_image.py index 2c2285d253..5fac7c272b 100644 --- a/manager/integration/tests/test_backing_image.py +++ b/manager/integration/tests/test_backing_image.py @@ -38,10 +38,14 @@ from common import wait_for_backing_image_status from common import wait_for_backing_image_in_disk_fail from common import get_disk_uuid +from common import write_volume_dev_random_mb_data, get_device_checksum +from common import check_backing_image_disk_map_status from common import LONGHORN_NAMESPACE, RETRY_EXEC_COUNTS, RETRY_INTERVAL from common import BACKING_IMAGE_QCOW2_CHECKSUM from common import BACKING_IMAGE_STATE_READY from common import BACKING_IMAGE_STATE_FAILED_AND_CLEANUP +from common import BACKING_IMAGE_STATE_IN_PROGRESS +from common import RETRY_COUNTS_LONG import time @@ -73,9 +77,10 @@ def backing_image_basic_operation_test(client, volume_name, bi_name, bi_url): # 8. Delete the backing image. """ - volume = create_and_check_volume( - client, volume_name, 3, - str(BACKING_IMAGE_EXT4_SIZE), bi_name) + volume = create_and_check_volume(client, volume_name, + num_of_replicas=3, + size=str(BACKING_IMAGE_EXT4_SIZE), + backing_image=bi_name) lht_host_id = get_self_host_id() volume.attach(hostId=lht_host_id) volume = wait_for_volume_healthy(client, volume_name) @@ -140,9 +145,10 @@ def backing_image_content_test(client, volume_name_prefix, bi_name, bi_url): # lht_host_id = get_self_host_id() volume_name1 = volume_name_prefix + "-1" - volume1 = create_and_check_volume( - client, volume_name1, 3, - str(BACKING_IMAGE_EXT4_SIZE), bi_name) + volume1 = create_and_check_volume(client, volume_name1, + num_of_replicas=3, + size=str(BACKING_IMAGE_EXT4_SIZE), + backing_image=bi_name) volume1.attach(hostId=lht_host_id) volume1 = wait_for_volume_healthy(client, volume_name1) assert volume1.backingImage == bi_name @@ -172,9 +178,10 @@ def backing_image_content_test(client, volume_name_prefix, bi_name, bi_url): # check_volume_data(volume1, data) volume_name2 = volume_name_prefix + "-2" - volume2 = create_and_check_volume( - client, volume_name2, 3, - str(BACKING_IMAGE_EXT4_SIZE), bi_name) + volume2 = create_and_check_volume(client, volume_name2, + num_of_replicas=3, + size=str(BACKING_IMAGE_EXT4_SIZE), + backing_image=bi_name) volume2.attach(hostId=lht_host_id) volume2 = wait_for_volume_healthy(client, volume_name2) assert volume1.backingImage == bi_name @@ -524,10 +531,10 @@ def test_backing_image_auto_resync(bi_url, client, volume_name): # NOQA client, BACKING_IMAGE_NAME, bi_url) # Step 2 - volume = create_and_check_volume( - client, volume_name, 3, - str(BACKING_IMAGE_EXT4_SIZE), - BACKING_IMAGE_NAME) + volume = create_and_check_volume(client, volume_name, + num_of_replicas=3, + size=str(BACKING_IMAGE_EXT4_SIZE), + backing_image=BACKING_IMAGE_NAME) # Step 3 lht_host_id = get_self_host_id() @@ -581,13 +588,13 @@ def backing_image_cleanup(core_api, client): # NOQA # Step 2 lht_host_id = get_self_host_id() - volume1 = create_and_check_volume( - client, volume_name="vol-1", size=str(1 * Gi), - backing_image=backing_img1_name) + volume1 = create_and_check_volume(client, "vol-1", + size=str(1 * Gi), + backing_image=backing_img1_name) - volume2 = create_and_check_volume( - client, volume_name="vol-2", size=str(1 * Gi), - backing_image=backing_img2_name) + volume2 = create_and_check_volume(client, "vol-2", + size=str(1 * Gi), + backing_image=backing_img2_name) # Step 3 volume1.attach(hostId=lht_host_id) @@ -630,3 +637,63 @@ def test_backing_image_with_wrong_md5sum(bi_url, client): # NOQA wait_for_backing_image_status(client, BACKING_IMAGE_NAME, BACKING_IMAGE_STATE_FAILED_AND_CLEANUP) + + +def test_volume_wait_for_backing_image_condition(client): # NOQA + """ + Test the volume condition "WaitForBackingImage" + + Given + - Create a BackingImage + + When + - Creating the Volume with the BackingImage while it is still in progress + + Then + - The condition "WaitForBackingImage" of the Volume + would be first True and then change to False when + the BackingImage is ready and all the replicas are in running state. + """ + # Create a large volume and export as backingimage + lht_host_id = get_self_host_id() + + volume1_name = "vol1" + volume1 = create_and_check_volume(client, volume1_name, + num_of_replicas=3, + size=str(1 * Gi)) + volume1.attach(hostId=lht_host_id) + volume1 = wait_for_volume_healthy(client, volume1_name) + volume_endpoint = get_volume_endpoint(volume1) + write_volume_dev_random_mb_data(volume_endpoint, 1, 500) + vol1_cksum = get_device_checksum(volume_endpoint) + + backing_img_name = 'bi-test' + backing_img = client.create_backing_image( + name=backing_img_name, + sourceType=BACKING_IMAGE_SOURCE_TYPE_FROM_VOLUME, + parameters={"export-type": "qcow2", "volume-name": volume1_name}, + expectedChecksum="") + + # Create volume with that backing image + volume2_name = "vol2" + volume2 = create_and_check_volume(client, volume2_name, + size=str(1 * Gi), + backing_image=backing_img["name"]) + + volume2.attach(hostId=lht_host_id) + + if check_backing_image_disk_map_status(client, + backing_img_name, + 1, + BACKING_IMAGE_STATE_IN_PROGRESS): + volume2 = client.by_id_volume(volume2_name) + assert volume2.conditions.WaitForBackingImage.status == "True" + + # Check volume healthy, and backing image ready + volume2 = wait_for_volume_healthy(client, volume2_name, RETRY_COUNTS_LONG) + assert volume2.conditions.WaitForBackingImage.status == "False" + check_backing_image_disk_map_status(client, backing_img_name, 3, "ready") + + volume_endpoint = get_volume_endpoint(volume2) + vol2_cksum = get_device_checksum(volume_endpoint) + assert vol1_cksum == vol2_cksum diff --git a/manager/integration/tests/test_basic.py b/manager/integration/tests/test_basic.py index 4088e9fe8d..bedd863e03 100644 --- a/manager/integration/tests/test_basic.py +++ b/manager/integration/tests/test_basic.py @@ -100,6 +100,7 @@ from common import BACKUP_COMPRESSION_METHOD_NONE from common import create_and_wait_deployment from common import get_custom_object_api_client +from common import RETRY_COUNTS_SHORT from backupstore import backupstore_delete_volume_cfg_file from backupstore import backupstore_cleanup @@ -270,8 +271,10 @@ def volume_basic_test(client, volume_name, backing_image=""): # NOQA numberOfReplicas=2, frontend="invalid_frontend") - volume = create_and_check_volume(client, volume_name, num_replicas, SIZE, - backing_image) + volume = create_and_check_volume(client, volume_name, + num_of_replicas=num_replicas, + size=SIZE, + backing_image=backing_image) assert volume.restoreRequired is False def validate_volume_basic(expected, actual): @@ -348,8 +351,11 @@ def test_volume_iscsi_basic(client, volume_name): # NOQA def volume_iscsi_basic_test(client, volume_name, backing_image=""): # NOQA host_id = get_self_host_id() - volume = create_and_check_volume(client, volume_name, 3, SIZE, - backing_image, VOLUME_FRONTEND_ISCSI) + volume = create_and_check_volume(client, volume_name, + num_of_replicas=3, + size=SIZE, + backing_image=backing_image, + frontend=VOLUME_FRONTEND_ISCSI) volume.attach(hostId=host_id) volume = common.wait_for_volume_healthy(client, volume_name) @@ -558,8 +564,10 @@ def test_backup_status_for_unavailable_replicas(set_random_backupstore, client, def backup_status_for_unavailable_replicas_test(client, volume_name, # NOQA size, backing_image=""): # NOQA - volume = create_and_check_volume(client, volume_name, 2, str(size), - backing_image) + volume = create_and_check_volume(client, volume_name, + num_of_replicas=2, + size=str(size), + backing_image=backing_image) lht_hostId = get_self_host_id() volume = volume.attach(hostId=lht_hostId) @@ -745,7 +753,9 @@ def test_dr_volume_activated_with_failed_replica(set_random_backupstore, client, backupstore_cleanup(client) host_id = get_self_host_id() - vol = create_and_check_volume(client, volume_name, 2, SIZE) + vol = create_and_check_volume(client, volume_name, + num_of_replicas=2, + size=SIZE) vol.attach(hostId=host_id) vol = common.wait_for_volume_healthy(client, volume_name) @@ -820,7 +830,9 @@ def test_dr_volume_with_backup_block_deletion(set_random_backupstore, client, co host_id = get_self_host_id() - vol = create_and_check_volume(client, volume_name, 2, SIZE) + vol = create_and_check_volume(client, volume_name, + num_of_replicas=2, + size=SIZE) vol.attach(hostId=host_id) vol = common.wait_for_volume_healthy(client, volume_name) @@ -917,7 +929,9 @@ def test_dr_volume_with_backup_block_deletion_abort_during_backup_in_progress(se host_id = get_self_host_id() - vol = create_and_check_volume(client, volume_name, 2, SIZE) + vol = create_and_check_volume(client, volume_name, + num_of_replicas=2, + size=SIZE) vol.attach(hostId=host_id) vol = common.wait_for_volume_healthy(client, volume_name) @@ -996,7 +1010,9 @@ def test_dr_volume_with_all_backup_blocks_deleted(set_random_backupstore, client host_id = get_self_host_id() - vol = create_and_check_volume(client, volume_name, 2, SIZE) + vol = create_and_check_volume(client, volume_name, + num_of_replicas=2, + size=SIZE) vol.attach(hostId=host_id) vol = common.wait_for_volume_healthy(client, volume_name) @@ -1321,8 +1337,10 @@ def test_backup(set_random_backupstore, client, volume_name): # NOQA def backup_test(client, volume_name, size, backing_image="", compression_method=DEFAULT_BACKUP_COMPRESSION_METHOD): # NOQA - volume = create_and_check_volume(client, volume_name, 2, size, - backing_image) + volume = create_and_check_volume(client, volume_name, + num_of_replicas=2, + size=size, + backing_image=backing_image) lht_hostId = get_self_host_id() volume = volume.attach(hostId=lht_hostId) @@ -1381,8 +1399,10 @@ def test_backup_labels(set_random_backupstore, client, random_labels, volume_nam def backup_labels_test(client, random_labels, volume_name, size=SIZE, backing_image=""): # NOQA host_id = get_self_host_id() - volume = create_and_check_volume(client, volume_name, 2, size, - backing_image) + volume = create_and_check_volume(client, volume_name, + num_of_replicas=2, + size=size, + backing_image=backing_image) volume.attach(hostId=host_id) volume = common.wait_for_volume_healthy(client, volume_name) @@ -1432,7 +1452,9 @@ def test_restore_inc(set_random_backupstore, client, core_api, volume_name, pod) def restore_inc_test(client, core_api, volume_name, pod): # NOQA - std_volume = create_and_check_volume(client, volume_name, 2, SIZE) + std_volume = create_and_check_volume(client, volume_name, + num_of_replicas=2, + size=SIZE) lht_host_id = get_self_host_id() std_volume.attach(hostId=lht_host_id) std_volume = common.wait_for_volume_healthy(client, volume_name) @@ -1848,8 +1870,10 @@ def test_volume_multinode(client, volume_name): # NOQA assert len(volumes) == 0 -@pytest.mark.skip(reason="TODO") -def test_pvc_storage_class_name_from_backup_volume(): # NOQA +def test_pvc_storage_class_name_from_backup_volume(set_random_backupstore, # NOQA + core_api, client, volume_name, # NOQA + pvc_name, pvc, pod_make, # NOQA + storage_class): # NOQA """ Test the storageClasName of the restored volume's PV/PVC should be from the backup volume @@ -1860,14 +1884,13 @@ def test_pvc_storage_class_name_from_backup_volume(): # NOQA kind: StorageClass apiVersion: storage.k8s.io/v1 metadata: - name: longhorn-sc-name-recorded + name: longhorn-test provisioner: driver.longhorn.io allowVolumeExpansion: true reclaimPolicy: Delete volumeBindingMode: Immediate parameters: numberOfReplicas: "3" - staleReplicaTimeout: "2880" ``` - Create a PVC to use this SC ``` @@ -1878,10 +1901,10 @@ def test_pvc_storage_class_name_from_backup_volume(): # NOQA spec: accessModes: - ReadWriteOnce - storageClassName: longhorn-sc-name-recorded + storageClassName: longhorn-test resources: requests: - storage: 5Gi + storage: 300Mi ``` - Attach the Volume and write some data @@ -1890,17 +1913,89 @@ def test_pvc_storage_class_name_from_backup_volume(): # NOQA Then - the backupvolume's status.storageClassName should be - longhorn-sc-name-recorded + longhorn-test When - Restore the backup to a new volume - Create PV/PVC from the new volume with create new PVC option Then - - The new PVC's storageClassName should still be longhorn-sc-name-recorded + - The new PVC's storageClassName should still be longhorn-test - Verify the restored data is the same as original one """ - pass + volume_size = str(300 * Mi) + create_storage_class(storage_class) + + pod_name = "pod-" + pvc_name + pvc['metadata']['name'] = pvc_name + pvc['spec']['storageClassName'] = storage_class['metadata']['name'] + pvc['spec']['resources']['requests']['storage'] = volume_size + common.create_pvc(pvc) + + pv = common.wait_and_get_pv_for_pvc(core_api, pvc_name) + assert pv.status.phase == "Bound" + + test_pod = pod_make(pod_name) + test_pod['metadata']['name'] = pod_name + test_pod['spec']['volumes'] = [{ + 'name': test_pod['spec']['containers'][0]['volumeMounts'][0]['name'], + 'persistentVolumeClaim': {'claimName': pvc_name}, + }] + create_and_wait_pod(core_api, test_pod) + + test_data = generate_random_data(VOLUME_RWTEST_SIZE) + write_pod_volume_data(core_api, pod_name, test_data) + + volume_name = pv.spec.csi.volume_handle + volume_id = client.by_id_volume(volume_name) + snapshot = volume_id.snapshotCreate() + + volume_id.snapshotBackup(name=snapshot.name) + wait_for_backup_completion(client, volume_name, snapshot.name) + + # in nfs backupstore, bv.storageClassName sometimes were empty + # due to timing issue + for i in range(RETRY_COMMAND_COUNT): + bv, b = find_backup(client, volume_name, snapshot.name) + if bv.storageClassName != "": + break + time.sleep(RETRY_INTERVAL) + assert bv.storageClassName == storage_class['metadata']['name'] + + restore_name = generate_volume_name() + volume = client.create_volume(name=restore_name, size=volume_size, + numberOfReplicas=3, + fromBackup=b.url) + + volume = common.wait_for_volume_restoration_completed(client, restore_name) + volume = common.wait_for_volume_detached(client, restore_name) + assert volume.name == restore_name + assert volume.size == volume_size + assert volume.numberOfReplicas == 3 + assert volume.state == "detached" + + create_pv_for_volume(client, core_api, volume, restore_name) + create_pvc_for_volume(client, core_api, volume, restore_name) + + claim = core_api.\ + read_namespaced_persistent_volume_claim(name=restore_name, + namespace='default') + + assert claim.spec.storage_class_name == storage_class['metadata']['name'] + + backup_pod = pod_make(name="backup-pod") + restore_volume_pod_name = "pod-" + restore_name + backup_pod['metadata']['name'] = restore_volume_pod_name + backup_pod['spec']['volumes'] = [{ + 'name': backup_pod['spec']['containers'][0]['volumeMounts'][0]['name'], # NOQA + 'persistentVolumeClaim': { + 'claimName': restore_name, + }, + }] + create_and_wait_pod(core_api, backup_pod) + + resp = read_volume_data(core_api, restore_volume_pod_name) + assert resp == test_data @pytest.mark.coretest # NOQA @@ -2003,7 +2098,8 @@ def test_volume_update_replica_count(client, volume_name): # NOQA host_id = get_self_host_id() replica_count = 2 - volume = create_and_check_volume(client, volume_name, replica_count) + volume = create_and_check_volume(client, volume_name, + num_of_replicas=replica_count) volume.attach(hostId=host_id) volume = common.wait_for_volume_healthy(client, volume_name) @@ -2108,11 +2204,8 @@ def test_storage_class_from_backup(set_random_backupstore, volume_name, pvc_name pv_name = pvc_name - volume = create_and_check_volume( - client, - volume_name, - size=VOLUME_SIZE - ) + volume = create_and_check_volume(client, volume_name, + size=VOLUME_SIZE) wait_for_volume_detached(client, volume_name) @@ -2330,7 +2423,9 @@ def test_expansion_with_size_round_up(client, core_api, volume_name): # NOQA 5. Check if size round up '2147483648' and the written data. """ - volume = create_and_check_volume(client, volume_name, 2, str(1 * Gi)) + volume = create_and_check_volume(client, volume_name, + num_of_replicas=2, + size=str(1 * Gi)) self_hostId = get_self_host_id() volume.attach(hostId=self_hostId, disableFrontend=False) @@ -2392,7 +2487,9 @@ def test_restore_inc_with_offline_expansion(set_random_backupstore, client, core """ lht_host_id = get_self_host_id() - std_volume = create_and_check_volume(client, volume_name, 2, SIZE) + std_volume = create_and_check_volume(client, volume_name, + num_of_replicas=2, + size=SIZE) std_volume.attach(hostId=lht_host_id) std_volume = common.wait_for_volume_healthy(client, volume_name) @@ -3077,7 +3174,8 @@ def test_backup_lock_deletion_during_restoration(set_random_backupstore, client, wait_for_backup_completion(client, std_volume_name, snap1.name) _, b = common.find_backup(client, std_volume_name, snap1.name) - client.create_volume(name=restore_volume_name, fromBackup=b.url) + client.create_volume(name=restore_volume_name, fromBackup=b.url, + numberOfReplicas=3) wait_for_volume_restoration_start(client, restore_volume_name, b.name) backup_volume = client.by_id_backupVolume(std_volume_name) @@ -3172,7 +3270,8 @@ def test_backup_lock_deletion_during_backup(set_random_backupstore, client, core b1 = None assert b1 is None - client.create_volume(name=restore_volume_name_1, fromBackup=b2.url) + client.create_volume(name=restore_volume_name_1, fromBackup=b2.url, + numberOfReplicas=3) wait_for_volume_restoration_completed(client, restore_volume_name_1) restore_volume_1 = wait_for_volume_detached(client, restore_volume_name_1) @@ -4118,7 +4217,9 @@ def test_expand_pvc_with_size_round_up(client, core_api, volume_name): # NOQA setting = client.update(setting, value=static_sc_name) assert setting.value == static_sc_name - volume = create_and_check_volume(client, volume_name, 2, str(1 * Gi)) + volume = create_and_check_volume(client, volume_name, + num_of_replicas=2, + size=str(1 * Gi)) create_pv_for_volume(client, core_api, volume, volume_name) create_pvc_for_volume(client, core_api, volume, volume_name) @@ -5547,7 +5648,7 @@ def test_backuptarget_invalid(apps_api, # NOQA snap = create_snapshot(client, volume_name) volume.snapshotBackup(name=snap.name) - for i in range(RETRY_COMMAND_COUNT): + for i in range(RETRY_COUNTS_SHORT): api = get_custom_object_api_client() backups = api.list_namespaced_custom_object("longhorn.io", "v1beta2", diff --git a/manager/integration/tests/test_cluster_autoscaler.py b/manager/integration/tests/test_cluster_autoscaler.py index 1404719701..6a103674c0 100644 --- a/manager/integration/tests/test_cluster_autoscaler.py +++ b/manager/integration/tests/test_cluster_autoscaler.py @@ -147,9 +147,8 @@ def finalizer(): nodes = client.list_node() scale_size = len(nodes)-1 - volume = create_and_check_volume( - client, volume_name, num_of_replicas=scale_size - ) + volume = create_and_check_volume(client, volume_name, + num_of_replicas=scale_size) create_pv_for_volume(client, core_api, volume, volume.name) create_pvc_for_volume(client, core_api, volume, volume.name) diff --git a/manager/integration/tests/test_csi.py b/manager/integration/tests/test_csi.py index d9e9e16c00..84930ffbb0 100644 --- a/manager/integration/tests/test_csi.py +++ b/manager/integration/tests/test_csi.py @@ -280,7 +280,8 @@ def test_csi_encrypted_block_volume(client, core_api, storage_class, crypto_secr 7. Validate the data in `pod2` is consistent with `test_data` """ - create_crypto_secret(crypto_secret) + secret = crypto_secret(LONGHORN_NAMESPACE) + create_crypto_secret(secret) storage_class['reclaimPolicy'] = 'Retain' storage_class['parameters']['csi.storage.k8s.io/provisioner-secret-name'] = 'longhorn-crypto' # NOQA @@ -891,6 +892,7 @@ def test_csi_minimal_volume_size( csi_pv['metadata']['name'] = pv_name csi_pv['spec']['csi']['volumeHandle'] = vol_name csi_pv['spec']['capacity']['storage'] = min_storage + csi_pv['spec']['persistentVolumeReclaimPolicy'] = 'Retain' core_api.create_persistent_volume(csi_pv) pvc_name = vol_name + "-pvc" diff --git a/manager/integration/tests/test_engine_upgrade.py b/manager/integration/tests/test_engine_upgrade.py index 073f69dac9..2c16d6a8c1 100644 --- a/manager/integration/tests/test_engine_upgrade.py +++ b/manager/integration/tests/test_engine_upgrade.py @@ -9,6 +9,7 @@ from common import wait_for_volume_detached from common import wait_for_engine_image_deletion from common import wait_for_engine_image_ref_count, wait_for_engine_image_state +from common import wait_for_engine_image_incompatible from common import get_volume_engine, write_volume_random_data from common import check_volume_endpoint from common import wait_for_volume_replicas_mode @@ -450,8 +451,8 @@ def test_engine_image_incompatible(client, core_api, volume_name): # NOQA ctl_v, ctl_minv, data_v, data_minv) img = client.create_engine_image(image=fail_cli_v_image) - img = wait_for_engine_image_state(client, img.name, "incompatible") - assert img.state == "incompatible" + img = wait_for_engine_image_incompatible(client, img.name) + assert img.incompatible assert img.cliAPIVersion == cli_minv - 1 assert img.cliAPIMinVersion == cli_minv - 1 client.delete(img) @@ -462,8 +463,8 @@ def test_engine_image_incompatible(client, core_api, volume_name): # NOQA ctl_v, ctl_minv, data_v, data_minv) img = client.create_engine_image(image=fail_cli_minv_image) - img = wait_for_engine_image_state(client, img.name, "incompatible") - assert img.state == "incompatible" + img = wait_for_engine_image_incompatible(client, img.name) + assert img.incompatible assert img.cliAPIVersion == cli_v + 1 assert img.cliAPIMinVersion == cli_v + 1 client.delete(img) diff --git a/manager/integration/tests/test_ha.py b/manager/integration/tests/test_ha.py index 0137916556..6c1a794515 100644 --- a/manager/integration/tests/test_ha.py +++ b/manager/integration/tests/test_ha.py @@ -95,6 +95,8 @@ SMALL_RETRY_COUNTS = 30 BACKUPSTORE = get_backupstores() +REPLICA_FAILURE_MODE_CRASH = "replica_failure_mode_crash" +REPLICA_FAILURE_MODE_DELETE = "replica_failure_mode_delete" @pytest.mark.coretest # NOQA def test_ha_simple_recovery(client, volume_name): # NOQA @@ -111,8 +113,10 @@ def test_ha_simple_recovery(client, volume_name): # NOQA def ha_simple_recovery_test(client, volume_name, size, backing_image=""): # NOQA - volume = create_and_check_volume(client, volume_name, 2, size, - backing_image) + volume = create_and_check_volume(client, volume_name, + num_of_replicas=2, + size=size, + backing_image=backing_image) host_id = get_self_host_id() volume = volume.attach(hostId=host_id) @@ -246,7 +250,8 @@ def ha_salvage_test(client, core_api, # NOQA assert setting.name == SETTING_AUTO_SALVAGE assert setting.value == "false" - volume = create_and_check_volume(client, volume_name, 2, + volume = create_and_check_volume(client, volume_name, + num_of_replicas=2, backing_image=backing_image) host_id = get_self_host_id() @@ -289,7 +294,8 @@ def ha_salvage_test(client, core_api, # NOQA assert setting.name == SETTING_AUTO_SALVAGE assert setting.value == "false" - volume = create_and_check_volume(client, volume_name, 2, + volume = create_and_check_volume(client, volume_name, + num_of_replicas=2, backing_image=backing_image) volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) @@ -337,7 +343,8 @@ def ha_salvage_test(client, core_api, # NOQA assert setting.name == SETTING_DISABLE_REVISION_COUNTER assert setting.value == "true" - volume = create_and_check_volume(client, volume_name, 3, + volume = create_and_check_volume(client, volume_name, + num_of_replicas=3, backing_image=backing_image) host_id = get_self_host_id() @@ -381,7 +388,8 @@ def ha_salvage_test(client, core_api, # NOQA assert setting.name == "disable-revision-counter" assert setting.value == "false" - volume = create_and_check_volume(client, volume_name, 3, + volume = create_and_check_volume(client, volume_name, + num_of_replicas=3, backing_image=backing_image) host_id = get_self_host_id() @@ -505,7 +513,8 @@ def test_ha_prohibit_deleting_last_replica(client, volume_name): # NOQA FIXME: Move out of test_ha.py """ - volume = create_and_check_volume(client, volume_name, 1) + volume = create_and_check_volume(client, volume_name, + num_of_replicas=1) host_id = get_self_host_id() volume = volume.attach(hostId=host_id) @@ -537,7 +546,9 @@ def test_ha_recovery_with_expansion(client, volume_name, request): # NOQA """ original_size = str(3 * Gi) expand_size = str(4 * Gi) - volume = create_and_check_volume(client, volume_name, 2, original_size) + volume = create_and_check_volume(client, volume_name, + num_of_replicas=2, + size=original_size) host_id = get_self_host_id() volume.attach(hostId=host_id) @@ -800,92 +811,32 @@ def test_rebuild_replica_and_from_replica_on_the_same_node(client, core_api, vol def test_rebuild_with_restoration(set_random_backupstore, client, core_api, volume_name, csi_pv, pvc, pod_make): # NOQA """ - [HA] Test if the rebuild is disabled for the restoring volume + [HA] Test if the rebuild is disabled for the restoring volume. + + This is similar to test_single_replica_restore_failure and + test_single_replica_unschedulable_restore_failure. In this version, a + replica is deleted. We expect a new replica to be rebuilt in its place and + the restore to complete. + 1. Setup a random backupstore. - 2. Create a pod with a volume and wait for pod to start. - 3. Write data to the volume and get the md5sum. - 4. Create a backup for the volume. - 5. Restore a volume from the backup. - 6. Delete one replica during the restoration. - 7. Wait for the restoration complete and the volume detached. - 8. Check if the replica is rebuilt for the auto detachment. - 9. Create PV/PVC/Pod for the restored volume and wait for the pod start. - 10. Check if the restored volume is state `Healthy` + 2. Do cleanup for the backupstore. + 3. Create a pod with a volume and wait for pod to start. + 4. Write data to the pod volume and get the md5sum. + 5. Create a backup for the volume. + 6. Restore a volume from the backup. + 7. Wait for the volume restore start. + 8. Delete one replica during the restoration. + 9. Wait for the restoration complete and the volume detached. + 10. Check if the replica is rebuilt. + 11. Create PV/PVC/Pod for the restored volume and wait for the pod start. + 12. Check if the restored volume is state `Healthy` after the attachment. - 11. Check md5sum of the data in the restored volume. - 12. Do cleanup. + 13. Check md5sum of the data in the restored volume. + 14. Do cleanup. """ - update_setting(client, common.SETTING_DEGRADED_AVAILABILITY, "false") - - original_volume_name = volume_name + "-origin" - data_path = "/data/test" - original_pod_name, original_pv_name, original_pvc_name, original_md5sum = \ - prepare_pod_with_data_in_mb( - client, core_api, csi_pv, pvc, pod_make, original_volume_name, - volume_size=str(2*Gi), data_path=data_path, - data_size_in_mb=3*DATA_SIZE_IN_MB_3) - - original_volume = client.by_id_volume(original_volume_name) - snap = create_snapshot(client, original_volume_name) - original_volume.snapshotBackup(name=snap.name) - wait_for_backup_completion(client, - original_volume_name, - snap.name, - retry_count=600) - bv, b = find_backup(client, original_volume_name, snap.name) - - restore_volume_name = volume_name + "-restore" - client.create_volume(name=restore_volume_name, size=str(2 * Gi), - numberOfReplicas=3, fromBackup=b.url) - wait_for_volume_creation(client, restore_volume_name) - - restoring_replica = wait_for_volume_restoration_start( - client, restore_volume_name, b.name) - restore_volume = client.by_id_volume(restore_volume_name) - restore_volume.replicaRemove(name=restoring_replica) - client.list_backupVolume() - - # Wait for the rebuild start - running_replica_count = 0 - for i in range(RETRY_COUNTS): - running_replica_count = 0 - restore_volume = client.by_id_volume(restore_volume_name) - for r in restore_volume.replicas: - if r['running'] and not r['failedAt']: - running_replica_count += 1 - if running_replica_count == 3: - break - time.sleep(RETRY_INTERVAL) - assert running_replica_count == 3 - - wait_for_volume_restoration_completed(client, restore_volume_name) - restore_volume = wait_for_volume_detached(client, restore_volume_name) - assert len(restore_volume.replicas) == 3 - for r in restore_volume.replicas: - assert restoring_replica != r.name - assert r['failedAt'] == "" - - restore_pod_name = restore_volume_name + "-pod" - restore_pv_name = restore_volume_name + "-pv" - restore_pvc_name = restore_volume_name + "-pvc" - restore_pod = pod_make(name=restore_pod_name) - create_pv_for_volume(client, core_api, restore_volume, restore_pv_name) - create_pvc_for_volume(client, core_api, restore_volume, restore_pvc_name) - restore_pod['spec']['volumes'] = [create_pvc_spec(restore_pvc_name)] - create_and_wait_pod(core_api, restore_pod) - - restore_volume = client.by_id_volume(restore_volume_name) - assert restore_volume[VOLUME_FIELD_ROBUSTNESS] == VOLUME_ROBUSTNESS_HEALTHY - - md5sum = get_pod_data_md5sum(core_api, restore_pod_name, data_path) - assert original_md5sum == md5sum - - # cleanup the backupstore so we don't impact other tests - # since we crashed the replica that initiated the restore - # it's backupstore lock will still be present, so we need to - # wait till the lock is expired, before we can delete the backups - backupstore_wait_for_lock_expiration() - backupstore_cleanup(client) + restore_with_replica_failure(client, core_api, volume_name, csi_pv, pvc, + pod_make, False, False, + REPLICA_FAILURE_MODE_DELETE) def test_rebuild_with_inc_restoration(set_random_backupstore, client, core_api, volume_name, csi_pv, pvc, pod_make): # NOQA @@ -1504,121 +1455,65 @@ def test_single_replica_restore_failure(set_random_backupstore, client, core_api becoming Degraded, and if the restore volume is still usable after the failure. - Notice that this case is similar to test_rebuild_with_restoration(). - But the way to fail the replica is different. - test_rebuild_with_restoration() directly crash the replica process - hence there is no error in the restore status. + This is similar to test_rebuild_with_restoration and + test_single_replica_unschedulable_restore_failure. In this version, a + replica is crashed. We expect the crashed replica to be rebuilt and the + restore to complete. - 1. Enable auto-salvage. - 2. Set the a random backupstore. - 3. Do cleanup for the backupstore. + 1. Setup a random backupstore. + 2. Do cleanup for the backupstore. + 3. Create a pod with a volume and wait for pod to start. + 4. Write data to the pod volume and get the md5sum. + 5. Create a backup for the volume. + 6. Restore a volume from the backup. + 7. Wait for the volume restore start. + 8. Crash one replica during the restoration. + 9. Wait for the restoration complete and the volume detached. + 10. Check if the replica is rebuilt. + 11. Create PV/PVC/Pod for the restored volume and wait for the pod start. + 12. Check if the restored volume is state `Healthy` + after the attachment. + 13. Check md5sum of the data in the restored volume. + 14. Do cleanup. + """ + restore_with_replica_failure(client, core_api, volume_name, csi_pv, pvc, + pod_make, False, False, + REPLICA_FAILURE_MODE_CRASH) + + +def test_single_replica_unschedulable_restore_failure(set_random_backupstore, client, core_api, volume_name, csi_pv, pvc, pod_make): # NOQA + """ + [HA] Test if the restore can complete if a restoring replica is killed + while it is ongoing and cannot be recovered. + + This is similar to test_rebuild_with_restoration and + test_single_replica_restore_failure. In this version, a replica is crashed + and not allowed to recover. However, we enable + allow-volume-creation-with-degraded-availability, so we expect the restore + to complete anyway. + + 1. Setup a random backupstore. + 2. Do cleanup for the backupstore. + 3. Enable allow-volume-creation-with-degraded-availability (to allow + restoration to complete without all replicas). 4. Create a pod with a volume and wait for pod to start. 5. Write data to the pod volume and get the md5sum. 6. Create a backup for the volume. 7. Restore a volume from the backup. - 8. Wait for the volume restore start by checking if: - 8.1. `volume.restoreStatus` shows the related restore info. - 8.2. `volume.conditions[Restore].status == True && - volume.conditions[Restore].reason == "RestoreInProgress"`. - 8.3. `volume.ready == false`. - 9. Find a way to fail just one replica restore. - e.g. Use iptable to block the restore. - 10. Wait for the restore volume Degraded. - 11. Wait for the volume restore & rebuild complete and check if: - 11.1. `volume.ready == true` - 11.2. `volume.conditions[Restore].status == False && - volume.conditions[Restore].reason == ""`. + 8. Wait for the volume restore start. + 9. Disable replica rebuilding (to ensure the killed replica cannot + recover). + 10. Crash one replica during the restoration. + 11. Wait for the restoration complete and the volume detached. 12. Create PV/PVC/Pod for the restored volume and wait for the pod start. 13. Check if the restored volume is state `Healthy` after the attachment. 14. Check md5sum of the data in the restored volume. 15. Do cleanup. """ - auto_salvage_setting = client.by_id_setting(SETTING_AUTO_SALVAGE) - assert auto_salvage_setting.name == SETTING_AUTO_SALVAGE - assert auto_salvage_setting.value == "true" - - update_setting(client, common.SETTING_DEGRADED_AVAILABILITY, "false") - - backupstore_cleanup(client) - - data_path = "/data/test" - - pod_name, pv_name, pvc_name, md5sum = \ - prepare_pod_with_data_in_mb(client, core_api, csi_pv, pvc, - pod_make, - volume_name, - data_size_in_mb=DATA_SIZE_IN_MB_2, - data_path=data_path) - - volume = client.by_id_volume(volume_name) - snap = create_snapshot(client, volume_name) - volume.snapshotBackup(name=snap.name) - wait_for_backup_completion(client, volume_name, snap.name) - bv, b = find_backup(client, volume_name, snap.name) - - res_name = "res-" + volume_name - - client.create_volume(name=res_name, fromBackup=b.url) - wait_for_volume_condition_restore(client, res_name, - "status", "True") - wait_for_volume_condition_restore(client, res_name, - "reason", "RestoreInProgress") - - res_volume = client.by_id_volume(res_name) - assert res_volume.ready is False - - res_volume = wait_for_volume_healthy_no_frontend(client, res_name) - - failed_replica = res_volume.replicas[0] - crash_replica_processes(client, core_api, res_name, - replicas=[failed_replica], - wait_to_fail=False) - wait_for_volume_degraded(client, res_name) - - # Wait for the rebuild start - running_replica_count = 0 - for i in range(RETRY_COUNTS): - running_replica_count = 0 - res_volume = client.by_id_volume(res_name) - for r in res_volume.replicas: - if r['running'] and not r['failedAt']: - running_replica_count += 1 - if running_replica_count == 3: - break - time.sleep(RETRY_INTERVAL) - assert running_replica_count == 3 - - wait_for_volume_restoration_completed(client, res_name) - wait_for_volume_condition_restore(client, res_name, - "status", "False") - res_volume = wait_for_volume_detached(client, res_name) - assert res_volume.ready is True - - res_pod_name = res_name + "-pod" - pv_name = res_name + "-pv" - pvc_name = res_name + "-pvc" - - create_pv_for_volume(client, core_api, res_volume, pv_name) - create_pvc_for_volume(client, core_api, res_volume, pvc_name) - - res_pod = pod_make(name=res_pod_name) - res_pod['spec']['volumes'] = [create_pvc_spec(pvc_name)] - create_and_wait_pod(core_api, res_pod) - - res_volume = client.by_id_volume(res_name) - assert res_volume[VOLUME_FIELD_ROBUSTNESS] == VOLUME_ROBUSTNESS_HEALTHY - - res_md5sum = get_pod_data_md5sum(core_api, res_pod_name, data_path) - assert md5sum == res_md5sum - - # cleanup the backupstore so we don't impact other tests - # since we crashed the replica that initiated the restore - # it's backupstore lock will still be present, so we need to - # wait till the lock is expired, before we can delete the backups - backupstore_wait_for_lock_expiration() - backupstore_cleanup(client) - + restore_with_replica_failure(client, core_api, volume_name, csi_pv, pvc, + pod_make, True, True, + REPLICA_FAILURE_MODE_CRASH) def test_dr_volume_with_restore_command_error(set_random_backupstore, client, core_api, volume_name, csi_pv, pvc, pod_make): # NOQA """ @@ -1995,7 +1890,8 @@ def test_rebuild_after_replica_file_crash(client, volume_name): # NOQA 6. Read the data from the volume and verify the md5sum. """ replica_count = 3 - volume = create_and_check_volume(client, volume_name, replica_count) + volume = create_and_check_volume(client, volume_name, + num_of_replicas=replica_count) host_id = get_self_host_id() volume = volume.attach(hostId=host_id) volume = wait_for_volume_healthy(client, volume_name) @@ -2602,7 +2498,9 @@ def test_replica_failure_during_attaching(settings_reset, client, core_api, volu node = common.wait_for_disk_update(client, node.name, len(update_disks)) volume_name_2 = volume_name + '-2' - volume_2 = create_and_check_volume(client, volume_name_2, 3, str(1 * Gi)) + volume_2 = create_and_check_volume(client, volume_name_2, + num_of_replicas=3, + size=str(1 * Gi)) volume_2.attach(hostId=host_id) volume_2 = wait_for_volume_healthy(client, volume_name_2) write_volume_random_data(volume_2) @@ -2961,7 +2859,8 @@ def test_engine_image_not_fully_deployed_perform_replica_scheduling(client, core node2 = common.wait_for_node_update(client, node2.id, "allowScheduling", False) - volume1 = create_and_check_volume(client, "vol-1", num_of_replicas=2, + volume1 = create_and_check_volume(client, "vol-1", + num_of_replicas=2, size=str(3 * Gi)) volume1.attach(hostId=node3.id) @@ -3008,10 +2907,12 @@ def test_engine_image_not_fully_deployed_perform_auto_upgrade_engine(client, cor """ prepare_engine_not_fully_deployed_environment(client, core_api) - volume1 = create_and_check_volume(client, "vol-1", num_of_replicas=2, + volume1 = create_and_check_volume(client, "vol-1", + num_of_replicas=2, size=str(3 * Gi)) - volume2 = create_and_check_volume(client, "vol-2", num_of_replicas=2, + volume2 = create_and_check_volume(client, "vol-2", + num_of_replicas=2, size=str(3 * Gi)) default_img = common.get_default_engine_image(client) @@ -3085,7 +2986,8 @@ def test_engine_image_not_fully_deployed_perform_dr_restoring_expanding_volume(c prepare_engine_not_fully_deployed_environment(client, core_api) # step 1 - volume1 = create_and_check_volume(client, "vol-1", num_of_replicas=2, + volume1 = create_and_check_volume(client, "vol-1", + num_of_replicas=2, size=str(1 * Gi)) # node1: tainted node, node2: self host node, node3: the last one @@ -3402,3 +3304,142 @@ def test_recovery_from_im_deletion(client, core_api, volume_name, make_deploymen # Step8 assert test_data == to_be_verified_data + + +@pytest.mark.skip(reason="TODO") # NOQA +def test_retain_potentially_useful_replicas_in_autosalvage_loop(): + """ + Related issue: + https://github.com/longhorn/longhorn/issues/7425 + + Related manual test steps: + https://github.com/longhorn/longhorn-manager/pull/2432#issuecomment-1894675916 + + Steps: + 1. Create a volume with numberOfReplicas=2 and staleReplicaTimeout=1. + Consider its two replicas ReplicaA and ReplicaB. + 2. Attach the volume to a node. + 3. Write data to the volume. + 4. Exec into the instance-manager for ReplicaB and delete all .img.meta + files. This makes it impossible to restart ReplicaB successfully. + 5. Cordon the node for Replica A. This makes it unavailable for + autosalvage. + 6. Crash the instance-managers for both ReplicaA and ReplicaB. + 7. Wait one minute and fifteen seconds. This is longer than + staleReplicaTimeout. + 8. Confirm the volume is not healthy. + 9. Confirm ReplicaA was not deleted. + 10. Delete ReplicaB. + 11. Wait for the volume to become healthy. + 12. Verify the data. + """ + +def restore_with_replica_failure(client, core_api, volume_name, csi_pv, # NOQA + pvc, pod_make, # NOQA + allow_degraded_availability, + disable_rebuild, replica_failure_mode): + """ + restore_with_replica_failure is reusable by a number of similar tests. + In general, it attempts a volume restore, kills one of the restoring + replicas, and verifies the restore can still complete. The manner in which + a replica is killed and the settings enabled at the time vary with the + parameters. + """ + + backupstore_cleanup(client) + + update_setting(client, common.SETTING_DEGRADED_AVAILABILITY, + str(allow_degraded_availability).lower()) + + data_path = "/data/test" + _, _, _, md5sum = \ + prepare_pod_with_data_in_mb(client, core_api, csi_pv, pvc, + pod_make, + volume_name, + volume_size=str(2 * Gi), + data_size_in_mb=DATA_SIZE_IN_MB_4, + data_path=data_path) + + volume = client.by_id_volume(volume_name) + snap = create_snapshot(client, volume_name) + volume.snapshotBackup(name=snap.name) + wait_for_backup_completion(client, volume_name, snap.name, retry_count=600) + _, b = find_backup(client, volume_name, snap.name) + + restore_volume_name = volume_name + "-restore" + client.create_volume(name=restore_volume_name, size=str(2 * Gi), + fromBackup=b.url) + + _ = wait_for_volume_restoration_start(client, restore_volume_name, b.name) + restore_volume = client.by_id_volume(restore_volume_name) + failed_replica = restore_volume.replicas[0] + + if disable_rebuild: + common.update_setting( + client, + common.SETTING_CONCURRENT_REPLICA_REBUILD_PER_NODE_LIMIT, "0") + + if replica_failure_mode == REPLICA_FAILURE_MODE_CRASH: + crash_replica_processes(client, core_api, restore_volume_name, + replicas=[failed_replica], + wait_to_fail=False) + if replica_failure_mode == REPLICA_FAILURE_MODE_DELETE: + restore_volume.replicaRemove(name=failed_replica.name) + + if not disable_rebuild: + # If disable_rebuild then we expect the volume to quickly finish + # restoration and detach. We MIGHT be able to catch it degraded before, + # but trying can lead to flakes. Check degraded at the end of test, + # since no rebuilds are allowed. + wait_for_volume_degraded(client, restore_volume_name) + running_replica_count = 0 + for i in range(RETRY_COUNTS): + running_replica_count = 0 + for r in restore_volume.replicas: + if r['running'] and not r['failedAt']: + running_replica_count += 1 + if running_replica_count == 3: + break + time.sleep(RETRY_INTERVAL) + assert running_replica_count == 3 + + wait_for_volume_restoration_completed(client, restore_volume_name) + wait_for_volume_condition_restore(client, restore_volume_name, + "status", "False") + restore_volume = wait_for_volume_detached(client, restore_volume_name) + assert restore_volume.ready + + if disable_rebuild and replica_failure_mode == REPLICA_FAILURE_MODE_DELETE: + assert len(restore_volume.replicas) == 3 + for r in restore_volume.replicas: + assert r['failedAt'] == "" + assert failed_replica.name != r.name + + restore_pod_name = restore_volume_name + "-pod" + restore_pv_name = restore_volume_name + "-pv" + restore_pvc_name = restore_volume_name + "-pvc" + create_pv_for_volume(client, core_api, restore_volume, restore_pv_name) + create_pvc_for_volume(client, core_api, restore_volume, restore_pvc_name) + + restore_pod = pod_make(name=restore_pod_name) + restore_pod['spec']['volumes'] = [create_pvc_spec(restore_pvc_name)] + create_and_wait_pod(core_api, restore_pod) + + restore_volume = client.by_id_volume(restore_volume_name) + if disable_rebuild: + # Restoration should be complete, but without one replica. + assert restore_volume[VOLUME_FIELD_ROBUSTNESS] == \ + VOLUME_ROBUSTNESS_DEGRADED + else: + assert restore_volume[VOLUME_FIELD_ROBUSTNESS] == \ + VOLUME_ROBUSTNESS_HEALTHY + + restore_md5sum = get_pod_data_md5sum(core_api, restore_pod_name, data_path) + assert restore_md5sum == md5sum + + # cleanup the backupstore so we don't impact other tests + # since we crashed the replica that initiated the restore + # it's backupstore lock will still be present, so we need to + # wait till the lock is expired, before we can delete the backups + backupstore_wait_for_lock_expiration() + backupstore_cleanup(client) diff --git a/manager/integration/tests/test_infra.py b/manager/integration/tests/test_infra.py index 87cf0f5d1f..6842db3090 100644 --- a/manager/integration/tests/test_infra.py +++ b/manager/integration/tests/test_infra.py @@ -141,7 +141,6 @@ def is_infra_k3s(): @pytest.fixture def reset_cluster_ready_status(request): yield - node_worker_label = 'node-role.kubernetes.io/worker' node_controlplane_label = 'node-role.kubernetes.io/control-plane' node_ip_annotation = "flannel.alpha.coreos.com/public-ip" @@ -149,27 +148,16 @@ def reset_cluster_ready_status(request): longhorn_api_client = get_longhorn_api_client() cloudprovider = detect_cloudprovider() - k3s = is_infra_k3s() - print('==> test completed! reset cluster ready status ...') for node_item in k8s_api_client.list_node().items: - if k3s is True: - if node_controlplane_label not in node_item.metadata.labels: - node_name = node_item.metadata.name - node_ip = node_item.metadata.annotations[node_ip_annotation] - node = cloudprovider.instance_id_by_ip(node_ip) - else: - continue - + if node_controlplane_label not in node_item.metadata.labels: + node_name = node_item.metadata.name + node_ip = node_item.metadata.annotations[node_ip_annotation] + node = cloudprovider.instance_id_by_ip(node_ip) else: - if node_worker_label in node_item.metadata.labels and \ - node_item.metadata.labels[node_worker_label] == 'true': - node_name = node_item.metadata.name - node = cloudprovider.instance_id(node_name) - else: - continue + continue if is_node_ready_k8s(node_name, k8s_api_client) is False: @@ -199,7 +187,6 @@ def test_offline_node(reset_cluster_ready_status): 1. Bring down one of the nodes in Kuberntes cluster (avoid current node) 2. Make sure the Longhorn node state become `down` """ - node_worker_label = 'node-role.kubernetes.io/worker' pod_lable_selector = "longhorn-test=test-job" node_controlplane_label = 'node-role.kubernetes.io/control-plane' node_ip_annotation = "flannel.alpha.coreos.com/public-ip" @@ -214,27 +201,15 @@ def test_offline_node(reset_cluster_ready_status): if pod.metadata.name == "longhorn-test": longhorn_test_node_name = pod.spec.node_name - k3s = is_infra_k3s() - for node_item in k8s_api_client.list_node().items: - if k3s is True: - if node_controlplane_label not in node_item.metadata.labels: - node_name = node_item.metadata.name - node_ip = node_item.metadata.annotations[node_ip_annotation] - if node_name == longhorn_test_node_name: - continue - else: - node = cloudprovider.instance_id_by_ip(node_ip) - break - else: - if node_worker_label in node_item.metadata.labels and \ - node_item.metadata.labels[node_worker_label] == 'true': - node_name = node_item.metadata.name - if node_name == longhorn_test_node_name: - continue - else: - node = cloudprovider.instance_id(node_name) - break + if node_controlplane_label not in node_item.metadata.labels: + node_name = node_item.metadata.name + node_ip = node_item.metadata.annotations[node_ip_annotation] + if node_name == longhorn_test_node_name: + continue + else: + node = cloudprovider.instance_id_by_ip(node_ip) + break print(f'==> stop node: {node_name}') diff --git a/manager/integration/tests/test_metric.py b/manager/integration/tests/test_metric.py new file mode 100644 index 0000000000..3210cf1f00 --- /dev/null +++ b/manager/integration/tests/test_metric.py @@ -0,0 +1,523 @@ +import pytest +import requests +import time + +from collections import defaultdict +from prometheus_client.parser import text_string_to_metric_families + +from common import client, core_api, volume_name # NOQA + +from common import delete_replica_processes +from common import create_pv_for_volume +from common import create_pvc_for_volume +from common import create_snapshot +from common import create_and_check_volume +from common import generate_random_data +from common import get_self_host_id +from common import wait_for_volume_degraded +from common import wait_for_volume_detached +from common import wait_for_volume_detached_unknown +from common import wait_for_volume_expansion +from common import wait_for_volume_faulted +from common import wait_for_volume_healthy +from common import write_volume_data +from common import write_volume_random_data +from common import set_node_scheduling +from common import set_node_cordon +from common import Mi +from common import LONGHORN_NAMESPACE +from common import RETRY_COUNTS +from common import RETRY_INTERVAL +from common import DEFAULT_DISK_PATH + +# The dictionaries use float type of value because the value obtained from +# prometheus_client is in float type. +# https://github.com/longhorn/longhorn-tests/pull/1531#issuecomment-1833349994 +longhorn_volume_state = { + "creating": 1.0, + "attached": 2.0, + "detached": 3.0, + "attaching": 4.0, + "detaching": 5.0, + "deleting": 6.0, + } + +longhorn_volume_robustness = { + "unknown": 0.0, + "healthy": 1.0, + "degraded": 2.0, + "faulted": 3.0, +} + + +def get_metrics(core_api, metric_node_id): # NOQA + pods = core_api.list_namespaced_pod(namespace=LONGHORN_NAMESPACE, + label_selector="app=longhorn-manager") + for pod in pods.items: + if pod.spec.node_name == metric_node_id: + manager_ip = pod.status.pod_ip + break + + metrics = requests.get("http://{}:9500/metrics".format(manager_ip)).content + string_data = metrics.decode('utf-8') + result = text_string_to_metric_families(string_data) + return result + + +def find_metric(metric_data, metric_name): + return find_metrics(metric_data, metric_name)[0] + + +def find_metrics(metric_data, metric_name): + metrics = [] + + # Find the metric with the given name in the provided metric data + for family in metric_data: + for sample in family.samples: + if sample.name == metric_name: + metrics.append(sample) + + return metrics + + +def check_metric_with_condition(core_api, metric_name, metric_labels, expected_value=None, metric_node_id=get_self_host_id()): # NOQA) + """ + Some metric have multiple conditions, for exameple metric + longhorn_node_status have condition + - allowScheduling + - mountpropagation + - ready + - schedulable + metric longhorn_disk_status have conditions + - ready + - schedulable + Use this function to get specific condition of a mertic + """ + metric_data = get_metrics(core_api, metric_node_id) + + found_metric = next( + (sample for family in metric_data for sample in family.samples + if sample.name == metric_name and + sample.labels.get("condition") == metric_labels.get("condition")), + None + ) + + assert found_metric is not None + + examine_metric_value(found_metric, metric_labels, expected_value) + + +def check_metric(core_api, metric_name, metric_labels, expected_value=None, metric_node_id=get_self_host_id()): # NOQA + metric_data = get_metrics(core_api, metric_node_id) + + found_metric = None + for family in metric_data: + found_metric = next((sample for sample in family.samples if sample.name == metric_name), None) # NOQA + if found_metric: + break + + assert found_metric is not None + + examine_metric_value(found_metric, metric_labels, expected_value) + + +def examine_metric_value(found_metric, metric_labels, expected_value=None): + for key, value in metric_labels.items(): + assert found_metric.labels[key] == value + + assert isinstance(found_metric.value, float) + + if expected_value is not None: + assert found_metric.value == expected_value + else: + assert found_metric.value >= 0.0 + + +def check_metric_sum_on_all_nodes(client, core_api, metric_name, expected_labels, expected_value=None): # NOQA + # Initialize total_metrics to store the sum of the metric values. + total_metrics = {"labels": defaultdict(None), "value": 0.0} + + # Initialize the total_metric_values to store the sum of the + # metric label values. + total_metric_values = total_metrics["labels"] + + # Find the metric based on the given labels. + def filter_metric_by_labels(metrics, labels): + for metric in metrics: + is_matched = True + for key, value in labels.items(): + if type(value) in (float, int): + continue + + if metric.labels[key] != value: + is_matched = False + break + + if is_matched: + return metric + + raise AssertionError("Cannot find the metric matching the labels") + + for node in client.list_node(): + metric_data = get_metrics(core_api, node.name) + + metrics = find_metrics(metric_data, metric_name) + if len(metrics) == 0: + continue + + filtered_metric = filter_metric_by_labels(metrics, expected_labels) + + assert isinstance(filtered_metric.value, float) + + for key, value in expected_labels.items(): + value_type = type(value) + + if key not in total_metric_values: + total_metric_values[key] = value_type( + filtered_metric.labels[key] + ) + # Accumulate the metric label values. + elif isinstance(value, (float, int)): + total_metric_values[key] += value_type( + filtered_metric.labels[key] + ) + + # Accumulate the metric values. + total_metrics["value"] += filtered_metric.value + + for key, value in expected_labels.items(): + assert total_metric_values[key] == value + + if expected_value is not None: + assert total_metrics["value"] == expected_value + else: + assert total_metrics["value"] >= 0.0 + + +def wait_for_metric_count_all_nodes(client, core_api, metric_name, metric_labels, expected_count): # NOQA + for _ in range(RETRY_COUNTS): + time.sleep(RETRY_INTERVAL) + + try: + check_metric_count_all_nodes(client, core_api, metric_name, + metric_labels, expected_count) + return + except AssertionError: + continue + + check_metric_count_all_nodes(client, core_api, metric_name, + metric_labels, expected_count) + + +def check_metric_count_all_nodes(client, core_api, metric_name, metric_labels, expected_count): # NOQA + # Find the metrics based on the given labels. + def filter_metrics_by_labels(metrics, labels): + filtered_metrics = [] + for metric in metrics: + is_matched = True + for key, value in labels.items(): + if type(value) in (float, int): + continue + + if metric.labels[key] != value: + is_matched = False + break + + if is_matched: + filtered_metrics.append(metric) + + print(filtered_metrics) + return filtered_metrics + + filtered_metrics = [] + for node in client.list_node(): + metric_data = get_metrics(core_api, node.name) + + metrics = find_metrics(metric_data, metric_name) + if len(metrics) == 0: + continue + + filtered_metrics.extend( + filter_metrics_by_labels(metrics, metric_labels) + ) + + assert len(filtered_metrics) == expected_count + + + +@pytest.mark.parametrize("pvc_namespace", [LONGHORN_NAMESPACE, "default"]) # NOQA +def test_volume_metrics(client, core_api, volume_name, pvc_namespace): # NOQA + """ + https://longhorn.io/docs/master/monitoring/metrics/#volume + + The goal of this test case is to verify that the accuracy + of volume metrics by sending HTTP requests to + http://{longhorn-manager IP}:9500/metrics and use + prometheus_client to validate the return value. + """ + lht_hostId = get_self_host_id() + pv_name = volume_name + "-pv" + pvc_name = volume_name + "-pvc" + volume_size = str(500 * Mi) + volume = create_and_check_volume(client, + volume_name, + num_of_replicas=3, + size=volume_size) + + volume = client.by_id_volume(volume_name) + create_pv_for_volume(client, core_api, volume, pv_name) + create_pvc_for_volume(client, core_api, volume, pvc_name, pvc_namespace) + + volume = client.by_id_volume(volume_name) + volume.attach(hostId=lht_hostId) + volume = wait_for_volume_healthy(client, volume_name) + write_volume_random_data(volume) + volume = client.by_id_volume(volume_name) + actual_size = float(volume.controllers[0].actualSize) + capacity_size = float(volume.size) + + metric_labels = { + "node": lht_hostId, + "pvc": pvc_name, + "volume": volume_name, + "pvc_namespace": pvc_namespace + } + + # check volume metric basic + check_metric(core_api, "longhorn_volume_actual_size_bytes", + metric_labels, actual_size) + check_metric(core_api, "longhorn_volume_capacity_bytes", + metric_labels, capacity_size) + check_metric(core_api, "longhorn_volume_read_throughput", + metric_labels) + check_metric(core_api, "longhorn_volume_write_throughput", + metric_labels) + check_metric(core_api, "longhorn_volume_read_iops", + metric_labels) + check_metric(core_api, "longhorn_volume_write_iops", + metric_labels) + check_metric(core_api, "longhorn_volume_read_latency", + metric_labels) + check_metric(core_api, "longhorn_volume_write_latency", + metric_labels) + + # verify longhorn_volume_robustness when volume is healthy, + # degraded, faulted or unknown + volume.detach() + volume = wait_for_volume_detached_unknown(client, volume_name) + check_metric(core_api, "longhorn_volume_robustness", + metric_labels, longhorn_volume_robustness["unknown"]) + + volume.attach(hostId=lht_hostId) + volume = wait_for_volume_healthy(client, volume_name) + check_metric(core_api, "longhorn_volume_robustness", + metric_labels, longhorn_volume_robustness["healthy"]) + + volume.updateReplicaCount(replicaCount=4) + volume = wait_for_volume_degraded(client, volume_name) + check_metric(core_api, "longhorn_volume_robustness", + metric_labels, longhorn_volume_robustness["degraded"]) + + volume.updateReplicaCount(replicaCount=3) + volume = wait_for_volume_healthy(client, volume_name) + delete_replica_processes(client, core_api, volume_name) + volume = wait_for_volume_faulted(client, volume_name) + + check_metric(core_api, "longhorn_volume_robustness", + metric_labels, longhorn_volume_robustness["faulted"]) + + # verify longhorn_volume_state when volume is attached or detached + volume = wait_for_volume_healthy(client, volume_name) + check_metric(core_api, "longhorn_volume_state", + metric_labels, longhorn_volume_state["attached"]) + + volume.detach() + volume = wait_for_volume_detached(client, volume_name) + check_metric(core_api, "longhorn_volume_state", + metric_labels, longhorn_volume_state["detached"]) + + +def test_metric_longhorn_snapshot_actual_size_bytes(client, core_api, volume_name): # NOQA + """ + Scenario: test metric longhorn_snapshot_actual_size_bytes + + Issue: https://github.com/longhorn/longhorn/issues/5869 + + Given a volume + + When 1 snapshot is created by user + And 1 snapshot is created by system + Then has a metric longhorn_snapshot_actual_size_bytes value equals to the + size of the user created snapshot, + and volume label is the volume name + and user_created label is true + And has a metric longhorn_snapshot_actual_size_bytes value equals to the + size of the system created snapshot, + and volume label is the volume name + and user_created label is false + + When 3 snapshot is created by user + Then has 4 metrics longhorn_snapshot_actual_size_bytes with + volume label is the volume name + and user_created label is true + And has 1 metrics longhorn_snapshot_actual_size_bytes with + volume label is the volume name + and user_created label is false + """ + self_hostId = get_self_host_id() + + # create a volume and attach it to a node. + volume_size = 50 * Mi + client.create_volume(name=volume_name, + numberOfReplicas=1, + size=str(volume_size)) + volume = wait_for_volume_detached(client, volume_name) + volume.attach(hostId=self_hostId) + volume = wait_for_volume_healthy(client, volume_name) + + # create the user snapshot. + data_size = 10 * Mi + user_snapshot_data_0 = {'pos': 0, + 'len': data_size, + 'content': generate_random_data(data_size)} + write_volume_data(volume, user_snapshot_data_0) + + create_snapshot(client, volume_name) + + # create the system snapshot by expanding the volume. + system_snapshot_data_0 = {'pos': 0, + 'len': data_size, + 'content': generate_random_data(data_size)} + write_volume_data(volume, system_snapshot_data_0) + + volume_size_expanded_0 = str(volume_size * 2) + volume.expand(size=volume_size_expanded_0) + wait_for_volume_expansion(client, volume_name) + volume = client.by_id_volume(volume_name) + assert volume.size == volume_size_expanded_0 + + # get the snapshot sizes. + user_snapshot_size = 0 + system_snapshot_size = 0 + snapshots = volume.snapshotList() + for snapshot in snapshots: + if snapshot.name == "volume-head": + continue + + if snapshot.usercreated: + user_snapshot_size = int(snapshot.size) + else: + system_snapshot_size = int(snapshot.size) + assert user_snapshot_size > 0 + assert system_snapshot_size > 0 + + # assert the metric values for the user snapshot. + user_snapshot_metric_labels = { + "volume": volume_name, + "user_created": "true", + } + check_metric_sum_on_all_nodes(client, core_api, + "longhorn_snapshot_actual_size_bytes", + user_snapshot_metric_labels, + user_snapshot_size) + + # assert the metric values for the system snapshot. + system_snapshot_metric_labels = { + "volume": volume_name, + "user_created": "false", + } + check_metric_sum_on_all_nodes(client, core_api, + "longhorn_snapshot_actual_size_bytes", + system_snapshot_metric_labels, + system_snapshot_size) + + # create 3 more user snapshots. + create_snapshot(client, volume_name) + create_snapshot(client, volume_name) + create_snapshot(client, volume_name) + + wait_for_metric_count_all_nodes(client, core_api, + "longhorn_snapshot_actual_size_bytes", + user_snapshot_metric_labels, 4) + wait_for_metric_count_all_nodes(client, core_api, + "longhorn_snapshot_actual_size_bytes", + system_snapshot_metric_labels, 1) + + +def test_node_metrics(client, core_api): # NOQA + lht_hostId = get_self_host_id() + node = client.by_id_node(lht_hostId) + disks = node.disks + for _, disk in iter(disks.items()): + if disk.path == DEFAULT_DISK_PATH: + default_disk = disk + break + assert default_disk is not None + + metric_labels = {} + check_metric(core_api, "longhorn_node_count_total", + metric_labels, expected_value=3.0) + + metric_labels = { + "node": lht_hostId, + } + check_metric(core_api, "longhorn_node_cpu_capacity_millicpu", + metric_labels) + check_metric(core_api, "longhorn_node_cpu_usage_millicpu", + metric_labels) + check_metric(core_api, "longhorn_node_memory_capacity_bytes", + metric_labels) + check_metric(core_api, "longhorn_node_memory_usage_bytes", + metric_labels) + check_metric(core_api, "longhorn_node_storage_capacity_bytes", + metric_labels, default_disk.storageMaximum) + check_metric(core_api, "longhorn_node_storage_usage_bytes", + metric_labels) + check_metric(core_api, "longhorn_node_storage_reservation_bytes", + metric_labels, default_disk.storageReserved) + + # check longhorn_node_status by 4 different conditions + metric_labels = { + "condition": "mountpropagation", + "condition_reason": "", + "node": lht_hostId + } + check_metric_with_condition(core_api, "longhorn_node_status", + metric_labels, 1.0) + + metric_labels = { + "condition": "ready", + "condition_reason": "", + "node": lht_hostId + } + check_metric_with_condition(core_api, "longhorn_node_status", + metric_labels, 1.0) + + metric_labels = { + "condition": "allowScheduling", + "condition_reason": "", + "node": lht_hostId, + } + check_metric_with_condition(core_api, "longhorn_node_status", + metric_labels, 1.0) + node = client.by_id_node(lht_hostId) + set_node_scheduling(client, node, allowScheduling=False, retry=True) + check_metric_with_condition(core_api, "longhorn_node_status", + metric_labels, 0.0) + + metric_labels = { + "condition": "schedulable", + "condition_reason": "", + "node": lht_hostId + } + check_metric_with_condition(core_api, "longhorn_node_status", + metric_labels, 1.0) + + metric_labels = { + "condition": "schedulable", + "condition_reason": "KubernetesNodeCordoned", + "node": lht_hostId + } + set_node_cordon(core_api, lht_hostId, True) + check_metric_with_condition(core_api, "longhorn_node_status", + metric_labels, 0.0) diff --git a/manager/integration/tests/test_migration.py b/manager/integration/tests/test_migration.py index d8dced2a0e..cea8ab15e9 100644 --- a/manager/integration/tests/test_migration.py +++ b/manager/integration/tests/test_migration.py @@ -404,10 +404,9 @@ def test_migration_with_restore_volume(core_api, # NOQA """ # Step 1 lht_host_id = get_self_host_id() - volume = create_and_check_volume(client, - volume_name, - REPLICA_COUNT, - SIZE) + volume = create_and_check_volume(client, volume_name, + num_of_replicas=REPLICA_COUNT, + size=SIZE) attachment_id = common.generate_attachment_ticket_id() volume.attach(attachmentID=attachment_id, hostId=lht_host_id) diff --git a/manager/integration/tests/test_node.py b/manager/integration/tests/test_node.py index 2e6af2282d..1e4ad5dd32 100644 --- a/manager/integration/tests/test_node.py +++ b/manager/integration/tests/test_node.py @@ -19,14 +19,14 @@ SETTING_DEFAULT_DATA_PATH, \ SETTING_CREATE_DEFAULT_DISK_LABELED_NODES, \ DEFAULT_STORAGE_OVER_PROVISIONING_PERCENTAGE, \ - SETTING_DISABLE_SCHEDULING_ON_CORDONED_NODE + SETTING_DISABLE_SCHEDULING_ON_CORDONED_NODE, \ + SETTING_DETACH_MANUALLY_ATTACHED_VOLUMES_WHEN_CORDONED from common import get_volume_endpoint from common import get_update_disks from common import wait_for_disk_status, wait_for_disk_update, \ wait_for_disk_conditions, wait_for_node_tag_update, \ cleanup_node_disks, wait_for_disk_storage_available, \ wait_for_disk_uuid, wait_for_node_schedulable_condition -from common import exec_nsenter from common import SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY from common import volume_name # NOQA @@ -1425,14 +1425,17 @@ def test_replica_datapath_cleanup(client): # NOQA # data path should exist now for data_path in data_paths: - assert exec_nsenter("ls {}".format(data_path)) + assert os.listdir(data_path) cleanup_volume_by_name(client, vol_name) # data path should be gone due to the cleanup of replica for data_path in data_paths: - with pytest.raises(subprocess.CalledProcessError): - exec_nsenter("ls {}".format(data_path)) + try: + os.listdir(data_path) + raise AssertionError(f"data path {data_path} should be gone") + except FileNotFoundError: + pass node = client.by_id_node(lht_hostId) disks = node.disks @@ -2626,6 +2629,14 @@ def test_disk_eviction_with_node_level_soft_anti_affinity_disabled(client, # NOQ replica_path = test_disk_path + '/replicas' assert os.path.isdir(replica_path) + # Since https://github.com/longhorn/longhorn-manager/pull/2138, the node + # controller is responsible for triggering replica eviction. If the timing + # of the node controller and node monitor are off, the node controller + # may take extra time to do so. Wait for evidence eviction is in progress + # before proceeding. + wait_for_volume_replica_count(client, volume.name, + volume.numberOfReplicas + 1) + for i in range(common.RETRY_COMMAND_COUNT): if len(os.listdir(replica_path)) > 0: break @@ -2668,3 +2679,146 @@ def finalizer(): common.cleanup_all_volumes(client) request.addfinalizer(finalizer) + +@pytest.mark.skip(reason="TODO") # NOQA +def test_drain_with_block_for_eviction_success(): + """ + Test drain completes after evicting replica with node-drain-policy + block-for-eviction + + 1. Set `node-drain-policy` to `block-for-eviction`. + 2. Create a volume. + 3. Ensure (through soft anti-affinity, low replica count, and/or enough + disks) that an evicted replica of the volume can be scheduled elsewhere. + 4. Write data to the volume. + 5. Drain a node one of the volume's replicas is scheduled to. + 6. While the drain is ongoing: + - Verify that the volume never becomes degraded. + - Verify that `node.status.autoEvicting == true`. + - Optionally verify that `replica.spec.evictionRequested == true`. + 7. Verify the drain completes. + 8. Uncordon the node. + 9. Verify the replica on the drained node has moved to a different one. + 10. Verify that `node.status.autoEvicting == false`. + 11. Verify that `replica.spec.evictionRequested == false`. + 12. Verify the volume's data. + """ + +@pytest.mark.skip(reason="TODO") # NOQA +def test_drain_with_block_for_eviction_if_contains_last_replica_success(): + """ + Test drain completes after evicting replicas with node-drain-policy + block-for-eviction-if-contains-last-replica + + 1. Set `node-drain-policy` to + `block-for-eviction-if-contains-last-replica`. + 2. Create one volume with a single replica and another volume with three + replicas. + 3. Ensure (through soft anti-affinity, low replica count, and/or enough + disks) that evicted replicas of both volumes can be scheduled elsewhere. + 4. Write data to the volumes. + 5. Drain a node both volumes have a replica scheduled to. + 6. While the drain is ongoing: + - Verify that the volume with one replica never becomes degraded. + - Verify that the volume with three replicas becomes degraded. + - Verify that `node.status.autoEvicting == true`. + - Optionally verify that `replica.spec.evictionRequested == true` on the + replica for the volume that only has one. + - Optionally verify that `replica.spec.evictionRequested == false` on + the replica for the volume that has three. + 7. Verify the drain completes. + 8. Uncordon the node. + 9. Verify the replica for the volume with one replica has moved to a + different node. + 10. Verify the replica for the volume with three replicas has not moved. + 11. Verify that `node.status.autoEvicting == false`. + 12. Verify that `replica.spec.evictionRequested == false` on all replicas. + 13. Verify the the data in both volumes. + """ + +@pytest.mark.skip(reason="TODO") # NOQA +def test_drain_with_block_for_eviction_failure(): + """ + Test drain never completes with node-drain-policy block-for-eviction + + 1. Set `node-drain-policy` to `block-for-eviction`. + 2. Create a volume. + 3. Ensure (through soft anti-affinity, high replica count, and/or not + enough disks) that an evicted replica of the volume cannot be scheduled + elsewhere. + 4. Write data to the volume. + 5. Drain a node one of the volume's replicas is scheduled to. + 6. While the drain is ongoing: + - Verify that `node.status.autoEvicting == true`. + - Verify that `replica.spec.evictionRequested == true`. + 7. Verify the drain never completes. + """ + +@pytest.mark.node # NOQA +def test_auto_detach_volume_when_node_is_cordoned(client, core_api, volume_name): # NOQA + """ + Test auto detach volume when node is cordoned + + 1. Set `detach-manually-attached-volumes-when-cordoned` to `false`. + 2. Create a volume and attached to the node through API (manually). + 3. Cordon the node. + 4. Set `detach-manually-attached-volumes-when-cordoned` to `true`. + 5. Volume will be detached automatically. + """ + + # Set `Detach Manually Attached Volumes When Cordoned` to false + detach_manually_attached_volumes_when_cordoned = \ + client.by_id_setting( + SETTING_DETACH_MANUALLY_ATTACHED_VOLUMES_WHEN_CORDONED) + client.update(detach_manually_attached_volumes_when_cordoned, + value="false") + + # Create a volume + volume = client.create_volume(name=volume_name, + size=SIZE, + numberOfReplicas=3) + volume = common.wait_for_volume_detached(client, + volume_name) + assert volume.restoreRequired is False + + # Attach to the node + host_id = get_self_host_id() + volume.attach(hostId=host_id) + volume = common.wait_for_volume_healthy(client, volume_name) + assert volume.restoreRequired is False + + # Cordon the node + set_node_cordon(core_api, host_id, True) + + # Volume is still attached for a while + time.sleep(NODE_UPDATE_WAIT_INTERVAL) + volume = common.wait_for_volume_healthy(client, volume_name) + assert volume.restoreRequired is False + + # Set `Detach Manually Attached Volumes When Cordoned` to true + client.update(detach_manually_attached_volumes_when_cordoned, value="true") + + # Volume should be detached + volume = common.wait_for_volume_detached(client, volume_name) + assert volume.restoreRequired is False + + # Delete the Volume + client.delete(volume) + common.wait_for_volume_delete(client, volume_name) + + volumes = client.list_volume().data + assert len(volumes) == 0 + +@pytest.mark.skip(reason="TODO") # NOQA +def test_do_not_react_to_brief_kubelet_restart(): + """ + Test the node controller ignores Ready == False due to KubeletNotReady for + ten seconds before reacting. + + Repeat the following five times: + 1. Verify status.conditions[type == Ready] == True for the Longhorn node we + are running on. + 2. Kill the kubelet process (e.g. `pkill kubelet`). + 3. Verify status.conditions[type == Ready] != False for the Longhorn node + we are running on at any point for at least ten seconds. + """ diff --git a/manager/integration/tests/test_orphan.py b/manager/integration/tests/test_orphan.py index 9ee0715704..9a951b990e 100644 --- a/manager/integration/tests/test_orphan.py +++ b/manager/integration/tests/test_orphan.py @@ -3,13 +3,13 @@ import time import random import string +import shutil from common import core_api, client # NOQA from common import Gi, SIZE from common import volume_name # NOQA from common import SETTING_ORPHAN_AUTO_DELETION from common import RETRY_COUNTS, RETRY_INTERVAL_LONG -from common import exec_nsenter from common import get_self_host_id from common import get_update_disks, wait_for_disk_update, cleanup_node_disks from common import create_and_check_volume, wait_for_volume_healthy @@ -18,6 +18,7 @@ from common import wait_for_node_update from common import wait_for_disk_status from common import update_node_disks +from common import exec_local def generate_random_id(num_bytes): @@ -60,7 +61,9 @@ def create_volume_with_replica_on_host(client, volume_name): # NOQA nodes = client.list_node() - volume = create_and_check_volume(client, volume_name, len(nodes), SIZE) + volume = create_and_check_volume(client, volume_name, + num_of_replicas=len(nodes), + size=SIZE) volume.attach(hostId=lht_hostId, disableFrontend=False) wait_for_volume_healthy(client, volume_name) @@ -78,7 +81,7 @@ def create_orphaned_directories_on_host(volume, disk_paths, num_orphans): # NOQ replica_dir_name = volume.name + "-" + generate_random_id(8) path = os.path.join(disk_path, "replicas", replica_dir_name) paths.append(path) - exec_nsenter("cp -a {} {}".format(replica.dataPath, path)) + exec_local("cp -a {} {}".format(replica.dataPath, path)) return paths @@ -120,18 +123,16 @@ def wait_for_orphan_count(client, number, retry_counts=120): # NOQA def wait_for_file_count(path, number, retry_counts=120): for _ in range(retry_counts): - count = exec_nsenter("ls {} | wc -l".format(path)) - if int(count) == number: + if len(os.listdir(path)) == number: break time.sleep(RETRY_INTERVAL_LONG) - count = exec_nsenter("ls {} | wc -l".format(path)) - return int(count) + return len(os.listdir(path)) def delete_orphaned_directory_on_host(directories): # NOQA for path in directories: - exec_nsenter("rm -rf {}".format(path)) + exec_local("rm -rf {}".format(path)) def delete_extra_disks_on_host(client, disk_names): # NOQA @@ -190,26 +191,24 @@ def test_orphaned_dirs_with_wrong_naming_format(client, volume_name, request): # Create invalid orphaned directories. # 8-byte random id missing - exec_nsenter("mkdir -p {}".format(os.path.join(replica.diskPath, - "replicas", - volume_name))) + os.makedirs(os.path.join(replica.diskPath, "replicas", volume_name)) + # wrong random id length - exec_nsenter("mkdir -p {}".format( - os.path.join(replica.diskPath, - "replicas", - volume_name + "-" + generate_random_id(4)))) + os.makedirs(os.path.join(replica.diskPath, "replicas", + volume_name + "-" + generate_random_id(4))) + # volume.meta missing - path = os.path.join(replica.diskPath, - "replicas", + path = os.path.join(replica.diskPath, "replicas", volume_name + "-" + generate_random_id(8)) - exec_nsenter("cp -a {} {}; rm -f {}".format( - replica.dataPath, path, os.path.join(path, "volume.meta"))) + shutil.copytree(replica.dataPath, path) + os.remove(os.path.join(path, "volume.meta")) + # corrupted volume.meta - path = os.path.join(replica.diskPath, - "replicas", + path = os.path.join(replica.diskPath, "replicas", volume_name + "-" + generate_random_id(8)) - exec_nsenter("cp -a {} {}; echo xxx > {}".format( - replica.dataPath, path, os.path.join(path, "volume.meta"))) + shutil.copytree(replica.dataPath, path) + with open(os.path.join(path, "volume.meta"), 'w') as file: + file.write("xxx") # Step 5 cleanup_volume_by_name(client, volume_name) @@ -535,7 +534,7 @@ def test_orphaned_dirs_in_duplicated_disks(client, volume_name, request): # NOQ disks = node.disks disk_path = os.path.join(disk_paths[0], disk_names[1]) disk_paths.append(disk_path) - exec_nsenter("mkdir -p {}".format(disk_path)) + os.makedirs(disk_path) disk2 = {"path": disk_path, "allowScheduling": True} update_disk = get_update_disks(disks) diff --git a/manager/integration/tests/test_recurring_job.py b/manager/integration/tests/test_recurring_job.py index aeacf8c705..3e98353adb 100644 --- a/manager/integration/tests/test_recurring_job.py +++ b/manager/integration/tests/test_recurring_job.py @@ -17,6 +17,7 @@ from common import random_labels, volume_name # NOQA from common import storage_class, statefulset, pvc # NOQA from common import make_deployment_with_pvc # NOQA +from common import generate_volume_name from common import get_self_host_id @@ -68,6 +69,8 @@ from common import wait_for_cron_job_create from common import wait_for_cron_job_delete +from common import ACCESS_MODE_RWO +from common import ACCESS_MODE_RWX from common import JOB_LABEL from common import KUBERNETES_STATUS_LABEL from common import LONGHORN_NAMESPACE @@ -1991,9 +1994,9 @@ def test_recurring_job_restored_from_backup_target(set_random_backupstore, clien back1 = BACKUP + "1" back2 = BACKUP + "2" group1 = "group01" - volume_name1 = "record-recurring-job" - rvolume_name1 = "restore-record-recurring-job-01" - rvolume_name2 = "restore-record-recurring-job-02" + volume_name1 = "record-recur" + "-" + generate_volume_name() + rvolume_name1 = "restore-01" + "-" + generate_volume_name() + rvolume_name2 = "restore-02" + "-" + generate_volume_name() recurring_jobs = { back1: { @@ -2048,8 +2051,8 @@ def test_recurring_job_restored_from_backup_target(set_random_backupstore, clien complete_backup_1_count = 0 restore_snapshot_name = "" - volume = client.by_id_volume(volume_name1) wait_for_backup_completion(client, volume_name1) + volume = client.by_id_volume(volume_name1) for b in volume.backupStatus: if back1+"-" in b.snapshot: complete_backup_1_count += 1 @@ -2081,7 +2084,8 @@ def test_recurring_job_restored_from_backup_target(set_random_backupstore, clien @pytest.mark.recurring_job # NOQA -def test_recurring_job_filesystem_trim(client, core_api, batch_v1_api, volume_name, csi_pv, pvc, pod_make): # NOQA +@pytest.mark.parametrize("access_mode", [ACCESS_MODE_RWO, ACCESS_MODE_RWX]) # NOQA +def test_recurring_job_filesystem_trim(client, core_api, batch_v1_api, volume_name, csi_pv, pvc, pod_make, access_mode): # NOQA """ Scenario: test recurring job filesystem-trim @@ -2102,7 +2106,8 @@ def test_recurring_job_filesystem_trim(client, core_api, batch_v1_api, volume_na """ pod_name, _, _, _ = \ prepare_pod_with_data_in_mb(client, core_api, csi_pv, pvc, pod_make, - volume_name, data_size_in_mb=10) + volume_name, data_size_in_mb=10, + access_mode=access_mode) volume = client.by_id_volume(volume_name) diff --git a/manager/integration/tests/test_rwx.py b/manager/integration/tests/test_rwx.py index d01f840449..79ea321117 100644 --- a/manager/integration/tests/test_rwx.py +++ b/manager/integration/tests/test_rwx.py @@ -6,20 +6,24 @@ from common import create_and_wait_pod, read_volume_data from common import get_apps_api_client, wait_statefulset from common import create_and_wait_deployment, delete_and_wait_pod +from common import delete_and_wait_deployment +from common import delete_and_wait_pvc from common import prepare_pod_with_data_in_mb, DATA_SIZE_IN_MB_1 from common import create_snapshot, wait_for_backup_completion from common import find_backup, Gi, volume_name, csi_pv, pod_make # NOQA from common import wait_for_volume_creation, DATA_SIZE_IN_MB_3 from common import create_pv_for_volume, create_pvc_for_volume from common import DEFAULT_STATEFULSET_TIMEOUT, DEFAULT_STATEFULSET_INTERVAL +from common import wait_delete_pod, wait_for_pod_remount from common import get_core_api_client, write_pod_volume_random_data from common import create_pvc_spec, make_deployment_with_pvc # NOQA -from common import wait_for_pod_phase from common import core_api, statefulset, pvc, pod, client # NOQA from common import RETRY_COUNTS, RETRY_INTERVAL from common import EXPANDED_VOLUME_SIZE from common import expand_and_wait_for_pvc, wait_for_volume_expansion from common import wait_deployment_replica_ready, wait_for_volume_healthy +from common import crypto_secret, storage_class # NOQA +from common import create_crypto_secret, create_storage_class from backupstore import set_random_backupstore # NOQA from multiprocessing import Pool @@ -344,9 +348,10 @@ def test_rwx_delete_share_manager_pod(core_api, statefulset): # NOQA 2. Wait for StatefulSet to come up healthy. 3. Write data and compute md5sum. 4. Delete the share manager pod. - 5. Check the data md5sum in statefulSet. - 6. Write more data to it and compute md5sum. - 7. Check the data md5sum in share manager volume. + 5. Wait for a new pod to be created and volume getting attached. + 6. Check the data md5sum in statefulSet. + 7. Write more data to it and compute md5sum. + 8. Check the data md5sum in share manager volume. """ statefulset_name = 'statefulset-delete-share-manager-pods-test' @@ -377,8 +382,10 @@ def test_rwx_delete_share_manager_pod(core_api, statefulset): # NOQA delete_and_wait_pod(core_api, share_manager_name, namespace=LONGHORN_NAMESPACE) - wait_for_pod_phase(core_api, share_manager_name, - namespace=LONGHORN_NAMESPACE, pod_phase="Running") + target_pod = core_api.read_namespaced_pod(name=pod_name, + namespace='default') + wait_delete_pod(core_api, target_pod.metadata.uid) + wait_for_pod_remount(core_api, pod_name) test_data_2 = generate_random_data(VOLUME_RWTEST_SIZE) write_pod_volume_data(core_api, pod_name, test_data_2, filename='test2') @@ -518,7 +525,7 @@ def test_restore_rwo_volume_to_rwx(set_random_backupstore, client, core_api, vol @pytest.mark.skip(reason="TODO") -def test_rwx_onine_expansion(): # NOQA +def test_rwx_online_expansion(): # NOQA """ Related issue : https://github.com/longhorn/longhorn/issues/2181 @@ -634,3 +641,110 @@ def test_rwx_offline_expansion(client, core_api, pvc, make_deployment_with_pvc): pod_name, 'default') assert int(data_size_in_pod)/1024/1024 == data_size_in_mb + + +def test_encrypted_rwx_volume(core_api, statefulset, storage_class, crypto_secret, pvc, make_deployment_with_pvc): # NOQA + """ + Test creating encrypted rwx volume and use the secret in + non longhorn-system namespace. + + 1. Create crypto secret in non longhorn-system namespace. + 2. Create a storage class. + 3. Create a deployment with a PVC and the pods should be able to running. + """ + + namespace = 'default' + # Create crypto secret + secret = crypto_secret(namespace) + create_crypto_secret(secret, namespace) + + # Create storage class + storage_class['reclaimPolicy'] = 'Delete' + storage_class['parameters']['csi.storage.k8s.io/provisioner-secret-name'] = 'longhorn-crypto' # NOQA + storage_class['parameters']['csi.storage.k8s.io/provisioner-secret-namespace'] = namespace # NOQA + storage_class['parameters']['csi.storage.k8s.io/node-publish-secret-name'] = 'longhorn-crypto' # NOQA + storage_class['parameters']['csi.storage.k8s.io/node-publish-secret-namespace'] = namespace # NOQA + storage_class['parameters']['csi.storage.k8s.io/node-stage-secret-name'] = 'longhorn-crypto' # NOQA + storage_class['parameters']['csi.storage.k8s.io/node-stage-secret-namespace'] = namespace # NOQA + create_storage_class(storage_class) + + # Create deployment with PVC + pvc_name = 'pvc-deployment-with-encrypted-rwx-volume' + pvc['metadata']['name'] = pvc_name + pvc['spec']['storageClassName'] = storage_class['metadata']['name'] + pvc['spec']['accessModes'] = ['ReadWriteMany'] + + core_api.create_namespaced_persistent_volume_claim( + body=pvc, namespace='default') + + deployment = make_deployment_with_pvc( + 'pvc-deployment-with-encrypted-rwx-volume', pvc_name, replicas=3) + + apps_api = get_apps_api_client() + create_and_wait_deployment(apps_api, deployment) + + # Clean up deployment and volume + delete_and_wait_deployment(apps_api, deployment["metadata"]["name"]) + delete_and_wait_pvc(core_api, pvc_name) + + +def test_rwx_volume_mount_options(core_api, storage_class, pvc, make_deployment_with_pvc): # NOQA + """ + Test creating rwx volume with custom mount options + non longhorn-system namespace. + + 1. Create a storage class with nfsOptions parameter. + 2. Create a deployment with a PVC and the pods should be able to run. + 3. Check the mounts on the deployment pods. + """ + + # Create storage class + storage_class['reclaimPolicy'] = 'Delete' + storage_class['parameters']['nfsOptions'] = 'vers=4.2,soft,noresvport,timeo=600,retrans=4' # NOQA + create_storage_class(storage_class) + + # Create deployment with PVC + pvc_name = 'pvc-deployment-with-custom-mount-options-volume' + pvc['metadata']['name'] = pvc_name + pvc['spec']['storageClassName'] = storage_class['metadata']['name'] + pvc['spec']['accessModes'] = ['ReadWriteMany'] + + core_api.create_namespaced_persistent_volume_claim( + body=pvc, namespace='default') + + deployment = make_deployment_with_pvc( + 'deployment-with-custom-mount-options-volume', pvc_name, replicas=2) + + apps_api = get_apps_api_client() + create_and_wait_deployment(apps_api, deployment) + + # Check mount options on deployment pods + deployment_label_selector = "name=" + \ + deployment["metadata"]["labels"]["name"] + + deployment_pod_list = \ + core_api.list_namespaced_pod(namespace="default", + label_selector=deployment_label_selector) + + assert deployment_pod_list.items.__len__() == 2 + + pod_name_1 = deployment_pod_list.items[0].metadata.name + pod_name_2 = deployment_pod_list.items[1].metadata.name + + command = "cat /proc/mounts | grep 'nfs'" + mount_options_1 = exec_command_in_pod(core_api, command, + pod_name_1, + 'default') + mount_options_2 = exec_command_in_pod(core_api, command, + pod_name_2, + 'default') + + # print(f'mount_options_1={mount_options_1}') + # print(f'mount_options_2={mount_options_2}') + + assert "vers=4.2" in mount_options_1 + assert "vers=4.2" in mount_options_2 + + # Clean up deployment and volume + delete_and_wait_deployment(apps_api, deployment["metadata"]["name"]) + delete_and_wait_pvc(core_api, pvc_name) diff --git a/manager/integration/tests/test_scheduling.py b/manager/integration/tests/test_scheduling.py index 5b801b703f..2c164cad38 100644 --- a/manager/integration/tests/test_scheduling.py +++ b/manager/integration/tests/test_scheduling.py @@ -55,6 +55,12 @@ from common import wait_for_replica_running from common import crash_engine_process_with_sigkill +from common import set_node_tags +from common import wait_for_node_tag_update +from common import wait_for_volume_condition_scheduled +from common import cleanup_host_disks +from common import wait_for_volume_delete +from common import wait_for_disk_update from common import Mi, Gi from common import DATA_SIZE_IN_MB_2 @@ -70,6 +76,11 @@ from common import update_setting, delete_replica_on_test_node from common import VOLUME_FRONTEND_BLOCKDEV, SNAPSHOT_DATA_INTEGRITY_IGNORED from common import VOLUME_ROBUSTNESS_DEGRADED, RETRY_COUNTS_SHORT +from common import SETTING_ALLOW_EMPTY_NODE_SELECTOR_VOLUME +from common import SIZE, CONDITION_STATUS_FALSE, CONDITION_STATUS_TRUE +from common import SETTING_REPLICA_ZONE_SOFT_ANTI_AFFINITY +from common import SETTING_REPLICA_DISK_SOFT_ANTI_AFFINITY +from common import SETTING_ALLOW_EMPTY_DISK_SELECTOR_VOLUME from time import sleep @@ -1082,8 +1093,7 @@ def test_data_locality_basic(client, core_api, volume_name, pod, settings_reset) pod1['metadata']['name'] = pod1_name - volume1 = create_and_check_volume(client, - volume1_name, + volume1 = create_and_check_volume(client, volume1_name, num_of_replicas=1, size=volume1_size) @@ -1724,8 +1734,118 @@ def finalizer(): wait_for_statefulset_pods_healthy(statefulset) -@pytest.mark.skip(reason="TODO") -def test_global_disk_soft_anti_affinity(): # NOQA +def test_allow_empty_node_selector_volume_setting(client, volume_name): # NOQA + """ + Test the global setting allow-empty-node-selector-volume + + If true, a replica of the volume without node selector + can be scheduled on node with tags. + + If false, a replica of the volume without node selector + can not be scheduled on node with tags. + + Setup + - Prepare 3 nodes + - Add `AVAIL` tag to nodes + - Set allow-empty-node-selector-volume to `false` + + When + - Create a Volume with 3 replicas without tag + + Then + - All replicas can not be scheduled to the nodes + + When + - Remove `AVAIL` tag from one of the node + - Set allow-empty-node-selector-volume to `true` + + Then + - Wait for a while for controller to resync the volume, + all replicas can be scheduled to the nodes + """ + # Setup + node_tag = ["AVAIL"] + for node in client.list_node(): + set_node_tags(client, node, tags=node_tag, retry=True) + wait_for_node_tag_update(client, node.name, node_tag) + + update_setting(client, SETTING_ALLOW_EMPTY_NODE_SELECTOR_VOLUME, "false") + + # Check volume can not be scehduled + client.create_volume(name=volume_name, size=SIZE) + volume = wait_for_volume_detached(client, volume_name) + + volume = client.by_id_volume(volume.name) + volume = wait_for_volume_condition_scheduled(client, volume_name, + "status", + CONDITION_STATUS_FALSE) + + # Rremove tag from 1 node and set setting allow-empty-node-selector-volume + # to true + node = client.by_id_node(get_self_host_id()) + set_node_tags(client, node, tags=[], retry=True) + update_setting(client, SETTING_ALLOW_EMPTY_NODE_SELECTOR_VOLUME, "true") + + # Volume can be schedule + volume = wait_for_volume_condition_scheduled(client, volume_name, "status", + CONDITION_STATUS_TRUE) + assert volume.ready + + # All replicas schedule to nodes + volume.attach(hostId=get_self_host_id()) + volume = wait_for_volume_healthy(client, volume_name) + + +def prepare_for_affinity_tests(client, volume_name, request): # NOQA + """ + For 'test_global_disk_soft_anti_affinity' and + 'test_volume_disk_soft_anti_affinity' use, they have identical + the same preparation steps as below: + + Given + - One node has three disks + - The three disks have very different sizes + - Only two disks are available for scheduling + - No other node is available for scheduling + """ + def finalizer(): + volume = client.by_id_volume(volume_name) + volume.detach(hostId=lht_hostId) + wait_for_volume_detached(client, volume_name) + client.delete(volume) + wait_for_volume_delete(client, volume.name) + cleanup_host_disks(client, 'vol-disk-1', 'vol-disk-2') + request.addfinalizer(finalizer) + + # Preparation + lht_hostId = get_self_host_id() + node = client.by_id_node(lht_hostId) + disks = node.disks + disk_path1 = create_host_disk(client, 'vol-disk-1', + str(2 * Gi), lht_hostId) + disk1 = {"path": disk_path1, "allowScheduling": True} + disk_path2 = create_host_disk(client, 'vol-disk-2', + str(4 * Gi), lht_hostId) + disk2 = {"path": disk_path2, "allowScheduling": False} + + update_disk = get_update_disks(disks) + update_disk["disk1"] = disk1 + update_disk["disk2"] = disk2 + + node = update_node_disks(client, node.name, disks=update_disk, retry=True) + node = wait_for_disk_update(client, lht_hostId, len(update_disk)) + assert len(node.disks) == len(update_disk) + + # Make only current node schedulable + nodes = client.list_node() + for node in nodes: + if node.id != lht_hostId: + set_node_scheduling(client, node, allowScheduling=False) + + return disk_path1, disk_path2 + + +def test_global_disk_soft_anti_affinity(client, volume_name, request): # NOQA """ 1. When Replica Disk Soft Anti-Affinity is false, it should be impossible to schedule replicas to the same disk. @@ -1771,11 +1891,131 @@ def test_global_disk_soft_anti_affinity(): # NOQA - Verify all three replicas are healthy - Verify all three replicas have a different spec.diskID """ - pass + # Preparation + disk_path1, disk_path2 = prepare_for_affinity_tests(client, + volume_name, + request) + + # Test start + update_setting(client, SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY, "true") + update_setting(client, SETTING_REPLICA_ZONE_SOFT_ANTI_AFFINITY, "true") + update_setting(client, SETTING_REPLICA_DISK_SOFT_ANTI_AFFINITY, "false") + + lht_hostId = get_self_host_id() + client.create_volume(name=volume_name, size=str(500*Mi)) + volume = wait_for_volume_detached(client, volume_name) + volume.attach(hostId=lht_hostId) + volume = wait_for_volume_degraded(client, volume_name) + + num_running = 0 + for replica in volume.replicas: + if replica.running: + num_running += 1 + else: + assert replica.hostId == "" + + assert num_running == 2 + + # After enable SETTING_REPLICA_DISK_SOFT_ANTI_AFFINITY to true, + # replicas can schedule on the same disk, threrefore volume become healthy + update_setting(client, SETTING_REPLICA_DISK_SOFT_ANTI_AFFINITY, "true") + + volume = wait_for_volume_healthy(client, volume_name) + + node = client.by_id_node(lht_hostId) + disks = node.disks + for fsid, disk in iter(disks.items()): + if disk.path == disk_path2: + disk.allowScheduling = True + + # Enable disk2 + update_disks = get_update_disks(disks) + update_node_disks(client, node.name, disks=update_disks, retry=True) + + # Delete one of the two replicas with the same diskID + disk_id = [] + for replica in volume.replicas: + if replica.diskID not in disk_id: + disk_id.append(replica.diskID) + else: + volume.replicaRemove(name=replica.name) + + volume = wait_for_volume_degraded(client, volume_name) + volume = wait_for_volume_healthy(client, volume_name) + + # Replcas should located on 3 different disks on current node + disk_id.clear() + for replica in volume.replicas: + assert replica.diskID not in disk_id + disk_id.append(replica.diskID) + + +def test_allow_empty_disk_selector_volume_setting(client, volume_name): # NOQA + """ + Test the global setting allow-empty-disk-selector-volume + + If true, a replica of the volume without disk selector + can be scheduled on disk with tags. + + If false, a replica of the volume without disk selector + can not be scheduled on disk with tags. + + Setup + - Prepare 3 nodes each with one disk + - Add `AVAIL` tag to every disk + - Set allow-empty-disk-selector-volume to `false` + + When + - Create a Volume with 3 replicas without tag + + Then + - All replicas can not be scheduled to the disks on the nodes + + When + - Remove `AVAIL` tag from one of the node + - Set allow-empty-disk-selector-volume to `true` + + Then + - Wait for a while for controller to resync the volume, + all replicas can be scheduled to the disks on the nodes + """ + # Preparation + nodes = client.list_node() + for node in nodes: + disks = get_update_disks(node.disks) + disks[list(disks)[0]].tags = ["AVAIL"] + update_node_disks(client, node.name, disks=disks) + + update_setting(client, SETTING_ALLOW_EMPTY_DISK_SELECTOR_VOLUME, "false") + + # Check volume can not be scehduled + client.create_volume(name=volume_name, size=SIZE) + volume = wait_for_volume_detached(client, volume_name) + + volume = client.by_id_volume(volume.name) + volume = wait_for_volume_condition_scheduled(client, volume_name, + "status", + CONDITION_STATUS_FALSE) + + # Remove tag from current node + host_id = get_self_host_id() + node = client.by_id_node(host_id) + disks = get_update_disks(node.disks) + disks[list(disks)[0]].tags = [] + update_node_disks(client, node.name, disks=disks) + update_setting(client, SETTING_ALLOW_EMPTY_DISK_SELECTOR_VOLUME, "true") + + # Volume can be schedule + volume = wait_for_volume_condition_scheduled(client, volume_name, "status", + CONDITION_STATUS_TRUE) + assert volume.ready + # All replicas schedule to disks on nodes + volume.attach(hostId=host_id) + volume = wait_for_volume_healthy(client, volume_name) -@pytest.mark.skip(reason="TODO") -def test_volume_disk_soft_anti_affinity(): # NOQA + +def test_volume_disk_soft_anti_affinity(client, volume_name, request): # NOQA """ 1. When Replica Disk Soft Anti-Affinity is disabled, it should be impossible to schedule replicas to the same disk. @@ -1818,6 +2058,73 @@ def test_volume_disk_soft_anti_affinity(): # NOQA Then - Verify the volume is in a healthy state - Verify all three replicas are healthy - - Verify all three replicas have a different `replica.HostID` + - Verify all three replicas have a different diskID """ - pass + # Preparation + disk_path1, disk_path2 = prepare_for_affinity_tests(client, + volume_name, + request) + + # Test start + update_setting(client, SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY, "true") + update_setting(client, SETTING_REPLICA_ZONE_SOFT_ANTI_AFFINITY, "true") + + lht_hostId = get_self_host_id() + client.create_volume(name=volume_name, size=str(500*Mi), + replicaDiskSoftAntiAffinity="disabled") + volume = wait_for_volume_detached(client, volume_name) + assert volume.replicaDiskSoftAntiAffinity == "disabled" + + volume.attach(hostId=lht_hostId) + volume = wait_for_volume_degraded(client, volume_name) + + num_running = 0 + for replica in volume.replicas: + if replica.running: + num_running += 1 + else: + assert replica.hostId == "" + + assert num_running == 2 + + # After set update volume.updateReplicaDiskSoftAntiAffinity to enabled, + # replicas can schedule on the same disk, threrefore volume become healthy + volume = volume.updateReplicaDiskSoftAntiAffinity( + replicaDiskSoftAntiAffinity="enabled") + assert volume.replicaDiskSoftAntiAffinity == "enabled" + + volume = wait_for_volume_healthy(client, volume_name) + + disk_id = [] + for replica in volume.replicas: + if replica.diskID not in disk_id: + disk_id.append(replica.diskID) + + assert len(disk_id) == 2 + + node = client.by_id_node(lht_hostId) + disks = node.disks + for fsid, disk in iter(disks.items()): + if disk.path == disk_path2: + disk.allowScheduling = True + + # Enable disk2 + update_disks = get_update_disks(disks) + update_node_disks(client, node.name, disks=update_disks, retry=True) + + # Delete one of the two replicas with the same diskID + disk_id.clear() + for replica in volume.replicas: + if replica.diskID not in disk_id: + disk_id.append(replica.diskID) + else: + volume.replicaRemove(name=replica.name) + + volume = wait_for_volume_degraded(client, volume_name) + volume = wait_for_volume_healthy(client, volume_name) + + # Replcas should located on 3 different disks on current node + disk_id.clear() + for replica in volume.replicas: + assert replica.diskID not in disk_id + disk_id.append(replica.diskID) diff --git a/manager/integration/tests/test_settings.py b/manager/integration/tests/test_settings.py index ce54a31b46..1f025b2fae 100644 --- a/manager/integration/tests/test_settings.py +++ b/manager/integration/tests/test_settings.py @@ -22,7 +22,7 @@ get_engine_image_status_value, create_volume, create_volume_and_backup, cleanup_volume_by_name, wait_for_volume_restoration_completed, wait_for_backup_restore_completed, - get_engine_host_id, + get_engine_host_id, wait_for_instance_manager_count, Gi, Mi, LONGHORN_NAMESPACE, @@ -32,6 +32,7 @@ SETTING_DEFAULT_REPLICA_COUNT, SETTING_BACKUP_TARGET, SETTING_CONCURRENT_VOLUME_BACKUP_RESTORE, + SETTING_V1_DATA_ENGINE, RETRY_COUNTS, RETRY_INTERVAL, RETRY_INTERVAL_LONG, update_setting, BACKING_IMAGE_QCOW2_URL, BACKING_IMAGE_NAME, create_backing_image_with_matching_url, BACKING_IMAGE_EXT4_SIZE, @@ -105,8 +106,7 @@ def test_setting_toleration(): 2. Verify the request fails. 3. Create a volume and attach it. 4. Set `taint-toleration` to "key1=value1:NoSchedule; key2:NoExecute". - 5. Verify that cannot update toleration setting when any volume is - attached. + 5. Verify that can update toleration setting when any volume is attached. 6. Generate and write `data1` into the volume. 7. Detach the volume. 8. Set `taint-toleration` to "key1=value1:NoSchedule; key2:NoExecute". @@ -155,10 +155,8 @@ def test_setting_toleration(): "effect": "NoExecute" }, ] - with pytest.raises(Exception) as e: - client.update(setting, value=setting_value_str) - assert 'cannot modify toleration setting before all volumes are detached' \ - in str(e.value) + setting = client.update(setting, value=setting_value_str) + assert setting.value == setting_value_str data1 = write_volume_random_data(volume) check_volume_data(volume, data1) @@ -166,8 +164,6 @@ def test_setting_toleration(): volume.detach() wait_for_volume_detached(client, volume_name) - setting = client.update(setting, value=setting_value_str) - assert setting.value == setting_value_str wait_for_toleration_update(core_api, apps_api, count, setting_value_dicts) client, node = wait_for_longhorn_node_ready() @@ -493,8 +489,8 @@ def test_setting_priority_class(core_api, apps_api, scheduling_api, priority_cla for the Setting. 2. Create a new Priority Class in Kubernetes. 3. Create and attach a Volume. - 4. Verify that the Priority Class Setting cannot be updated with an - attached Volume. + 4. Verify that the Priority Class Setting can be updated with an attached + volume. 5. Generate and write `data1`. 6. Detach the Volume. 7. Update the Priority Class Setting to the new Priority Class. @@ -528,10 +524,8 @@ def test_setting_priority_class(core_api, apps_api, scheduling_api, priority_cla volume.attach(hostId=get_self_host_id()) volume = wait_for_volume_healthy(client, volume_name) - with pytest.raises(Exception) as e: - client.update(setting, value=name) - assert 'cannot modify priority class setting before all volumes are ' \ - 'detached' in str(e.value) + setting = client.update(setting, value=name) + assert setting.value == name data1 = write_volume_random_data(volume) check_volume_data(volume, data1) @@ -539,9 +533,6 @@ def test_setting_priority_class(core_api, apps_api, scheduling_api, priority_cla volume.detach() wait_for_volume_detached(client, volume_name) - setting = client.update(setting, value=name) - assert setting.value == name - wait_for_priority_class_update(core_api, apps_api, count, priority_class) client, node = wait_for_longhorn_node_ready() @@ -657,9 +648,10 @@ def test_setting_backing_image_auto_cleanup(client, core_api, volume_name): # N ] for volume_name in volume_names: - create_and_check_volume( - client, volume_name, 3, str(BACKING_IMAGE_EXT4_SIZE), - BACKING_IMAGE_NAME) + create_and_check_volume(client, volume_name, + num_of_replicas=3, + size=str(BACKING_IMAGE_EXT4_SIZE), + backing_image=BACKING_IMAGE_NAME) # Step 4 lht_host_id = get_self_host_id() @@ -941,7 +933,7 @@ def setting_concurrent_volume_backup_restore_limit_concurrent_restoring_test(cli str(concurrent_limit)) _, backup = create_volume_and_backup(client, volname + "-with-backup", - 500 * Mi, 300 * Mi) + 1000 * Mi, 600 * Mi) nodes = client.list_node() restore_volume_names = [] @@ -1238,6 +1230,66 @@ def test_setting_update_with_invalid_value_via_configmap(core_api, request): # [SETTING_BACKUP_TARGET, SETTING_TAINT_TOLERATION], [target, - ""]) + "key1=value1:NoSchedule"]) cleanup_volume_by_name(client, vol_name) + + +def test_setting_v1_data_engine(client, request): # NOQA + """ + Test that the v1 data engine setting works correctly. + 1. Create a volume and attach it. + 2. Set v1 data engine setting to false. The setting should be rejected. + 3. Detach the volume. + 4. Set v1 data engine setting to false again. The setting should be + accepted. Then, attach the volume. The volume is unable to attach. + 5. set v1 data engine setting to true. The setting should be accepted. + 6. Attach the volume. + """ + + setting = client.by_id_setting(SETTING_V1_DATA_ENGINE) + + # Step 1 + volume_name = "test-v1-vol" # NOQA + volume = create_and_check_volume(client, volume_name) + + def finalizer(): + cleanup_volume(client, volume) + client.update(setting, value="true") + + request.addfinalizer(finalizer) + + volume.attach(hostId=get_self_host_id()) + volume = wait_for_volume_healthy(client, volume_name) + + # Step 2 + with pytest.raises(Exception) as e: + client.update(setting, value="false") + assert 'cannot apply v1-data-engine setting to Longhorn workloads when ' \ + 'there are attached v1 volumes' in str(e.value) + + # Step 3 + volume.detach() + wait_for_volume_detached(client, volume_name) + + # Step 4 + setting = client.by_id_setting(SETTING_V1_DATA_ENGINE) + client.update(setting, value="false") + + count = wait_for_instance_manager_count(client, 0) + assert count == 0 + + volume.attach(hostId=get_self_host_id()) + with pytest.raises(Exception) as e: + wait_for_volume_healthy(client, volume_name) + assert 'volume[key]=detached' in str(e.value) + + # Step 5 + client.update(setting, value="true") + nodes = client.list_node() + count = wait_for_instance_manager_count(client, len(nodes)) + assert count == len(nodes) + + # Step 6 + volume.attach(hostId=get_self_host_id()) + volume = wait_for_volume_healthy(client, volume_name) diff --git a/manager/integration/tests/test_snapshot.py b/manager/integration/tests/test_snapshot.py index 4d4a70edb5..da8c7dc8ce 100644 --- a/manager/integration/tests/test_snapshot.py +++ b/manager/integration/tests/test_snapshot.py @@ -321,7 +321,9 @@ def detect_and_repair_corrupted_replica(client, volume_name, data_integrity_mode """ # Step 1 - volume = create_and_check_volume(client, volume_name, 3, size=str(2 * Gi)) + volume = create_and_check_volume(client, volume_name, + num_of_replicas=3, + size=str(2 * Gi)) lht_hostId = get_self_host_id() volume = volume.attach(hostId=lht_hostId) @@ -572,7 +574,8 @@ def check_hashed_and_with_immediate_hash(client, volume_name, snapshot_data_inte """ # Step 1 - volume = create_and_check_volume(client, volume_name, 3, + volume = create_and_check_volume(client, volume_name, + num_of_replicas=3, size=str(volume_size * Mi), snapshot_data_integrity=snapshot_data_integrity) # NOQA @@ -604,7 +607,8 @@ def check_hashed_and_without_immediate_hash(client, volume_name, snapshot_data_i """ # Step 1 - volume = create_and_check_volume(client, volume_name, 3, + volume = create_and_check_volume(client, volume_name, + num_of_replicas=3, size=str(16 * Mi), snapshot_data_integrity=snapshot_data_integrity) # NOQA @@ -644,7 +648,8 @@ def check_per_volume_hash_disable(client, volume_name, snapshot_data_integrity): """ # Step 1 - volume = create_and_check_volume(client, volume_name, 3, + volume = create_and_check_volume(client, volume_name, + num_of_replicas=3, size=str(16 * Mi), snapshot_data_integrity=snapshot_data_integrity) # NOQA @@ -694,7 +699,9 @@ def test_snapshot_cr(client, volume_name, settings_reset): # NOQA client.update(setting, value="true") lht_hostId = get_self_host_id() - volume = create_and_check_volume(client, volume_name, 3, size=str(1 * Gi)) + volume = create_and_check_volume(client, volume_name, + num_of_replicas=3, + size=str(1 * Gi)) volume = volume.attach(hostId=lht_hostId) wait_for_volume_healthy(client, volume_name) volume = client.by_id_volume(volume_name) diff --git a/manager/integration/tests/test_upgrade.py b/manager/integration/tests/test_upgrade.py index 428d41c7b2..249af8caf5 100644 --- a/manager/integration/tests/test_upgrade.py +++ b/manager/integration/tests/test_upgrade.py @@ -205,6 +205,12 @@ def longhorn_upgrade(longhorn_install_method, longhorn_repo_url, longhorn_repo_branch], shell=False) + elif longhorn_install_method == "fleet": + command = "./pipelines/fleet/scripts/upgrade-longhorn.sh" + process = subprocess.Popen([command, + longhorn_repo_url, + longhorn_repo_branch], + shell=False) process.wait() if process.returncode == 0: @@ -343,11 +349,9 @@ def test_upgrade(longhorn_upgrade_type, set_backupstore_nfs(client) mount_nfs_backupstore(client) backup_vol_name = "backup-vol" - backup_vol = create_and_check_volume( - client, - backup_vol_name, - 2, - str(DEFAULT_VOLUME_SIZE * Gi)) + backup_vol = create_and_check_volume(client, backup_vol_name, + num_of_replicas=2, + size=str(DEFAULT_VOLUME_SIZE * Gi)) backup_vol.attach(hostId=host_id) backup_vol = wait_for_volume_healthy(client, backup_vol_name) data0 = {'pos': 0, 'len': BACKUP_BLOCK_SIZE, diff --git a/manager/integration/tests/test_zone.py b/manager/integration/tests/test_zone.py index 6fcc66f5ff..5965a7bf39 100644 --- a/manager/integration/tests/test_zone.py +++ b/manager/integration/tests/test_zone.py @@ -8,8 +8,12 @@ from common import pvc, pod # NOQA from common import volume_name # NOQA +from common import cleanup_node_disks from common import get_self_host_id +from common import get_update_disks +from common import update_node_disks + from common import create_and_wait_pod from common import create_pv_for_volume from common import create_pvc_for_volume @@ -141,7 +145,8 @@ def test_zone_tags(client, core_api, volume_name, k8s_node_zone_tags): # NOQA wait_longhorn_node_zone_updated(client) - volume = create_and_check_volume(client, volume_name, num_of_replicas=2) + volume = create_and_check_volume(client, volume_name, + num_of_replicas=2) host_id = get_self_host_id() @@ -503,6 +508,166 @@ def test_replica_auto_balance_zone_best_effort(client, core_api, volume_name): assert z3_r_count == 2 +def test_replica_auto_balance_when_disabled_disk_scheduling_in_zone(client, core_api, volume_name): # NOQA + """ + Scenario: replica auto-balance when disk scheduling is disabled on nodes + in a zone. + + Issue: https://github.com/longhorn/longhorn/issues/6508 + + Given `replica-soft-anti-affinity` setting is `true`. + And node-1 is in zone-1. + node-2 is in zone-2. + node-3 is in zone-3. + And disk scheduling is disabled on node-3. + And create a volume with 3 replicas. + And attach the volume to test pod node. + And 3 replicas running in zone-1 and zone-2. + 0 replicas running in zone-3. + + When set `replica-auto-balance` to `best-effort`. + + Then 3 replicas running in zone-1 and zone-2. + 0 replicas running in zone-3. + And replica count remains stable across zones and nodes. + """ + # Set `replica-soft-anti-affinity` to `true`. + update_setting(client, SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY, "true") + + # Assign nodes to respective zones + node1, node2, node3 = client.list_node() + set_k8s_node_zone_label(core_api, node1.name, ZONE1) + set_k8s_node_zone_label(core_api, node2.name, ZONE2) + set_k8s_node_zone_label(core_api, node3.name, ZONE3) + wait_longhorn_node_zone_updated(client) + + # Disable disk scheduling on node 3 + cleanup_node_disks(client, node3.name) + + # Create a volume with 3 replicas + num_of_replicas = 3 + volume = client.create_volume(name=volume_name, + numberOfReplicas=num_of_replicas) + + # Wait for the volume to detach and attach it to the test pod node + volume = wait_for_volume_detached(client, volume_name) + volume.attach(hostId=get_self_host_id()) + + # Define a function to assert replica count + def assert_replica_count(is_stable=False): + for _ in range(RETRY_COUNTS): + time.sleep(RETRY_INTERVAL) + + zone3_replica_count = get_zone_replica_count( + client, volume_name, ZONE3, chk_running=True) + assert zone3_replica_count == 0 + + total_replica_count = \ + get_zone_replica_count( + client, volume_name, ZONE1, chk_running=True) + \ + get_zone_replica_count( + client, volume_name, ZONE2, chk_running=True) + + if is_stable: + assert total_replica_count == num_of_replicas + elif total_replica_count == num_of_replicas: + break + + assert total_replica_count == 3 + + # Perform the initial assertion to ensure the replica count is as expected + assert_replica_count() + + # Update the replica-auto-balance setting to `best-effort` + update_setting(client, SETTING_REPLICA_AUTO_BALANCE, "best-effort") + + # Perform the final assertion to ensure the replica count is as expected, + # and stable after the setting update + assert_replica_count(is_stable=True) + + +def test_replica_auto_balance_when_no_storage_available_in_zone(client, core_api, volume_name): # NOQA + """ + Scenario: replica auto-balance when there is no storage available on nodes + in a zone. + + Issue: https://github.com/longhorn/longhorn/issues/6671 + + Given `replica-soft-anti-affinity` setting is `true`. + And node-1 is in zone-1. + node-2 is in zone-2. + node-3 is in zone-3. + And fill up the storage on node-3. + And create a volume with 3 replicas. + And attach the volume to test pod node. + And 3 replicas running in zone-1 and zone-2. + 0 replicas running in zone-3. + + When set `replica-auto-balance` to `best-effort`. + + Then 3 replicas running in zone-1 and zone-2. + 0 replicas running in zone-3. + And replica count remains stable across zones and nodes. + """ + # Set `replica-soft-anti-affinity` to `true`. + update_setting(client, SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY, "true") + + # Assign nodes to respective zones + node1, node2, node3 = client.list_node() + set_k8s_node_zone_label(core_api, node1.name, ZONE1) + set_k8s_node_zone_label(core_api, node2.name, ZONE2) + set_k8s_node_zone_label(core_api, node3.name, ZONE3) + wait_longhorn_node_zone_updated(client) + + # Fill up the storage on node 3 + for _, disk in node3.disks.items(): + disk.storageReserved = disk.storageMaximum + + update_disks = get_update_disks(node3.disks) + update_node_disks(client, node3.name, disks=update_disks, retry=True) + + # Create a volume with 3 replicas + num_of_replicas = 3 + volume = client.create_volume(name=volume_name, + numberOfReplicas=num_of_replicas) + + # Wait for the volume to detach and attach it to the test pod node + volume = wait_for_volume_detached(client, volume_name) + volume.attach(hostId=get_self_host_id()) + + # Define a function to assert replica count + def assert_replica_count(is_stable=False): + for _ in range(RETRY_COUNTS): + time.sleep(RETRY_INTERVAL) + + zone3_replica_count = get_zone_replica_count( + client, volume_name, ZONE3, chk_running=True) + assert zone3_replica_count == 0 + + total_replica_count = \ + get_zone_replica_count( + client, volume_name, ZONE1, chk_running=True) + \ + get_zone_replica_count( + client, volume_name, ZONE2, chk_running=True) + + if is_stable: + assert total_replica_count == num_of_replicas + elif total_replica_count == num_of_replicas: + break + + assert total_replica_count == 3 + + # Perform the initial assertion to ensure the replica count is as expected + assert_replica_count() + + # Update the replica-auto-balance setting to `best-effort` + update_setting(client, SETTING_REPLICA_AUTO_BALANCE, "best-effort") + + # Perform the final assertion to ensure the replica count is as expected, + # and stable after the setting update + assert_replica_count(is_stable=True) + + def test_replica_auto_balance_when_replica_on_unschedulable_node(client, core_api, volume_name, request): # NOQA """ Scenario: replica auto-balance when replica already running on diff --git a/mirror_csi_images/scripts/publish.sh b/mirror_csi_images/scripts/publish.sh index 5f3cb322f3..4d918fb099 100755 --- a/mirror_csi_images/scripts/publish.sh +++ b/mirror_csi_images/scripts/publish.sh @@ -13,13 +13,21 @@ if [[ -n "${LONGHORN_IMAGES_FILE_URL}" ]]; then CSI_IMAGE=$(echo "${LINE}" | sed -e "s/longhornio\///g") IFS=: read -ra IMAGE_TAG_PAIR <<< "${CSI_IMAGE}" echo "registry.k8s.io/sig-storage/${IMAGE_TAG_PAIR[0]}" "longhornio/${IMAGE_TAG_PAIR[0]}" "${IMAGE_TAG_PAIR[1]}" >> "${INFILE}" + elif [[ "${LINE}" =~ "support-bundle-kit" ]]; then + SUPPORT_BUNDLE_KIT_IMAGE=$(echo "${LINE}" | sed -e "s/longhornio\///g") + IFS=: read -ra IMAGE_TAG_PAIR <<< "${SUPPORT_BUNDLE_KIT_IMAGE}" + echo "rancher/${IMAGE_TAG_PAIR[0]}" "longhornio/${IMAGE_TAG_PAIR[0]}" "${IMAGE_TAG_PAIR[1]}" >> "${INFILE}" fi done < "${LONGHORN_IMAGES_FILE}" else IFS=, read -ra CSI_IMAGES_ARR <<< "${CSI_IMAGES}" for CSI_IMAGE in "${CSI_IMAGES_ARR[@]}"; do IFS=: read -ra IMAGE_TAG_PAIR <<< "$CSI_IMAGE" - echo "registry.k8s.io/sig-storage/${IMAGE_TAG_PAIR[0]}" "longhornio/${IMAGE_TAG_PAIR[0]}" "${IMAGE_TAG_PAIR[1]}" >> "${INFILE}" + if [[ "${CSI_IMAGE}" =~ "csi-" ]]; then + echo "registry.k8s.io/sig-storage/${IMAGE_TAG_PAIR[0]}" "longhornio/${IMAGE_TAG_PAIR[0]}" "${IMAGE_TAG_PAIR[1]}" >> "${INFILE}" + elif [[ "${CSI_IMAGE}" =~ "support-bundle-kit" ]]; then + echo "rancher/${IMAGE_TAG_PAIR[0]}" "longhornio/${IMAGE_TAG_PAIR[0]}" "${IMAGE_TAG_PAIR[1]}" >> "${INFILE}" + fi done fi diff --git a/pipelines/e2e/Dockerfile.setup b/pipelines/e2e/Dockerfile.setup index 4b79eafa60..5fd0be7b8c 100644 --- a/pipelines/e2e/Dockerfile.setup +++ b/pipelines/e2e/Dockerfile.setup @@ -25,7 +25,7 @@ RUN wget -q https://storage.googleapis.com/kubernetes-release/release/$KUBECTL_V wget -q "https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64" && \ mv yq_linux_amd64 /usr/local/bin/yq && \ chmod +x /usr/local/bin/yq && \ - apk add openssl openssh-client ca-certificates git rsync bash curl jq python3 py3-pip gcc python3-dev libc-dev && \ + apk add openssl openssh-client ca-certificates git rsync bash curl jq python3 py3-pip gcc python3-dev libc-dev py3-virtualenv && \ ssh-keygen -t rsa -b 4096 -N "" -f ~/.ssh/id_rsa && \ curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 && \ chmod 700 get_helm.sh && \ diff --git a/pipelines/fleet/Dockerfile.setup b/pipelines/fleet/Dockerfile.setup new file mode 100644 index 0000000000..18deaed96b --- /dev/null +++ b/pipelines/fleet/Dockerfile.setup @@ -0,0 +1,34 @@ +From alpine:latest + +ARG KUBECTL_VERSION=v1.20.2 + +ARG RKE_VERSION=v1.3.4 + +ARG TERRAFORM_VERSION=1.3.5 + +ARG YQ_VERSION=v4.24.2 + +ENV WORKSPACE /src/longhorn-tests + +WORKDIR $WORKSPACE + +RUN wget -q https://storage.googleapis.com/kubernetes-release/release/$KUBECTL_VERSION/bin/linux/amd64/kubectl && \ + mv kubectl /usr/local/bin/kubectl && \ + chmod +x /usr/local/bin/kubectl && \ + wget -q https://github.com/rancher/rke/releases/download/$RKE_VERSION/rke_linux-amd64 && \ + mv rke_linux-amd64 /usr/bin/rke && \ + chmod +x /usr/bin/rke && \ + wget -q https://releases.hashicorp.com/terraform/${TERRAFORM_VERSION}/terraform_${TERRAFORM_VERSION}_linux_amd64.zip && \ + unzip terraform_${TERRAFORM_VERSION}_linux_amd64.zip && rm terraform_${TERRAFORM_VERSION}_linux_amd64.zip && \ + mv terraform /usr/bin/terraform && \ + chmod +x /usr/bin/terraform && \ + wget -q "https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64" && \ + mv yq_linux_amd64 /usr/local/bin/yq && \ + chmod +x /usr/local/bin/yq && \ + apk add openssl openssh-client ca-certificates git rsync bash curl jq python3 py3-pip && \ + ssh-keygen -t rsa -b 4096 -N "" -f ~/.ssh/id_rsa && \ + curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 && \ + chmod 700 get_helm.sh && \ + ./get_helm.sh + +COPY [".", "$WORKSPACE"] diff --git a/pipelines/fleet/Jenkinsfile b/pipelines/fleet/Jenkinsfile new file mode 100644 index 0000000000..d2fdf43a09 --- /dev/null +++ b/pipelines/fleet/Jenkinsfile @@ -0,0 +1,163 @@ +def imageName = "${JOB_BASE_NAME}-${env.BUILD_NUMBER}" +def summary +def WORKSPACE = "/src/longhorn-tests" +def BUILD_TRIGGER_BY = "\n${currentBuild.getBuildCauses()[0].shortDescription}" + +// define optional parameters +def SELINUX_MODE = params.SELINUX_MODE ? params.SELINUX_MODE : "" + +def CREDS_ID = JOB_BASE_NAME == "longhorn-tests-regression" ? "AWS_CREDS_RANCHER_QA" : "AWS_CREDS" +def REGISTRATION_CODE_ID = params.ARCH == "amd64" ? "REGISTRATION_CODE" : "REGISTRATION_CODE_ARM64" + +// parameters for air gap installation +def AIR_GAP_INSTALLATION = params.AIR_GAP_INSTALLATION ? params.AIR_GAP_INSTALLATION : false +def CIS_HARDENING = params.CIS_HARDENING ? params.CIS_HARDENING : false +def REGISTRY_URL +def REGISTRY_USERNAME +def REGISTRY_PASSWORD + +// parameter for hdd test +def USE_HDD = params.USE_HDD ? params.USE_HDD : false + +node { + + withCredentials([ + usernamePassword(credentialsId: CREDS_ID, passwordVariable: 'AWS_SECRET_KEY', usernameVariable: 'AWS_ACCESS_KEY'), + string(credentialsId: REGISTRATION_CODE_ID, variable: 'REGISTRATION_CODE'), + ]) { + + if (params.SEND_SLACK_NOTIFICATION) { + notifyBuild('STARTED', BUILD_TRIGGER_BY, params.NOTIFY_SLACK_CHANNEL) + } + + checkout scm + + try { + + stage('build') { + + echo "Using credentials: $CREDS_ID" + echo "Using registration code: $REGISTRATION_CODE_ID" + + sh "pipelines/fleet/scripts/build.sh" + sh """ docker run -itd --name ${JOB_BASE_NAME}-${BUILD_NUMBER} \ + --env REGISTRY_URL=${REGISTRY_URL} \ + --env REGISTRY_USERNAME=${REGISTRY_USERNAME} \ + --env REGISTRY_PASSWORD=${REGISTRY_PASSWORD} \ + --env FLEET_REPO_URI=${FLEET_REPO_URI} \ + --env FLEET_REPO_VERSION=${FLEET_REPO_VERSION} \ + --env LONGHORN_TESTS_CUSTOM_IMAGE=${LONGHORN_TESTS_CUSTOM_IMAGE} \ + --env DISTRO=${DISTRO} \ + --env FLEET_REPO_STABLE_VERSION=${FLEET_REPO_STABLE_VERSION} \ + --env FLEET_REPO_TRANSIENT_VERSION=${FLEET_REPO_TRANSIENT_VERSION} \ + --env LONGHORN_TEST_CLOUDPROVIDER=${LONGHORN_TEST_CLOUDPROVIDER} \ + --env LONGHORN_UPGRADE_TEST=${LONGHORN_UPGRADE_TEST} \ + --env PYTEST_CUSTOM_OPTIONS="${PYTEST_CUSTOM_OPTIONS}" \ + --env BACKUP_STORE_TYPE="${BACKUP_STORE_TYPE}" \ + --env TF_VAR_use_hdd=${USE_HDD} \ + --env TF_VAR_arch=${ARCH} \ + --env TF_VAR_k8s_distro_name=${K8S_DISTRO_NAME} \ + --env TF_VAR_k8s_distro_version=${K8S_DISTRO_VERSION} \ + --env TF_VAR_aws_availability_zone=${AWS_AVAILABILITY_ZONE} \ + --env TF_VAR_aws_region=${AWS_REGION} \ + --env TF_VAR_os_distro_version=${DISTRO_VERSION} \ + --env TF_VAR_lh_aws_access_key=${AWS_ACCESS_KEY} \ + --env TF_VAR_lh_aws_instance_name_controlplane="${JOB_BASE_NAME}-ctrl" \ + --env TF_VAR_lh_aws_instance_name_worker="${JOB_BASE_NAME}-wrk" \ + --env TF_VAR_lh_aws_instance_type_controlplane=${CONTROLPLANE_INSTANCE_TYPE} \ + --env TF_VAR_lh_aws_instance_type_worker=${WORKER_INSTANCE_TYPE}\ + --env TF_VAR_lh_aws_secret_key=${AWS_SECRET_KEY} \ + --env TF_VAR_selinux_mode=${SELINUX_MODE} \ + --env TF_VAR_registration_code=${REGISTRATION_CODE} \ + --env TF_VAR_cis_hardening=${CIS_HARDENING} \ + ${imageName} + """ + } + + timeout(60) { + stage ('terraform') { + sh "docker exec ${JOB_BASE_NAME}-${BUILD_NUMBER} pipelines/utilities/terraform_setup.sh" + } + } + + stage ('longhorn setup & tests') { + sh "docker exec ${JOB_BASE_NAME}-${BUILD_NUMBER} pipelines/fleet/scripts/longhorn-setup.sh" + } + + stage ('download support bundle') { + sh "docker exec ${JOB_BASE_NAME}-${BUILD_NUMBER} pipelines/utilities/download_support_bundle.sh ${JOB_BASE_NAME}-${BUILD_NUMBER}-bundle.zip" + sh "docker cp ${JOB_BASE_NAME}-${BUILD_NUMBER}:${WORKSPACE}/${JOB_BASE_NAME}-${BUILD_NUMBER}-bundle.zip ." + archiveArtifacts allowEmptyArchive: true, artifacts: '**/*.zip', followSymlinks: false + } + + stage ('report generation') { + sh "docker cp ${JOB_BASE_NAME}-${BUILD_NUMBER}:${WORKSPACE}/longhorn-test-junit-report.xml ." + + if(params.LONGHORN_UPGRADE_TEST && params.FLEET_REPO_TRANSIENT_VERSION) { + sh "docker cp ${JOB_BASE_NAME}-${BUILD_NUMBER}:${WORKSPACE}/longhorn-test-upgrade-from-stable-junit-report.xml ." + sh "docker cp ${JOB_BASE_NAME}-${BUILD_NUMBER}:${WORKSPACE}/longhorn-test-upgrade-from-transient-junit-report.xml ." + summary = junit 'longhorn-test-upgrade-from-stable-junit-report.xml, longhorn-test-upgrade-from-transient-junit-report.xml, longhorn-test-junit-report.xml' + } + else if(params.LONGHORN_UPGRADE_TEST) { + sh "docker cp ${JOB_BASE_NAME}-${BUILD_NUMBER}:${WORKSPACE}/longhorn-test-upgrade-from-stable-junit-report.xml ." + summary = junit 'longhorn-test-upgrade-from-stable-junit-report.xml, longhorn-test-junit-report.xml' + } + else { + summary = junit 'longhorn-test-junit-report.xml' + } + } + + } catch (e) { + currentBuild.result = "FAILED" + throw e + } finally { + stage ('releasing resources') { + + if (sh (script: "docker container inspect ${JOB_BASE_NAME}-${BUILD_NUMBER} > /dev/null 2>&1", returnStatus: true) == 0) { + sh "docker exec ${JOB_BASE_NAME}-${BUILD_NUMBER} pipelines/utilities/cleanup.sh" + sh "docker stop ${JOB_BASE_NAME}-${BUILD_NUMBER}" + sh "docker rm -v ${JOB_BASE_NAME}-${BUILD_NUMBER}" + sh "docker rmi ${imageName}" + } + + if (summary) { + summary_msg = "\nTest Summary - Failures: ${summary.failCount}, Skipped: ${summary.skipCount}, Passed: ${summary.passCount} -- Job completed in ${currentBuild.durationString.replace(' and counting', '')}" + } else { + summary_msg = "\n Test Failed: No Junit report" + } + + if(params.SEND_SLACK_NOTIFICATION){ + notifyBuild(currentBuild.result, summary_msg, params.NOTIFY_SLACK_CHANNEL) + } + } + } + } + +} + + +def notifyBuild(String buildStatus = 'STARTED', String summary_msg, String slack_channel) { + // build status of null means successful + buildStatus = buildStatus ?: 'SUCCESSFUL' + + // Default values + def colorName = 'RED' + def colorCode = '#FF0000' + def subject = "${buildStatus}: Job '${env.JOB_BASE_NAME} [${env.BUILD_NUMBER}]'" + def summary = "${subject} (${env.BUILD_URL})" + summary_msg + + // Override default values based on build status + if (buildStatus == 'STARTED') { + color = 'YELLOW' + colorCode = '#FFFF00' + } else if (buildStatus == 'SUCCESSFUL') { + color = 'GREEN' + colorCode = '#00FF00' + } else { + color = 'RED' + colorCode = '#FF0000' + } + + // Send notifications + slackSend (color: colorCode, message: summary, channel: slack_channel, tokenCredentialId: 'longhorn-tests-slack-token') +} diff --git a/pipelines/fleet/scripts/build.sh b/pipelines/fleet/scripts/build.sh new file mode 100755 index 0000000000..0e4b6813ac --- /dev/null +++ b/pipelines/fleet/scripts/build.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +docker build --no-cache -f ./pipelines/fleet/Dockerfile.setup -t "${JOB_BASE_NAME}-${BUILD_NUMBER}" . diff --git a/pipelines/fleet/scripts/longhorn-setup.sh b/pipelines/fleet/scripts/longhorn-setup.sh new file mode 100755 index 0000000000..4332155876 --- /dev/null +++ b/pipelines/fleet/scripts/longhorn-setup.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash + +set -x + +source pipelines/utilities/kubeconfig.sh +source pipelines/utilities/selinux_workaround.sh +source pipelines/utilities/install_csi_snapshotter.sh +source pipelines/utilities/create_aws_secret.sh +source pipelines/utilities/install_backupstores.sh +source pipelines/utilities/create_longhorn_namespace.sh +source pipelines/utilities/fleet.sh +source pipelines/utilities/run_longhorn_test.sh + + +export LONGHORN_NAMESPACE="longhorn-system" +export LONGHORN_INSTALL_METHOD="fleet" + + +main(){ + set_kubeconfig + + if [[ ${DISTRO} == "rhel" ]] || [[ ${DISTRO} == "rockylinux" ]] || [[ ${DISTRO} == "oracle" ]]; then + apply_selinux_workaround + fi + + # set debugging mode off to avoid leaking aws secrets to the logs. + # DON'T REMOVE! + set +x + create_aws_secret + set -x + + create_longhorn_namespace + install_backupstores + install_csi_snapshotter + + install_fleet + + if [[ "${LONGHORN_UPGRADE_TEST}" == true ]]; then + create_fleet_git_repo "${FLEET_REPO_STABLE_VERSION}" + LONGHORN_UPGRADE_TYPE="from_stable" + LONGHORN_UPGRADE_TEST_POD_NAME="longhorn-test-upgrade-from-stable" + if [[ -n "${FLEET_REPO_TRANSIENT_VERSION}" ]]; then + UPGRADE_LH_REPO_URL="${FLEET_REPO_URI}" + UPGRADE_LH_REPO_BRANCH="${FLEET_REPO_TRANSIENT_VERSION}" + UPGRADE_LH_ENGINE_IMAGE="longhornio/longhorn-engine:${FLEET_REPO_TRANSIENT_VERSION}" + run_longhorn_upgrade_test + LONGHORN_UPGRADE_TYPE="from_transient" + LONGHORN_UPGRADE_TEST_POD_NAME="longhorn-test-upgrade-from-transient" + fi + UPGRADE_LH_REPO_URL="${FLEET_REPO_URI}" + UPGRADE_LH_REPO_BRANCH="${FLEET_REPO_VERSION}" + UPGRADE_LH_ENGINE_IMAGE="longhornio/longhorn-engine:${FLEET_REPO_VERSION}" + run_longhorn_upgrade_test + run_longhorn_test + else + create_fleet_git_repo + run_longhorn_test + fi +} + +main diff --git a/pipelines/fleet/scripts/upgrade-longhorn.sh b/pipelines/fleet/scripts/upgrade-longhorn.sh new file mode 100755 index 0000000000..13571ec608 --- /dev/null +++ b/pipelines/fleet/scripts/upgrade-longhorn.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +set -x + +export FLEET_REPO_URI="${1}" +export FLEET_REPO_VERSION="${2}" + +source pipelines/utilities/fleet.sh + +export LONGHORN_NAMESPACE="longhorn-system" + +create_fleet_git_repo diff --git a/pipelines/utilities/argocd.sh b/pipelines/utilities/argocd.sh index fcfdb890be..cb6d6c0e8d 100755 --- a/pipelines/utilities/argocd.sh +++ b/pipelines/utilities/argocd.sh @@ -34,7 +34,27 @@ init_argocd(){ create_argocd_app(){ REVISION="${1:-${LONGHORN_INSTALL_VERSION}}" - argocd app create longhorn --repo "${LONGHORN_REPO_URI}" --revision "${REVISION}" --path chart --dest-server https://kubernetes.default.svc --dest-namespace "${LONGHORN_NAMESPACE}" + cat > longhorn-application.yaml < longhorn-gitrepo.yaml <&1 | awk '{print $1}' | grep csi-` ]] || \ + [[ -z `kubectl get pods -n ${LONGHORN_NAMESPACE} --no-headers 2>&1 | awk '{print $1}' | grep engine-image-` ]] || \ [[ -n `kubectl get pods -n ${LONGHORN_NAMESPACE} --no-headers 2>&1 | awk '{print $3}' | grep -v "Running\|Completed"` ]]; do echo "Longhorn is still installing ... re-checking in 1m" sleep ${RETRY_INTERVAL} diff --git a/pipelines/utilities/run_longhorn_e2e_test.sh b/pipelines/utilities/run_longhorn_e2e_test.sh index 589bdc4d52..c819dfdd35 100755 --- a/pipelines/utilities/run_longhorn_e2e_test.sh +++ b/pipelines/utilities/run_longhorn_e2e_test.sh @@ -74,6 +74,8 @@ run_longhorn_e2e_test_out_of_cluster(){ export LONGHORN_BACKUPSTORE_POLL_INTERVAL="30" cd e2e + python3 -m venv . + source bin/activate pip install -r requirements.txt eval "ROBOT_COMMAND_ARGS=($PYTEST_CUSTOM_OPTIONS)" diff --git a/test_framework/Dockerfile.setup b/test_framework/Dockerfile.setup index a507e256c2..d4c035e2f1 100644 --- a/test_framework/Dockerfile.setup +++ b/test_framework/Dockerfile.setup @@ -25,8 +25,7 @@ RUN wget -q https://storage.googleapis.com/kubernetes-release/release/$KUBECTL_V wget -q "https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64" && \ mv yq_linux_amd64 /usr/local/bin/yq && \ chmod +x /usr/local/bin/yq && \ - apk add openssl openssh-client ca-certificates git rsync bash curl jq chromium chromium-chromedriver python3 py3-pip && \ - pip3 install -U selenium==3.141.0 && \ + apk add openssl openssh-client ca-certificates git rsync bash curl jq && \ ssh-keygen -t rsa -b 4096 -N "" -f ~/.ssh/id_rsa && \ curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 && \ chmod 700 get_helm.sh && \ diff --git a/test_framework/Jenkinsfile b/test_framework/Jenkinsfile index 56b1a081bf..3f2c7bd508 100644 --- a/test_framework/Jenkinsfile +++ b/test_framework/Jenkinsfile @@ -18,6 +18,7 @@ def RANCHER_CHART_GIT_BRANCH = params.RANCHER_CHART_GIT_BRANCH ? params.RANCHER_ def RANCHER_CHART_INSTALL_VERSION = params.RANCHER_CHART_INSTALL_VERSION ? params.RANCHER_CHART_INSTALL_VERSION : "" def LONGHORN_TRANSIENT_VERSION = params.LONGHORN_TRANSIENT_VERSION ? params.LONGHORN_TRANSIENT_VERSION : "" def CIS_HARDENING = params.CIS_HARDENING ? params.CIS_HARDENING : false +def CUSTOM_SSH_PUBLIC_KEY = params.CUSTOM_SSH_PUBLIC_KEY ? params.CUSTOM_SSH_PUBLIC_KEY : "" def REGISTRY_URL def REGISTRY_USERNAME def REGISTRY_PASSWORD @@ -136,6 +137,7 @@ node { --env TF_VAR_azure_tenant_id=${AZURE_TENANT_ID} \ --env TF_VAR_azure_subscription_id=${AZURE_SUBSCRIPTION_ID} \ --env TF_VAR_cis_hardening=${CIS_HARDENING} \ + --env TF_VAR_custom_ssh_public_key="${CUSTOM_SSH_PUBLIC_KEY}" \ ${imageName} """ diff --git a/test_framework/scripts/longhorn-setup.sh b/test_framework/scripts/longhorn-setup.sh index e39c04924a..691dedabf3 100755 --- a/test_framework/scripts/longhorn-setup.sh +++ b/test_framework/scripts/longhorn-setup.sh @@ -56,9 +56,13 @@ install_cluster_autoscaler(){ install_csi_snapshotter_crds(){ CSI_SNAPSHOTTER_REPO_URL="https://github.com/kubernetes-csi/external-snapshotter.git" - CSI_SNAPSHOTTER_REPO_BRANCH="v6.2.1" CSI_SNAPSHOTTER_REPO_DIR="${TMPDIR}/k8s-csi-external-snapshotter" + [[ "${LONGHORN_REPO_URI}" =~ https://([^/]+)/([^/]+)/([^/.]+)(.git)? ]] + wget "https://raw.githubusercontent.com/${BASH_REMATCH[2]}/${BASH_REMATCH[3]}/${LONGHORN_REPO_BRANCH}/deploy/longhorn-images.txt" -O "/tmp/longhorn-images.txt" + IFS=: read -ra IMAGE_TAG_PAIR <<< $(grep csi-snapshotter /tmp/longhorn-images.txt) + CSI_SNAPSHOTTER_REPO_BRANCH="${IMAGE_TAG_PAIR[1]}" + git clone --single-branch \ --branch "${CSI_SNAPSHOTTER_REPO_BRANCH}" \ "${CSI_SNAPSHOTTER_REPO_URL}" \ @@ -89,9 +93,10 @@ install_rancher() { get_rancher_api_key() { - python3 "${TF_VAR_tf_workspace}/scripts/rancher/webdriver/main.py" "${RANCHER_HOSTNAME}" "${RANCHER_BOOTSTRAP_PASSWORD}" - RANCHER_ACCESS_KEY=`cat "${PWD}/access_key"` - RANCHER_SECRET_KEY=`cat "${PWD}/secret_key"` + TOKEN=$(curl -X POST -s -k "https://${RANCHER_HOSTNAME}/v3-public/localproviders/local?action=login" -H 'Content-Type: application/json' -d "{\"username\":\"admin\", \"password\":\"${RANCHER_BOOTSTRAP_PASSWORD}\", \"responseType\": \"json\"}" | jq -r '.token' | tr -d '"') + ARR=(${TOKEN//:/ }) + RANCHER_ACCESS_KEY=${ARR[0]} + RANCHER_SECRET_KEY=${ARR[1]} } @@ -116,10 +121,11 @@ wait_longhorn_status_running(){ local RETRY_COUNTS=10 # in minutes local RETRY_INTERVAL="1m" - # csi components are installed after longhorn components. + # csi and engine image components are installed after longhorn components. # it's possible that all longhorn components are running but csi components aren't created yet. RETRIES=0 while [[ -z `kubectl get pods -n ${LONGHORN_NAMESPACE} --no-headers 2>&1 | awk '{print $1}' | grep csi-` ]] || \ + [[ -z `kubectl get pods -n ${LONGHORN_NAMESPACE} --no-headers 2>&1 | awk '{print $1}' | grep engine-image-` ]] || \ [[ -n `kubectl get pods -n ${LONGHORN_NAMESPACE} --no-headers 2>&1 | awk '{print $3}' | grep -v Running` ]]; do echo "Longhorn is still installing ... re-checking in 1m" sleep ${RETRY_INTERVAL} @@ -317,6 +323,8 @@ run_longhorn_upgrade_test(){ yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env[4].value="'${LONGHORN_UPGRADE_TYPE}'"' ${LONGHORN_UPGRADE_TESTS_MANIFEST_FILE_PATH} + yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env[7].value="'${RESOURCE_SUFFIX}'"' ${LONGHORN_UPGRADE_TESTS_MANIFEST_FILE_PATH} + kubectl apply -f ${LONGHORN_UPGRADE_TESTS_MANIFEST_FILE_PATH} # wait upgrade test pod to start running diff --git a/test_framework/scripts/rancher/webdriver/main.py b/test_framework/scripts/rancher/webdriver/main.py deleted file mode 100644 index b57fb7ed63..0000000000 --- a/test_framework/scripts/rancher/webdriver/main.py +++ /dev/null @@ -1,138 +0,0 @@ -import os -import sys - -from selenium import webdriver -from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException -from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions -from selenium.webdriver.support.wait import WebDriverWait - -wait_timeout = 10 -click_retry_timeout = 20 - - -def navigate_and_wait_for(target, expect): - success = False - retry = 0 - max_retry = 10 - while success is not True and retry < max_retry: - try: - driver.get(target) - wait_for(expect) - success = True - except ElementClickInterceptedException as e: - raise e - except Exception as e: - print(f'[retry {retry}] try to navigate to {target} and wait for {expect} failed ... {repr(e)} {e}') - retry += 1 - - -def wait_for(target): - try: - WebDriverWait(driver, wait_timeout).until( - expected_conditions.presence_of_element_located((By.XPATH, elements[target])) - ) - return True - except Exception as e: - print(f'wait for {target} error: {e}') - return False - - -def click(target): - driver.find_element_by_xpath(elements[target]).click() - - -def click_and_wait(target, expect): - success = False - retry = 0 - max_retry = 10 - while success is not True and retry < max_retry: - try: - _target = driver.find_element_by_xpath(elements[target]) - _target.click() - WebDriverWait(driver, click_retry_timeout).until( - expected_conditions.presence_of_element_located((By.XPATH, elements[expect])) - ) - success = True - except ElementClickInterceptedException as e: - raise e - except Exception as e: - print(f'[retry {retry}] try to click {target} and wait for {expect} failed ... {repr(e)} {e}') - retry += 1 - - -def send_keys(target, keys): - _target = driver.find_element_by_xpath(elements[target]) - _target.send_keys(keys) - - -def get_element(element): - content = driver.find_element_by_xpath(elements[element]).text - return content - - -elements = { - 'username_input': '//*[@id="username"]', - 'password_input': '//*[@type="password"]', - 'login': '//button[@id="submit"]', - 'agree': '(//*[contains(@class, "checkbox-custom")])[2]', - 'continue': '//*[@type="submit" and not(@disabled)]', - 'local_cluster': '//*[contains(@href, "/local")]', - 'create_api_key': '//button[contains(text(), "Create API Key")]', - 'create_confirm': '//button//*[contains(text() ,"Create")]', - 'access_key': '//*[contains(@class, "with-copy")][1]/span', - 'secret_key': '//*[contains(@class, "with-copy")][2]/span', - 'create_done': '//button//*[contains(text() ,"Done")]' -} - -if __name__ == '__main__': - - url = 'https://' + sys.argv[1] - login_url = url + '/dashboard/auth/login' - account_url = url + '/dashboard/account' - - options = webdriver.ChromeOptions() - prefs = { - 'profile.default_content_setting_values.notifications': 2 - } - options.add_experimental_option('prefs', prefs) - options.add_argument('--headless') - options.add_argument('--ignore-certificate-errors') - options.add_argument('--no-sandbox') - options.add_argument('window-size=1920,1200') - - driver = webdriver.Chrome(executable_path='/usr/bin/chromedriver', options=options) - - success = False - _retry = 0 - _max_retry = 3 - - while not success and _retry < _max_retry: - try: - print(login_url) - driver.get(login_url) - try: - wait_for('username_input') - send_keys('username_input', 'admin') - except Exception as e: - print(f'no username field {e}') - send_keys('password_input', sys.argv[2]) - click_and_wait('login', 'agree') - click_and_wait('agree', 'continue') - click_and_wait('continue', 'local_cluster') - navigate_and_wait_for(account_url, 'create_api_key') - click_and_wait('create_api_key', 'create_confirm') - click_and_wait('create_confirm', 'access_key') - access_key = get_element('access_key') - secret_key = get_element('secret_key') - click('create_done') - with open('access_key', 'w') as f: - f.write(str(access_key)) - with open('secret_key', 'w') as f: - f.write(str(secret_key)) - success = True - except Exception as e: - print(f'parsing error: {e}') - _retry += 1 - - driver.quit() diff --git a/test_framework/scripts/terraform-setup.sh b/test_framework/scripts/terraform-setup.sh index 6c7beee307..3d28f7c2c5 100755 --- a/test_framework/scripts/terraform-setup.sh +++ b/test_framework/scripts/terraform-setup.sh @@ -32,6 +32,8 @@ terraform_setup(){ if [[ "${TF_VAR_create_load_balancer}" == true ]]; then terraform -chdir=${TF_VAR_tf_workspace}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} output -raw load_balancer_url > ${TF_VAR_tf_workspace}/load_balancer_url fi + + export RESOURCE_SUFFIX=$(terraform -chdir=${TF_VAR_tf_workspace}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} output -raw resource_suffix) } diff --git a/test_framework/terraform/aws/oracle/data.tf b/test_framework/terraform/aws/oracle/data.tf index 0365770e87..1072b2ce91 100644 --- a/test_framework/terraform/aws/oracle/data.tf +++ b/test_framework/terraform/aws/oracle/data.tf @@ -27,6 +27,7 @@ data "template_file" "provision_k3s_server" { k3s_cluster_secret = random_password.cluster_secret.result k3s_server_public_ip = aws_eip.lh_aws_eip_controlplane[0].public_ip k3s_version = var.k8s_distro_version + custom_ssh_public_key = var.custom_ssh_public_key } } @@ -37,6 +38,7 @@ data "template_file" "provision_k3s_agent" { k3s_server_url = "https://${aws_eip.lh_aws_eip_controlplane[0].public_ip}:6443" k3s_cluster_secret = random_password.cluster_secret.result k3s_version = var.k8s_distro_version + custom_ssh_public_key = var.custom_ssh_public_key } } @@ -47,6 +49,7 @@ data "template_file" "provision_rke2_server" { rke2_cluster_secret = random_password.cluster_secret.result rke2_server_public_ip = aws_eip.lh_aws_eip_controlplane[0].public_ip rke2_version = var.k8s_distro_version + custom_ssh_public_key = var.custom_ssh_public_key } } @@ -57,5 +60,6 @@ data "template_file" "provision_rke2_agent" { rke2_server_url = "https://${aws_eip.lh_aws_eip_controlplane[0].public_ip}:9345" rke2_cluster_secret = random_password.cluster_secret.result rke2_version = var.k8s_distro_version + custom_ssh_public_key = var.custom_ssh_public_key } } diff --git a/test_framework/terraform/aws/oracle/output.tf b/test_framework/terraform/aws/oracle/output.tf index ea0c7196c7..3e0941a2c1 100644 --- a/test_framework/terraform/aws/oracle/output.tf +++ b/test_framework/terraform/aws/oracle/output.tf @@ -46,3 +46,11 @@ output "load_balancer_url" { value = var.create_load_balancer ? aws_lb.lh_aws_lb[0].dns_name : null } + +output "resource_suffix" { + depends_on = [ + random_string.random_suffix + ] + + value = random_string.random_suffix.id +} \ No newline at end of file diff --git a/test_framework/terraform/aws/oracle/user-data-scripts/provision_k3s_agent.sh.tpl b/test_framework/terraform/aws/oracle/user-data-scripts/provision_k3s_agent.sh.tpl index 36fc5dc613..092350791b 100755 --- a/test_framework/terraform/aws/oracle/user-data-scripts/provision_k3s_agent.sh.tpl +++ b/test_framework/terraform/aws/oracle/user-data-scripts/provision_k3s_agent.sh.tpl @@ -19,3 +19,7 @@ until (curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="agent --token ${k3s_clus echo 'k3s agent did not install correctly' sleep 2 done + +if [[ -n "${custom_ssh_public_key}" ]]; then + echo "${custom_ssh_public_key}" >> /home/ec2-user/.ssh/authorized_keys +fi diff --git a/test_framework/terraform/aws/oracle/user-data-scripts/provision_k3s_server.sh.tpl b/test_framework/terraform/aws/oracle/user-data-scripts/provision_k3s_server.sh.tpl index 9fcdcb2c59..1e5ca030c2 100755 --- a/test_framework/terraform/aws/oracle/user-data-scripts/provision_k3s_server.sh.tpl +++ b/test_framework/terraform/aws/oracle/user-data-scripts/provision_k3s_server.sh.tpl @@ -21,3 +21,6 @@ until (kubectl get pods -A | grep 'Running'); do sleep 5 done +if [[ -n "${custom_ssh_public_key}" ]]; then + echo "${custom_ssh_public_key}" >> /home/ec2-user/.ssh/authorized_keys +fi diff --git a/test_framework/terraform/aws/oracle/user-data-scripts/provision_rke2_agent.sh.tpl b/test_framework/terraform/aws/oracle/user-data-scripts/provision_rke2_agent.sh.tpl index 2ee3532529..f4f8089780 100644 --- a/test_framework/terraform/aws/oracle/user-data-scripts/provision_rke2_agent.sh.tpl +++ b/test_framework/terraform/aws/oracle/user-data-scripts/provision_rke2_agent.sh.tpl @@ -33,4 +33,9 @@ EOF systemctl enable rke2-agent.service systemctl start rke2-agent.service + +if [[ -n "${custom_ssh_public_key}" ]]; then + echo "${custom_ssh_public_key}" >> /home/ec2-user/.ssh/authorized_keys +fi + exit $? diff --git a/test_framework/terraform/aws/oracle/user-data-scripts/provision_rke2_server.sh.tpl b/test_framework/terraform/aws/oracle/user-data-scripts/provision_rke2_server.sh.tpl index 742a31967d..d670a60be0 100644 --- a/test_framework/terraform/aws/oracle/user-data-scripts/provision_rke2_server.sh.tpl +++ b/test_framework/terraform/aws/oracle/user-data-scripts/provision_rke2_server.sh.tpl @@ -29,3 +29,7 @@ until (KUBECONFIG=/etc/rancher/rke2/rke2.yaml /var/lib/rancher/rke2/bin/kubectl echo 'Waiting for rke2 startup' sleep 5 done + +if [[ -n "${custom_ssh_public_key}" ]]; then + echo "${custom_ssh_public_key}" >> /home/ec2-user/.ssh/authorized_keys +fi diff --git a/test_framework/terraform/aws/oracle/variables.tf b/test_framework/terraform/aws/oracle/variables.tf index 2c08e6ec0a..aa588e6f1d 100644 --- a/test_framework/terraform/aws/oracle/variables.tf +++ b/test_framework/terraform/aws/oracle/variables.tf @@ -114,3 +114,9 @@ variable "create_load_balancer" { type = bool default = false } + +variable "custom_ssh_public_key" { + type = string + default = "" + sensitive = true +} diff --git a/test_framework/terraform/aws/rhel/data.tf b/test_framework/terraform/aws/rhel/data.tf index a5e590adc8..c80ecb91d5 100644 --- a/test_framework/terraform/aws/rhel/data.tf +++ b/test_framework/terraform/aws/rhel/data.tf @@ -31,6 +31,7 @@ data "template_file" "provision_k3s_server" { k3s_version = var.k8s_distro_version selinux_mode = var.selinux_mode enable_selinux = var.selinux_mode == "permissive" ? "false" : "true" + custom_ssh_public_key = var.custom_ssh_public_key } } @@ -43,6 +44,7 @@ data "template_file" "provision_k3s_agent" { k3s_version = var.k8s_distro_version selinux_mode = var.selinux_mode enable_selinux = var.selinux_mode == "permissive" ? "false" : "true" + custom_ssh_public_key = var.custom_ssh_public_key } } @@ -54,6 +56,7 @@ data "template_file" "provision_rke2_server" { rke2_server_public_ip = aws_eip.lh_aws_eip_controlplane[0].public_ip rke2_version = var.k8s_distro_version selinux_mode = var.selinux_mode + custom_ssh_public_key = var.custom_ssh_public_key } } @@ -65,5 +68,6 @@ data "template_file" "provision_rke2_agent" { rke2_cluster_secret = random_password.cluster_secret.result rke2_version = var.k8s_distro_version selinux_mode = var.selinux_mode + custom_ssh_public_key = var.custom_ssh_public_key } } diff --git a/test_framework/terraform/aws/rhel/output.tf b/test_framework/terraform/aws/rhel/output.tf index ea0c7196c7..cbae005ce5 100644 --- a/test_framework/terraform/aws/rhel/output.tf +++ b/test_framework/terraform/aws/rhel/output.tf @@ -46,3 +46,11 @@ output "load_balancer_url" { value = var.create_load_balancer ? aws_lb.lh_aws_lb[0].dns_name : null } + +output "resource_suffix" { + depends_on = [ + random_string.random_suffix + ] + + value = random_string.random_suffix.id +} diff --git a/test_framework/terraform/aws/rhel/user-data-scripts/provision_k3s_agent.sh.tpl b/test_framework/terraform/aws/rhel/user-data-scripts/provision_k3s_agent.sh.tpl index 879ba574d4..5e2a5f9d0d 100755 --- a/test_framework/terraform/aws/rhel/user-data-scripts/provision_k3s_agent.sh.tpl +++ b/test_framework/terraform/aws/rhel/user-data-scripts/provision_k3s_agent.sh.tpl @@ -25,3 +25,7 @@ until (curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="agent --token ${k3s_clus echo 'k3s agent did not install correctly' sleep 2 done + +if [[ -n "${custom_ssh_public_key}" ]]; then + echo "${custom_ssh_public_key}" >> /home/ec2-user/.ssh/authorized_keys +fi \ No newline at end of file diff --git a/test_framework/terraform/aws/rhel/user-data-scripts/provision_k3s_server.sh.tpl b/test_framework/terraform/aws/rhel/user-data-scripts/provision_k3s_server.sh.tpl index d767f41f5b..a874b9bb2d 100755 --- a/test_framework/terraform/aws/rhel/user-data-scripts/provision_k3s_server.sh.tpl +++ b/test_framework/terraform/aws/rhel/user-data-scripts/provision_k3s_server.sh.tpl @@ -26,3 +26,7 @@ until (sudo /usr/local/bin/kubectl get pods -A | grep 'Running'); do echo 'Waiting for k3s startup' sleep 5 done + +if [[ -n "${custom_ssh_public_key}" ]]; then + echo "${custom_ssh_public_key}" >> /home/ec2-user/.ssh/authorized_keys +fi \ No newline at end of file diff --git a/test_framework/terraform/aws/rhel/user-data-scripts/provision_rke2_agent.sh.tpl b/test_framework/terraform/aws/rhel/user-data-scripts/provision_rke2_agent.sh.tpl index fec3a2e169..4884b10f4e 100755 --- a/test_framework/terraform/aws/rhel/user-data-scripts/provision_rke2_agent.sh.tpl +++ b/test_framework/terraform/aws/rhel/user-data-scripts/provision_rke2_agent.sh.tpl @@ -39,4 +39,9 @@ EOF sudo systemctl enable rke2-agent.service sudo systemctl start rke2-agent.service -exit $? + +if [[ -n "${custom_ssh_public_key}" ]]; then + echo "${custom_ssh_public_key}" >> /home/ec2-user/.ssh/authorized_keys +fi + +exit $? \ No newline at end of file diff --git a/test_framework/terraform/aws/rhel/user-data-scripts/provision_rke2_server.sh.tpl b/test_framework/terraform/aws/rhel/user-data-scripts/provision_rke2_server.sh.tpl index 78392c43b2..69d800c7bb 100755 --- a/test_framework/terraform/aws/rhel/user-data-scripts/provision_rke2_server.sh.tpl +++ b/test_framework/terraform/aws/rhel/user-data-scripts/provision_rke2_server.sh.tpl @@ -35,3 +35,7 @@ until (KUBECONFIG=/etc/rancher/rke2/rke2.yaml sudo /var/lib/rancher/rke2/bin/kub echo 'Waiting for rke2 startup' sleep 5 done + +if [[ -n "${custom_ssh_public_key}" ]]; then + echo "${custom_ssh_public_key}" >> /home/ec2-user/.ssh/authorized_keys +fi \ No newline at end of file diff --git a/test_framework/terraform/aws/rhel/variables.tf b/test_framework/terraform/aws/rhel/variables.tf index 3178442037..98f211e362 100644 --- a/test_framework/terraform/aws/rhel/variables.tf +++ b/test_framework/terraform/aws/rhel/variables.tf @@ -120,3 +120,9 @@ variable "create_load_balancer" { type = bool default = false } + +variable "custom_ssh_public_key" { + type = string + default = "" + sensitive = true +} diff --git a/test_framework/terraform/aws/rockylinux/data.tf b/test_framework/terraform/aws/rockylinux/data.tf index ead3e87e4b..5ed2a86f44 100644 --- a/test_framework/terraform/aws/rockylinux/data.tf +++ b/test_framework/terraform/aws/rockylinux/data.tf @@ -36,6 +36,7 @@ data "template_file" "provision_k3s_server" { k3s_version = var.k8s_distro_version selinux_mode = var.selinux_mode enable_selinux = var.selinux_mode == "permissive" ? "false" : "true" + custom_ssh_public_key = var.custom_ssh_public_key } } @@ -48,6 +49,7 @@ data "template_file" "provision_k3s_agent" { k3s_version = var.k8s_distro_version selinux_mode = var.selinux_mode enable_selinux = var.selinux_mode == "permissive" ? "false" : "true" + custom_ssh_public_key = var.custom_ssh_public_key } } @@ -59,6 +61,7 @@ data "template_file" "provision_rke2_server" { rke2_server_public_ip = aws_eip.lh_aws_eip_controlplane[0].public_ip rke2_version = var.k8s_distro_version selinux_mode = var.selinux_mode + custom_ssh_public_key = var.custom_ssh_public_key } } @@ -70,5 +73,6 @@ data "template_file" "provision_rke2_agent" { rke2_cluster_secret = random_password.cluster_secret.result rke2_version = var.k8s_distro_version selinux_mode = var.selinux_mode + custom_ssh_public_key = var.custom_ssh_public_key } } diff --git a/test_framework/terraform/aws/rockylinux/output.tf b/test_framework/terraform/aws/rockylinux/output.tf index 68ea8dbf23..804fa2d099 100644 --- a/test_framework/terraform/aws/rockylinux/output.tf +++ b/test_framework/terraform/aws/rockylinux/output.tf @@ -46,3 +46,11 @@ output "load_balancer_url" { value = var.create_load_balancer ? aws_lb.lh_aws_lb[0].dns_name : null } + +output "resource_suffix" { + depends_on = [ + random_string.random_suffix + ] + + value = random_string.random_suffix.id +} \ No newline at end of file diff --git a/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_k3s_agent.sh.tpl b/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_k3s_agent.sh.tpl index 29760319ea..7dab190733 100755 --- a/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_k3s_agent.sh.tpl +++ b/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_k3s_agent.sh.tpl @@ -24,3 +24,7 @@ until (curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="agent --token ${k3s_clus echo 'k3s agent did not install correctly' sleep 2 done + +if [[ -n "${custom_ssh_public_key}" ]]; then + echo "${custom_ssh_public_key}" >> /home/rocky/.ssh/authorized_keys +fi \ No newline at end of file diff --git a/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_k3s_server.sh.tpl b/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_k3s_server.sh.tpl index 0e77a01a26..a87c270175 100755 --- a/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_k3s_server.sh.tpl +++ b/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_k3s_server.sh.tpl @@ -24,3 +24,6 @@ until (kubectl get pods -A | grep 'Running'); do sleep 5 done +if [[ -n "${custom_ssh_public_key}" ]]; then + echo "${custom_ssh_public_key}" >> /home/rocky/.ssh/authorized_keys +fi diff --git a/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_rke2_agent.sh.tpl b/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_rke2_agent.sh.tpl index 04133b229f..e2a67e95f4 100755 --- a/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_rke2_agent.sh.tpl +++ b/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_rke2_agent.sh.tpl @@ -38,4 +38,9 @@ EOF sudo systemctl enable rke2-agent.service sudo systemctl start rke2-agent.service + +if [[ -n "${custom_ssh_public_key}" ]]; then + echo "${custom_ssh_public_key}" >> /home/rocky/.ssh/authorized_keys +fi + exit $? diff --git a/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_rke2_server.sh.tpl b/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_rke2_server.sh.tpl index 6cab617458..c7686fc9b9 100755 --- a/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_rke2_server.sh.tpl +++ b/test_framework/terraform/aws/rockylinux/user-data-scripts/provision_rke2_server.sh.tpl @@ -34,3 +34,7 @@ until (KUBECONFIG=/etc/rancher/rke2/rke2.yaml /var/lib/rancher/rke2/bin/kubectl echo 'Waiting for rke2 startup' sleep 5 done + +if [[ -n "${custom_ssh_public_key}" ]]; then + echo "${custom_ssh_public_key}" >> /home/rocky/.ssh/authorized_keys +fi diff --git a/test_framework/terraform/aws/rockylinux/variables.tf b/test_framework/terraform/aws/rockylinux/variables.tf index 34ee882cf3..dfce28e0ae 100644 --- a/test_framework/terraform/aws/rockylinux/variables.tf +++ b/test_framework/terraform/aws/rockylinux/variables.tf @@ -119,4 +119,10 @@ variable "use_hdd" { variable "create_load_balancer" { type = bool default = false +} + +variable "custom_ssh_public_key" { + type = string + default = "" + sensitive = true } \ No newline at end of file diff --git a/test_framework/terraform/aws/sle-micro/data.tf b/test_framework/terraform/aws/sle-micro/data.tf index a5969891c2..472d0ca177 100644 --- a/test_framework/terraform/aws/sle-micro/data.tf +++ b/test_framework/terraform/aws/sle-micro/data.tf @@ -20,6 +20,7 @@ data "template_file" "provision_k3s_server" { k3s_cluster_secret = random_password.cluster_secret.result k3s_server_public_ip = aws_eip.lh_aws_eip_controlplane[0].public_ip k3s_version = var.k8s_distro_version + custom_ssh_public_key = var.custom_ssh_public_key } } @@ -30,6 +31,7 @@ data "template_file" "provision_k3s_agent" { k3s_server_url = "https://${aws_eip.lh_aws_eip_controlplane[0].public_ip}:6443" k3s_cluster_secret = random_password.cluster_secret.result k3s_version = var.k8s_distro_version + custom_ssh_public_key = var.custom_ssh_public_key } } @@ -40,6 +42,7 @@ data "template_file" "provision_rke2_server" { rke2_cluster_secret = random_password.cluster_secret.result rke2_server_public_ip = aws_eip.lh_aws_eip_controlplane[0].public_ip rke2_version = var.k8s_distro_version + custom_ssh_public_key = var.custom_ssh_public_key } } @@ -50,5 +53,6 @@ data "template_file" "provision_rke2_agent" { rke2_server_url = "https://${aws_eip.lh_aws_eip_controlplane[0].public_ip}:9345" rke2_cluster_secret = random_password.cluster_secret.result rke2_version = var.k8s_distro_version + custom_ssh_public_key = var.custom_ssh_public_key } } diff --git a/test_framework/terraform/aws/sle-micro/k3s_instaces.tf b/test_framework/terraform/aws/sle-micro/k3s_instaces.tf index 82c0bd889e..c233186a15 100644 --- a/test_framework/terraform/aws/sle-micro/k3s_instaces.tf +++ b/test_framework/terraform/aws/sle-micro/k3s_instaces.tf @@ -112,11 +112,8 @@ resource "null_resource" "registration_controlplane_k3s" { inline = [ "sudo transactional-update register -r ${var.registration_code}", - "sudo shutdown -r now", ] - on_failure = continue - connection { type = "ssh" user = "suse" @@ -139,11 +136,8 @@ resource "null_resource" "registration_worker_k3s" { inline = [ "sudo transactional-update register -r ${var.registration_code}", - "sudo shutdown -r now", ] - on_failure = continue - connection { type = "ssh" user = "suse" @@ -252,7 +246,32 @@ resource "null_resource" "cluster_setup_worker_k3s" { } -# node initialization step 4: download KUBECONFIG file for k3s +# node initialization step 4: make sure k8s components running +resource "null_resource" "make_sure_k8s_components_running_controlplane_k3s" { + count = var.k8s_distro_name == "k3s" ? 1 : 0 + + depends_on = [ + null_resource.cluster_setup_controlplane_k3s, + null_resource.cluster_setup_worker_k3s + ] + + provisioner "remote-exec" { + + inline = [ + "until (kubectl get pods -A | grep 'Running'); do echo 'Waiting for k3s startup'; sleep 5; done" + ] + + connection { + type = "ssh" + user = "suse" + host = aws_eip.lh_aws_eip_controlplane[0].public_ip + private_key = file(var.aws_ssh_private_key_file_path) + } + } + +} + +# node initialization step 5: download KUBECONFIG file for k3s resource "null_resource" "rsync_kubeconfig_file" { count = var.k8s_distro_name == "k3s" ? 1 : 0 @@ -260,8 +279,7 @@ resource "null_resource" "rsync_kubeconfig_file" { aws_instance.lh_aws_instance_controlplane_k3s, aws_eip.lh_aws_eip_controlplane, aws_eip_association.lh_aws_eip_assoc_k3s, - null_resource.cluster_setup_controlplane_k3s, - null_resource.cluster_setup_worker_k3s + null_resource.make_sure_k8s_components_running_controlplane_k3s ] provisioner "remote-exec" { diff --git a/test_framework/terraform/aws/sle-micro/output.tf b/test_framework/terraform/aws/sle-micro/output.tf index f1fdd5e52d..1946e2eaa4 100644 --- a/test_framework/terraform/aws/sle-micro/output.tf +++ b/test_framework/terraform/aws/sle-micro/output.tf @@ -45,3 +45,11 @@ output "load_balancer_url" { value = var.create_load_balancer ? aws_lb.lh_aws_lb[0].dns_name : null } + +output "resource_suffix" { + depends_on = [ + random_string.random_suffix + ] + + value = random_string.random_suffix.id +} \ No newline at end of file diff --git a/test_framework/terraform/aws/sle-micro/rke2_instances.tf b/test_framework/terraform/aws/sle-micro/rke2_instances.tf index 9f8033357c..07d2957ee6 100644 --- a/test_framework/terraform/aws/sle-micro/rke2_instances.tf +++ b/test_framework/terraform/aws/sle-micro/rke2_instances.tf @@ -112,11 +112,8 @@ resource "null_resource" "registration_controlplane_rke2" { inline = [ "sudo transactional-update register -r ${var.registration_code}", - "sudo shutdown -r now", ] - on_failure = continue - connection { type = "ssh" user = "suse" @@ -139,11 +136,8 @@ resource "null_resource" "registration_worker_rke2" { inline = [ "sudo transactional-update register -r ${var.registration_code}", - "sudo shutdown -r now", ] - on_failure = continue - connection { type = "ssh" user = "suse" @@ -252,7 +246,32 @@ resource "null_resource" "cluster_setup_worker_rke2" { } -# node initialization step 3: download KUBECONFIG file for rke2 +# node initialization step 4: make sure k8s components running +resource "null_resource" "make_sure_k8s_components_running_controlplane_rke2" { + count = var.k8s_distro_name == "rke2" ? 1 : 0 + + depends_on = [ + null_resource.cluster_setup_controlplane_rke2, + null_resource.cluster_setup_worker_rke2 + ] + + provisioner "remote-exec" { + + inline = [ + "until (KUBECONFIG=/etc/rancher/rke2/rke2.yaml kubectl get pods -A | grep 'Running'); do echo 'Waiting for rke2 startup'; sleep 5; done" + ] + + connection { + type = "ssh" + user = "suse" + host = aws_eip.lh_aws_eip_controlplane[0].public_ip + private_key = file(var.aws_ssh_private_key_file_path) + } + } + +} + +# node initialization step 5: download KUBECONFIG file for rke2 resource "null_resource" "rsync_kubeconfig_file_rke2" { count = var.k8s_distro_name == "rke2" ? 1 : 0 @@ -260,8 +279,7 @@ resource "null_resource" "rsync_kubeconfig_file_rke2" { aws_instance.lh_aws_instance_controlplane_rke2, aws_eip.lh_aws_eip_controlplane, aws_eip_association.lh_aws_eip_assoc_rke2, - null_resource.cluster_setup_controlplane_rke2, - null_resource.cluster_setup_worker_rke2 + null_resource.make_sure_k8s_components_running_controlplane_rke2 ] provisioner "remote-exec" { diff --git a/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_k3s_agent.sh.tpl b/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_k3s_agent.sh.tpl index b9cf4373b8..fb81b0214d 100755 --- a/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_k3s_agent.sh.tpl +++ b/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_k3s_agent.sh.tpl @@ -20,3 +20,7 @@ fi curl -sfL https://get.k3s.io | sudo INSTALL_K3S_EXEC="agent --token ${k3s_cluster_secret}" K3S_URL="${k3s_server_url}" INSTALL_K3S_VERSION="${k3s_version}" sh - sudo systemctl start k3s-agent + +if [[ -n "${custom_ssh_public_key}" ]]; then + echo "${custom_ssh_public_key}" >> /home/suse/.ssh/authorized_keys +fi \ No newline at end of file diff --git a/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_k3s_server.sh.tpl b/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_k3s_server.sh.tpl index 9bcc200570..7f254cda4b 100755 --- a/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_k3s_server.sh.tpl +++ b/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_k3s_server.sh.tpl @@ -2,3 +2,7 @@ curl -sfL https://get.k3s.io | sudo INSTALL_K3S_EXEC="server --node-taint "node-role.kubernetes.io/master=true:NoExecute" --node-taint "node-role.kubernetes.io/master=true:NoSchedule" --tls-san ${k3s_server_public_ip} --write-kubeconfig-mode 644 --token ${k3s_cluster_secret}" INSTALL_K3S_VERSION="${k3s_version}" sh - sudo systemctl start k3s + +if [[ -n "${custom_ssh_public_key}" ]]; then + echo "${custom_ssh_public_key}" >> /home/suse/.ssh/authorized_keys +fi \ No newline at end of file diff --git a/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_rke2_agent.sh.tpl b/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_rke2_agent.sh.tpl index 74d2dd67e6..6efbaab929 100755 --- a/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_rke2_agent.sh.tpl +++ b/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_rke2_agent.sh.tpl @@ -32,4 +32,9 @@ EOF sudo systemctl enable rke2-agent.service sudo systemctl start rke2-agent.service -exit $? + +if [[ -n "${custom_ssh_public_key}" ]]; then + echo "${custom_ssh_public_key}" >> /home/suse/.ssh/authorized_keys +fi + +exit $? \ No newline at end of file diff --git a/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_rke2_server.sh.tpl b/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_rke2_server.sh.tpl index 9fd79f05c4..1ec23d1df3 100755 --- a/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_rke2_server.sh.tpl +++ b/test_framework/terraform/aws/sle-micro/user-data-scripts/provision_rke2_server.sh.tpl @@ -15,4 +15,8 @@ EOF sudo systemctl enable rke2-server.service sudo systemctl start rke2-server.service -sudo ln -s /var/lib/rancher/rke2/bin/kubectl /usr/local/bin/kubectl \ No newline at end of file +sudo ln -s /var/lib/rancher/rke2/bin/kubectl /usr/local/bin/kubectl + +if [[ -n "${custom_ssh_public_key}" ]]; then + echo "${custom_ssh_public_key}" >> /home/suse/.ssh/authorized_keys +fi \ No newline at end of file diff --git a/test_framework/terraform/aws/sle-micro/variables.tf b/test_framework/terraform/aws/sle-micro/variables.tf index f1b7afd7f6..b00745d94b 100644 --- a/test_framework/terraform/aws/sle-micro/variables.tf +++ b/test_framework/terraform/aws/sle-micro/variables.tf @@ -121,4 +121,10 @@ variable "create_load_balancer" { variable "registration_code" { type = string sensitive = true +} + +variable "custom_ssh_public_key" { + type = string + default = "" + sensitive = true } \ No newline at end of file diff --git a/test_framework/terraform/aws/sles/data.tf b/test_framework/terraform/aws/sles/data.tf index a71132950b..b8964a56dd 100644 --- a/test_framework/terraform/aws/sles/data.tf +++ b/test_framework/terraform/aws/sles/data.tf @@ -17,6 +17,7 @@ data "template_file" "provision_k3s_server" { k3s_cluster_secret = random_password.cluster_secret.result k3s_server_public_ip = aws_eip.lh_aws_eip_controlplane[0].public_ip k3s_version = var.k8s_distro_version + custom_ssh_public_key = var.custom_ssh_public_key } } @@ -27,6 +28,7 @@ data "template_file" "provision_k3s_agent" { k3s_server_url = "https://${aws_eip.lh_aws_eip_controlplane[0].public_ip}:6443" k3s_cluster_secret = random_password.cluster_secret.result k3s_version = var.k8s_distro_version + custom_ssh_public_key = var.custom_ssh_public_key } } @@ -38,6 +40,7 @@ data "template_file" "provision_rke2_server" { rke2_server_public_ip = aws_eip.lh_aws_eip_controlplane[0].public_ip rke2_version = var.k8s_distro_version cis_hardening = var.cis_hardening + custom_ssh_public_key = var.custom_ssh_public_key } } @@ -49,5 +52,6 @@ data "template_file" "provision_rke2_agent" { rke2_cluster_secret = random_password.cluster_secret.result rke2_version = var.k8s_distro_version cis_hardening = var.cis_hardening + custom_ssh_public_key = var.custom_ssh_public_key } } diff --git a/test_framework/terraform/aws/sles/output.tf b/test_framework/terraform/aws/sles/output.tf index bd2b4c12f7..291f420816 100644 --- a/test_framework/terraform/aws/sles/output.tf +++ b/test_framework/terraform/aws/sles/output.tf @@ -78,3 +78,11 @@ output "controlplane_public_ip" { ] value = aws_eip.lh_aws_eip_controlplane[0].public_ip } + +output "resource_suffix" { + depends_on = [ + random_string.random_suffix + ] + + value = random_string.random_suffix.id +} diff --git a/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_agent.sh.tpl b/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_agent.sh.tpl index af5faa1d4c..c799b47a07 100755 --- a/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_agent.sh.tpl +++ b/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_agent.sh.tpl @@ -34,3 +34,7 @@ until (curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="agent --token ${k3s_clus echo 'k3s agent did not install correctly' sleep 2 done + +if [[ -n "${custom_ssh_public_key}" ]]; then + echo "${custom_ssh_public_key}" >> /home/ec2-user/.ssh/authorized_keys +fi \ No newline at end of file diff --git a/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_server.sh.tpl b/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_server.sh.tpl index c04dd4cab8..2a2df03018 100755 --- a/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_server.sh.tpl +++ b/test_framework/terraform/aws/sles/user-data-scripts/provision_k3s_server.sh.tpl @@ -25,3 +25,6 @@ until (kubectl get pods -A | grep 'Running'); do RETRY=$((RETRY+1)) done +if [[ -n "${custom_ssh_public_key}" ]]; then + echo "${custom_ssh_public_key}" >> /home/ec2-user/.ssh/authorized_keys +fi diff --git a/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_agent.sh.tpl b/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_agent.sh.tpl index e74b801290..04a9e1959c 100755 --- a/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_agent.sh.tpl +++ b/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_agent.sh.tpl @@ -60,4 +60,9 @@ EOF fi systemctl start rke2-agent.service + +if [[ -n "${custom_ssh_public_key}" ]]; then + echo "${custom_ssh_public_key}" >> /home/ec2-user/.ssh/authorized_keys +fi + exit $? diff --git a/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_server.sh.tpl b/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_server.sh.tpl index 9f5add5c49..6bf855bc44 100755 --- a/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_server.sh.tpl +++ b/test_framework/terraform/aws/sles/user-data-scripts/provision_rke2_server.sh.tpl @@ -53,3 +53,7 @@ until (KUBECONFIG=/etc/rancher/rke2/rke2.yaml /var/lib/rancher/rke2/bin/kubectl fi RETRY=$((RETRY+1)) done + +if [[ -n "${custom_ssh_public_key}" ]]; then + echo "${custom_ssh_public_key}" >> /home/ec2-user/.ssh/authorized_keys +fi diff --git a/test_framework/terraform/aws/sles/variables.tf b/test_framework/terraform/aws/sles/variables.tf index 6e683a7d55..1a435ac39b 100644 --- a/test_framework/terraform/aws/sles/variables.tf +++ b/test_framework/terraform/aws/sles/variables.tf @@ -31,7 +31,7 @@ variable "arch" { variable "os_distro_version" { type = string - default = "15-sp4" + default = "15-sp5" } variable "aws_ami_sles_account_number" { @@ -126,4 +126,10 @@ variable "cis_hardening" { variable "resources_owner" { type = string default = "longhorn-infra" +} + +variable "custom_ssh_public_key" { + type = string + default = "" + sensitive = true } \ No newline at end of file diff --git a/test_framework/terraform/aws/ubuntu/data.tf b/test_framework/terraform/aws/ubuntu/data.tf index 374a3069b4..9edd0495f2 100644 --- a/test_framework/terraform/aws/ubuntu/data.tf +++ b/test_framework/terraform/aws/ubuntu/data.tf @@ -16,6 +16,7 @@ data "template_file" "provision_k3s_server" { k3s_cluster_secret = random_password.cluster_secret.result k3s_server_public_ip = aws_eip.lh_aws_eip_controlplane[0].public_ip k3s_version = var.k8s_distro_version + custom_ssh_public_key = var.custom_ssh_public_key } } @@ -26,6 +27,7 @@ data "template_file" "provision_k3s_agent" { k3s_server_url = "https://${aws_eip.lh_aws_eip_controlplane[0].public_ip}:6443" k3s_cluster_secret = random_password.cluster_secret.result k3s_version = var.k8s_distro_version + custom_ssh_public_key = var.custom_ssh_public_key } } @@ -37,6 +39,7 @@ data "template_file" "provision_rke2_server" { rke2_cluster_secret = random_password.cluster_secret.result rke2_server_public_ip = aws_eip.lh_aws_eip_controlplane[0].public_ip rke2_version = var.k8s_distro_version + custom_ssh_public_key = var.custom_ssh_public_key } } @@ -47,6 +50,7 @@ data "template_file" "provision_rke2_agent" { rke2_server_url = "https://${aws_eip.lh_aws_eip_controlplane[0].public_ip}:9345" rke2_cluster_secret = random_password.cluster_secret.result rke2_version = var.k8s_distro_version + custom_ssh_public_key = var.custom_ssh_public_key } } diff --git a/test_framework/terraform/aws/ubuntu/output.tf b/test_framework/terraform/aws/ubuntu/output.tf index a441717d06..9e865901a5 100644 --- a/test_framework/terraform/aws/ubuntu/output.tf +++ b/test_framework/terraform/aws/ubuntu/output.tf @@ -46,3 +46,11 @@ output "load_balancer_url" { value = var.create_load_balancer ? aws_lb.lh_aws_lb[0].dns_name : null } + +output "resource_suffix" { + depends_on = [ + random_string.random_suffix + ] + + value = random_string.random_suffix.id +} diff --git a/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_k3s_agent.sh.tpl b/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_k3s_agent.sh.tpl index 38112a8aa6..c7b825fc63 100755 --- a/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_k3s_agent.sh.tpl +++ b/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_k3s_agent.sh.tpl @@ -13,3 +13,7 @@ until (curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="agent --token ${k3s_clus echo 'k3s agent did not install correctly' sleep 2 done + +if [[ -n "${custom_ssh_public_key}" ]]; then + echo "${custom_ssh_public_key}" >> /home/ubuntu/.ssh/authorized_keys +fi diff --git a/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_k3s_server.sh.tpl b/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_k3s_server.sh.tpl index d0a2ae2fcd..2ac9c835fe 100755 --- a/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_k3s_server.sh.tpl +++ b/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_k3s_server.sh.tpl @@ -15,3 +15,6 @@ until (kubectl get pods -A | grep 'Running'); do sleep 5 done +if [[ -n "${custom_ssh_public_key}" ]]; then + echo "${custom_ssh_public_key}" >> /home/ubuntu/.ssh/authorized_keys +fi diff --git a/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_rke2_agent.sh.tpl b/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_rke2_agent.sh.tpl index 09804ab903..b2d58b4ed9 100755 --- a/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_rke2_agent.sh.tpl +++ b/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_rke2_agent.sh.tpl @@ -28,4 +28,9 @@ EOF systemctl enable rke2-agent.service systemctl start rke2-agent.service -exit $? + +if [[ -n "${custom_ssh_public_key}" ]]; then + echo "${custom_ssh_public_key}" >> /home/ubuntu/.ssh/authorized_keys +fi + +exit $? \ No newline at end of file diff --git a/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_rke2_server.sh.tpl b/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_rke2_server.sh.tpl index 6543e059c7..ee3358398b 100755 --- a/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_rke2_server.sh.tpl +++ b/test_framework/terraform/aws/ubuntu/user-data-scripts/provision_rke2_server.sh.tpl @@ -26,3 +26,6 @@ until (KUBECONFIG=/etc/rancher/rke2/rke2.yaml /var/lib/rancher/rke2/bin/kubectl sleep 5 done +if [[ -n "${custom_ssh_public_key}" ]]; then + echo "${custom_ssh_public_key}" >> /home/ubuntu/.ssh/authorized_keys +fi diff --git a/test_framework/terraform/aws/ubuntu/variables.tf b/test_framework/terraform/aws/ubuntu/variables.tf index b6c53dd1b8..468a06b00e 100644 --- a/test_framework/terraform/aws/ubuntu/variables.tf +++ b/test_framework/terraform/aws/ubuntu/variables.tf @@ -115,3 +115,9 @@ variable "create_load_balancer" { type = bool default = false } + +variable "custom_ssh_public_key" { + type = string + default = "" + sensitive = true +} \ No newline at end of file diff --git a/test_tools/gen_data/README.md b/test_tools/gen_data/README.md new file mode 100644 index 0000000000..3e73e6720a --- /dev/null +++ b/test_tools/gen_data/README.md @@ -0,0 +1,46 @@ +# Generate test data script +Generate RWO/RWX workloads, write data into `/mnt/data/data` in workloads and record md5 to data.output. + +# Usage +Modify config.yaml +```yaml +storage: 1Gi # Each volume size +storageClass: longhorn-test # Need to prepare your own storage class first +dataSizeInMb: 500 +namespace: default # Nees to prepare first before run script +statefulSet: # Single RWO/RWX statefulset and its replica counts + rwo: + replicas: 1 + rwx: + replicas: 0 +deployment: # Number of RWO/RWX deployments, replica of RWO fixed to 1 + rwo: + pvCounts: 0 + rwx: + pvCounts: 1 + deploymentReplicas: 2 # Replica count of each RWX deployment +``` + +# Generate test data + `./run.sh` + +# Cleanup workloads and PVC +`./clean.sh` + +# Output(example) +`cat data.output` + +Can see worklad name and md5sum of mount point file +``` +test-data-rwx-statefulset-0 +2bccd99c8e35ccab2cd7620a200bc3e1 + +test-data-rwx-statefulset-1 +8f96c74b8b990ff11e98d478fc65f77b + +test-data-rwo-deployment-1-7f99f8bf76-cqblb +91fc370c81957d12f01581f78e4bdeba + +test-data-rwo-deployment-2-549d6cb995-gvc79 +883c98d04e2c54c89f979b20d3fa277e +``` diff --git a/test_tools/gen_data/clean.sh b/test_tools/gen_data/clean.sh new file mode 100755 index 0000000000..7c0514ef28 --- /dev/null +++ b/test_tools/gen_data/clean.sh @@ -0,0 +1,6 @@ +#!/bin/bash +NAMESPACE=$(yq eval '.namespace' config.yaml) + +kubectl get statefulset -n $NAMESPACE --no-headers | grep "test-data-" | awk '{print $1}' | xargs kubectl -n $NAMESPACE delete statefulset +kubectl get deployment -n $NAMESPACE --no-headers | grep "test-data-" | awk '{print $1}' | xargs kubectl -n $NAMESPACE delete deployment +kubectl get pvc -n $NAMESPACE --no-headers | grep "test-data-" | awk '{print $1}' | xargs kubectl -n $NAMESPACE delete pvc \ No newline at end of file diff --git a/test_tools/gen_data/config.yaml b/test_tools/gen_data/config.yaml new file mode 100644 index 0000000000..965e2a2419 --- /dev/null +++ b/test_tools/gen_data/config.yaml @@ -0,0 +1,15 @@ +storage: 500Mi +storageClass: longhorn +dataSizeInMb: 50 +namespace: default +statefulSet: + rwo: + replicas: 0 + rwx: + replicas: 0 +deployment: + rwo: + pvCounts: 0 + rwx: + pvCounts: 0 + deploymentReplicas: 0 \ No newline at end of file diff --git a/test_tools/gen_data/deployment.yaml b/test_tools/gen_data/deployment.yaml new file mode 100644 index 0000000000..97ea776424 --- /dev/null +++ b/test_tools/gen_data/deployment.yaml @@ -0,0 +1,42 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: test-deployment + namespace: default +spec: + accessModes: + - ReadWriteOnce + storageClassName: longhorn + resources: + requests: + storage: 1Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: test-deployment + namespace: default + labels: + name: test-deployment +spec: + replicas: 1 + selector: + matchLabels: + name: test-deployment + template: + metadata: + labels: + name: test-deployment + spec: + containers: + - name: test-deployment + image: nginx:stable-alpine + command: ["/bin/sh"] + args: [-c, 'dd if=/dev/urandom of=/mnt/data/data bs=1M count=500; trap : TERM INT; sleep infinity & wait'] + volumeMounts: + - name: vol1 + mountPath: /mnt/data + volumes: + - name: vol1 + persistentVolumeClaim: + claimName: test-deployment diff --git a/test_tools/gen_data/run.sh b/test_tools/gen_data/run.sh new file mode 100755 index 0000000000..d9b786786a --- /dev/null +++ b/test_tools/gen_data/run.sh @@ -0,0 +1,279 @@ +#!/bin/bash +ACCESS_MODE_RWO="ReadWriteOnce" +ACCESS_MODE_RWX="ReadWriteMany" +CONFIG_FILE="config.yaml" +OUTPUT_FILE="data.output" +DEPLOYMENT_TEMPLATE="deployment.yaml" +STATEFULSET_TEMPLATE="statefulset.yaml" +RWO_DEPLOYMENT_WORKLOAD_PREFIX="test-data-rwo-deployment-" +RWX_DEPLOYMENT_WORKLOAD_PREFIX="test-data-rwx-deployment-" +RWO_STATEFULSET_NAME="test-data-rwo-statefulset" +RWX_STATEFULSET_NAME="test-data-rwx-statefulset" +RETRY_COUNTS=60 +RETRY_INTERVAL=5 +RETRY_INTERVAL_LONG=10 + +###################################################### +# Log +###################################################### +export RED='\x1b[0;31m' +export GREEN='\x1b[38;5;22m' +export CYAN='\x1b[36m' +export YELLOW='\x1b[33m' +export NO_COLOR='\x1b[0m' + +if [ -z "${LOG_TITLE}" ]; then + LOG_TITLE='' +fi +if [ -z "${LOG_LEVEL}" ]; then + LOG_LEVEL="INFO" +fi + +debug() { + if [[ "${LOG_LEVEL}" == "DEBUG" ]]; then + local log_title + if [ -n "${LOG_TITLE}" ]; then + log_title="(${LOG_TITLE})" + else + log_title='' + fi + echo -e "${GREEN}[DEBUG]${log_title} ${NO_COLOR}$1" + fi +} + +info() { + if [[ "${LOG_LEVEL}" == "DEBUG" ]] ||\ + [[ "${LOG_LEVEL}" == "INFO" ]]; then + local log_title + if [ -n "${LOG_TITLE}" ]; then + log_title="(${LOG_TITLE})" + else + log_title='' + fi + echo -e "${CYAN}[INFO] ${log_title} ${NO_COLOR}$1" + fi +} + +warn() { + if [[ "${LOG_LEVEL}" == "DEBUG" ]] ||\ + [[ "${LOG_LEVEL}" == "INFO" ]] ||\ + [[ "${LOG_LEVEL}" == "WARN" ]]; then + local log_title + if [ -n "${LOG_TITLE}" ]; then + log_title="(${LOG_TITLE})" + else + log_title='' + fi + echo -e "${YELLOW}[WARN] ${log_title} ${NO_COLOR}$1" + fi +} + +error() { + if [[ "${LOG_LEVEL}" == "DEBUG" ]] ||\ + [[ "${LOG_LEVEL}" == "INFO" ]] ||\ + [[ "${LOG_LEVEL}" == "WARN" ]] ||\ + [[ "${LOG_LEVEL}" == "ERROR" ]]; then + local log_title + if [ -n "${LOG_TITLE}" ]; then + log_title="(${LOG_TITLE})" + else + log_title='' + fi + echo -e "${RED}[ERROR]${log_title} ${NO_COLOR}$1" + fi +} + +###################################################### +# Check Logics +###################################################### +check_local_dependencies() { + local targets=($@) + + local all_found=true + for ((i=0; i<${#targets[@]}; i++)); do + local target=${targets[$i]} + if [ "$(which $target)" = "" ]; then + all_found=false + error "Not found: $target" + fi + done + + if [ "$all_found" = "false" ]; then + msg="Please install missing dependencies: ${targets[@]}." + info "$msg" + exit 1 + fi + + msg="Required dependencies '${targets[@]}' are installed." + info "$msg" +} + +check_config_input() { + NAMESPACE=$(yq eval '.namespace' config.yaml) + STORAGE_SIZE=$(yq eval '.storage' config.yaml) + STORAGE_CLASS_NAME=$(yq eval '.storageClass' config.yaml) + DATA_SIZE_IN_MB=$(yq eval '.dataSizeInMb' config.yaml) + STATEFULSET_RWO_REPLICAS=$(yq eval '.statefulSet.rwo.replicas' config.yaml) + STATEFULSET_RWX_REPLICAS=$(yq eval '.statefulSet.rwx.replicas' config.yaml) + DEPLOYMENT_RWO_COUNTS=$(yq eval '.deployment.rwo.pvCounts' config.yaml) + DEPLOYMENT_RWX_COUNTS=$(yq eval '.deployment.rwx.pvCounts' config.yaml) + DEPLOYMENT_RWX_REPLICAS=$(yq eval '.deployment.rwx.deploymentReplicas' config.yaml) + + msg="$CONFIG_FILE is not correct, please check" + # varialbe = "null" when yq not find yaml field + [ "$STORAGE_SIZE" = "null" -o ${#STORAGE_SIZE} -eq 0 ] && error "$msg" && exit 2 + [ "$NAMESPACE" = "null" -o ${#NAMESPACE} -eq 0 ] && error "$msg" && exit 2 + [ "$STORAGE_CLASS_NAME" = "null" -o ${#STORAGE_CLASS_NAME} -eq 0 ] && error "$msg" && exit 2 + [ "$DATA_SIZE_IN_MB" = "null" -o ${#DATA_SIZE_IN_MB} -eq 0 ] && error "$msg" && exit 2 + [ "$STATEFULSET_RWO_REPLICAS" = "null" -o ${#STATEFULSET_RWO_REPLICAS} -eq 0 ] && error "$msg" && exit 2 + [ "$STATEFULSET_RWX_REPLICAS" = "null" -o ${#STATEFULSET_RWX_REPLICAS} -eq 0 ] && error "$msg" && exit 2 + [ "$DEPLOYMENT_RWO_COUNTS" = "null" -o ${#DEPLOYMENT_RWO_COUNTS} -eq 0 ] && error "$msg" && exit 2 + [ "$DEPLOYMENT_RWX_COUNTS" = "null" -o ${#DEPLOYMENT_RWX_COUNTS} -eq 0 ] && error "$msg" && exit 2 + [ "$DEPLOYMENT_RWX_REPLICAS" = "null" -o ${#DEPLOYMENT_RWX_REPLICAS} -eq 0 ] && error "$msg" && exit 2 +} + +check_kubernetes_resources() { + if ! kubectl get storageclass "$STORAGE_CLASS_NAME" &> /dev/null; then + msg="StorageClass '$STORAGE_CLASS_NAME' does not exist." + error "$msg" + exit 1 + fi + + if ! kubectl get namespace "$NAMESPACE" &> /dev/null; then + msg="Namespace '$NAMESPACE' does not exist." + error "$msg" + exit 1 + fi +} + +wait_workload_ready() { + local workload_type=$1 + local workload_name=$2 + local workload_replicas=$3 + local retries=0 + while [[ -n `kubectl -n $NAMESPACE get $workload_type --no-headers | grep $workload_name | awk '{print $2}' | grep -v $workload_replicas/$workload_replicas` ]]; do + msg="Pod is still creating ... re-checking in ${RETRY_INTERVAL}s" + info "$msg" + sleep ${RETRY_INTERVAL} + retries=$((RETRIES+1)) + + if [[ ${retries} -eq ${RETRY_COUNTS} ]]; then echo "Error: Pod create timeout"; exit 1 ; fi + done + + +} + +record_pod_data() { + local pattern="$1" + local pod_names=($(kubectl -n $NAMESPACE get pods | grep $pattern | cut -d ' ' -f1)) + # wait md5sum stable in case data is large + for pod_name in "${pod_names[@]}"; do + for ((i=0; i<=$RETRY_COUNTS; i++)); do + local md5_temp1=$(kubectl -n $NAMESPACE exec -it $pod_name -- /bin/sh -c "md5sum /mnt/data/data" | cut -d ' ' -f1) + sleep ${RETRY_INTERVAL_LONG} + local md5_temp2=$(kubectl -n $NAMESPACE exec -it $pod_name -- /bin/sh -c "md5sum /mnt/data/data" | cut -d ' ' -f1) + if [ "${md5_temp1}" != "${md5_temp2}" ]; then + continue + else + local md5=${md5_temp1} + break + fi + done + msg="${pod_name} data md5: ${md5}" + info "$msg" + echo $pod_name >> $OUTPUT_FILE + echo $md5 >> $OUTPUT_FILE + echo "" >> $OUTPUT_FILE + done +} + +###################################################### +# Workloads +###################################################### +create_deployments() { + local deployment_type=$1 + if [ "${deployment_type}" == "rwo" ]; then + local deployment_replica=1 + local access_mode=$ACCESS_MODE_RWO + local deployment_cnt=$DEPLOYMENT_RWO_COUNTS + local deployment_prefix=$RWO_DEPLOYMENT_WORKLOAD_PREFIX + elif [ "${deployment_type}" == "rwx" ]; then + local deployment_replica=$DEPLOYMENT_RWX_REPLICAS + local access_mode=$ACCESS_MODE_RWX + local deployment_cnt=$DEPLOYMENT_RWX_COUNTS + local deployment_prefix=$RWX_DEPLOYMENT_WORKLOAD_PREFIX + fi + + local command="[\"-c\", \"if [ ! -f /mnt/data/data ]; then dd if=/dev/urandom of=/mnt/data/data bs=1M count=${DATA_SIZE_IN_MB}; fi; trap : TERM INT; sleep infinity & wait\"]" + for (( i=1; i<=$deployment_cnt; i++)) do + local deployment_name="${deployment_prefix}$i" + local pvc_name="pvc-${deployment_name}" + + yq -i e "select(.kind == \"PersistentVolumeClaim\").metadata.name = \"${pvc_name}\"" "${DEPLOYMENT_TEMPLATE}" + yq -i e "select(.kind == \"PersistentVolumeClaim\").metadata.namespace = \"${NAMESPACE}\"" "${DEPLOYMENT_TEMPLATE}" + yq -i e "select(.kind == \"PersistentVolumeClaim\").spec.accessModes[0] = \"${access_mode}\"" "${DEPLOYMENT_TEMPLATE}" + yq -i e "select(.kind == \"PersistentVolumeClaim\").spec.resources.requests.storage = \"${STORAGE_SIZE}\"" "${DEPLOYMENT_TEMPLATE}" + yq -i e "select(.kind == \"PersistentVolumeClaim\").spec.storageClassName = \"${STORAGE_CLASS_NAME}\"" "${DEPLOYMENT_TEMPLATE}" + yq -i e "select(.kind == \"Deployment\").metadata.name = \"${deployment_name}\"" "${DEPLOYMENT_TEMPLATE}" + yq -i e "select(.kind == \"Deployment\").metadata.namespace = \"${NAMESPACE}\"" "${DEPLOYMENT_TEMPLATE}" + yq -i e "select(.kind == \"Deployment\").metadata.labels.name = \"${deployment_name}\"" "${DEPLOYMENT_TEMPLATE}" + yq -i e "select(.kind == \"Deployment\").spec.replicas = ${deployment_replica}" "${DEPLOYMENT_TEMPLATE}" + yq -i e "select(.kind == \"Deployment\").spec.selector.matchLabels.name = \"${deployment_name}\"" "${DEPLOYMENT_TEMPLATE}" + yq -i e "select(.kind == \"Deployment\").spec.template.metadata.labels.name = \"${deployment_name}\"" "${DEPLOYMENT_TEMPLATE}" + yq -i e "select(.kind == \"Deployment\").spec.template.spec.containers[0].name = \"${deployment_name}\"" "${DEPLOYMENT_TEMPLATE}" + yq -i e "select(.kind == \"Deployment\").spec.template.spec.volumes[0].persistentVolumeClaim.claimName = \"${pvc_name}\"" "${DEPLOYMENT_TEMPLATE}" + yq -i e "select(.kind == \"Deployment\").spec.template.spec.containers[0].args = ${command}" "${DEPLOYMENT_TEMPLATE}" + kubectl apply -f ${DEPLOYMENT_TEMPLATE} + wait_workload_ready "deployment" $deployment_name $deployment_replica + + done + + record_pod_data $deployment_prefix +} + +create_statefulsets() { + local stateful_type=$1 + local command="[\"-c\", \"dd if=/dev/urandom of=/mnt/data/data bs=1M count=${DATA_SIZE_IN_MB}; trap : TERM INT; sleep infinity & wait\"]" + if [ "${stateful_type}" == "rwo" ]; then + local statefulset_cnt=$STATEFULSET_RWO_REPLICAS + local access_mode=$ACCESS_MODE_RWO + local statefulset_name=$RWO_STATEFULSET_NAME + elif [ "${stateful_type}" == "rwx" ]; then + local statefulset_cnt=$STATEFULSET_RWX_REPLICAS + local access_mode=$ACCESS_MODE_RWX + local statefulset_name=$RWX_STATEFULSET_NAME + fi + + if [ "$statefulset_cnt" -eq 0 ]; then + return + fi + + yq -i e "select(.kind == \"StatefulSet\").metadata.name = \"${statefulset_name}\"" "${STATEFULSET_TEMPLATE}" + yq -i e "select(.kind == \"StatefulSet\").metadata.namespace = \"${NAMESPACE}\"" "${STATEFULSET_TEMPLATE}" + yq -i e "select(.kind == \"StatefulSet\").spec.selector.matchLabels.app = \"${statefulset_name}\"" "${STATEFULSET_TEMPLATE}" + yq -i e "select(.kind == \"StatefulSet\").spec.serviceName = \"${statefulset_name}\"" "${STATEFULSET_TEMPLATE}" + yq -i e "select(.kind == \"StatefulSet\").spec.template.metadata.labels.app = \"${statefulset_name}\"" "${STATEFULSET_TEMPLATE}" + yq -i e "select(.kind == \"StatefulSet\").spec.replicas = ${statefulset_cnt}" "${STATEFULSET_TEMPLATE}" + yq -i e "select(.kind == \"StatefulSet\").spec.template.spec.containers[0].name = \"${statefulset_name}\"" "${STATEFULSET_TEMPLATE}" + yq -i e "select(.kind == \"StatefulSet\").spec.template.spec.containers[0].args = ${command}" "${STATEFULSET_TEMPLATE}" + yq -i e "select(.kind == \"StatefulSet\").spec.volumeClaimTemplates[0].spec.accessModes[0] = \"${access_mode}\"" "${STATEFULSET_TEMPLATE}" + yq -i e "select(.kind == \"StatefulSet\").spec.volumeClaimTemplates[0].spec.storageClassName = \"${STORAGE_CLASS_NAME}\"" "${STATEFULSET_TEMPLATE}" + yq -i e "select(.kind == \"StatefulSet\").spec.volumeClaimTemplates[0].spec.resources.requests.storage = \"${STORAGE_SIZE}\"" "${STATEFULSET_TEMPLATE}" + kubectl apply -f ${STATEFULSET_TEMPLATE} + + wait_workload_ready "statefulset" $statefulset_name $statefulset_cnt + record_pod_data $statefulset_name +} + +###################################################### +# Main logics +###################################################### +echo "" > $OUTPUT_FILE +DEPENDENCIES=("kubectl" "yq") +check_local_dependencies "${DEPENDENCIES[@]}" +check_config_input +check_kubernetes_resources +create_statefulsets "rwo" +create_statefulsets "rwx" +create_deployments "rwo" +create_deployments "rwx" diff --git a/test_tools/gen_data/statefulset.yaml b/test_tools/gen_data/statefulset.yaml new file mode 100644 index 0000000000..0478b3aa21 --- /dev/null +++ b/test_tools/gen_data/statefulset.yaml @@ -0,0 +1,34 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: test-statefulset + namespace: default +spec: + selector: + matchLabels: + app: test-statefulset + serviceName: test-statefulset + replicas: 2 + template: + metadata: + labels: + app: test-statefulset + spec: + containers: + - name: test-statefulset + image: nginx:stable-alpine + command: ["/bin/sh"] + args: [-c, 'dd if=/dev/urandom of=/mnt/data/data bs=1M count=500; trap : TERM INT; sleep infinity & wait'] + volumeMounts: + - name: pvc + mountPath: /mnt/data + volumeClaimTemplates: + - metadata: + name: pvc + spec: + accessModes: + - ReadWriteOnce + storageClassName: longhorn + resources: + requests: + storage: 1Gi \ No newline at end of file