From 607cc880a8a125ede64cd5b8c93b99df3486f915 Mon Sep 17 00:00:00 2001 From: Yang Chiu Date: Tue, 6 Feb 2024 10:24:17 +0800 Subject: [PATCH 01/42] ci: update disabling pre-upgrade checker setting for argocd Signed-off-by: Yang Chiu --- pipelines/utilities/argocd.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/utilities/argocd.sh b/pipelines/utilities/argocd.sh index cb6d6c0e8d..fa56809e31 100755 --- a/pipelines/utilities/argocd.sh +++ b/pipelines/utilities/argocd.sh @@ -48,8 +48,8 @@ spec: targetRevision: ${REVISION} helm: values: | - helmPreUpgradeCheckerJob: - enabled: false + preUpgradeChecker: + jobEnabled: false destination: server: https://kubernetes.default.svc namespace: ${LONGHORN_NAMESPACE} From 45ca7cd5fdc94db6baa1800db0650eb70f2e4c03 Mon Sep 17 00:00:00 2001 From: Yang Chiu Date: Mon, 22 Jan 2024 15:41:22 +0800 Subject: [PATCH 02/42] test: fix flaky test case test_offline_node Signed-off-by: Yang Chiu --- test_framework/scripts/longhorn-setup.sh | 4 ++++ test_framework/scripts/terraform-setup.sh | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/test_framework/scripts/longhorn-setup.sh b/test_framework/scripts/longhorn-setup.sh index 691dedabf3..7782f99a25 100755 --- a/test_framework/scripts/longhorn-setup.sh +++ b/test_framework/scripts/longhorn-setup.sh @@ -323,6 +323,7 @@ run_longhorn_upgrade_test(){ yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env[4].value="'${LONGHORN_UPGRADE_TYPE}'"' ${LONGHORN_UPGRADE_TESTS_MANIFEST_FILE_PATH} + RESOURCE_SUFFIX=$(terraform -chdir=${TF_VAR_tf_workspace}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} output -raw resource_suffix) yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env[7].value="'${RESOURCE_SUFFIX}'"' ${LONGHORN_UPGRADE_TESTS_MANIFEST_FILE_PATH} kubectl apply -f ${LONGHORN_UPGRADE_TESTS_MANIFEST_FILE_PATH} @@ -389,6 +390,9 @@ run_longhorn_tests(){ LONGHORN_TEST_POD_NAME=`yq e 'select(.spec.containers[0] != null).metadata.name' ${LONGHORN_TESTS_MANIFEST_FILE_PATH}` + RESOURCE_SUFFIX=$(terraform -chdir=${TF_VAR_tf_workspace}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} output -raw resource_suffix) + yq e -i 'select(.spec.containers[0] != null).spec.containers[0].env[7].value="'${RESOURCE_SUFFIX}'"' ${LONGHORN_TESTS_MANIFEST_FILE_PATH} + kubectl apply -f ${LONGHORN_TESTS_MANIFEST_FILE_PATH} local RETRY_COUNTS=60 diff --git a/test_framework/scripts/terraform-setup.sh b/test_framework/scripts/terraform-setup.sh index 3d28f7c2c5..6c7beee307 100755 --- a/test_framework/scripts/terraform-setup.sh +++ b/test_framework/scripts/terraform-setup.sh @@ -32,8 +32,6 @@ terraform_setup(){ if [[ "${TF_VAR_create_load_balancer}" == true ]]; then terraform -chdir=${TF_VAR_tf_workspace}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} output -raw load_balancer_url > ${TF_VAR_tf_workspace}/load_balancer_url fi - - export RESOURCE_SUFFIX=$(terraform -chdir=${TF_VAR_tf_workspace}/terraform/${LONGHORN_TEST_CLOUDPROVIDER}/${DISTRO} output -raw resource_suffix) } From 8b4dce86baa56beb041f4d52c995c326bf0e2971 Mon Sep 17 00:00:00 2001 From: David Ko Date: Wed, 7 Feb 2024 22:45:02 +0800 Subject: [PATCH 03/42] Update mergify.yml Signed-off-by: David Ko --- .github/mergify.yml | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/.github/mergify.yml b/.github/mergify.yml index 2dd1aee601..391743c6ed 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -5,9 +5,8 @@ pull_request_rules: - check-success=DCO - check-success=CodeFactor - check-success=codespell - - "#approved-reviews-by>=1" - - approved-reviews-by=@longhorn/maintainer - - label=ready-to-merge + - "#approved-reviews-by>=2" + - approved-reviews-by=[@longhorn/maintainer, @longhorn/qa] actions: merge: method: rebase @@ -17,18 +16,4 @@ pull_request_rules: - conflict actions: comment: - message: This pull request is now in conflicts. Could you fix it @{{author}}? 🙏 - -# Comment on the PR to trigger backport. ex: @Mergifyio copy stable/3.1 stable/4.0 -- name: backport patches to stable branch - conditions: - - base=master - actions: - backport: - title: "[BACKPORT][{{ destination_branch }}] {{ title }}" - body: | - This is an automatic backport of pull request #{{number}}. - - {{cherry_pick_error}} - assignees: - - "{{ author }}" \ No newline at end of file + message: This pull request is now in conflict. Could you fix it @{{author}}? 🙏 From b601a899e739c937466454217cc0f2298e877b2a Mon Sep 17 00:00:00 2001 From: David Ko Date: Wed, 7 Feb 2024 23:08:24 +0800 Subject: [PATCH 04/42] Create codespell.yml Signed-off-by: David Ko --- .github/workflows/codespell.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 .github/workflows/codespell.yml diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml new file mode 100644 index 0000000000..b1ab6b8e6a --- /dev/null +++ b/.github/workflows/codespell.yml @@ -0,0 +1,22 @@ +name: Codespell + +on: + pull_request: + branches: + - master + - main + - "v*.*.*" + +jobs: + codespell: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 1 + - name: Check code spell + uses: codespell-project/actions-codespell@v2 + with: + check_filenames: true + skip: "*/**.yaml,*/**.yml,./scripts,./vendor,MAINTAINERS,LICENSE,go.mod,go.sum" From 2f5a0e99b8c48d8e87a5ce08b981d4918b4c9683 Mon Sep 17 00:00:00 2001 From: Eric Weber Date: Wed, 7 Feb 2024 16:27:03 -0600 Subject: [PATCH 05/42] Fix automated mirroring of livenessprobe Longhorn 7428 Signed-off-by: Eric Weber --- mirror_csi_images/scripts/publish.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mirror_csi_images/scripts/publish.sh b/mirror_csi_images/scripts/publish.sh index 4d918fb099..8ff6bf755c 100755 --- a/mirror_csi_images/scripts/publish.sh +++ b/mirror_csi_images/scripts/publish.sh @@ -9,7 +9,7 @@ if [[ -n "${LONGHORN_IMAGES_FILE_URL}" ]]; then wget "${LONGHORN_IMAGES_FILE_URL}" -O "${LONGHORN_IMAGES_FILE}" while read -r LINE; do - if [[ "${LINE}" =~ "csi-" ]]; then + if [[ "${LINE}" =~ csi-|livenessprobe ]]; then CSI_IMAGE=$(echo "${LINE}" | sed -e "s/longhornio\///g") IFS=: read -ra IMAGE_TAG_PAIR <<< "${CSI_IMAGE}" echo "registry.k8s.io/sig-storage/${IMAGE_TAG_PAIR[0]}" "longhornio/${IMAGE_TAG_PAIR[0]}" "${IMAGE_TAG_PAIR[1]}" >> "${INFILE}" @@ -23,7 +23,7 @@ else IFS=, read -ra CSI_IMAGES_ARR <<< "${CSI_IMAGES}" for CSI_IMAGE in "${CSI_IMAGES_ARR[@]}"; do IFS=: read -ra IMAGE_TAG_PAIR <<< "$CSI_IMAGE" - if [[ "${CSI_IMAGE}" =~ "csi-" ]]; then + if [[ "${CSI_IMAGE}" =~ csi-|livenessprobe ]]; then echo "registry.k8s.io/sig-storage/${IMAGE_TAG_PAIR[0]}" "longhornio/${IMAGE_TAG_PAIR[0]}" "${IMAGE_TAG_PAIR[1]}" >> "${INFILE}" elif [[ "${CSI_IMAGE}" =~ "support-bundle-kit" ]]; then echo "rancher/${IMAGE_TAG_PAIR[0]}" "longhornio/${IMAGE_TAG_PAIR[0]}" "${IMAGE_TAG_PAIR[1]}" >> "${INFILE}" From 0db8dfe25f1dbd33052e2db3f8ab3f825ec8477d Mon Sep 17 00:00:00 2001 From: Yang Chiu Date: Thu, 15 Feb 2024 10:34:06 +0800 Subject: [PATCH 06/42] Revert "Update mergify.yml" This reverts commit 8b4dce86baa56beb041f4d52c995c326bf0e2971. Signed-off-by: Yang Chiu --- .github/mergify.yml | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/.github/mergify.yml b/.github/mergify.yml index 391743c6ed..2dd1aee601 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -5,8 +5,9 @@ pull_request_rules: - check-success=DCO - check-success=CodeFactor - check-success=codespell - - "#approved-reviews-by>=2" - - approved-reviews-by=[@longhorn/maintainer, @longhorn/qa] + - "#approved-reviews-by>=1" + - approved-reviews-by=@longhorn/maintainer + - label=ready-to-merge actions: merge: method: rebase @@ -16,4 +17,18 @@ pull_request_rules: - conflict actions: comment: - message: This pull request is now in conflict. Could you fix it @{{author}}? 🙏 + message: This pull request is now in conflicts. Could you fix it @{{author}}? 🙏 + +# Comment on the PR to trigger backport. ex: @Mergifyio copy stable/3.1 stable/4.0 +- name: backport patches to stable branch + conditions: + - base=master + actions: + backport: + title: "[BACKPORT][{{ destination_branch }}] {{ title }}" + body: | + This is an automatic backport of pull request #{{number}}. + + {{cherry_pick_error}} + assignees: + - "{{ author }}" \ No newline at end of file From 7d9378cdafb8984d1aae56f690c2be5c0f1388de Mon Sep 17 00:00:00 2001 From: David Ko Date: Thu, 15 Feb 2024 10:57:33 +0800 Subject: [PATCH 07/42] Update mergify.yml --- .github/mergify.yml | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/.github/mergify.yml b/.github/mergify.yml index 2dd1aee601..0da48caa09 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -5,9 +5,8 @@ pull_request_rules: - check-success=DCO - check-success=CodeFactor - check-success=codespell - - "#approved-reviews-by>=1" + - "#approved-reviews-by>=2" - approved-reviews-by=@longhorn/maintainer - - label=ready-to-merge actions: merge: method: rebase @@ -17,18 +16,4 @@ pull_request_rules: - conflict actions: comment: - message: This pull request is now in conflicts. Could you fix it @{{author}}? 🙏 - -# Comment on the PR to trigger backport. ex: @Mergifyio copy stable/3.1 stable/4.0 -- name: backport patches to stable branch - conditions: - - base=master - actions: - backport: - title: "[BACKPORT][{{ destination_branch }}] {{ title }}" - body: | - This is an automatic backport of pull request #{{number}}. - - {{cherry_pick_error}} - assignees: - - "{{ author }}" \ No newline at end of file + message: This pull request is now in conflict. Could you fix it @{{author}}? 🙏 From 46cfe13e252ebc7cbb8d712fca7e3a783ae5ea5f Mon Sep 17 00:00:00 2001 From: Yang Chiu Date: Fri, 16 Feb 2024 09:29:46 +0800 Subject: [PATCH 08/42] test: fix flaky test case test_extra_replica_cleanup Signed-off-by: Yang Chiu --- manager/integration/tests/test_ha.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manager/integration/tests/test_ha.py b/manager/integration/tests/test_ha.py index 6c1a794515..797d7ee484 100644 --- a/manager/integration/tests/test_ha.py +++ b/manager/integration/tests/test_ha.py @@ -1984,7 +1984,7 @@ def test_extra_replica_cleanup(client, volume_name, settings_reset): # NOQA wait_for_volume_replica_count(client, volume_name, 3) volume = client.by_id_volume(volume_name) - assert volume.robustness == "healthy" + wait_for_volume_healthy(client, volume_name) check_volume_data(volume, data) From b3a6cd6b2572f38e5f1669a33df3d99bd49885f3 Mon Sep 17 00:00:00 2001 From: Yang Chiu Date: Mon, 19 Feb 2024 13:55:02 +0800 Subject: [PATCH 09/42] ci: fix codespell errors Signed-off-by: Yang Chiu --- .github/workflows/codespell.yml | 1 + .../terraform/aws/ubuntu/main.tf | 2 +- .../content/manual/functional-test-cases/backup.md | 6 +++--- .../manual/functional-test-cases/kubernetes.md | 6 +++--- .../manual/functional-test-cases/monitoring.md | 4 ++-- docs/content/manual/functional-test-cases/node.md | 2 +- .../content/manual/functional-test-cases/volume.md | 2 +- .../cluster-restore/restore-to-an-old-cluster.md | 6 +++--- .../node-down/single-replica-node-down.md | 2 +- .../pre-release/node/degraded-availability.md | 4 ++-- .../upgrade/backing-image-during-upgrade.md | 2 +- .../v1.2.0/label-driven-recurring-job.md | 4 ++-- .../v1.2.0/test-backing-image-upload.md | 2 +- .../test-backing-image-checksum-mismatching.md | 2 +- .../v1.3.0/extend_CSI_snapshot_support.md | 2 +- .../v1.6.0/test-engine-version-enforcement.md | 4 ++-- .../test-rebuild-in-meta-blocks-engine-start.md | 2 +- engine/environment-setup/setupRancher.py | 2 +- manager/integration/tests/common.py | 6 +++--- manager/integration/tests/test_backing_image.py | 2 +- manager/integration/tests/test_basic.py | 4 ++-- manager/integration/tests/test_csi_snapshotter.py | 8 ++++---- manager/integration/tests/test_engine_upgrade.py | 2 +- manager/integration/tests/test_ha.py | 12 ++++++------ manager/integration/tests/test_infra.py | 2 +- manager/integration/tests/test_metric.py | 2 +- manager/integration/tests/test_node.py | 6 +++--- manager/integration/tests/test_rwx.py | 4 ++-- manager/integration/tests/test_scheduling.py | 4 ++-- manager/integration/tests/test_settings.py | 4 ++-- manager/integration/tests/test_statefulset.py | 6 +++--- scalability_test/script/monitor.py | 14 +++++++------- scalability_test/script/scale-test.py | 2 +- secscan/terraform/aws/main.tf | 2 +- test_framework/terraform/aws/centos/main.tf | 4 ++-- test_framework/terraform/aws/oracle/main.tf | 4 ++-- test_framework/terraform/aws/rhel/main.tf | 4 ++-- test_framework/terraform/aws/rockylinux/main.tf | 4 ++-- test_framework/terraform/aws/sles/main.tf | 4 ++-- test_framework/terraform/aws/ubuntu/main.tf | 4 ++-- test_tools/gen_data/README.md | 2 +- test_tools/gen_data/run.sh | 2 +- 42 files changed, 82 insertions(+), 81 deletions(-) diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml index b1ab6b8e6a..3e239f5e31 100644 --- a/.github/workflows/codespell.yml +++ b/.github/workflows/codespell.yml @@ -20,3 +20,4 @@ jobs: with: check_filenames: true skip: "*/**.yaml,*/**.yml,./scripts,./vendor,MAINTAINERS,LICENSE,go.mod,go.sum" + ignore_words_list: aks diff --git a/build_engine_test_images/terraform/aws/ubuntu/main.tf b/build_engine_test_images/terraform/aws/ubuntu/main.tf index 82dab61c9f..70ece8d133 100644 --- a/build_engine_test_images/terraform/aws/ubuntu/main.tf +++ b/build_engine_test_images/terraform/aws/ubuntu/main.tf @@ -99,7 +99,7 @@ resource "aws_route_table" "build_engine_aws_public_rt" { } } -# Assciate public subnet to public route table +# Associate public subnet to public route table resource "aws_route_table_association" "build_engine_aws_public_subnet_rt_association" { depends_on = [ aws_subnet.build_engine_aws_public_subnet, diff --git a/docs/content/manual/functional-test-cases/backup.md b/docs/content/manual/functional-test-cases/backup.md index 9795af967c..ea366600e9 100644 --- a/docs/content/manual/functional-test-cases/backup.md +++ b/docs/content/manual/functional-test-cases/backup.md @@ -28,7 +28,7 @@ Backup create operations test cases |-----| --- | --- | --- | | 1 | Create backup from existing snapshot | **Prerequisite:**

* Backup target is set to NFS server, or S3 compatible target.

1. Create a workload using Longhorn volume
2. Write data to volume, compute it’s checksum (checksum#1)
3. Create a snapshot (snapshot#1)
4. Create a backup from (snapshot#1)
5. Restore backup to a different volume
6. Attach volume to a node and check it’s data, and compute it’s checksum | * Backup should be created
* Restored volume data checksum should match (checksum#1) | | 2 | Create volume backup for a volume attached to a node | **Prerequisite:**

* Backup target is set to NFS server, or S3 compatible target.

1. Create a volume, attach it to a node
2. Format volume using ext4/xfs filesystem and mount it to a directory on the node
3. Write data to volume, compute it’s checksum (checksum#1)
4. Create a backup
5. Restore backup to a different volume
6. Attach volume to a node and check it’s data, and compute it’s checksum
7. Check volume backup labels | * Backup should be created
* Restored volume data checksum should match (checksum#1)
* backup should have no backup labels | -| 3 | Create volume backup used by Kubernetes workload | **Prerequisite:**

* Backup target is set to NFS server, or S3 compatible target.

1. Create a deployment workload with `nReplicas = 1` using Longhorn volume
2. Write data to volume, compute it’s checksum (checksum#1)
3. Create a backup
4. Check backup labels
5. Scale down deployment `nReplicas = 0`
6. Delete Longhorn volume
7. Restore backup to a volume with the same deleted volume name
8. Scale back deployment `nReplicas = 1`
9. Check volume data checksum | * Backup labels should contain the following informations about workload that was using the volume at time of backup.
* Namespace

* PV Name

* PVC Name

* PV Status

* Workloads Status

* Pod Name
Workload Name
Workload Type
Pod Status

* After volume restore, data checksum should match (checksum#1) | +| 3 | Create volume backup used by Kubernetes workload | **Prerequisite:**

* Backup target is set to NFS server, or S3 compatible target.

1. Create a deployment workload with `nReplicas = 1` using Longhorn volume
2. Write data to volume, compute it’s checksum (checksum#1)
3. Create a backup
4. Check backup labels
5. Scale down deployment `nReplicas = 0`
6. Delete Longhorn volume
7. Restore backup to a volume with the same deleted volume name
8. Scale back deployment `nReplicas = 1`
9. Check volume data checksum | * Backup labels should contain the following information about workload that was using the volume at time of backup.
* Namespace

* PV Name

* PVC Name

* PV Status

* Workloads Status

* Pod Name
Workload Name
Workload Type
Pod Status

* After volume restore, data checksum should match (checksum#1) | | 4 | Create volume backup with customized labels | **Prerequisite:**

* Backup target is set to NFS server, or S3 compatible target.

1. Create a volume, attach it to a node
2. Create a backup, add customized labels
key: `K1` value: `V1`
3. Check volume backup labels | * Backup should be created with customized labels | | 5 | Create recurring backups | 1. Create a deployment workload with `nReplicas = 1` using Longhorn volume
2. Write data to volume , compute it’s checksum (checksum#1)
3. Create a recurring backup `every 5 minutes`. and set retain count to `5`
4. add customized labels key: `K1` value: `V1`
5. Wait for recurring backup to triggered (backup#1, backup#2 )
6. Scale down deployment `nReplicas = 0`
7. Delete the volume.
8. Restore backup to a volume with the same deleted volume name
9. Scale back deployment `nReplicas = 1`
10. Check volume data checksum | * backups should be created with Kubernetes status labels and customized labels
* After volume restore, data checksum should match (checksum#1)
* after restoring the backup recurring backups should continue to be created | | 6 | Backup created using Longhorn behind proxy | **Prerequisite:**

* Setup a Proxy on an instance (Optional: use squid)
* Create a single node cluster in EC2
* Deploy Longhorn

1. Block outgoing traffic except for the proxy instance.
2. Create AWS secret in longhorn.
3. In UI Settings page, set backupstore target and backupstore credential secret
4. Create a volume, attach it to a node, format the volume, and mount it to a directory.
5. Write some data to the volume, and create a backup. | * Ensure backup is created | @@ -99,7 +99,7 @@ Disaster Recovery test cases | DR volume across the cluster #5 | Cluster A:

* Create volume Y
* Attach the volume Y
* Create a backup of Y

Cluster B:

* Backup Volume list page, click \`Create Disaster Recovery Volume\` from volume dropdown
* Create two DR volumes Ydr1 and Ydr2.
* Attach the volume Y to any node
* Mount the volume Y on the node
* Write a file of 10Mb into it, use \`/dev/urandom\` to generate the file
* Calculate the checksum of the file
* Make a Backup
* Attach Ydr1 and Ydr2 to any nodes | * DR volume's last backup should be updated automatically, after settings.BackupPollInterval passed.
* DR volume.LastBackup should be different from DR volume's controller\[0\].LastRestoredBackup temporarily (it's restoring the last backup)
* During the restoration, DR volume cannot be activated.
* Eventually, DR volume.LastBackup should equal to controller\[0\].LastRestoredBackup. | | DR volume across the cluster #6 | \[follow #5\]
Cluster A:

* In the directory mounted volume Y, write a new file of 100Mb.
* Record the checksum of the file
* Create a backup of volume Y

Cluster B:

* Wait for restoration of volume Ydr1 and Ydr2 to complete
* Activate Ydr1
* Attach it to one node and verify the content | * DR volume's last backup should be updated automatically, after settings.BackupPollInterval passed.
* Eventually, DR volume.LastBackup should equal to controller\[0\].LastRestoredBackup.
* Ydr1 should have the same file checksum of volume Y | | DR volume across the cluster #7 | \[follow #6\]
Cluster A

* In the directory mounted volume Y, remove all the files. Write a file of 50Mb
* Record the checksum of the file

Cluster B

* Change setting.BackupPollInterval to longer e.g. 1h

Cluster A

* Create a backup of volume Y

Cluster B
\[DO NOT CLICK BACKUP PAGE, which will update last backup as a side effect\]

* Before Ydr2's last backup updated, activate Ydr2 | * YBdr2's last backup should be immediately updated to the last backup of volume Y
* Activate should fail due to restoration is in progress | When user clicks on “activate DRV”, restoration happens

And the volume goes into detached state | -| DR volume across the cluster #8 | Cluster A

* Create volume Z
* Attach the volume Z
* Create a backup of Z

Cluster B

* Backup Volume list page, click \`Create Disaster Recovery Volume\` from volume dropdown
* Create DR volumes Zdr1, Zdr2 and Zdr3
* Attach the volume Zdr1, Zdr2 and Zdr3 to any node
* Change setting.BackupPollInterval to approriate interval for multiple backups e.g. 15min
* Make sure LastBackup of Zdr is consistent with that of Z

Cluster A

* Create multiple backups for volume Z before Zdr's last backup updated. For each backup, write or modify at least one file then record the cheksum.

Cluster B

* Wait for restoration of volume Zdr1 to complete
* Activate Zdr1
* Attach it to one node and verify the content | * Zdr1's last backup should be updated after settings.BackupPollInterval passed.
* Zdr1 should have the same files with the the same checksums of volume Z | +| DR volume across the cluster #8 | Cluster A

* Create volume Z
* Attach the volume Z
* Create a backup of Z

Cluster B

* Backup Volume list page, click \`Create Disaster Recovery Volume\` from volume dropdown
* Create DR volumes Zdr1, Zdr2 and Zdr3
* Attach the volume Zdr1, Zdr2 and Zdr3 to any node
* Change setting.BackupPollInterval to appropriate interval for multiple backups e.g. 15min
* Make sure LastBackup of Zdr is consistent with that of Z

Cluster A

* Create multiple backups for volume Z before Zdr's last backup updated. For each backup, write or modify at least one file then record the checksum.

Cluster B

* Wait for restoration of volume Zdr1 to complete
* Activate Zdr1
* Attach it to one node and verify the content | * Zdr1's last backup should be updated after settings.BackupPollInterval passed.
* Zdr1 should have the same files with the the same checksums of volume Z | | DR volume across the cluster #9 | \[follow #8\]
Cluster A

* Delete the latest backup of Volume Z | * Last backup of Zdr2 and Zdr3 should be empty after settings.BackupPollInterval passed. Field controller\[0\].LastRestoredBackup and controller\[0\].RequestedBackupRestore should retain. | | DR volume across the cluster #10 | \[follow #9\]
Cluster B

* Activate Zdr2
* Attach it to one node and verify the content | * Zdr2 should have the same files with the the same checksums of volume Z | | | DR volume across the cluster #11 | \[follow #10\]
Cluster A

* Create one more backup with at least one file modified.

Cluster B

* Wait for restoration of volume Zdr3 to complete
* Activate Zdr3
* Attach it to one node and verify the content | * Zdr3 should have the same files with the the same checksums of volume Z | @@ -150,7 +150,7 @@ The setup requirements: | 4 | Delete the backup with `DeletionPolicy` as delete | 1. Repeat the steps from test scenario 1.
2. Delete the `VolumeSnapshot` using `kubectl delete volumesnapshots test-snapshot-pvc` | 1. The `VolumeSnapshot` should be deleted.
2. By default the `DeletionPolicy` is delete, so the `VolumeSnapshotContent` should be deleted.
3. Verify in the backup store, the backup should be deleted. | | 5 | Delete the backup with `DeletionPolicy` as retain | 1. Create a `VolumeSnapshotClass` class with `deletionPolicy` as Retain
kind: VolumeSnapshotClass
apiVersion: snapshot.storage.k8s.io/v1beta1
metadata:
name: longhorn
driver: driver.longhorn.io
deletionPolicy: Retain
2. Repeat the steps from test scenario 1.
3. Delete the `VolumeSnapshot` using `kubectl delete volumesnapshots test-snapshot-pvc` | 1. The `VolumeSnapshot` should be deleted.
2. `VolumeSnapshotContent` should NOT be deleted.
3. Verify in the backup store, the backup should NOT be deleted. | | 6 | Take a backup from longhorn of a snapshot created by csi snapshotter. | 1. Create a volume test-vol and write into it.
1. Compute the md5sum

2. Create the below `VolumeSnapshot` object
apiVersion: snapshot.storage.k8s.io/v1beta1
kind: VolumeSnapshot
metadata:
name: test-snapshot-pvc
spec:
volumeSnapshotClassName: longhorn
source:
persistentVolumeClaimName: test-vol
3. Go to longhorn UI and click on the snapshot created and take another backup | 1. On creating a `VolumeSnapshot`, a backup should be created in the backup store.
2. On creating another backup from longhorn UI, one more backup should be created in backup store. | -| 7 | Delete the `csi plugin` while a backup is in progress. | 1. Create a volume and write into it.
Compute the md5sum of the data.
2. Create the below `VolumeSnapshot` object
apiVersion: snapshot.storage.k8s.io/v1beta1
kind: VolumeSnapshot
metadata:

name: test-snapshot-pvc
spec:
volumeSnapshotClassName: longhorn
source:
persistentVolumeClaimName: test-vol
3. While the backup is in progress, delete the `csi plugin` | On deleting `csi plugin` , a new pod of `csi plugin` should get created and the bacup should continue to complete. | +| 7 | Delete the `csi plugin` while a backup is in progress. | 1. Create a volume and write into it.
Compute the md5sum of the data.
2. Create the below `VolumeSnapshot` object
apiVersion: snapshot.storage.k8s.io/v1beta1
kind: VolumeSnapshot
metadata:

name: test-snapshot-pvc
spec:
volumeSnapshotClassName: longhorn
source:
persistentVolumeClaimName: test-vol
3. While the backup is in progress, delete the `csi plugin` | On deleting `csi plugin` , a new pod of `csi plugin` should get created and the backup should continue to complete. | | 8 | Take a backup using csi snapshotter with backup store as NFS server. | | | | 9 | Restore from NFS backup store. | | | | 10 | Delete from NFS backup store. | | | diff --git a/docs/content/manual/functional-test-cases/kubernetes.md b/docs/content/manual/functional-test-cases/kubernetes.md index 5f13237538..a4c603d61f 100644 --- a/docs/content/manual/functional-test-cases/kubernetes.md +++ b/docs/content/manual/functional-test-cases/kubernetes.md @@ -43,13 +43,13 @@ title: 5. Kubernetes | 2 | Persistent Volume: Create a PV | **Pre condition:**

* Longhorn is deployed in the cluster

**Steps:**

1. Create a Volume in Longhorn UI `test-volume`
2. Go to cluster → Storage → Persistent Volumes
3. Click on Add PV
4. Select Volume Plugin **Longhorn**
5. Give in other required parameters including replica count.
6. Give in Volume Plugin - `test-volume` which an existing volume in longhorn
7. Click on **Save**.
8. Verify **test-1** PV is created
9. Go to Cluster → Project (default) → Workloads
10. Deploy a workload
11. In the Volumes section → Add a New Volume Claim → Use an existing persistent volume → Select **test-1** from PV dropdown.
12. Click on Define
13. Enter Mount Point.
14. Click on create workload
15. Verify workload is created successfully.
16. Volume gets attached to the pod in the workload
17. Navigate to Longhorn UI.
18. Verify user is able to view the volume attached to the workload in the UI
19. Navigate to volume details page of the volume and Verify the replica count mentioned in Step 4 is available | * Longhorn PV should be created
* Workload should be deployed with the volume mounted from the PV
* Verify volume is available on the Longhorn UI.
* Verify the replica count is as mentioned during storage class creation. | | 3 | Create Storage class in Rancher; From Longhorn create volumes from this storage class. | **Pre condition:**

* Longhorn is deployed in the cluster

**Steps:**

1. Go to cluster → Storage → Storage Classes
2. Click on Add class
3. Select Provisioner **Longhorn**
4. Give in other required parameters including replica count.
5. Click on **Save**.
6. Verify **test-1** storage class is created
7. Go to Longhorn UI
8. In the Settings page for “Default Longhorn Static StorageClass Name”, give in the value: “test-1”
9. Go to Volumes page, click on create volume.
10. Create a volume name : v1
11. Verify v1 is created
12. using kubectl -
13. kubectl get pv -o yaml
14. Verify “storageClassName:” ---> test-1 | * Longhorn storage class should be created
* Value of Default Longhorn Static StorageClass Name should be changed in the settings page
* volume should be created in longhorn UI
* “storageClassName:” value should be **test-1** | | 4 | Create Storage Class using backup URL | 1. Create volume and PV/PVC/POD in Longhorn
2. Write `test_data` into pod
3. Create a snapshot and back it up. Get the backup URL
4. Create a new StorageClass `longhorn-from-backup` in rancher and set backup URL.
5. Use `longhorn-from-backup` to create a new PVC
6. Wait for the volume to be created and complete the restoration.
7. Create the pod using the PVC. Verify the data | | -| 5 | Create Storage class - by using different values for the input list of paramters | **Pre condition:**

* Longhorn is deployed in the cluster

**Steps:**

1. Go to cluster → Storage → Storage Classes
2. Click on Add class
3. Select Provisioner **Longhorn**
4. Give in other required parameters.
5. Click on **Save**.
6. Use this storage class to create a PVC and deploy in a workload.
7. Verify the parameters of the volume created. | Volume parameters should match the storage class paramaters. | +| 5 | Create Storage class - by using different values for the input list of parameters | **Pre condition:**

* Longhorn is deployed in the cluster

**Steps:**

1. Go to cluster → Storage → Storage Classes
2. Click on Add class
3. Select Provisioner **Longhorn**
4. Give in other required parameters.
5. Click on **Save**.
6. Use this storage class to create a PVC and deploy in a workload.
7. Verify the parameters of the volume created. | Volume parameters should match the storage class parameters. | | 6 | StorageClass with `reclaimPolicy` parameter set to `Delete` - PVC from storage class | **Pre conditions:**

* Create PVC from “Longhorn” storage class in rancher.
* It will have a dynamic PV bound

**Steps**:

1. 'Delete PVC from Rancher
2. Verify PVC is deleted
3. Verify PV bound to this PVC is deleted - Rancher → Cluster → Storage → PV
4. Verify the volume(Dynamic PV) in Longhorn is deleted | | | 7 | Volume/PV/PVC created in Longhorn | **Pre conditions:**

* Create volume, PV, PVC in longhorn

**Steps:**

1. 'Delete PVC from Rancher
2. Verify PVC is deleted
3. PV will NOT. be deleted but be in “released” state in Rancher UI
4. Verify Volume does not get deleted | | | 8 | StorageClass with `reclaimPolicy` parameter set to `Retain` - PVC from storage class | **Pre conditions:**

* Create PVC from “Longhorn” storage class in rancher.
* It will have a dynamic PV bound

**Steps**:

1. 'Delete PVC from Rancher
2. Verify PVC is deleted
3. Verify PV bound to this PVC is NOT deleted - Rancher → Cluster → Storage → PV
4. Verify the volume(Dynamic PV) in Longhorn is NOT deleted | | | 9 | StorageClass with `reclaimPolicy` parameter set to `Retain` - Volume/PV/PVC created in Longhorn | **Pre conditions:**

* Create volume, PV, PVC in longhorn

**Steps:**

1. 'Delete PVC from Rancher
2. Verify PVC is deleted
3. PV will NOT. be deleted but be in “released” state in Rancher UI
4. Verify Volume does not get deleted | | -| 10 | Power down node | 1. Power down
2. Replica migrates
3. Power back on
4. Verify if the replicas in the node have been deleted | * When a node is powered down, the replica is rebuilt on the 4th wrker node.
* When the node is powered back on, and the replica on the powered down node is not available in Longhorn UI anymore, there is no data in `/var/lib/longhorn/replicas` folder in the powered on node. | -| 11 | Power down node with. Node tag/disk tag | 1. Add a node tag/disk tag
2. Power down
3. Replica cannot migrate
4. Power back on
5. Replica should get rebuilt on this node | * When a node is powered down, the replica is rebuilt on the 4th wrker node.
* When the node is powered back on, and the replica on the powered down node is not available in Longhorn UI anymore, there is no data in `/var/lib/longhorn/replicas` folder in the powered on node.
* The new replica is rebuilt on a node which has a tag. | +| 10 | Power down node | 1. Power down
2. Replica migrates
3. Power back on
4. Verify if the replicas in the node have been deleted | * When a node is powered down, the replica is rebuilt on the 4th worker node.
* When the node is powered back on, and the replica on the powered down node is not available in Longhorn UI anymore, there is no data in `/var/lib/longhorn/replicas` folder in the powered on node. | +| 11 | Power down node with. Node tag/disk tag | 1. Add a node tag/disk tag
2. Power down
3. Replica cannot migrate
4. Power back on
5. Replica should get rebuilt on this node | * When a node is powered down, the replica is rebuilt on the 4th worker node.
* When the node is powered back on, and the replica on the powered down node is not available in Longhorn UI anymore, there is no data in `/var/lib/longhorn/replicas` folder in the powered on node.
* The new replica is rebuilt on a node which has a tag. | | 12 | Drain a node | 1. Drain use case — drain a worker node 
2. Check if the State of the node reflects in the Longhorn UI —> Node
3. Verify if replica is rebuilt on another node? 
4. Verify if the pod migrates
5. And the volume get migrated | All the components should be successfully drained. | | 13 | kubectl - force drain | Using kubectl - force drain a node where the pod with the volume attached is available

Have snapshots before

Verify data after pod migrates | Volume attaches on the new pod

2 of the 3 replicas are in “Stopped” state - Caused replica rebuild. | | 14 | Cordon a node | 1. Cordon state - cordon a worker node | | diff --git a/docs/content/manual/functional-test-cases/monitoring.md b/docs/content/manual/functional-test-cases/monitoring.md index ef0bcd4dce..1fd2b42146 100644 --- a/docs/content/manual/functional-test-cases/monitoring.md +++ b/docs/content/manual/functional-test-cases/monitoring.md @@ -157,8 +157,8 @@ spec: | 6 | longhorn\_instance\_manager\_cpu\_usage\_millicpu | **Pre-requisite:**

1. Prometheus is setup is done and Prometheus web UI is accessible.
**Test Steps:**

1. Create a volume and attach it to a pod.2. Write 1 Gi data into it.3. Set multiple recurring backup on the volume.4. Go to Prometheus web UI.5. Select `longhorn_instance_manager_cpu_usage_millicpu` and execute. | 1. The reading of cpu\_usage should be shown correctly2. The reading of other instance managers should not get impacted. | | 7 | longhorn\_instance\_manager\_memory\_requests\_bytes | **Pre-requisite:**

1. Prometheus is setup is done and Prometheus web UI is accessible.
**Test Steps:**

1. Create a volume and attach it to a pod.2. Write 1 Gi data into it.3. Set multiple recurring backup on the volume.4. Go to Prometheus web UI.5. Select `longhorn_instance_manager_memory_requests_bytes` and execute. | 1. The reading of memory\_requests should go up for the attached instance manager.2. The reading of other instance managers should not get impacted. | | 8 | longhorn\_instance\_manager\_memory\_usage\_bytes | **Pre-requisite:**

1. Prometheus is setup is done and Prometheus web UI is accessible.
**Test Steps:**

1. Create a volume and attach it to a pod.2. Write 1 Gi data into it.3. Set multiple recurring backup on the volume.4. Go to Prometheus web UI.5. Select `longhorn_instance_manager_memory_usage_bytes` and execute. | 1. The reading of memory\_usage should go up for the attached instance manager.2. The reading of other instance managers should not get impacted. | -| 9 | longhorn\_manager\_cpu\_usage\_millicpu | **Pre-requisite:**

1. Prometheus is setup is done and Prometheus web UI is accessible.
**Test Steps:**

1. Create 3 volumes of different sizes.2. Attach 1st volume to a pod and write 1 Gi data into it.3. Leave the 2rd volume to the detached state.4. Attach the 3th volume to pod and write 1.5 Gi data into it. Attach the volume in maintenance mode.5. Set a recurring backup on volume 1st.6. Perform revert to snapshot with 3rd volume.7. Go to Prometheus web UI.8. Select `longhorn_manager_cpu_usage_millicpu` and execute. | 1. Monitor the graph and the console on the Prometheus server, the cpu\_usage should go up. | -| 10 | longhorn\_manager\_memory\_usage\_bytes | **Pre-requisite:**

1. Prometheus is setup is done and Prometheus web UI is accessible.
**Test Steps:**

1. Create 3 volumes of different sizes.2. Attach 1st volume to a pod and write 1 Gi data into it.3. Leave the 2rd volume to the detached state.4. Attach the 3th volume to pod and write 1.5 Gi data into it. Attach the volume in maintenance mode.5. Set a recurring backup on volume 1st.6. Perform revert to snapshot with 3rd volume.7. Try to make disk full of a node where `longhorn-manager` is running.8. Go to Prometheus web UI.9. Select `longhorn_manager_memory_usage_bytes` and execute. | 1. Monitor the graph and the console on the Prometheus server, the memory\_usage should go up. | +| 9 | longhorn\_manager\_cpu\_usage\_millicpu | **Pre-requisite:**

1. Prometheus is setup is done and Prometheus web UI is accessible.
**Test Steps:**

1. Create 3 volumes of different sizes.2. Attach 1st volume to a pod and write 1 Gi data into it.3. Leave the 2nd volume to the detached state.4. Attach the 3th volume to pod and write 1.5 Gi data into it. Attach the volume in maintenance mode.5. Set a recurring backup on volume 1st.6. Perform revert to snapshot with 3rd volume.7. Go to Prometheus web UI.8. Select `longhorn_manager_cpu_usage_millicpu` and execute. | 1. Monitor the graph and the console on the Prometheus server, the cpu\_usage should go up. | +| 10 | longhorn\_manager\_memory\_usage\_bytes | **Pre-requisite:**

1. Prometheus is setup is done and Prometheus web UI is accessible.
**Test Steps:**

1. Create 3 volumes of different sizes.2. Attach 1st volume to a pod and write 1 Gi data into it.3. Leave the 2nd volume to the detached state.4. Attach the 3th volume to pod and write 1.5 Gi data into it. Attach the volume in maintenance mode.5. Set a recurring backup on volume 1st.6. Perform revert to snapshot with 3rd volume.7. Try to make disk full of a node where `longhorn-manager` is running.8. Go to Prometheus web UI.9. Select `longhorn_manager_memory_usage_bytes` and execute. | 1. Monitor the graph and the console on the Prometheus server, the memory\_usage should go up. | | 11 | longhorn\_disk\_capacity\_bytes | **Pre-requisite:**

1. Prometheus is setup is done and Prometheus web UI is accessible.
**Test Steps:**

1. Create volumes and attach them to each node.2. Add an additional disk to all the nodes. (Different size)3. Write into the volumes.4. Power down a node.5. Disable a node.6. Add a new node in the cluster.7. Delete a node from the cluster.8. Go to Prometheus web UI.9. Select `longhorn_disk_capacity_bytes` and execute. | 1. All the disks should be identified by Prometheus.2. All the disks should show the correct total size of the disks. | | 12 | longhorn\_disk\_usage\_bytes | **Pre-requisite:**

1. Prometheus is setup is done and Prometheus web UI is accessible.
**Test Steps:**

1. Create volumes and attach them to each node.2. Add an additional disk to all the nodes. (Different size)3. Write into the volumes.4. Power down a node.5. Disable a node.6. Add a new node in the cluster.7. Delete a node from the cluster.8. Go to Prometheus web UI.9. Select `longhorn_disk_usage_bytes` and execute. | 1. All the disks should be identified by Prometheus.2. All the disks should show the occupied size of the disks. | | 13 | longhorn\_node\_capacity\_bytes | **Pre-requisite:**

1. Prometheus is setup is done and Prometheus web UI is accessible.
**Test Steps:**

1. Create volumes and attach them to each node.2. Add an additional disk to all the nodes. (Different size)3. Write into the volumes.4. Power down a node.5. Disable a node.6. Add a new node in the cluster.7. Delete a node from the cluster.8. Go to Prometheus web UI.9. Select `longhorn_node_capacity_bytes` and execute. | 1. All the nodes should be identified by Prometheus.2. All the nodes should show the total capacity available of disks available. | diff --git a/docs/content/manual/functional-test-cases/node.md b/docs/content/manual/functional-test-cases/node.md index 645c7467d1..1129ea6a7e 100644 --- a/docs/content/manual/functional-test-cases/node.md +++ b/docs/content/manual/functional-test-cases/node.md @@ -24,7 +24,7 @@ Test cases | | **Test Case** | **Test Instructions** | **Expected Results** | | --- | --- | --- | --- | | 1 | Node scheduling | * **Prerequisites:**
* Longhorn Deployed with 3 nodes


1. Disable Node Scheduling on a node
2. Create a volume with 3 replicas, and attach it to a node
3. Re-enabled node scheduling on the node | * Volume should be created and attached
* Volume replicas should be scheduled to Schedulable nodes only
* Re-enabling node scheduling will not affect existing scheduled replicas, it will only affect new replicas being created, or rebuilt. | | -| 2 | Disk Scheduling | * **Prerequisites:**
* Longhorn Deployed with 3 nodes

* Add additional disk (Disk#1) ,attach it and mounted to Node-01.


1. Create a New Disk, Keep Disk Scheduling disabled
2. Create a volume (vol#1), set replica count to `4` and attach it to a node
3. Check (vol#1) replica paths
4. Enable Scheduling on (disk#1)
5. Create a volume (vol#2), set replica count to `4` and attach it to a node
6. Check (vol#2) replica paths | * (vol#1) replicas should be scheduled only to Disks withe Scheduling enabled, no replicas should be scheduled to (disk#1)
* One of (vol#2) replica paths will be scheduled to (disk#1) | Pass

Case of vol#2 - Not necessarily replica will exists on disk#1 provided soft anti affinity is enabled. It might scheduled on disk#1 | +| 2 | Disk Scheduling | * **Prerequisites:**
* Longhorn Deployed with 3 nodes

* Add additional disk (Disk#1) ,attach it and mounted to Node-01.


1. Create a New Disk, Keep Disk Scheduling disabled
2. Create a volume (vol#1), set replica count to `4` and attach it to a node
3. Check (vol#1) replica paths
4. Enable Scheduling on (disk#1)
5. Create a volume (vol#2), set replica count to `4` and attach it to a node
6. Check (vol#2) replica paths | * (vol#1) replicas should be scheduled only to Disks with Scheduling enabled, no replicas should be scheduled to (disk#1)
* One of (vol#2) replica paths will be scheduled to (disk#1) | Pass

Case of vol#2 - Not necessarily replica will exists on disk#1 provided soft anti affinity is enabled. It might scheduled on disk#1 | | 3 | Volume Created with Node Tags | * **Prerequisites:**
* Longhorn Deployed with 3 nodes


1. Create Node tags as follows:
1. Node-01: fast

2. Node-02: slow

3. Node-02: fast

2. Create a volume (vol#1), set Node tags to slow
3. Create a volume (vol#2), set Node tags to fast
4. Check Volumes replicas paths
5. Check Volume detail `Node Tags` | * vol#1 replicas should only be scheduled to Node-02
* vol#2 replicas should only be scheduled to Node-01 and Node-03
* Node Tag volume detail should contain Node tag specified in volume creation request. | | 4 | Volumes created with Disk Tags | * **Prerequisites:**
* Longhorn Deployed with 3 nodes, with default disks (disk#01-1, disk#02-1, disk#03-1)

* `disk#0X-Y` indicate that disk is attached to `Node-0X` , and it is disk number `Y` on that node.

* Create 3 additional disks (disk#01-2, disk#02-2, disk#03-2), attach each one to a different node, and mount it to a directory on that node.

1. Create Disk tags as follows:
1. disk#01-1: fast

2. disk#01-2: fast

3. disk#02-1: slow

4. disk#02-2: slow

5. disk#03-1: fast

6. disk#01-2: fast

2. Create a volume (vol#1), set Disk tags to slow
3. Create a volume (vol#2), set Disk tags to fast
4. Check Volumes replicas paths
5. Check Volume detail `Disk Tags` | * vol#1 replicas should only be scheduled to disks have slow tag (disk#02-1 and disk#02-2)
* vol#2 replicas should can be scheduled to disks have fast Tag
(disk#01-1, disk#01-2, disk#03-1, disk#03-2)
* Disk Tag volume detail should contain Disk tag specified in volume creation request. | | 5 | Volumes created with both DIsk and Node Tags | * Create a volume, set Disk and node tags, and attach it to a node | * Volume replicas should be scheduled only to node that have Node tags, and only on disks that have Disk tags specified on volume creation request
* If No Node match both Node and Disk tags, volume replicas will not be created. | diff --git a/docs/content/manual/functional-test-cases/volume.md b/docs/content/manual/functional-test-cases/volume.md index 9692941fd9..c9ccefb14f 100644 --- a/docs/content/manual/functional-test-cases/volume.md +++ b/docs/content/manual/functional-test-cases/volume.md @@ -12,7 +12,7 @@ title: 3. Volume | 5 | Attach multiple volumes in maintenance mode | * **Prerequisite:**
* Create multiple volumes

1. Select multiple volumes and Attach them to a node in maintenance mode | * All Volumes should be attached in maintenance mode to the same node specified in volume attach request. | | 6 | Detach multiple volumes | * **Prerequisite:**
* Multiple attached volumes
* Select multiple volumes and detach | * Volumes should be detached | | 7 | Backup multiple Volumes | * **Prerequisite:**
* Longhorn should be configured to point to a backupstore
* Multiple volumes existed and attached to node/used buy kubernetes workload
* Write some data to multiple volumes and compute it’s checksum
* Select multiple volumes and Create a backup
* restore volumes backups and check its data checksum | * Volume backups should be created
* Restored volumes from backup should contain the same data when backup is created | -| 8 | Create PV/PVC for multiple volumes | **Prerequisite:**

* Create multiple volumes

1. Select multiple volumes
2. Create a PV, specify filesysem
3. Check PV in Lonhgorn UI and in Kubernetes
4. Create PVC
5. Check PVC in Lonhgorn UI and in Kubernetes
6. Delete PVC
7. Check PV in Lonhgorn UI and in Kubernetes | * For all selected volumes
* PV should created
* PV/PVC status in UI should be `Available`
* PV `spec.csi.fsType` should match filesystem specified in PV creation request
* PV `spec.storageClassName` should match the setting in `Default Longhorn Static StorageClass Name`
* PV `spec.csi.volumeHandle` should be the volume name
* PV/PVC status in UI should be `Bound` in Longhorn UI
* PVC namespace should match namespace specified in PVC creation request
* After Deleting PVC, PV/PVC status should be `Relased` in Longhorn UI. | +| 8 | Create PV/PVC for multiple volumes | **Prerequisite:**

* Create multiple volumes

1. Select multiple volumes
2. Create a PV, specify filesystem
3. Check PV in Lonhgorn UI and in Kubernetes
4. Create PVC
5. Check PVC in Lonhgorn UI and in Kubernetes
6. Delete PVC
7. Check PV in Lonhgorn UI and in Kubernetes | * For all selected volumes
* PV should created
* PV/PVC status in UI should be `Available`
* PV `spec.csi.fsType` should match filesystem specified in PV creation request
* PV `spec.storageClassName` should match the setting in `Default Longhorn Static StorageClass Name`
* PV `spec.csi.volumeHandle` should be the volume name
* PV/PVC status in UI should be `Bound` in Longhorn UI
* PVC namespace should match namespace specified in PVC creation request
* After Deleting PVC, PV/PVC status should be `Released` in Longhorn UI. | | 9 | Volume expansion | Check Multiple Volume expansion test cases work for multiple volumes

[Test Cases in Volume Details page](https://rancher.atlassian.net/wiki/spaces/LON/pages/354453117/Volume+detail+page) | Volume expansion should work for multiple volumes. | | 10 | Engine Offline Upgrade For Multiple Volumes | **Prerequisite:**

* Volume is consumed by Kubernetes deployment workload
* Volume use old Longhorn Engine

1. Write data to volume, compute it’s checksum (checksum#1)
2. Scale down deployment , volume gets detached
3. Upgrade Longhorn engine image to use new deployed engine image
4. Scale up deployment, volume gets attached | * Volume read/write operations should work before and after engine upgrade.
* Old Engine `Reference Count` will be decreased by 1
* New Engine `Reference Count` will be increased by 1 | | 12 | Show System Hidden | **Prerequisite**:

* Volume is created and attached to a pod.

1. Click the volume appearing on volume list page, it takes user to volume.
2. Take snapshot and upgrade the replicas.
3. Under snapshot section, enable option 'Show System Hidden | Enabling this option will show system created snapshots while rebuilding of replicas. | diff --git a/docs/content/manual/pre-release/cluster-restore/restore-to-an-old-cluster.md b/docs/content/manual/pre-release/cluster-restore/restore-to-an-old-cluster.md index 10fb2f80cf..838522dd97 100644 --- a/docs/content/manual/pre-release/cluster-restore/restore-to-an-old-cluster.md +++ b/docs/content/manual/pre-release/cluster-restore/restore-to-an-old-cluster.md @@ -35,15 +35,15 @@ This test may need to be validated for both kind of cluster. 4. Deploy a StatefulSet with volume D. Write some data and do some snapshot operations. (Validate 2 cases: <1> volume can be recovered automatically if some replicas are removed and some new replicas are replenished; <2> snapshot info will be resynced;) 5. Deploy a Deployment with volume E. Write some data and do some snapshot operations. (Validate 4 cases: <1> engine upgrade; <2> offline expansion) 3. Create a cluster snapshot via Rancher. -4. Do the followings before the restore: +4. Do the following before the restore: 1. Delete volume A. 2. Write more data to volume B and create more backups. 3. Remove all current replicas one by one for volume C. Then all replicas of volume C are new replicas. 4. Remove some replicas for volume D. Do snapshot creation, deletion, and revert. 5. Scale down the workload. Upgrade volume E from the default image to another engine image. And do expansion. - 6. Create and attach volume F via UI. Write some data and do some snapshot operations. (Validate 1 case: Users need to manuall recover the volume if it's created after the cluster snapshot) + 6. Create and attach volume F via UI. Write some data and do some snapshot operations. (Validate 1 case: Users need to manually recover the volume if it's created after the cluster snapshot) 5. Restore the cluster. -6. Check the followings according to the doc: +6. Check the following according to the doc: 1. Volume A is back. But there is no data in it. And users can re-delete it. 2. Volume B can be reattached or keep attached with correct data. The backup info of volume B is resynced when the volume is reattahed. The pod can use the volume after restart. 3. All old removed replicas are back and all newly rebuilt replicas in step4-3 disappear for volume C. There is no data in volume C. The data directories of the disappeared replicas are still on the node. Hence the data are be recovered by exporting a single replica volume. diff --git a/docs/content/manual/pre-release/node-not-ready/node-down/single-replica-node-down.md b/docs/content/manual/pre-release/node-not-ready/node-down/single-replica-node-down.md index 9bc72860b6..53a3c0fb27 100644 --- a/docs/content/manual/pre-release/node-not-ready/node-down/single-replica-node-down.md +++ b/docs/content/manual/pre-release/node-not-ready/node-down/single-replica-node-down.md @@ -20,7 +20,7 @@ https://github.com/longhorn/longhorn/issues/3957 6. Power up node or delete the workload pod so that kubernetes will recreate pod on another node. 7. Verify auto salvage finishes (i.e pod completes start). 8. Verify volume attached & accessible by pod (i.e test data is available). - - For data locality = strict-local volume, volume wiil keep in detaching, attaching status for about 10 minutes, after volume attached to node which replica located, check volume healthy and pod status. + - For data locality = strict-local volume, volume will keep in detaching, attaching status for about 10 minutes, after volume attached to node which replica located, check volume healthy and pod status. ## Node restart/down scenario with `Pod Deletion Policy When Node is Down` set to `delete-both-statefulset-and-deployment-pod` 1. Create RWO|RWX volume with replica count = 1 & data locality = enabled|disabled|strict-local. diff --git a/docs/content/manual/pre-release/node/degraded-availability.md b/docs/content/manual/pre-release/node/degraded-availability.md index 26f2b17a43..fedbc8b3c1 100644 --- a/docs/content/manual/pre-release/node/degraded-availability.md +++ b/docs/content/manual/pre-release/node/degraded-availability.md @@ -15,8 +15,8 @@ title: Degraded availability with added nodes ##### Steps: 1. Create a Deployment Pod with a volume and three replicas. 1. After the volume is attached, on Volume page it should be displayed as `Degraded` - 1. Hover the crusor to the red circle exclamation mark, the tooltip will says, "The volume cannot be scheduled". - 1. Click into the volume detail page it will display `Scheduling Failure` but the volume remain fuctional as expected. + 1. Hover the cursor to the red circle exclamation mark, the tooltip will says, "The volume cannot be scheduled". + 1. Click into the volume detail page it will display `Scheduling Failure` but the volume remain functional as expected. 1. Write data to the Pod. 1. Scale down the deployment to 0 to detach the volume. 1. Volume return to `Detached` state. diff --git a/docs/content/manual/pre-release/upgrade/backing-image-during-upgrade.md b/docs/content/manual/pre-release/upgrade/backing-image-during-upgrade.md index ec44abb402..9ca7dc0132 100644 --- a/docs/content/manual/pre-release/upgrade/backing-image-during-upgrade.md +++ b/docs/content/manual/pre-release/upgrade/backing-image-during-upgrade.md @@ -38,7 +38,7 @@ title: Test Backing Image during Longhorn upgrade 1. Deploy Longhorn. 2. Create a backing images. Wait for the backing image being ready in the 1st disk. 3. Create and attach volumes with the backing image. Wait for all disk files of the backing image being ready. -4. Run `kubectl -n longhorn system get pod -w` in a seperate session. +4. Run `kubectl -n longhorn system get pod -w` in a separate session. 5. Upgrade Longhorn manager but with the backing image manager image unchanged. (Actually we can mock this upgrade by removing all longhorn manager pods simultaneously.) 6. Check if all disk file status of the backing image becomes `unknown` then `ready` during the longhorn manager pods termination and restart. (May need to refresh the UI page after restart.) 7. After the longhorn manager pods restart, Verify there is no backing image data source pod launched for the backing image in the output of step4. diff --git a/docs/content/manual/release-specific/v1.2.0/label-driven-recurring-job.md b/docs/content/manual/release-specific/v1.2.0/label-driven-recurring-job.md index 30f6134599..6f448fd7e8 100644 --- a/docs/content/manual/release-specific/v1.2.0/label-driven-recurring-job.md +++ b/docs/content/manual/release-specific/v1.2.0/label-driven-recurring-job.md @@ -15,11 +15,11 @@ https://github.com/longhorn/longhorn/issues/467 *And* create volume `test-job-4`. *And* create volume `test-job-5`. -**Then** moniter the cron job pod log. +**Then** monitor the cron job pod log. *And* should see 2 jobs created concurrently. **When** update `snapshot1` recurring job with `concurrency` set to `3`. -**Then** moniter the cron job pod log. +**Then** monitor the cron job pod log. *And* should see 3 jobs created concurrently. diff --git a/docs/content/manual/release-specific/v1.2.0/test-backing-image-upload.md b/docs/content/manual/release-specific/v1.2.0/test-backing-image-upload.md index e42178b19f..5b8740893c 100644 --- a/docs/content/manual/release-specific/v1.2.0/test-backing-image-upload.md +++ b/docs/content/manual/release-specific/v1.2.0/test-backing-image-upload.md @@ -37,7 +37,7 @@ title: Test backing image 1. Create a valid backing image 2. Create a StorageClass, which use the same backing image name but different data source type/parameters. 3. Create a PVC with the StorageClass. - ==> The corresponding creation should fail. The longhorn-csi-plugin will repeatly print out error logs like this `existing backing image %v data source is different from the parameters in the creation request or StorageClass`. + ==> The corresponding creation should fail. The longhorn-csi-plugin will repeatedly print out error logs like this `existing backing image %v data source is different from the parameters in the creation request or StorageClass`. 4. Delete the PVC and the StorageClass. 5. Recreate a StorageClass in which the backing image fields match the existing backing image. 6. Create a PVC with the StorageClass. diff --git a/docs/content/manual/release-specific/v1.2.3/test-backing-image-checksum-mismatching.md b/docs/content/manual/release-specific/v1.2.3/test-backing-image-checksum-mismatching.md index fe096f8e5c..89f015c376 100644 --- a/docs/content/manual/release-specific/v1.2.3/test-backing-image-checksum-mismatching.md +++ b/docs/content/manual/release-specific/v1.2.3/test-backing-image-checksum-mismatching.md @@ -3,7 +3,7 @@ title: Test backing image checksum mismatching --- ### Test step -1. Modify setting `Backing Image Recovery Wait Interval` to a shorter value so that the backing image will start auto recovery eariler. +1. Modify setting `Backing Image Recovery Wait Interval` to a shorter value so that the backing image will start auto recovery earlier. 2. Create a backing image file with type `Download From URL`. 3. Launch a volume using the backing image file so that there are 2 disk records for the backing image. 4. Modify one disk file for the backing image and make sure the file size is not changed. This will lead to data inconsistency/corruption later. e.g., diff --git a/docs/content/manual/release-specific/v1.3.0/extend_CSI_snapshot_support.md b/docs/content/manual/release-specific/v1.3.0/extend_CSI_snapshot_support.md index d096cdd538..d70c46ecdc 100644 --- a/docs/content/manual/release-specific/v1.3.0/extend_CSI_snapshot_support.md +++ b/docs/content/manual/release-specific/v1.3.0/extend_CSI_snapshot_support.md @@ -132,7 +132,7 @@ https://github.com/longhorn/longhorn/issues/2534 * Scale down the workload to detach the `test-vol` * Create the same PVC `test-restore-pvc` as in the `Source volume is attached && Longhorn snapshot exist` section * Verify that PVC provisioning failed because the source volume is detached so Longhorn cannot verify the existence of the Longhorn snapshot in the source volume. - * Scale up the workload to attache `test-vol` + * Scale up the workload to attach `test-vol` * Wait for PVC to finish provisioning and be bounded * Attach the PVC `test-restore-pvc` and verify the data * Delete the PVC diff --git a/docs/content/manual/release-specific/v1.6.0/test-engine-version-enforcement.md b/docs/content/manual/release-specific/v1.6.0/test-engine-version-enforcement.md index ba4f32d956..0d2b543e78 100644 --- a/docs/content/manual/release-specific/v1.6.0/test-engine-version-enforcement.md +++ b/docs/content/manual/release-specific/v1.6.0/test-engine-version-enforcement.md @@ -22,7 +22,7 @@ longhorn-manager-grhsf 0/1 CrashLoopBackOff ``` And should see incompatible version error in longhorn-manager Pod logs ``` -time="2023-08-17T03:03:20Z" level=fatal msg="Error starting manager: failed checking Engine upgarde path: incompatible Engine ei-7fa7c208 client API version: found version 7 is below required minimal version 8" +time="2023-08-17T03:03:20Z" level=fatal msg="Error starting manager: failed checking Engine upgrade path: incompatible Engine ei-7fa7c208 client API version: found version 7 is below required minimal version 8" ``` **When** downgraded Longhorn to v1.5.x @@ -39,5 +39,5 @@ ei-7fa7c208 true deployed longhornio/longhorn-engine:v1.4.1 0 ei-ad420081 false deployed c3y1huang/research:2017-lh-ei 0 44h 24s ``` -**When** update existing volume/engine/replica custom resourcs `spec.image` with `longhornio/longhorn-engine:v1.4.x` +**When** update existing volume/engine/replica custom resources `spec.image` with `longhornio/longhorn-engine:v1.4.x` **Then** should be blocked diff --git a/docs/content/manual/release-specific/v1.6.0/test-rebuild-in-meta-blocks-engine-start.md b/docs/content/manual/release-specific/v1.6.0/test-rebuild-in-meta-blocks-engine-start.md index f81a56c604..a1cfaed7e0 100644 --- a/docs/content/manual/release-specific/v1.6.0/test-rebuild-in-meta-blocks-engine-start.md +++ b/docs/content/manual/release-specific/v1.6.0/test-rebuild-in-meta-blocks-engine-start.md @@ -32,7 +32,7 @@ index b48ddd46..c4523f11 100644 **And** the `auto-salvage` setting is set to `true`. **And** a new StorageClass is created with `NumberOfReplica` set to `1`. **And** a StatefulSet is created with `Replica` set to `1`. -**And** the node of the StatefulSet Pod and the node of its volume Replica are different. This is necessary to trigger the rebuilding in reponse to the data locality setting update later. +**And** the node of the StatefulSet Pod and the node of its volume Replica are different. This is necessary to trigger the rebuilding in response to the data locality setting update later. **And** Volume have 1 running Replica. **And** data exists in the volume. diff --git a/engine/environment-setup/setupRancher.py b/engine/environment-setup/setupRancher.py index 8882c14343..4c49b28428 100644 --- a/engine/environment-setup/setupRancher.py +++ b/engine/environment-setup/setupRancher.py @@ -32,7 +32,7 @@ def silent_remove_file(filename): os.remove(filename) except OSError as e: if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory - raise # re-raise exception if a different error occured + raise # re-raise exception if a different error occurred def gce_create_instance(compute, name, gce_startup_script): diff --git a/manager/integration/tests/common.py b/manager/integration/tests/common.py index 28503ae13e..cedabb6155 100644 --- a/manager/integration/tests/common.py +++ b/manager/integration/tests/common.py @@ -2287,7 +2287,7 @@ class AssertErrorCheckThread(threading.Thread): Parameters: target : The threading function. - args : Arguments of the target fucntion. + args : Arguments of the target function. """ def __init__(self, target, args): threading.Thread.__init__(self) @@ -5758,7 +5758,7 @@ def generate_support_bundle(case_name): # NOQA Generate support bundle into folder ./support_bundle/case_name.zip Won't generate support bundle if current support bundle count - greate than MAX_SUPPORT_BINDLE_NUMBER. + greater than MAX_SUPPORT_BINDLE_NUMBER. Args: case_name: support bundle will named case_name.zip """ @@ -5808,7 +5808,7 @@ def generate_support_bundle(case_name): # NOQA with open('./support_bundle/{0}.zip'.format(case_name), 'wb') as f: f.write(r.content) except Exception as e: - warnings.warn("Error occured while downloading support bundle {}.zip\n\ + warnings.warn("Error occurred when downloading support bundle {}.zip\n\ The error was {}".format(case_name, e)) diff --git a/manager/integration/tests/test_backing_image.py b/manager/integration/tests/test_backing_image.py index 5fac7c272b..118fbfccbe 100644 --- a/manager/integration/tests/test_backing_image.py +++ b/manager/integration/tests/test_backing_image.py @@ -431,7 +431,7 @@ def test_backing_image_with_disk_migration(): # NOQA `-` is removed. 9. Remount the host disk to another path. Then create another Longhorn disk based on the migrated path (disk migration). - 10. Verify the followings. + 10. Verify the following. 1. The disk added in step3 (before the migration) should be "unschedulable". 2. The disk added in step9 (after the migration) should diff --git a/manager/integration/tests/test_basic.py b/manager/integration/tests/test_basic.py index bedd863e03..22596c6e58 100644 --- a/manager/integration/tests/test_basic.py +++ b/manager/integration/tests/test_basic.py @@ -342,7 +342,7 @@ def test_volume_iscsi_basic(client, volume_name): # NOQA 1. Create and attach a volume with iscsi frontend 2. Check the volume endpoint and connect it using the iscsi - initator on the node. + initiator on the node. 3. Write then read back volume data for validation """ @@ -3421,7 +3421,7 @@ def test_allow_volume_creation_with_degraded_availability(client, volume_name): 2. `node-level-soft-anti-affinity` to false. Steps: - (degraded availablity) + (degraded availability) 1. Disable scheduling for node 2 and 3. 2. Create a volume with three replicas. 1. Volume should be `ready` after creation and `Scheduled` is true. diff --git a/manager/integration/tests/test_csi_snapshotter.py b/manager/integration/tests/test_csi_snapshotter.py index b1bf905f26..05493e5ce9 100644 --- a/manager/integration/tests/test_csi_snapshotter.py +++ b/manager/integration/tests/test_csi_snapshotter.py @@ -435,7 +435,7 @@ def csi_volumesnapshot_creation_test(snapshotClass=longhorn|custom): 4. check creation of a new longhorn snapshot named `snapshot-uuid` 5. check for `VolumeSnapshotContent` named `snapcontent-uuid` 6. wait for `VolumeSnapshotContent.readyToUse` flag to be set to **true** - 7. check for backup existance on the backupstore + 7. check for backup existence on the backupstore # the csi snapshot restore sets the fromBackup field same as # the StorageClass based restore approach. @@ -860,16 +860,16 @@ def test_csi_snapshot_snap_create_volume_from_snapshot(apps_api, # NOQA - Attach the PVC and verify data - Source volume is detached - Scale down the workload - - Create PVC from VolumeSnapshot generated from step beggining + - Create PVC from VolumeSnapshot generated from step beginning - Verify PVC provision failed - Scale up the workload - Wait for PVC to finish provisioning and be bounded - Attach the PVC test-restore-pvc and verify the data - Source volume is attached && Longhorn snapshot doesn’t exist - Use VolumeSnapshotContent.snapshotHandle to - specify Longhorn snapshot generated in step beggining + specify Longhorn snapshot generated in step beginning - Delete the Longhorn snapshot - - Create PVC from VolumeSnapshot generated from step beggining + - Create PVC from VolumeSnapshot generated from step beginning - PVC should be stuck in provisioning state """ vol, deployment, csisnapclass, expected_md5sum = \ diff --git a/manager/integration/tests/test_engine_upgrade.py b/manager/integration/tests/test_engine_upgrade.py index 2c16d6a8c1..310b31e20e 100644 --- a/manager/integration/tests/test_engine_upgrade.py +++ b/manager/integration/tests/test_engine_upgrade.py @@ -43,7 +43,7 @@ def test_engine_image(client, core_api, volume_name): # NOQA """ Test Engine Image deployment - 1. List Engine Images and validate basic properities. + 1. List Engine Images and validate basic properties. 2. Try deleting default engine image and it should fail. 3. Try creating a duplicate engine image as default and it should fail 4. Get upgrade test image for the same versions diff --git a/manager/integration/tests/test_ha.py b/manager/integration/tests/test_ha.py index 797d7ee484..c5ee886185 100644 --- a/manager/integration/tests/test_ha.py +++ b/manager/integration/tests/test_ha.py @@ -1033,7 +1033,7 @@ def test_inc_restoration_with_multiple_rebuild_and_expansion(set_random_backupst wait_for_volume_healthy(client, std_volume_name) # Step 9: - # When the total writen data size is more than 1Gi, there must be data in + # When the total written data size is more than 1Gi, there must be data in # the expanded part. data_path2 = "/data/test2" write_pod_volume_random_data(core_api, std_pod_name, @@ -1093,7 +1093,7 @@ def test_inc_restoration_with_multiple_rebuild_and_expansion(set_random_backupst wait_for_volume_expansion(client, std_volume_name) # Step 15: - # When the total writen data size is more than 2Gi, there must be data in + # When the total written data size is more than 2Gi, there must be data in # the 2nd expanded part. data_path3 = "/data/test3" write_pod_volume_random_data(core_api, std_pod_name, @@ -1689,7 +1689,7 @@ def test_engine_crash_for_restore_volume(set_random_backupstore, client, core_ap # The complete state transition would be like: # detaching -> detached -> attaching -> attached -> restore -> detached . # Now the state change too fast, script eventually caught final detach - # So temporaly comment out below line of code + # So temporarily comment out below line of code # wait_for_volume_detached(client, res_name) res_volume = wait_for_volume_healthy_no_frontend(client, res_name) @@ -1806,7 +1806,7 @@ def test_engine_crash_for_dr_volume(set_random_backupstore, client, core_api, vo # The complete state transition would be like: # detaching -> detached -> attaching -> attached -> restore -> detached . # Now the state change too fast, script eventually caught final detach - # So temporaly comment out below line of code + # So temporarily comment out below line of code # wait_for_volume_detached(client, dr_volume_name) # Check if the DR volume is auto reattached then continue @@ -1943,10 +1943,10 @@ def test_extra_replica_cleanup(client, volume_name, settings_reset): # NOQA save the checksum. 4. Increase the volume replica number to 4. 5. Volume should show failed to schedule and an extra stop replica. - 6. Decrease the volume replica nubmer to 3. + 6. Decrease the volume replica number to 3. 7. Volume should show healthy and the extra failed to scheduled replica should be removed. - 8. Check the data in the volume and make sure it's same as the chechsum. + 8. Check the data in the volume and make sure it's same as the checksum. """ replica_node_soft_anti_affinity_setting = \ client.by_id_setting(SETTING_REPLICA_NODE_SOFT_ANTI_AFFINITY) diff --git a/manager/integration/tests/test_infra.py b/manager/integration/tests/test_infra.py index 6842db3090..73f0995ca4 100644 --- a/manager/integration/tests/test_infra.py +++ b/manager/integration/tests/test_infra.py @@ -184,7 +184,7 @@ def test_offline_node(reset_cluster_ready_status): """ Test offline node - 1. Bring down one of the nodes in Kuberntes cluster (avoid current node) + 1. Bring down one of the nodes in Kubernetes cluster (avoid current node) 2. Make sure the Longhorn node state become `down` """ pod_lable_selector = "longhorn-test=test-job" diff --git a/manager/integration/tests/test_metric.py b/manager/integration/tests/test_metric.py index 3210cf1f00..5223c70abe 100644 --- a/manager/integration/tests/test_metric.py +++ b/manager/integration/tests/test_metric.py @@ -82,7 +82,7 @@ def find_metrics(metric_data, metric_name): def check_metric_with_condition(core_api, metric_name, metric_labels, expected_value=None, metric_node_id=get_self_host_id()): # NOQA) """ - Some metric have multiple conditions, for exameple metric + Some metric have multiple conditions, for example metric longhorn_node_status have condition - allowScheduling - mountpropagation diff --git a/manager/integration/tests/test_node.py b/manager/integration/tests/test_node.py index 1e4ad5dd32..4f95978d4d 100644 --- a/manager/integration/tests/test_node.py +++ b/manager/integration/tests/test_node.py @@ -190,7 +190,7 @@ def test_node_disk_update(client): # NOQA 3. Create two disks `disk1` and `disk2`, attach them to the current node. 4. Add two disks to the current node. 5. Verify two extra disks have been added to the node - 6. Disbale the two disks' scheduling, and set StorageReserved + 6. Disable the two disks' scheduling, and set StorageReserved 7. Update the two disks. 8. Validate all the disks properties. 9. Delete other two disks. Validate deletion works. @@ -1919,7 +1919,7 @@ def test_node_config_annotation_missing(client, core_api, reset_default_disk_lab 3. Verify disk update works. 4. Verify tag update works 5. Verify using tag annotation for configuration works. - 6. After remove the tag annotaion, verify unset tag node works fine. + 6. After remove the tag annotation, verify unset tag node works fine. 7. Set tag annotation again. Verify node updated for the tag. """ setting = client.by_id_setting(SETTING_CREATE_DEFAULT_DISK_LABELED_NODES) @@ -2012,7 +2012,7 @@ def test_replica_scheduler_rebuild_restore_is_too_big(set_random_backupstore, cl data cannot fit in the small disk 6. Delete a replica of volume. 1. Verify the volume reports `scheduled = false` due to unable to find - a suitable disk for rebuliding replica, since the replica with the + a suitable disk for rebuilding replica, since the replica with the existing data cannot fit in the small disk 6. Enable the scheduling for other disks, disable scheduling for small disk 7. Verify the volume reports `scheduled = true`. And verify the data. diff --git a/manager/integration/tests/test_rwx.py b/manager/integration/tests/test_rwx.py index 79ea321117..2132acf020 100644 --- a/manager/integration/tests/test_rwx.py +++ b/manager/integration/tests/test_rwx.py @@ -538,7 +538,7 @@ def test_rwx_online_expansion(): # NOQA - Create a rwx pvc using longhorn storage class of size 1 Gi. And - - Atach it to a workload (deployment) and write some data. + - Attach it to a workload (deployment) and write some data. When - Expand the volume to 5 Gi @@ -566,7 +566,7 @@ def test_rwx_offline_expansion(client, core_api, pvc, make_deployment_with_pvc): - Create a rwx pvc using longhorn storage class of size 1 Gi. And - - Atach it to a workload (deployment) and write some data. + - Attach it to a workload (deployment) and write some data. - Scale down the workload, wait volume detached - Share manager pod will terminate automatically - Expand the volume to 4 Gi, wait exoansion complete diff --git a/manager/integration/tests/test_scheduling.py b/manager/integration/tests/test_scheduling.py index 2c164cad38..e6ffacd7c5 100644 --- a/manager/integration/tests/test_scheduling.py +++ b/manager/integration/tests/test_scheduling.py @@ -1917,7 +1917,7 @@ def test_global_disk_soft_anti_affinity(client, volume_name, request): # NOQA assert num_running == 2 # After enable SETTING_REPLICA_DISK_SOFT_ANTI_AFFINITY to true, - # replicas can schedule on the same disk, threrefore volume become healthy + # replicas can schedule on the same disk, therefore volume become healthy update_setting(client, SETTING_REPLICA_DISK_SOFT_ANTI_AFFINITY, "true") volume = wait_for_volume_healthy(client, volume_name) @@ -2088,7 +2088,7 @@ def test_volume_disk_soft_anti_affinity(client, volume_name, request): # NOQA assert num_running == 2 # After set update volume.updateReplicaDiskSoftAntiAffinity to enabled, - # replicas can schedule on the same disk, threrefore volume become healthy + # replicas can schedule on the same disk, therefore volume become healthy volume = volume.updateReplicaDiskSoftAntiAffinity( replicaDiskSoftAntiAffinity="enabled") assert volume.replicaDiskSoftAntiAffinity == "enabled" diff --git a/manager/integration/tests/test_settings.py b/manager/integration/tests/test_settings.py index 1f025b2fae..aff12a2732 100644 --- a/manager/integration/tests/test_settings.py +++ b/manager/integration/tests/test_settings.py @@ -995,7 +995,7 @@ def setting_concurrent_volume_backup_restore_limit_concurrent_restoring_test(cli break assert is_case_tested, \ - f"Unexpected cocurrent count: {concurrent_count}\n" + f"Unexpected concurrent count: {concurrent_count}\n" for restore_volume_name in restore_volume_names: if is_DR_volumes: @@ -1197,7 +1197,7 @@ def test_setting_update_with_invalid_value_via_configmap(core_api, request): # 2. Initialize longhorn-default-setting configmap containing valid and invalid settings 3. Update longhorn-default-setting configmap with invalid settings. - The invalid settings SETTING_TAINT_TOLERATION will be ingored + The invalid settings SETTING_TAINT_TOLERATION will be ignored when there is an attached volume. 4. Validate the default settings values. """ diff --git a/manager/integration/tests/test_statefulset.py b/manager/integration/tests/test_statefulset.py index a4a216dbd7..428119cfae 100644 --- a/manager/integration/tests/test_statefulset.py +++ b/manager/integration/tests/test_statefulset.py @@ -100,7 +100,7 @@ def test_statefulset_mount(client, core_api, storage_class, statefulset): # NOQ 1. Create a StatefulSet using dynamic provisioned Longhorn volume. 2. Wait for pods to become running - 3. Check volume properites are consistent with the StorageClass + 3. Check volume properties are consistent with the StorageClass """ statefulset_name = 'statefulset-mount-test' @@ -138,7 +138,7 @@ def test_statefulset_scaling(client, core_api, storage_class, statefulset): # N 1. Create a StatefulSet with VolumeClaimTemplate and Longhorn. 2. Wait for pods to run. - 3. Verify the properities of volumes. + 3. Verify the properties of volumes. 4. Scale the StatefulSet to 3 replicas 5. Wait for the new pod to become ready. 6. Verify the new volume properties. @@ -259,7 +259,7 @@ def test_statefulset_backup(set_random_backupstore, client, core_api, storage_cl 4. Create a third snapshot 5. Backup the snapshot `backup_snapshot` 6. Wait for backup to show up. - 1 Verify the backup informations + 1 Verify the backup information """ statefulset_name = 'statefulset-backup-test' diff --git a/scalability_test/script/monitor.py b/scalability_test/script/monitor.py index de03533460..cc195d7b2a 100644 --- a/scalability_test/script/monitor.py +++ b/scalability_test/script/monitor.py @@ -50,7 +50,7 @@ def update_data(self): node_list = [] try: pod_list = self.core_api_v1.list_namespaced_pod("default") - # TODO: change to catch any exeption and count the number of api exceptions + # TODO: change to catch any exception and count the number of api exceptions except client.ApiException as e: print("Exception when calling CoreV1Api->list_namespaced_pod: %s\n" % e) print("Skipping this update") @@ -58,7 +58,7 @@ def update_data(self): try: node_list = self.custom_objects_api.list_cluster_custom_object("metrics.k8s.io", "v1beta1", "nodes") - # TODO: change to catch any exeption and count the number of api exceptions + # TODO: change to catch any exception and count the number of api exceptions except client.ApiException as e: print("Exception when calling custom_objects_api->list_cluster_custom_object: %s\n" % e) print("Will set node metrics to 0") @@ -76,12 +76,12 @@ def update_data(self): if pod_with_valid_starting_time_count < running_pod_count and MAX_POD_STARTING_TIME_POINT not in self.annotating_points: self.annotating_points[MAX_POD_STARTING_TIME_POINT] = { "xy": (diff.total_seconds(), - pod_with_valid_starting_time_count), "desciption": "(1) "+str(pod_with_valid_starting_time_count)+" pods", + pod_with_valid_starting_time_count), "description": "(1) "+str(pod_with_valid_starting_time_count)+" pods", "color": "tab:orange"} if crashing_pod_count > self.max_pod_crashing_count and MAX_POD_CRASHING_POINT not in self.annotating_points: self.annotating_points[MAX_POD_CRASHING_POINT] = { "xy": (diff.total_seconds(), - pod_with_valid_starting_time_count), "desciption": "(2) "+str(pod_with_valid_starting_time_count)+" pods", + pod_with_valid_starting_time_count), "description": "(2) "+str(pod_with_valid_starting_time_count)+" pods", "color": "tab:red"} for node in node_list['items']: @@ -101,7 +101,7 @@ def update_data(self): self.cpu_metrics[node_name] = cpu_metric self.ram_metrics[node_name] = ram_metric - # update node metrics with value 0 if the infomation is missing in the above update + # update node metrics with value 0 if the information is missing in the above update for metric in self.cpu_metrics.values(): if len(metric) < len(self.time_diffs): cpu_metric.extend([0]*(len(self.time_diffs)-len(metric))) @@ -192,10 +192,10 @@ def draw(self): ax1, ax2, ax3 = self.axes ax1.plot(self.time_diffs, self.running_pod_metric) - ax1.set_ylabel('Number of running pods') + informationsinformationsax1.set_ylabel('Number of running pods') for point in self.annotating_points.values(): - ax1.annotate(point["desciption"], + ax1.annotate(point["description"], xy= point["xy"], xycoords='data', xytext=(0, 20), textcoords='offset points', arrowprops=dict(facecolor=point["color"], shrink=0.05), diff --git a/scalability_test/script/scale-test.py b/scalability_test/script/scale-test.py index d3164f7442..e343617ffd 100644 --- a/scalability_test/script/scale-test.py +++ b/scalability_test/script/scale-test.py @@ -33,7 +33,7 @@ def get_node_capacities(): # hugepages-2Mi: '0' # memory: 32412804Ki # pods: '110' - cpu = int(i.status.capacity["cpu"])*1000**3 # conver to nano cpu + cpu = int(i.status.capacity["cpu"])*1000**3 # convert to nano cpu ram = int(i.status.capacity["memory"][:-2]) node_capacities[i.metadata.name] = {"cpu": cpu, "ram": ram} diff --git a/secscan/terraform/aws/main.tf b/secscan/terraform/aws/main.tf index 81fed9e5a1..52bd9be406 100644 --- a/secscan/terraform/aws/main.tf +++ b/secscan/terraform/aws/main.tf @@ -90,7 +90,7 @@ resource "aws_route_table" "lh-secscan_aws_public_rt" { } } -# Assciate public subnet to public route table +# Associate public subnet to public route table resource "aws_route_table_association" "lh-secscan_aws_public_subnet_rt_association" { depends_on = [ aws_subnet.lh-secscan_aws_public_subnet, diff --git a/test_framework/terraform/aws/centos/main.tf b/test_framework/terraform/aws/centos/main.tf index 758a625be2..7097a5a1fd 100644 --- a/test_framework/terraform/aws/centos/main.tf +++ b/test_framework/terraform/aws/centos/main.tf @@ -250,7 +250,7 @@ resource "aws_route_table" "lh_aws_private_rt" { } } -# Assciate public subnet to public route table +# Associate public subnet to public route table resource "aws_route_table_association" "lh_aws_public_subnet_rt_association" { depends_on = [ aws_subnet.lh_aws_public_subnet, @@ -261,7 +261,7 @@ resource "aws_route_table_association" "lh_aws_public_subnet_rt_association" { route_table_id = aws_route_table.lh_aws_public_rt.id } -# Assciate private subnet to private route table +# Associate private subnet to private route table resource "aws_route_table_association" "lh_aws_private_subnet_rt_association" { depends_on = [ aws_subnet.lh_aws_private_subnet, diff --git a/test_framework/terraform/aws/oracle/main.tf b/test_framework/terraform/aws/oracle/main.tf index 4b22f7a21f..3fddf19914 100644 --- a/test_framework/terraform/aws/oracle/main.tf +++ b/test_framework/terraform/aws/oracle/main.tf @@ -250,7 +250,7 @@ resource "aws_route_table" "lh_aws_private_rt" { } } -# Assciate public subnet to public route table +# Associate public subnet to public route table resource "aws_route_table_association" "lh_aws_public_subnet_rt_association" { depends_on = [ aws_subnet.lh_aws_public_subnet, @@ -261,7 +261,7 @@ resource "aws_route_table_association" "lh_aws_public_subnet_rt_association" { route_table_id = aws_route_table.lh_aws_public_rt.id } -# Assciate private subnet to private route table +# Associate private subnet to private route table resource "aws_route_table_association" "lh_aws_private_subnet_rt_association" { depends_on = [ aws_subnet.lh_aws_private_subnet, diff --git a/test_framework/terraform/aws/rhel/main.tf b/test_framework/terraform/aws/rhel/main.tf index 4b22f7a21f..3fddf19914 100644 --- a/test_framework/terraform/aws/rhel/main.tf +++ b/test_framework/terraform/aws/rhel/main.tf @@ -250,7 +250,7 @@ resource "aws_route_table" "lh_aws_private_rt" { } } -# Assciate public subnet to public route table +# Associate public subnet to public route table resource "aws_route_table_association" "lh_aws_public_subnet_rt_association" { depends_on = [ aws_subnet.lh_aws_public_subnet, @@ -261,7 +261,7 @@ resource "aws_route_table_association" "lh_aws_public_subnet_rt_association" { route_table_id = aws_route_table.lh_aws_public_rt.id } -# Assciate private subnet to private route table +# Associate private subnet to private route table resource "aws_route_table_association" "lh_aws_private_subnet_rt_association" { depends_on = [ aws_subnet.lh_aws_private_subnet, diff --git a/test_framework/terraform/aws/rockylinux/main.tf b/test_framework/terraform/aws/rockylinux/main.tf index 02cf5120a5..e8e7be4756 100644 --- a/test_framework/terraform/aws/rockylinux/main.tf +++ b/test_framework/terraform/aws/rockylinux/main.tf @@ -251,7 +251,7 @@ resource "aws_route_table" "lh_aws_private_rt" { } } -# Assciate public subnet to public route table +# Associate public subnet to public route table resource "aws_route_table_association" "lh_aws_public_subnet_rt_association" { depends_on = [ aws_subnet.lh_aws_public_subnet, @@ -262,7 +262,7 @@ resource "aws_route_table_association" "lh_aws_public_subnet_rt_association" { route_table_id = aws_route_table.lh_aws_public_rt.id } -# Assciate private subnet to private route table +# Associate private subnet to private route table resource "aws_route_table_association" "lh_aws_private_subnet_rt_association" { depends_on = [ aws_subnet.lh_aws_private_subnet, diff --git a/test_framework/terraform/aws/sles/main.tf b/test_framework/terraform/aws/sles/main.tf index 665dd5b946..0e78f0a6c6 100644 --- a/test_framework/terraform/aws/sles/main.tf +++ b/test_framework/terraform/aws/sles/main.tf @@ -258,7 +258,7 @@ resource "aws_route_table" "lh_aws_private_rt" { } } -# Assciate public subnet to public route table +# Associate public subnet to public route table resource "aws_route_table_association" "lh_aws_public_subnet_rt_association" { depends_on = [ aws_subnet.lh_aws_public_subnet, @@ -269,7 +269,7 @@ resource "aws_route_table_association" "lh_aws_public_subnet_rt_association" { route_table_id = aws_route_table.lh_aws_public_rt.id } -# Assciate private subnet to private route table +# Associate private subnet to private route table resource "aws_route_table_association" "lh_aws_private_subnet_rt_association" { depends_on = [ aws_subnet.lh_aws_private_subnet, diff --git a/test_framework/terraform/aws/ubuntu/main.tf b/test_framework/terraform/aws/ubuntu/main.tf index 5ce977a111..956411de0f 100644 --- a/test_framework/terraform/aws/ubuntu/main.tf +++ b/test_framework/terraform/aws/ubuntu/main.tf @@ -252,7 +252,7 @@ resource "aws_route_table" "lh_aws_private_rt" { } } -# Assciate public subnet to public route table +# Associate public subnet to public route table resource "aws_route_table_association" "lh_aws_public_subnet_rt_association" { depends_on = [ aws_subnet.lh_aws_public_subnet, @@ -263,7 +263,7 @@ resource "aws_route_table_association" "lh_aws_public_subnet_rt_association" { route_table_id = aws_route_table.lh_aws_public_rt.id } -# Assciate private subnet to private route table +# Associate private subnet to private route table resource "aws_route_table_association" "lh_aws_private_subnet_rt_association" { depends_on = [ aws_subnet.lh_aws_private_subnet, diff --git a/test_tools/gen_data/README.md b/test_tools/gen_data/README.md index 3e73e6720a..f46e6661c3 100644 --- a/test_tools/gen_data/README.md +++ b/test_tools/gen_data/README.md @@ -7,7 +7,7 @@ Modify config.yaml storage: 1Gi # Each volume size storageClass: longhorn-test # Need to prepare your own storage class first dataSizeInMb: 500 -namespace: default # Nees to prepare first before run script +namespace: default # Needs to prepare first before run script statefulSet: # Single RWO/RWX statefulset and its replica counts rwo: replicas: 1 diff --git a/test_tools/gen_data/run.sh b/test_tools/gen_data/run.sh index d9b786786a..aa5da23629 100755 --- a/test_tools/gen_data/run.sh +++ b/test_tools/gen_data/run.sh @@ -120,7 +120,7 @@ check_config_input() { DEPLOYMENT_RWX_REPLICAS=$(yq eval '.deployment.rwx.deploymentReplicas' config.yaml) msg="$CONFIG_FILE is not correct, please check" - # varialbe = "null" when yq not find yaml field + # variable = "null" when yq not find yaml field [ "$STORAGE_SIZE" = "null" -o ${#STORAGE_SIZE} -eq 0 ] && error "$msg" && exit 2 [ "$NAMESPACE" = "null" -o ${#NAMESPACE} -eq 0 ] && error "$msg" && exit 2 [ "$STORAGE_CLASS_NAME" = "null" -o ${#STORAGE_CLASS_NAME} -eq 0 ] && error "$msg" && exit 2 From a5e412a1b048ad2150f6b2734d6810225013b594 Mon Sep 17 00:00:00 2001 From: Yang Chiu Date: Wed, 21 Feb 2024 15:12:38 +0800 Subject: [PATCH 10/42] test: fix flaky test case test_space_usage_for_rebuilding_only_volume in rockylinux arm64 Signed-off-by: Yang Chiu --- manager/integration/tests/test_basic.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/manager/integration/tests/test_basic.py b/manager/integration/tests/test_basic.py index 22596c6e58..eb4835ee59 100644 --- a/manager/integration/tests/test_basic.py +++ b/manager/integration/tests/test_basic.py @@ -5022,7 +5022,7 @@ def test_space_usage_for_rebuilding_only_volume(client, volume_name, request): snap_offset = 1 volume_endpoint = get_volume_endpoint(volume) write_volume_dev_random_mb_data(volume_endpoint, - snap_offset, 3000, 5) + snap_offset, 3000, 10) snap2 = create_snapshot(client, volume_name) volume.snapshotDelete(name=snap2.name) @@ -5030,7 +5030,7 @@ def test_space_usage_for_rebuilding_only_volume(client, volume_name, request): wait_for_snapshot_purge(client, volume_name, snap2.name) write_volume_dev_random_mb_data(volume_endpoint, - snap_offset, 3000, 5) + snap_offset, 3000, 10) for r in volume.replicas: if r.hostId != lht_hostId: @@ -5073,14 +5073,14 @@ def test_space_usage_for_rebuilding_only_volume_worst_scenario(client, volume_na snap_offset = 1 volume_endpoint = get_volume_endpoint(volume) write_volume_dev_random_mb_data(volume_endpoint, - snap_offset, 2000) + snap_offset, 2000, 10) snap1 = create_snapshot(client, volume_name) volume.snapshotDelete(name=snap1.name) volume.snapshotPurge() wait_for_snapshot_purge(client, volume_name, snap1.name) write_volume_dev_random_mb_data(volume_endpoint, - snap_offset, 2000) + snap_offset, 2000, 10) for r in volume.replicas: if r.hostId != lht_hostId: @@ -5090,7 +5090,7 @@ def test_space_usage_for_rebuilding_only_volume_worst_scenario(client, volume_na wait_for_volume_degraded(client, volume_name) wait_for_rebuild_start(client, volume_name) write_volume_dev_random_mb_data(volume_endpoint, - snap_offset, 2000) + snap_offset, 2000, 10) wait_for_rebuild_complete(client, volume_name) volume = client.by_id_volume(volume_name) From ac3e917cccab3b2e7a64abc5eceab97a05f9883f Mon Sep 17 00:00:00 2001 From: Yang Chiu Date: Thu, 22 Feb 2024 10:02:32 +0800 Subject: [PATCH 11/42] test: fix wrong pvc status check Signed-off-by: Yang Chiu --- manager/integration/tests/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manager/integration/tests/common.py b/manager/integration/tests/common.py index cedabb6155..a76d4f6b9d 100644 --- a/manager/integration/tests/common.py +++ b/manager/integration/tests/common.py @@ -1315,7 +1315,7 @@ def check_pvc_in_specific_status(api, pvc_name, status): claim = \ api.read_namespaced_persistent_volume_claim(name=pvc_name, namespace='default') - if claim.status.phase == "bound": + if claim.status.phase == status: break time.sleep(RETRY_INTERVAL) From 60994e632c2ca7761c9a3c92e314a71a23669485 Mon Sep 17 00:00:00 2001 From: Yang Chiu Date: Thu, 22 Feb 2024 11:08:02 +0800 Subject: [PATCH 12/42] test: fix flaky test case test_auto_detach_volume_when_node_is_cordoned Signed-off-by: Yang Chiu --- manager/integration/tests/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manager/integration/tests/common.py b/manager/integration/tests/common.py index a76d4f6b9d..b3294eeaa3 100644 --- a/manager/integration/tests/common.py +++ b/manager/integration/tests/common.py @@ -1970,7 +1970,7 @@ def wait_for_volume_faulted(client, name): def wait_for_volume_status(client, name, key, value, - retry_count=RETRY_COUNTS): + retry_count=RETRY_COUNTS_LONG): wait_for_volume_creation(client, name) for i in range(retry_count): volume = client.by_id_volume(name) From ad3376f0136552a540a0fb1d10a7032509ce8262 Mon Sep 17 00:00:00 2001 From: Yang Chiu Date: Thu, 22 Feb 2024 10:52:42 +0800 Subject: [PATCH 13/42] test: fix flaky test case test_engine_image_daemonset_restart Signed-off-by: Yang Chiu --- manager/integration/tests/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/manager/integration/tests/common.py b/manager/integration/tests/common.py index b3294eeaa3..40580fda5e 100644 --- a/manager/integration/tests/common.py +++ b/manager/integration/tests/common.py @@ -2220,10 +2220,10 @@ def wait_for_engine_image_condition(client, image_name, state): # This helps to prevent the flaky test case in which the ENGINE_NAME # is flapping between ready and not ready a few times before settling # down to the ready state - # https://github.com/longhorn/longhorn-tests/pull/1638 + # https://github.com/longhorn/longhorn/issues/7438 state_count = 1 if state == "True": - state_count = 5 + state_count = 60 c = 0 for i in range(RETRY_COUNTS): From 385b3deb4240ae612858b4171bb9841d08d80292 Mon Sep 17 00:00:00 2001 From: David Ko Date: Thu, 22 Feb 2024 13:29:51 +0800 Subject: [PATCH 14/42] Create renovate.json Signed-off-by: David Ko --- renovate.json | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 renovate.json diff --git a/renovate.json b/renovate.json new file mode 100644 index 0000000000..c297cdcff1 --- /dev/null +++ b/renovate.json @@ -0,0 +1,3 @@ +{ + "extends": ["github>longhorn/release:renovate-default"] +} From b9686f33a1d05b96e4ae965b09b23bd7acf5940d Mon Sep 17 00:00:00 2001 From: Yang Chiu Date: Thu, 22 Feb 2024 14:33:00 +0800 Subject: [PATCH 15/42] ci: use more cost-effective ec2 instance type Signed-off-by: Yang Chiu --- .../terraform/aws/ubuntu/variables.tf | 2 +- scalability_test/terraform/variables.tf | 4 ++-- .../terraform/aws/centos/variables.tf | 14 ++++++------- test_framework/terraform/aws/eks/main.tf | 2 +- .../terraform/aws/oracle/variables.tf | 19 ++++++++++-------- .../terraform/aws/rhel/variables.tf | 19 ++++++++++-------- .../terraform/aws/rockylinux/variables.tf | 19 ++++++++++-------- .../terraform/aws/sle-micro/variables.tf | 16 +++++++-------- .../terraform/aws/sles/variables.tf | 8 ++++---- .../terraform/aws/ubuntu/variables.tf | 20 ++++++++++--------- 10 files changed, 67 insertions(+), 56 deletions(-) diff --git a/build_engine_test_images/terraform/aws/ubuntu/variables.tf b/build_engine_test_images/terraform/aws/ubuntu/variables.tf index f1608de61a..c9bd3c38a2 100644 --- a/build_engine_test_images/terraform/aws/ubuntu/variables.tf +++ b/build_engine_test_images/terraform/aws/ubuntu/variables.tf @@ -56,7 +56,7 @@ variable "build_engine_aws_instance_name" { variable "build_engine_aws_instance_type" { type = string - description = "Recommended instance types t2.xlarge for amd64 & a1.xlarge for arm64" + description = "Recommended instance types t3.xlarge for amd64 & t4g.xlarge for arm64" default = "" } diff --git a/scalability_test/terraform/variables.tf b/scalability_test/terraform/variables.tf index 388235c0ac..9defd21ea1 100644 --- a/scalability_test/terraform/variables.tf +++ b/scalability_test/terraform/variables.tf @@ -45,12 +45,12 @@ variable "lh_aws_instance_name_controlplane" { variable "lh_aws_instance_type_controlplane" { type = string - default = "t2.xlarge" + default = "t3.xlarge" } variable "lh_aws_instance_type_worker" { type = string - default = "t2.xlarge" + default = "t3.xlarge" } variable "lh_aws_instance_root_block_device_size_controlplane" { diff --git a/test_framework/terraform/aws/centos/variables.tf b/test_framework/terraform/aws/centos/variables.tf index 07d56257be..238748d20c 100644 --- a/test_framework/terraform/aws/centos/variables.tf +++ b/test_framework/terraform/aws/centos/variables.tf @@ -10,12 +10,12 @@ variable "lh_aws_secret_key" { variable "aws_region" { type = string - default = "us-east-2" + default = "us-east-1" } variable "aws_availability_zone" { type = string - default = "us-east-2c" + default = "us-east-1c" } variable "lh_aws_vpc_name" { @@ -55,12 +55,12 @@ variable "lh_aws_instance_name_controlplane" { variable "lh_aws_instance_type_controlplane" { type = string - description = "Recommended instance types t2.xlarge for amd64 & a1.xlarge for arm64" + description = "Recommended instance types t3.xlarge for amd64 & t4g.xlarge for arm64" } variable "lh_aws_instance_type_worker" { type = string - description = "Recommended instance types t2.xlarge for amd64 & a1.xlarge for arm64" + description = "Recommended instance types t3.xlarge for amd64 & t4g.xlarge for arm64" } variable "lh_aws_instance_root_block_device_size_controlplane" { @@ -96,12 +96,12 @@ variable "k8s_distro_name" { variable "k8s_distro_version" { type = string - default = "v1.25.3+k3s1" + default = "v1.28.4+k3s1" description = <<-EOT kubernetes version that will be deployed rke: (default: v1.22.5-rancher1-1) - k3s: (default: v1.25.3+k3s1) - rke2: (default: v1.25.3+rke2r1) + k3s: (default: v1.28.4+k3s1) + rke2: (default: v1.28.4+rke2r1) EOT } diff --git a/test_framework/terraform/aws/eks/main.tf b/test_framework/terraform/aws/eks/main.tf index 95eb33647d..403abe5212 100644 --- a/test_framework/terraform/aws/eks/main.tf +++ b/test_framework/terraform/aws/eks/main.tf @@ -120,7 +120,7 @@ resource "aws_eks_node_group" "node_group" { subnet_ids = module.vpc.public_subnets ami_type = var.arch == "amd64" ? "AL2_x86_64" : "AL2_ARM_64" capacity_type = "ON_DEMAND" - instance_types = [var.arch == "amd64" ? "t2.xlarge" : "a1.xlarge"] + instance_types = [var.arch == "amd64" ? "t3.xlarge" : "t4g.xlarge"] disk_size = 40 scaling_config { desired_size = 3 diff --git a/test_framework/terraform/aws/oracle/variables.tf b/test_framework/terraform/aws/oracle/variables.tf index aa588e6f1d..8b6a661416 100644 --- a/test_framework/terraform/aws/oracle/variables.tf +++ b/test_framework/terraform/aws/oracle/variables.tf @@ -10,12 +10,12 @@ variable "lh_aws_secret_key" { variable "aws_region" { type = string - default = "us-east-2" + default = "us-east-1" } variable "aws_availability_zone" { type = string - default = "us-east-2a" + default = "us-east-1a" } variable "lh_aws_vpc_name" { @@ -26,11 +26,12 @@ variable "lh_aws_vpc_name" { variable "arch" { type = string description = "available values (amd64, arm64)" + default = "amd64" } variable "distro_version" { type = string - default = "8.6" + default = "9.1" } variable "aws_ami_oraclelinux_account_number" { @@ -55,12 +56,14 @@ variable "lh_aws_instance_name_controlplane" { variable "lh_aws_instance_type_controlplane" { type = string - description = "Recommended instance types t2.xlarge for amd64 & a1.xlarge for arm64" + description = "Recommended instance types t3.xlarge for amd64 & t4g.xlarge for arm64" + default = "t3.xlarge" } variable "lh_aws_instance_type_worker" { type = string - description = "Recommended instance types t2.xlarge for amd64 & a1.xlarge for arm64" + description = "Recommended instance types t3.xlarge for amd64 & t4g.xlarge for arm64" + default = "t3.xlarge" } variable "lh_aws_instance_root_block_device_size_controlplane" { @@ -96,12 +99,12 @@ variable "k8s_distro_name" { variable "k8s_distro_version" { type = string - default = "v1.25.3+k3s1" + default = "v1.28.4+k3s1" description = <<-EOT kubernetes version that will be deployed rke: (default: v1.22.5-rancher1-1) - k3s: (default: v1.25.3+k3s1) - rke2: (default: v1.25.3+rke2r1) + k3s: (default: v1.28.4+k3s1) + rke2: (default: v1.28.4+rke2r1) EOT } diff --git a/test_framework/terraform/aws/rhel/variables.tf b/test_framework/terraform/aws/rhel/variables.tf index 98f211e362..3368fa3231 100644 --- a/test_framework/terraform/aws/rhel/variables.tf +++ b/test_framework/terraform/aws/rhel/variables.tf @@ -10,12 +10,12 @@ variable "lh_aws_secret_key" { variable "aws_region" { type = string - default = "us-east-2" + default = "us-east-1" } variable "aws_availability_zone" { type = string - default = "us-east-2a" + default = "us-east-1a" } variable "lh_aws_vpc_name" { @@ -26,11 +26,12 @@ variable "lh_aws_vpc_name" { variable "arch" { type = string description = "available values (amd64, arm64)" + default = "amd64" } variable "os_distro_version" { type = string - default = "8.6.0" + default = "9.1.0" } variable "aws_ami_rhel_account_number" { @@ -55,12 +56,14 @@ variable "lh_aws_instance_name_controlplane" { variable "lh_aws_instance_type_controlplane" { type = string - description = "Recommended instance types t2.xlarge for amd64 & a1.xlarge for arm64" + description = "Recommended instance types t3.xlarge for amd64 & t4g.xlarge for arm64" + default = "t3.xlarge" } variable "lh_aws_instance_type_worker" { type = string - description = "Recommended instance types t2.xlarge for amd64 & a1.xlarge for arm64" + description = "Recommended instance types t3.xlarge for amd64 & t4g.xlarge for arm64" + default = "t3.xlarge" } variable "lh_aws_instance_root_block_device_size_controlplane" { @@ -96,12 +99,12 @@ variable "k8s_distro_name" { variable "k8s_distro_version" { type = string - default = "v1.25.3+k3s1" + default = "v1.28.4+k3s1" description = <<-EOT kubernetes version that will be deployed rke: (default: v1.22.5-rancher1-1) - k3s: (default: v1.25.3+k3s1) - rke2: (default: v1.25.3+rke2r1) + k3s: (default: v1.28.4+k3s1) + rke2: (default: v1.28.4+rke2r1) EOT } diff --git a/test_framework/terraform/aws/rockylinux/variables.tf b/test_framework/terraform/aws/rockylinux/variables.tf index dfce28e0ae..eb81c3b6f5 100644 --- a/test_framework/terraform/aws/rockylinux/variables.tf +++ b/test_framework/terraform/aws/rockylinux/variables.tf @@ -10,12 +10,12 @@ variable "lh_aws_secret_key" { variable "aws_region" { type = string - default = "us-east-2" + default = "us-east-1" } variable "aws_availability_zone" { type = string - default = "us-east-2a" + default = "us-east-1a" } variable "lh_aws_vpc_name" { @@ -26,11 +26,12 @@ variable "lh_aws_vpc_name" { variable "arch" { type = string description = "available values (amd64, arm64)" + default = "amd64" } variable "os_distro_version" { type = string - default = "9.2" + default = "9.3" } variable "aws_ami_rockylinux_account_number" { @@ -55,12 +56,14 @@ variable "lh_aws_instance_name_controlplane" { variable "lh_aws_instance_type_controlplane" { type = string - description = "Recommended instance types t2.xlarge for amd64 & a1.xlarge for arm64" + description = "Recommended instance types t3.xlarge for amd64 & t4g.xlarge for arm64" + default = "t3.xlarge" } variable "lh_aws_instance_type_worker" { type = string - description = "Recommended instance types t2.xlarge for amd64 & a1.xlarge for arm64" + description = "Recommended instance types t3.xlarge for amd64 & t4g.xlarge for arm64" + default = "t3.xlarge" } variable "lh_aws_instance_root_block_device_size_controlplane" { @@ -96,12 +99,12 @@ variable "k8s_distro_name" { variable "k8s_distro_version" { type = string - default = "v1.25.3+k3s1" + default = "v1.28.4+k3s1" description = <<-EOT kubernetes version that will be deployed rke: (default: v1.22.5-rancher1-1) - k3s: (default: v1.25.3+k3s1) - rke2: (default: v1.25.3+rke2r1) + k3s: (default: v1.28.4+k3s1) + rke2: (default: v1.28.4+rke2r1) EOT } diff --git a/test_framework/terraform/aws/sle-micro/variables.tf b/test_framework/terraform/aws/sle-micro/variables.tf index b00745d94b..503bbd0cef 100644 --- a/test_framework/terraform/aws/sle-micro/variables.tf +++ b/test_framework/terraform/aws/sle-micro/variables.tf @@ -31,7 +31,7 @@ variable "arch" { variable "os_distro_version" { type = string - default = "5.3" + default = "5.5" } variable "aws_ami_sles_account_number" { @@ -56,14 +56,14 @@ variable "lh_aws_instance_name_controlplane" { variable "lh_aws_instance_type_controlplane" { type = string - description = "Recommended instance types t2.xlarge for amd64 & a1.xlarge for arm64" - default = "t2.xlarge" + description = "Recommended instance types t3.xlarge for amd64 & t4g.xlarge for arm64" + default = "t3.xlarge" } variable "lh_aws_instance_type_worker" { type = string - description = "Recommended instance types t2.xlarge for amd64 & a1.xlarge for arm64" - default = "t2.xlarge" + description = "Recommended instance types t3.xlarge for amd64 & t4g.xlarge for arm64" + default = "t3.xlarge" } variable "lh_aws_instance_root_block_device_size_controlplane" { @@ -99,12 +99,12 @@ variable "k8s_distro_name" { variable "k8s_distro_version" { type = string - default = "v1.25.3+k3s1" + default = "v1.28.4+k3s1" description = <<-EOT kubernetes version that will be deployed rke: (default: v1.22.5-rancher1-1) - k3s: (default: v1.25.3+k3s1) - rke2: (default: v1.25.3+rke2r1) + k3s: (default: v1.28.4+k3s1) + rke2: (default: v1.28.4+rke2r1) EOT } diff --git a/test_framework/terraform/aws/sles/variables.tf b/test_framework/terraform/aws/sles/variables.tf index 1a435ac39b..640304259a 100644 --- a/test_framework/terraform/aws/sles/variables.tf +++ b/test_framework/terraform/aws/sles/variables.tf @@ -56,14 +56,14 @@ variable "lh_aws_instance_name_controlplane" { variable "lh_aws_instance_type_controlplane" { type = string - description = "Recommended instance types t2.xlarge for amd64 & a1.xlarge for arm64" - default = "t2.xlarge" + description = "Recommended instance types t3.xlarge for amd64 & t4g.xlarge for arm64" + default = "t3.xlarge" } variable "lh_aws_instance_type_worker" { type = string - description = "Recommended instance types t2.xlarge for amd64 & a1.xlarge for arm64" - default = "t2.xlarge" + description = "Recommended instance types t3.xlarge for amd64 & t4g.xlarge for arm64" + default = "t3.xlarge" } variable "lh_aws_instance_root_block_device_size_controlplane" { diff --git a/test_framework/terraform/aws/ubuntu/variables.tf b/test_framework/terraform/aws/ubuntu/variables.tf index 468a06b00e..6ee716104f 100644 --- a/test_framework/terraform/aws/ubuntu/variables.tf +++ b/test_framework/terraform/aws/ubuntu/variables.tf @@ -10,12 +10,12 @@ variable "lh_aws_secret_key" { variable "aws_region" { type = string - default = "us-east-2" + default = "us-east-1" } variable "aws_availability_zone" { type = string - default = "us-east-2a" + default = "us-east-1a" } variable "lh_aws_vpc_name" { @@ -26,11 +26,12 @@ variable "lh_aws_vpc_name" { variable "arch" { type = string description = "available values (amd64, arm64)" + default = "amd64" } variable "os_distro_version" { type = string - default = "20.04" + default = "22.04" } variable "aws_ami_ubuntu_account_number" { @@ -55,13 +56,14 @@ variable "lh_aws_instance_name_controlplane" { variable "lh_aws_instance_type_controlplane" { type = string - description = "Recommended instance types t2.xlarge for amd64 & a1.xlarge for arm64" + description = "Recommended instance types t3.xlarge for amd64 & t4g.xlarge for arm64" + default = "t3.xlarge" } variable "lh_aws_instance_type_worker" { type = string - description = "Recommended instance types t2.xlarge for amd64 & a1.xlarge for arm64" - + description = "Recommended instance types t3.xlarge for amd64 & t4g.xlarge for arm64" + default = "t3.xlarge" } variable "lh_aws_instance_root_block_device_size_controlplane" { @@ -97,12 +99,12 @@ variable "k8s_distro_name" { variable "k8s_distro_version" { type = string - default = "v1.25.3+k3s1" + default = "v1.28.4+k3s1" description = <<-EOT kubernetes version that will be deployed rke: (default: v1.22.5-rancher1-1) - k3s: (default: v1.25.3+k3s1) - rke2: (default: v1.25.3+rke2r1) + k3s: (default: v1.28.4+k3s1) + rke2: (default: v1.28.4+rke2r1) EOT } From e1588eaa3005e19246ed9ff2771212608f15bf1b Mon Sep 17 00:00:00 2001 From: Yang Chiu Date: Thu, 22 Feb 2024 19:58:12 +0800 Subject: [PATCH 16/42] ci: update oracle/rhel distro version Signed-off-by: Yang Chiu --- test_framework/terraform/aws/oracle/variables.tf | 2 +- test_framework/terraform/aws/rhel/variables.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test_framework/terraform/aws/oracle/variables.tf b/test_framework/terraform/aws/oracle/variables.tf index 8b6a661416..48664199cf 100644 --- a/test_framework/terraform/aws/oracle/variables.tf +++ b/test_framework/terraform/aws/oracle/variables.tf @@ -31,7 +31,7 @@ variable "arch" { variable "distro_version" { type = string - default = "9.1" + default = "9.3" } variable "aws_ami_oraclelinux_account_number" { diff --git a/test_framework/terraform/aws/rhel/variables.tf b/test_framework/terraform/aws/rhel/variables.tf index 3368fa3231..72ff0a3734 100644 --- a/test_framework/terraform/aws/rhel/variables.tf +++ b/test_framework/terraform/aws/rhel/variables.tf @@ -31,7 +31,7 @@ variable "arch" { variable "os_distro_version" { type = string - default = "9.1.0" + default = "9.3.0" } variable "aws_ami_rhel_account_number" { From 35b2134c26239218fd0281b36468af38e62c71a6 Mon Sep 17 00:00:00 2001 From: Shuo Wu Date: Thu, 22 Feb 2024 20:09:06 +0800 Subject: [PATCH 17/42] integration: Improve DR volume with backup deletion case Longhorn 7997 Signed-off-by: Shuo Wu --- manager/integration/tests/test_basic.py | 79 ++++++++++++++++++------- 1 file changed, 56 insertions(+), 23 deletions(-) diff --git a/manager/integration/tests/test_basic.py b/manager/integration/tests/test_basic.py index eb4835ee59..619e5ee420 100644 --- a/manager/integration/tests/test_basic.py +++ b/manager/integration/tests/test_basic.py @@ -985,26 +985,32 @@ def test_dr_volume_with_backup_block_deletion_abort_during_backup_in_progress(se check_volume_data(dr_vol, final_data, False) -def test_dr_volume_with_all_backup_blocks_deleted(set_random_backupstore, client, core_api, volume_name): # NOQA +def test_dr_volume_with_backup_and_backup_volume_deleted(set_random_backupstore, client, core_api, volume_name): # NOQA """ - Test DR volume can be activate after delete all backups. + Test DR volume can be activated after delete all backups. Context: - We want to make sure that DR volume can activate after delete all backups. + We want to make sure that DR volume can activate after deleting + some/all backups or the backup volume. Steps: 1. Create a volume and attach to the current node. 2. Write 4 MB to the beginning of the volume (2 x 2MB backup blocks). - 3. Create backup(0) of the volume. - 6. Verify backup block count == 2. - 7. Create DR volume from backup(0). - 8. Verify DR volume last backup is backup(0). - 9. Delete backup(0). - 10. Verify backup block count == 0. - 11. Verify DR volume last backup is empty. - 15. Activate and verify DR volume data is data(0). + 3. Create backup(0) then backup(1) for the volume. + 6. Verify backup block count == 4. + 7. Create DR volume(1) and DR volume(2) from backup(1). + 8. Verify DR volumes last backup is backup(1). + 9. Delete backup(1). + 10. Verify backup block count == 2. + 11. Verify DR volumes last backup becomes backup(0). + 12. Activate and verify DR volume(1) data is data(0). + 13. Delete backup(0). + 14. Verify backup block count == 0. + 15. Verify DR volume last backup is empty. + 16. Delete the backup volume. + 17. Activate and verify DR volume data is data(0). """ backupstore_cleanup(client) @@ -1020,30 +1026,57 @@ def test_dr_volume_with_all_backup_blocks_deleted(set_random_backupstore, client 'content': common.generate_random_data(2 * BACKUP_BLOCK_SIZE)} _, backup0, _, data0 = create_backup( client, volume_name, data0) + data1 = {'pos': 0, 'len': 2 * BACKUP_BLOCK_SIZE, + 'content': common.generate_random_data(2 * BACKUP_BLOCK_SIZE)} + _, backup1, _, data1 = create_backup( + client, volume_name, data1) backup_blocks_count = backupstore_count_backup_block_files(client, core_api, volume_name) - assert backup_blocks_count == 2 + assert backup_blocks_count == 4 - dr_vol_name = "dr-" + volume_name - client.create_volume(name=dr_vol_name, size=SIZE, - numberOfReplicas=2, fromBackup=backup0.url, + dr_vol_name1 = "dr-" + volume_name + "1" + dr_vol_name2 = "dr-" + volume_name + "2" + client.create_volume(name=dr_vol_name1, size=SIZE, + numberOfReplicas=2, fromBackup=backup1.url, frontend="", standby=True) - check_volume_last_backup(client, dr_vol_name, backup0.name) - wait_for_backup_restore_completed(client, dr_vol_name, backup0.name) + client.create_volume(name=dr_vol_name2, size=SIZE, + numberOfReplicas=2, fromBackup=backup1.url, + frontend="", standby=True) + check_volume_last_backup(client, dr_vol_name1, backup1.name) + wait_for_backup_restore_completed(client, dr_vol_name1, backup1.name) + check_volume_last_backup(client, dr_vol_name2, backup1.name) + wait_for_backup_restore_completed(client, dr_vol_name2, backup1.name) + + delete_backup(client, volume_name, backup1.name) + assert backupstore_count_backup_block_files(client, + core_api, + volume_name) == 2 + check_volume_last_backup(client, dr_vol_name1, backup0.name) + wait_for_backup_restore_completed(client, dr_vol_name1, backup0.name) + check_volume_last_backup(client, dr_vol_name2, backup0.name) + wait_for_backup_restore_completed(client, dr_vol_name2, backup0.name) + + activate_standby_volume(client, dr_vol_name1) + dr_vol1 = client.by_id_volume(dr_vol_name1) + dr_vol1.attach(hostId=host_id) + dr_vol1 = common.wait_for_volume_healthy(client, dr_vol_name1) + check_volume_data(dr_vol1, data0, False) delete_backup(client, volume_name, backup0.name) assert backupstore_count_backup_block_files(client, core_api, volume_name) == 0 - check_volume_last_backup(client, dr_vol_name, "") + check_volume_last_backup(client, dr_vol_name2, "") - activate_standby_volume(client, dr_vol_name) - dr_vol = client.by_id_volume(dr_vol_name) - dr_vol.attach(hostId=host_id) - dr_vol = common.wait_for_volume_healthy(client, dr_vol_name) - check_volume_data(dr_vol, data0, False) + delete_backup_volume(client, volume_name) + + activate_standby_volume(client, dr_vol_name2) + dr_vol2 = client.by_id_volume(dr_vol_name2) + dr_vol2.attach(hostId=host_id) + dr_vol2 = common.wait_for_volume_healthy(client, dr_vol_name2) + check_volume_data(dr_vol2, data0, False) def test_backup_volume_list(set_random_backupstore, client, core_api): # NOQA From d4fcafdd52ba543b1c3d112d3a1fa211e7c7c1f8 Mon Sep 17 00:00:00 2001 From: Chris Date: Thu, 22 Feb 2024 20:02:53 +0800 Subject: [PATCH 18/42] Use longhorn repo for backup store creation ref: 8004 Signed-off-by: Chris --- manager/integration/README.md | 3 +- .../backupstores/minio-backupstore.yaml | 83 ------------------- .../deploy/backupstores/nfs-backupstore.yaml | 52 ------------ pipelines/gke/scripts/longhorn-setup.sh | 4 +- pipelines/utilities/install_backupstores.sh | 4 +- test_framework/scripts/longhorn-setup.sh | 4 +- 6 files changed, 8 insertions(+), 142 deletions(-) delete mode 100644 manager/integration/deploy/backupstores/minio-backupstore.yaml delete mode 100644 manager/integration/deploy/backupstores/nfs-backupstore.yaml diff --git a/manager/integration/README.md b/manager/integration/README.md index 864698964f..77fcef7cf5 100644 --- a/manager/integration/README.md +++ b/manager/integration/README.md @@ -18,7 +18,8 @@ Requirement: Run the test: 1. Deploy all backupstore servers(including `NFS` server and `Minio` as s3 server) for test purposes. ``` -kubectl create -Rf integration/deploy/backupstores +kubectl create -f https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/minio-backupstore.yaml \ + -f https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/nfs-backupstore.yaml ``` 2. Deploy the test script to the Kubernetes cluster. ``` diff --git a/manager/integration/deploy/backupstores/minio-backupstore.yaml b/manager/integration/deploy/backupstores/minio-backupstore.yaml deleted file mode 100644 index 0654bfbab2..0000000000 --- a/manager/integration/deploy/backupstores/minio-backupstore.yaml +++ /dev/null @@ -1,83 +0,0 @@ -apiVersion: v1 -kind: Secret -metadata: - name: minio-secret - namespace: default -type: Opaque -data: - AWS_ACCESS_KEY_ID: bG9uZ2hvcm4tdGVzdC1hY2Nlc3Mta2V5 # longhorn-test-access-key - AWS_SECRET_ACCESS_KEY: bG9uZ2hvcm4tdGVzdC1zZWNyZXQta2V5 # longhorn-test-secret-key - AWS_ENDPOINTS: aHR0cHM6Ly9taW5pby1zZXJ2aWNlLmRlZmF1bHQ6OTAwMA== # https://minio-service.default:9000 - AWS_CERT: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSURMRENDQWhTZ0F3SUJBZ0lSQU1kbzQycGhUZXlrMTcvYkxyWjVZRHN3RFFZSktvWklodmNOQVFFTEJRQXcKR2pFWU1CWUdBMVVFQ2hNUFRHOXVaMmh2Y200Z0xTQlVaWE4wTUNBWERUSXdNRFF5TnpJek1EQXhNVm9ZRHpJeApNakF3TkRBek1qTXdNREV4V2pBYU1SZ3dGZ1lEVlFRS0V3OU1iMjVuYUc5eWJpQXRJRlJsYzNRd2dnRWlNQTBHCkNTcUdTSWIzRFFFQkFRVUFBNElCRHdBd2dnRUtBb0lCQVFEWHpVdXJnUFpEZ3pUM0RZdWFlYmdld3Fvd2RlQUQKODRWWWF6ZlN1USs3K21Oa2lpUVBvelVVMmZvUWFGL1BxekJiUW1lZ29hT3l5NVhqM1VFeG1GcmV0eDBaRjVOVgpKTi85ZWFJNWRXRk9teHhpMElPUGI2T0RpbE1qcXVEbUVPSXljdjRTaCsvSWo5Zk1nS0tXUDdJZGxDNUJPeThkCncwOVdkckxxaE9WY3BKamNxYjN6K3hISHd5Q05YeGhoRm9tb2xQVnpJbnlUUEJTZkRuSDBuS0lHUXl2bGhCMGsKVHBHSzYxc2prZnFTK3hpNTlJeHVrbHZIRXNQcjFXblRzYU9oaVh6N3lQSlorcTNBMWZoVzBVa1JaRFlnWnNFbQovZ05KM3JwOFhZdURna2kzZ0UrOElXQWRBWHExeWhqRDdSSkI4VFNJYTV0SGpKUUtqZ0NlSG5HekFnTUJBQUdqCmF6QnBNQTRHQTFVZER3RUIvd1FFQXdJQ3BEQVRCZ05WSFNVRUREQUtCZ2dyQmdFRkJRY0RBVEFQQmdOVkhSTUIKQWY4RUJUQURBUUgvTURFR0ExVWRFUVFxTUNpQ0NXeHZZMkZzYUc5emRJSVZiV2x1YVc4dGMyVnlkbWxqWlM1awpaV1poZFd4MGh3Ui9BQUFCTUEwR0NTcUdTSWIzRFFFQkN3VUFBNElCQVFDbUZMMzlNSHVZMzFhMTFEajRwMjVjCnFQRUM0RHZJUWozTk9kU0dWMmQrZjZzZ3pGejFXTDhWcnF2QjFCMVM2cjRKYjJQRXVJQkQ4NFlwVXJIT1JNU2MKd3ViTEppSEtEa0Jmb2U5QWI1cC9VakpyS0tuajM0RGx2c1cvR3AwWTZYc1BWaVdpVWorb1JLbUdWSTI0Q0JIdgpnK0JtVzNDeU5RR1RLajk0eE02czNBV2xHRW95YXFXUGU1eHllVWUzZjFBWkY5N3RDaklKUmVWbENtaENGK0JtCmFUY1RSUWN3cVdvQ3AwYmJZcHlERFlwUmxxOEdQbElFOW8yWjZBc05mTHJVcGFtZ3FYMmtYa2gxa3lzSlEralAKelFadHJSMG1tdHVyM0RuRW0yYmk0TktIQVFIcFc5TXUxNkdRakUxTmJYcVF0VEI4OGpLNzZjdEg5MzRDYWw2VgotLS0tLUVORCBDRVJUSUZJQ0FURS0tLS0t - AWS_CERT_KEY: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2UUlCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktjd2dnU2pBZ0VBQW9JQkFRRFh6VXVyZ1BaRGd6VDMKRFl1YWViZ2V3cW93ZGVBRDg0VllhemZTdVErNyttTmtpaVFQb3pVVTJmb1FhRi9QcXpCYlFtZWdvYU95eTVYagozVUV4bUZyZXR4MFpGNU5WSk4vOWVhSTVkV0ZPbXh4aTBJT1BiNk9EaWxNanF1RG1FT0l5Y3Y0U2grL0lqOWZNCmdLS1dQN0lkbEM1Qk95OGR3MDlXZHJMcWhPVmNwSmpjcWIzeit4SEh3eUNOWHhoaEZvbW9sUFZ6SW55VFBCU2YKRG5IMG5LSUdReXZsaEIwa1RwR0s2MXNqa2ZxUyt4aTU5SXh1a2x2SEVzUHIxV25Uc2FPaGlYejd5UEpaK3EzQQoxZmhXMFVrUlpEWWdac0VtL2dOSjNycDhYWXVEZ2tpM2dFKzhJV0FkQVhxMXloakQ3UkpCOFRTSWE1dEhqSlFLCmpnQ2VIbkd6QWdNQkFBRUNnZ0VBZlVyQ1hrYTN0Q2JmZjNpcnp2cFFmZnVEbURNMzV0TmlYaDJTQVpSVW9FMFYKbSsvZ1UvdnIrN2s2eUgvdzhMOXhpZXFhQTljVkZkL0JuTlIrMzI2WGc2dEpCNko2ZGZxODJZdmZOZ0VDaUFMaQpqalNGemFlQmhnT3ZsWXZHbTR5OTU1Q0FGdjQ1cDNac1VsMTFDRXJlL1BGbGtaWHRHeGlrWFl6NC85UTgzblhZCnM2eDdPYTgyUjdwT2lraWh3Q0FvVTU3Rjc4ZWFKOG1xTmkwRlF2bHlxSk9QMTFCbVp4dm54ZU11S2poQjlPTnAKTFNwMWpzZXk5bDZNR2pVbjBGTG53RHZkVWRiK0ZlUEkxTjdWYUNBd3hJK3JHa3JTWkhnekhWWE92VUpON2t2QQpqNUZPNW9uNGgvK3hXbkYzM3lxZ0VvWWZ0MFFJL2pXS2NOV1d1a2pCd1FLQmdRRGVFNlJGRUpsT2Q1aVcxeW1qCm45RENnczVFbXFtRXN3WU95bkN3U2RhK1lNNnZVYmlac1k4WW9wMVRmVWN4cUh2NkFQWGpVd2NBUG1QVE9KRW8KMlJtS0xTYkhsTnc4bFNOMWJsWDBEL3Mzamc1R3VlVW9nbW5TVnhMa0h1OFhKR0o3VzFReEUzZG9IUHRrcTNpagpoa09QTnJpZFM0UmxqNTJwYkhscjUvQzRjUUtCZ1FENHhFYmpuck1heFV2b0xxVTRvT2xiOVc5UytSUllTc0cxCmxJUmgzNzZTV0ZuTTlSdGoyMTI0M1hkaE4zUFBtSTNNeiswYjdyMnZSUi9LMS9Cc1JUQnlrTi9kbkVuNVUxQkEKYm90cGZIS1Jvc1FUR1hIQkEvM0JrNC9qOWplU3RmVXgzZ2x3eUI0L2hORy9KM1ZVV2FXeURTRm5qZFEvcGJsRwp6VWlsSVBmK1l3S0JnUUNwMkdYYmVJMTN5TnBJQ3psS2JqRlFncEJWUWVDQ29CVHkvUHRncUtoM3BEeVBNN1kyCnZla09VMWgyQVN1UkhDWHRtQXgzRndvVXNxTFFhY1FEZEw4bXdjK1Y5eERWdU02TXdwMDBjNENVQmE1L2d5OXoKWXdLaUgzeFFRaVJrRTZ6S1laZ3JqSkxYYXNzT1BHS2cxbEFYV1NlckRaV3R3MEEyMHNLdXQ0NlEwUUtCZ0hGZQpxZHZVR0ZXcjhvTDJ0dzlPcmVyZHVJVTh4RnZVZmVFdHRRTVJ2N3pjRE5qT0gxUnJ4Wk9aUW0ySW92dkp6MTIyCnFKMWhPUXJtV3EzTHFXTCtTU3o4L3pqMG4vWERWVUIzNElzTFR2ODJDVnVXN2ZPRHlTSnVDRlpnZ0VVWkxZd3oKWDJRSm4xZGRSV1Z6S3hKczVJbDNXSERqL3dXZWxnaEJSOGtSZEZOM0FvR0FJNldDdjJQQ1lUS1ZZNjAwOFYwbgpyTDQ3YTlPanZ0Yy81S2ZxSjFpMkpKTUgyQi9jbU1WRSs4M2dpODFIU1FqMWErNnBjektmQVppZWcwRk9nL015ClB6VlZRYmpKTnY0QzM5KzdxSDg1WGdZTXZhcTJ0aDFEZWUvQ3NsMlM4QlV0cW5mc0VuMUYwcWhlWUJZb2RibHAKV3RUaE5oRi9oRVhzbkJROURyWkJKT1U9Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K ---- -# same secret for longhorn-system namespace -apiVersion: v1 -kind: Secret -metadata: - name: minio-secret - namespace: longhorn-system -type: Opaque -data: - AWS_ACCESS_KEY_ID: bG9uZ2hvcm4tdGVzdC1hY2Nlc3Mta2V5 # longhorn-test-access-key - AWS_SECRET_ACCESS_KEY: bG9uZ2hvcm4tdGVzdC1zZWNyZXQta2V5 # longhorn-test-secret-key - AWS_ENDPOINTS: aHR0cHM6Ly9taW5pby1zZXJ2aWNlLmRlZmF1bHQ6OTAwMA== # https://minio-service.default:9000 - AWS_CERT: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSURMRENDQWhTZ0F3SUJBZ0lSQU1kbzQycGhUZXlrMTcvYkxyWjVZRHN3RFFZSktvWklodmNOQVFFTEJRQXcKR2pFWU1CWUdBMVVFQ2hNUFRHOXVaMmh2Y200Z0xTQlVaWE4wTUNBWERUSXdNRFF5TnpJek1EQXhNVm9ZRHpJeApNakF3TkRBek1qTXdNREV4V2pBYU1SZ3dGZ1lEVlFRS0V3OU1iMjVuYUc5eWJpQXRJRlJsYzNRd2dnRWlNQTBHCkNTcUdTSWIzRFFFQkFRVUFBNElCRHdBd2dnRUtBb0lCQVFEWHpVdXJnUFpEZ3pUM0RZdWFlYmdld3Fvd2RlQUQKODRWWWF6ZlN1USs3K21Oa2lpUVBvelVVMmZvUWFGL1BxekJiUW1lZ29hT3l5NVhqM1VFeG1GcmV0eDBaRjVOVgpKTi85ZWFJNWRXRk9teHhpMElPUGI2T0RpbE1qcXVEbUVPSXljdjRTaCsvSWo5Zk1nS0tXUDdJZGxDNUJPeThkCncwOVdkckxxaE9WY3BKamNxYjN6K3hISHd5Q05YeGhoRm9tb2xQVnpJbnlUUEJTZkRuSDBuS0lHUXl2bGhCMGsKVHBHSzYxc2prZnFTK3hpNTlJeHVrbHZIRXNQcjFXblRzYU9oaVh6N3lQSlorcTNBMWZoVzBVa1JaRFlnWnNFbQovZ05KM3JwOFhZdURna2kzZ0UrOElXQWRBWHExeWhqRDdSSkI4VFNJYTV0SGpKUUtqZ0NlSG5HekFnTUJBQUdqCmF6QnBNQTRHQTFVZER3RUIvd1FFQXdJQ3BEQVRCZ05WSFNVRUREQUtCZ2dyQmdFRkJRY0RBVEFQQmdOVkhSTUIKQWY4RUJUQURBUUgvTURFR0ExVWRFUVFxTUNpQ0NXeHZZMkZzYUc5emRJSVZiV2x1YVc4dGMyVnlkbWxqWlM1awpaV1poZFd4MGh3Ui9BQUFCTUEwR0NTcUdTSWIzRFFFQkN3VUFBNElCQVFDbUZMMzlNSHVZMzFhMTFEajRwMjVjCnFQRUM0RHZJUWozTk9kU0dWMmQrZjZzZ3pGejFXTDhWcnF2QjFCMVM2cjRKYjJQRXVJQkQ4NFlwVXJIT1JNU2MKd3ViTEppSEtEa0Jmb2U5QWI1cC9VakpyS0tuajM0RGx2c1cvR3AwWTZYc1BWaVdpVWorb1JLbUdWSTI0Q0JIdgpnK0JtVzNDeU5RR1RLajk0eE02czNBV2xHRW95YXFXUGU1eHllVWUzZjFBWkY5N3RDaklKUmVWbENtaENGK0JtCmFUY1RSUWN3cVdvQ3AwYmJZcHlERFlwUmxxOEdQbElFOW8yWjZBc05mTHJVcGFtZ3FYMmtYa2gxa3lzSlEralAKelFadHJSMG1tdHVyM0RuRW0yYmk0TktIQVFIcFc5TXUxNkdRakUxTmJYcVF0VEI4OGpLNzZjdEg5MzRDYWw2VgotLS0tLUVORCBDRVJUSUZJQ0FURS0tLS0t ---- -apiVersion: v1 -kind: Pod -metadata: - name: longhorn-test-minio - namespace: default - labels: - app: longhorn-test-minio -spec: - volumes: - - name: minio-volume - emptyDir: {} - - name: minio-certificates - secret: - secretName: minio-secret - items: - - key: AWS_CERT - path: public.crt - - key: AWS_CERT_KEY - path: private.key - - containers: - - name: minio - image: minio/minio:RELEASE.2022-02-01T18-00-14Z - command: ["sh", "-c", "mkdir -p /storage/backupbucket && mkdir -p /root/.minio/certs && ln -s /root/certs/private.key /root/.minio/certs/private.key && ln -s /root/certs/public.crt /root/.minio/certs/public.crt && exec minio server /storage"] - env: - - name: MINIO_ROOT_USER - valueFrom: - secretKeyRef: - name: minio-secret - key: AWS_ACCESS_KEY_ID - - name: MINIO_ROOT_PASSWORD - valueFrom: - secretKeyRef: - name: minio-secret - key: AWS_SECRET_ACCESS_KEY - ports: - - containerPort: 9000 - volumeMounts: - - name: minio-volume - mountPath: "/storage" - - name: minio-certificates - mountPath: "/root/certs" - readOnly: true ---- -apiVersion: v1 -kind: Service -metadata: - name: minio-service - namespace: default -spec: - selector: - app: longhorn-test-minio - ports: - - port: 9000 - targetPort: 9000 - protocol: TCP - sessionAffinity: ClientIP diff --git a/manager/integration/deploy/backupstores/nfs-backupstore.yaml b/manager/integration/deploy/backupstores/nfs-backupstore.yaml deleted file mode 100644 index e351c5075a..0000000000 --- a/manager/integration/deploy/backupstores/nfs-backupstore.yaml +++ /dev/null @@ -1,52 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: longhorn-test-nfs - namespace: default - labels: - app: longhorn-test-nfs -spec: - volumes: - - name: nfs-volume - emptyDir: {} - containers: - - name: longhorn-test-nfs-container - image: longhornio/nfs-ganesha:latest - imagePullPolicy: Always - env: - - name: EXPORT_ID - value: "14" - - name: EXPORT_PATH - value: /opt/backupstore - - name: PSEUDO_PATH - value: /opt/backupstore - - name: NFS_DISK_IMAGE_SIZE_MB - value: "4096" - command: ["bash", "-c", "chmod 700 /opt/backupstore && /opt/start_nfs.sh | tee /var/log/ganesha.log"] - securityContext: - privileged: true - capabilities: - add: ["SYS_ADMIN", "DAC_READ_SEARCH"] - volumeMounts: - - name: nfs-volume - mountPath: "/opt/backupstore" - livenessProbe: - exec: - command: ["bash", "-c", "grep \"No export entries found\" /var/log/ganesha.log > /dev/null 2>&1 ; [ $? -ne 0 ]"] - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 4 ---- -kind: Service -apiVersion: v1 -metadata: - name: longhorn-test-nfs-svc - namespace: default -spec: - selector: - app: longhorn-test-nfs - clusterIP: None - ports: - - name: notnecessary - port: 1234 - targetPort: 1234 diff --git a/pipelines/gke/scripts/longhorn-setup.sh b/pipelines/gke/scripts/longhorn-setup.sh index 163f9cc4bb..b9c6b6346c 100755 --- a/pipelines/gke/scripts/longhorn-setup.sh +++ b/pipelines/gke/scripts/longhorn-setup.sh @@ -139,8 +139,8 @@ create_longhorn_namespace(){ install_backupstores(){ - MINIO_BACKUPSTORE_URL="https://raw.githubusercontent.com/longhorn/longhorn-tests/master/manager/integration/deploy/backupstores/minio-backupstore.yaml" - NFS_BACKUPSTORE_URL="https://raw.githubusercontent.com/longhorn/longhorn-tests/master/manager/integration/deploy/backupstores/nfs-backupstore.yaml" + MINIO_BACKUPSTORE_URL="https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/minio-backupstore.yaml" + NFS_BACKUPSTORE_URL="https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/nfs-backupstore.yaml" kubectl create -f ${MINIO_BACKUPSTORE_URL} \ -f ${NFS_BACKUPSTORE_URL} } diff --git a/pipelines/utilities/install_backupstores.sh b/pipelines/utilities/install_backupstores.sh index 9ad06e8938..7f043e8a8d 100755 --- a/pipelines/utilities/install_backupstores.sh +++ b/pipelines/utilities/install_backupstores.sh @@ -1,6 +1,6 @@ install_backupstores(){ - MINIO_BACKUPSTORE_URL="https://raw.githubusercontent.com/longhorn/longhorn-tests/master/manager/integration/deploy/backupstores/minio-backupstore.yaml" - NFS_BACKUPSTORE_URL="https://raw.githubusercontent.com/longhorn/longhorn-tests/master/manager/integration/deploy/backupstores/nfs-backupstore.yaml" + MINIO_BACKUPSTORE_URL="https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/minio-backupstore.yaml" + NFS_BACKUPSTORE_URL="https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/nfs-backupstore.yaml" kubectl create -f ${MINIO_BACKUPSTORE_URL} \ -f ${NFS_BACKUPSTORE_URL} } \ No newline at end of file diff --git a/test_framework/scripts/longhorn-setup.sh b/test_framework/scripts/longhorn-setup.sh index 7782f99a25..8f7e3aa873 100755 --- a/test_framework/scripts/longhorn-setup.sh +++ b/test_framework/scripts/longhorn-setup.sh @@ -266,8 +266,8 @@ create_longhorn_namespace(){ install_backupstores(){ - MINIO_BACKUPSTORE_URL="https://raw.githubusercontent.com/longhorn/longhorn-tests/master/manager/integration/deploy/backupstores/minio-backupstore.yaml" - NFS_BACKUPSTORE_URL="https://raw.githubusercontent.com/longhorn/longhorn-tests/master/manager/integration/deploy/backupstores/nfs-backupstore.yaml" + MINIO_BACKUPSTORE_URL="https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/minio-backupstore.yaml" + NFS_BACKUPSTORE_URL="https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/nfs-backupstore.yaml" kubectl create -f ${MINIO_BACKUPSTORE_URL} \ -f ${NFS_BACKUPSTORE_URL} } From 082af42f29d95361498b02cbdc5fddc5c4e30b68 Mon Sep 17 00:00:00 2001 From: Roger Yao Date: Mon, 19 Feb 2024 17:26:18 +0800 Subject: [PATCH 19/42] test(manual): Sync up with backup target during DR volume activation longhorn/longhorn#5292 longhorn/longhorn#7945 Signed-off-by: Roger Yao --- ...-backup-target-during-dr-volume-activation.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 docs/content/manual/pre-release/backup-and-restore/sync-up-with-backup-target-during-dr-volume-activation.md diff --git a/docs/content/manual/pre-release/backup-and-restore/sync-up-with-backup-target-during-dr-volume-activation.md b/docs/content/manual/pre-release/backup-and-restore/sync-up-with-backup-target-during-dr-volume-activation.md new file mode 100644 index 0000000000..d316e20ed9 --- /dev/null +++ b/docs/content/manual/pre-release/backup-and-restore/sync-up-with-backup-target-during-dr-volume-activation.md @@ -0,0 +1,16 @@ +--- +title: "Sync up with backup target during DR volume activation" +--- + +#### Related Issue: +- https://github.com/longhorn/longhorn/issues/5292 +- https://github.com/longhorn/longhorn/issues/7945 + +1. Launch 2 clusters and both have Longhorn installed +1. Set up a backup target. +1. Create a volume and write data in the `1st cluster`. Then create `1st backup`. +1. Restore the backup as a DR volume in the `2nd cluster`. +1. Modify the backup poll interval to a large value. +1. Write more data for the volume in the `1st cluster`, and create the `2nd backup`. +1. Activate the DR volume in the `2nd cluster`. Then verify the data +1. The activated DR volume should contain the latest data. From 278d89811c6e3598db0a95c4d2578859e7b79dda Mon Sep 17 00:00:00 2001 From: Chris Date: Mon, 19 Feb 2024 18:43:14 +0800 Subject: [PATCH 20/42] Add test case test_drain_with_block_for_eviction_success ref: 7521 Signed-off-by: Chris --- manager/integration/tests/common.py | 1 + manager/integration/tests/test_node.py | 151 ++++++++++++++++++++++++- 2 files changed, 149 insertions(+), 3 deletions(-) diff --git a/manager/integration/tests/common.py b/manager/integration/tests/common.py index 40580fda5e..aea87717b8 100644 --- a/manager/integration/tests/common.py +++ b/manager/integration/tests/common.py @@ -216,6 +216,7 @@ "allow-empty-node-selector-volume" SETTING_REPLICA_DISK_SOFT_ANTI_AFFINITY = "replica-disk-soft-anti-affinity" SETTING_ALLOW_EMPTY_DISK_SELECTOR_VOLUME = "allow-empty-disk-selector-volume" +SETTING_NODE_DRAIN_POLICY = "node-drain-policy" DEFAULT_BACKUP_COMPRESSION_METHOD = "lz4" BACKUP_COMPRESSION_METHOD_LZ4 = "lz4" diff --git a/manager/integration/tests/test_node.py b/manager/integration/tests/test_node.py index 4f95978d4d..3f7c5fb70f 100644 --- a/manager/integration/tests/test_node.py +++ b/manager/integration/tests/test_node.py @@ -3,6 +3,7 @@ import os import subprocess import time +import yaml from random import choice from string import ascii_lowercase, digits @@ -47,8 +48,14 @@ from common import set_node_scheduling_eviction from common import update_node_disks from common import update_setting +from common import SETTING_NODE_DRAIN_POLICY, DATA_SIZE_IN_MB_3 +from common import make_deployment_with_pvc # NOQA +from common import create_pv_for_volume +from common import create_pvc_for_volume, create_and_wait_deployment +from common import get_apps_api_client, write_pod_volume_random_data from backupstore import set_random_backupstore # NOQA +from concurrent.futures import ThreadPoolExecutor, TimeoutError CREATE_DEFAULT_DISK_LABEL = "node.longhorn.io/create-default-disk" @@ -2680,8 +2687,31 @@ def finalizer(): request.addfinalizer(finalizer) -@pytest.mark.skip(reason="TODO") # NOQA -def test_drain_with_block_for_eviction_success(): + +def drain_node(core_api, node): # NOQA + set_node_cordon(core_api, node.id, True) + + command = ["kubectl", "drain", node.id, "--ignore-daemonsets"] + subprocess.run(command, check=True) + + +def get_replica_detail(replica_name): + """ + Get allreplica information by this function + """ + command = ["kubectl", "get", + "replicas.longhorn.io", + "-n", + "longhorn-system", + replica_name, + "-o", + "yaml"] + output = subprocess.check_output(command, text=True) + replica_info = yaml.safe_load(output) + return replica_info + + +def test_drain_with_block_for_eviction_success(client, core_api, volume_name, make_deployment_with_pvc): # NOQA """ Test drain completes after evicting replica with node-drain-policy block-for-eviction @@ -2693,7 +2723,6 @@ def test_drain_with_block_for_eviction_success(): 4. Write data to the volume. 5. Drain a node one of the volume's replicas is scheduled to. 6. While the drain is ongoing: - - Verify that the volume never becomes degraded. - Verify that `node.status.autoEvicting == true`. - Optionally verify that `replica.spec.evictionRequested == true`. 7. Verify the drain completes. @@ -2703,6 +2732,122 @@ def test_drain_with_block_for_eviction_success(): 11. Verify that `replica.spec.evictionRequested == false`. 12. Verify the volume's data. """ + host_id = get_self_host_id() + nodes = client.list_node() + evict_nodes = [node for node in nodes if node.id != host_id][:2] + evict_source_node = evict_nodes[0] + evict_target_node = evict_nodes[1] + + # Step 1 + setting = client.by_id_setting( + SETTING_NODE_DRAIN_POLICY) + client.update(setting, value="block-for-eviction") + + # Step 2, 3, 4 + volume = client.create_volume(name=volume_name, + size=str(1 * Gi), + numberOfReplicas=3) + volume = common.wait_for_volume_detached(client, volume_name) + + pvc_name = volume_name + "-pvc" + create_pv_for_volume(client, core_api, volume, volume_name) + create_pvc_for_volume(client, core_api, volume, pvc_name) + deployment_name = volume_name + "-dep" + deployment = make_deployment_with_pvc(deployment_name, pvc_name) + deployment["spec"]["template"]["spec"]["nodeSelector"] \ + = {"kubernetes.io/hostname": host_id} + + apps_api = get_apps_api_client() + create_and_wait_deployment(apps_api, deployment) + + pod_names = common.get_deployment_pod_names(core_api, deployment) + data_path = '/data/test' + write_pod_volume_random_data(core_api, + pod_names[0], + data_path, + DATA_SIZE_IN_MB_3) + expected_test_data_checksum = get_pod_data_md5sum(core_api, + pod_names[0], + data_path) + + volume = wait_for_volume_healthy(client, volume_name) + + # Make replica not locate on eviction target node + volume.updateReplicaCount(replicaCount=2) + for replica in volume.replicas: + if replica.hostId == evict_target_node.id: + volume.replicaRemove(name=replica.name) + break + + wait_for_volume_replica_count(client, volume_name, 2) + + # Step 5 + # drain eviction source node + executor = ThreadPoolExecutor(max_workers=5) + future = executor.submit(drain_node, core_api, evict_source_node) + + # Step 6 + volume = client.by_id_volume(volume_name) + for replica in volume.replicas: + if replica.hostId == evict_source_node.id: + replica_name = replica.name + break + + replica_info = get_replica_detail(replica_name) + eviction_requested = replica_info["spec"]["evictionRequested"] + assert eviction_requested is True + + nodes = client.list_node() + for node in nodes: + if node.id == evict_source_node.id: + assert node.autoEvicting is True + + # Step 7 + thread_timeout = 60 + try: + future.result(timeout=thread_timeout) + drain_complete = True + except TimeoutError: + print("drain node thread exceed timeout ({})s".format(thread_timeout)) + drain_complete = False + future.cancel() + finally: + assert drain_complete is True + + wait_for_volume_replica_count(client, volume_name, 2) + + # Step 8 + set_node_cordon(core_api, evict_source_node.id, False) + + # Step 9 + volume = wait_for_volume_healthy(client, volume_name) + assert len(volume.replicas) == 2 + for replica in volume.replicas: + assert replica.hostId != evict_source_node.id + + # Stpe 10 + nodes = client.list_node() + for node in nodes: + assert node.autoEvicting is False + + # Step 11 + volume = client.by_id_volume(volume_name) + for replica in volume.replicas: + if replica.hostId == evict_target_node.id: + replica_name = replica.name + break + + replica_info = get_replica_detail(replica_name) + eviction_requested = replica_info["spec"]["evictionRequested"] + assert eviction_requested is False + + # Step 12 + test_data_checksum = get_pod_data_md5sum(core_api, + pod_names[0], + data_path) + + assert expected_test_data_checksum == test_data_checksum + @pytest.mark.skip(reason="TODO") # NOQA def test_drain_with_block_for_eviction_if_contains_last_replica_success(): From 936609351858754eee7e14cced2a91612f26f04e Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 20 Feb 2024 16:48:23 +0800 Subject: [PATCH 21/42] Add test case test_drain_with_block_for_eviction_if_contains_last_replica_success ref: 7521 Signed-off-by: Chris --- manager/integration/Dockerfile | 2 +- manager/integration/tests/common.py | 39 ++++ manager/integration/tests/test_node.py | 291 ++++++++++++++++++------- 3 files changed, 257 insertions(+), 75 deletions(-) diff --git a/manager/integration/Dockerfile b/manager/integration/Dockerfile index 8541f5edd9..09ccfc9c2f 100644 --- a/manager/integration/Dockerfile +++ b/manager/integration/Dockerfile @@ -1,6 +1,6 @@ FROM registry.suse.com/bci/python:3.9 -ARG KUBECTL_VERSION=v1.17.0 +ARG KUBECTL_VERSION=v1.28.4 ARG YQ_VERSION=v4.24.2 ARG TERRAFORM_VERSION=1.3.5 ARG ARCH=amd64 diff --git a/manager/integration/tests/common.py b/manager/integration/tests/common.py index aea87717b8..e47f12136d 100644 --- a/manager/integration/tests/common.py +++ b/manager/integration/tests/common.py @@ -6123,3 +6123,42 @@ def wait_for_instance_manager_count(client, number, retry_counts=120): time.sleep(RETRY_INTERVAL_LONG) return len(ims) + + +def create_deployment_and_write_data(client, # NOQA + core_api, # NOQA + make_deployment_with_pvc, # NOQA + volume_name, # NOQA + size, # NOQA + replica_count, # NOQA + data_size, # NOQA + attach_node_id=None): # NOQA + apps_api = get_apps_api_client() + volume = client.create_volume(name=volume_name, + size=size, + numberOfReplicas=replica_count) + volume = wait_for_volume_detached(client, volume_name) + + pvc_name = volume_name + "-pvc" + create_pv_for_volume(client, core_api, volume, volume_name) + create_pvc_for_volume(client, core_api, volume, pvc_name) + deployment_name = volume_name + "-dep" + deployment = make_deployment_with_pvc(deployment_name, pvc_name) + if attach_node_id: + deployment["spec"]["template"]["spec"]["nodeSelector"] \ + = {"kubernetes.io/hostname": attach_node_id} + + create_and_wait_deployment(apps_api, deployment) + + data_path = '/data/test' + deployment_pod_names = get_deployment_pod_names(core_api, + deployment) + write_pod_volume_random_data(core_api, + deployment_pod_names[0], + data_path, + data_size) + checksum = get_pod_data_md5sum(core_api, + deployment_pod_names[0], + data_path) + + return client.by_id_volume(volume_name), deployment_pod_names[0], checksum diff --git a/manager/integration/tests/test_node.py b/manager/integration/tests/test_node.py index 3f7c5fb70f..f062996a9b 100644 --- a/manager/integration/tests/test_node.py +++ b/manager/integration/tests/test_node.py @@ -50,9 +50,8 @@ from common import update_setting from common import SETTING_NODE_DRAIN_POLICY, DATA_SIZE_IN_MB_3 from common import make_deployment_with_pvc # NOQA -from common import create_pv_for_volume -from common import create_pvc_for_volume, create_and_wait_deployment -from common import get_apps_api_client, write_pod_volume_random_data +from common import prepare_host_disk, wait_for_volume_degraded +from common import create_deployment_and_write_data from backupstore import set_random_backupstore # NOQA from concurrent.futures import ThreadPoolExecutor, TimeoutError @@ -2691,7 +2690,15 @@ def finalizer(): def drain_node(core_api, node): # NOQA set_node_cordon(core_api, node.id, True) - command = ["kubectl", "drain", node.id, "--ignore-daemonsets"] + command = [ + "kubectl", + "drain", + node.id, + "--ignore-daemonsets", + "--delete-emptydir-data", + "--grace-period=-1" + ] + subprocess.run(command, check=True) @@ -2711,8 +2718,84 @@ def get_replica_detail(replica_name): return replica_info +def check_node_auto_evict_state(client, target_node, expect_state): # NOQA + def get_specific_node(client, target_node): + nodes = client.list_node() + for node in nodes: + if node.id == target_node.id: + return node + + for i in range(RETRY_COUNTS): + node = get_specific_node(client, target_node) + if node.autoEvicting is expect_state: + break + time.sleep(RETRY_INTERVAL) + assert node.autoEvicting is expect_state + + +def check_replica_evict_state(client, volume_name, node, expect_state): # NOQA + volume = client.by_id_volume(volume_name) + for replica in volume.replicas: + if replica.hostId == node.id: + replica_name = replica.name + break + + replica_info = get_replica_detail(replica_name) + eviction_requested = replica_info["spec"]["evictionRequested"] + assert eviction_requested is expect_state + + +def wait_drain_complete(future, timeout): + """ + Wait concurrent.futures object complete in a duration + """ + thread_timeout = timeout + try: + future.result(timeout=thread_timeout) + drain_complete = True + except TimeoutError: + print("drain node thread exceed timeout ({})s".format(thread_timeout)) + drain_complete = False + future.cancel() + finally: + assert drain_complete is True + + +def make_replica_on_specific_node(client, volume_name, node): # NOQA + volume = client.by_id_volume(volume_name) + volume.updateReplicaCount(replicaCount=1) + for replica in volume.replicas: + if replica.hostId != node.id: + volume.replicaRemove(name=replica.name) + wait_for_volume_replica_count(client, volume_name, 1) + + +def get_all_replica_name(client, volume_name): # NOQA + volume_replicas = [] + volume = client.by_id_volume(volume_name) + for replica in volume.replicas: + volume_replicas.append(replica.name) + + return volume_replicas + + +def check_all_replicas_evict_state(client, volume_name, expect_state): # NOQA + volume = client.by_id_volume(volume_name) + for replica in volume.replicas: + replica_info = get_replica_detail(replica.name) + eviction_requested = replica_info["spec"]["evictionRequested"] + assert eviction_requested is expect_state + + +@pytest.mark.skip(reason="Can not run when in-cluster backup store pod exist") # NOQA def test_drain_with_block_for_eviction_success(client, core_api, volume_name, make_deployment_with_pvc): # NOQA """ + Test case has the potential to drain node where backup store pods are + located. + In that case, test case will fail because backup store pods can only be + forcibly drained. + --- + Test drain completes after evicting replica with node-drain-policy block-for-eviction @@ -2744,33 +2827,13 @@ def test_drain_with_block_for_eviction_success(client, core_api, volume_name, ma client.update(setting, value="block-for-eviction") # Step 2, 3, 4 - volume = client.create_volume(name=volume_name, - size=str(1 * Gi), - numberOfReplicas=3) - volume = common.wait_for_volume_detached(client, volume_name) - - pvc_name = volume_name + "-pvc" - create_pv_for_volume(client, core_api, volume, volume_name) - create_pvc_for_volume(client, core_api, volume, pvc_name) - deployment_name = volume_name + "-dep" - deployment = make_deployment_with_pvc(deployment_name, pvc_name) - deployment["spec"]["template"]["spec"]["nodeSelector"] \ - = {"kubernetes.io/hostname": host_id} - - apps_api = get_apps_api_client() - create_and_wait_deployment(apps_api, deployment) - - pod_names = common.get_deployment_pod_names(core_api, deployment) - data_path = '/data/test' - write_pod_volume_random_data(core_api, - pod_names[0], - data_path, - DATA_SIZE_IN_MB_3) - expected_test_data_checksum = get_pod_data_md5sum(core_api, - pod_names[0], - data_path) - - volume = wait_for_volume_healthy(client, volume_name) + volume, pod, checksum = create_deployment_and_write_data(client, + core_api, + make_deployment_with_pvc, # NOQA + volume_name, + str(1 * Gi), + 3, + DATA_SIZE_IN_MB_3, host_id) # NOQA # Make replica not locate on eviction target node volume.updateReplicaCount(replicaCount=2) @@ -2787,33 +2850,11 @@ def test_drain_with_block_for_eviction_success(client, core_api, volume_name, ma future = executor.submit(drain_node, core_api, evict_source_node) # Step 6 - volume = client.by_id_volume(volume_name) - for replica in volume.replicas: - if replica.hostId == evict_source_node.id: - replica_name = replica.name - break - - replica_info = get_replica_detail(replica_name) - eviction_requested = replica_info["spec"]["evictionRequested"] - assert eviction_requested is True - - nodes = client.list_node() - for node in nodes: - if node.id == evict_source_node.id: - assert node.autoEvicting is True + check_replica_evict_state(client, volume_name, evict_source_node, True) + check_node_auto_evict_state(client, evict_source_node, True) # Step 7 - thread_timeout = 60 - try: - future.result(timeout=thread_timeout) - drain_complete = True - except TimeoutError: - print("drain node thread exceed timeout ({})s".format(thread_timeout)) - drain_complete = False - future.cancel() - finally: - assert drain_complete is True - + wait_drain_complete(future, 60) wait_for_volume_replica_count(client, volume_name, 2) # Step 8 @@ -2826,32 +2867,29 @@ def test_drain_with_block_for_eviction_success(client, core_api, volume_name, ma assert replica.hostId != evict_source_node.id # Stpe 10 - nodes = client.list_node() - for node in nodes: - assert node.autoEvicting is False + check_node_auto_evict_state(client, evict_source_node, False) # Step 11 - volume = client.by_id_volume(volume_name) - for replica in volume.replicas: - if replica.hostId == evict_target_node.id: - replica_name = replica.name - break - - replica_info = get_replica_detail(replica_name) - eviction_requested = replica_info["spec"]["evictionRequested"] - assert eviction_requested is False + check_replica_evict_state(client, volume_name, evict_target_node, False) # Step 12 + data_path = data_path = '/data/test' test_data_checksum = get_pod_data_md5sum(core_api, - pod_names[0], + pod, data_path) + assert checksum == test_data_checksum - assert expected_test_data_checksum == test_data_checksum - -@pytest.mark.skip(reason="TODO") # NOQA -def test_drain_with_block_for_eviction_if_contains_last_replica_success(): +@pytest.mark.skip(reason="Can not run when in-cluster backup store pod exist") # NOQA +def test_drain_with_block_for_eviction_if_contains_last_replica_success(client, # NOQA + core_api, # NOQA + make_deployment_with_pvc): # NOQA """ + Test case has the potential to drain node where backup store pods are + located. + In that case, test case will fail because backup store pods can only be + forcibly drained. + --- Test drain completes after evicting replicas with node-drain-policy block-for-eviction-if-contains-last-replica @@ -2864,7 +2902,6 @@ def test_drain_with_block_for_eviction_if_contains_last_replica_success(): 4. Write data to the volumes. 5. Drain a node both volumes have a replica scheduled to. 6. While the drain is ongoing: - - Verify that the volume with one replica never becomes degraded. - Verify that the volume with three replicas becomes degraded. - Verify that `node.status.autoEvicting == true`. - Optionally verify that `replica.spec.evictionRequested == true` on the @@ -2880,6 +2917,112 @@ def test_drain_with_block_for_eviction_if_contains_last_replica_success(): 12. Verify that `replica.spec.evictionRequested == false` on all replicas. 13. Verify the the data in both volumes. """ + host_id = get_self_host_id() + nodes = client.list_node() + evict_nodes = [node for node in nodes if node.id != host_id][:2] + evict_source_node = evict_nodes[0] + + # Create extra disk on current node + node = client.by_id_node(host_id) + disks = node.disks + + disk_volume_name = 'vol-disk' + disk_volume = client.create_volume(name=disk_volume_name, + size=str(2 * Gi), + numberOfReplicas=1, + dataLocality="strict-local") + disk_volume = wait_for_volume_detached(client, disk_volume_name) + + disk_volume.attach(hostId=host_id) + disk_volume = wait_for_volume_healthy(client, disk_volume_name) + disk_path = prepare_host_disk(get_volume_endpoint(disk_volume), + disk_volume_name) + disk = {"path": disk_path, "allowScheduling": True} + + update_disk = get_update_disks(disks) + update_disk["disk1"] = disk + + node = update_node_disks(client, node.name, disks=update_disk, retry=True) + node = wait_for_disk_update(client, host_id, len(update_disk)) + assert len(node.disks) == len(update_disk) + + # Step 1 + setting = client.by_id_setting( + SETTING_NODE_DRAIN_POLICY) + client.update(setting, value="block-for-eviction-if-contains-last-replica") + + # Step 2, 3 + volume1_name = "vol-1" + volume2_name = "vol-2" + volume1, pod1, checksum1 = create_deployment_and_write_data(client, + core_api, + make_deployment_with_pvc, # NOQA + volume1_name, + str(1 * Gi), + 3, + DATA_SIZE_IN_MB_3, # NOQA + host_id) # NOQA + volume2, pod2, checksum2 = create_deployment_and_write_data(client, + core_api, + make_deployment_with_pvc, # NOQA + volume2_name, + str(1 * Gi), + 3, + DATA_SIZE_IN_MB_3, # NOQA + host_id) # NOQA + # Make volume 1 replica only located on evict_source_node + make_replica_on_specific_node(client, volume1_name, evict_source_node) + volume2_replicas = get_all_replica_name(client, volume2_name) + + # Step 5 + executor = ThreadPoolExecutor(max_workers=5) + future = executor.submit(drain_node, core_api, evict_source_node) + + # Step 6 + check_replica_evict_state(client, volume1_name, evict_source_node, True) + check_node_auto_evict_state(client, evict_source_node, True) + + volume2 = wait_for_volume_degraded(client, volume2_name) + check_all_replicas_evict_state(client, volume2_name, False) + + # Step 7 + wait_drain_complete(future, 60) + + # Step 8 + set_node_cordon(core_api, evict_source_node.id, False) + + # Step 9 + volume1 = client.by_id_volume(volume1_name) + assert len(volume1.replicas) == 1 + for replica in volume1.replicas: + assert replica.hostId != evict_source_node.id + + # Step 10 + # Verify volume2 replicas not moved by check replica name + # stored before the node drain + volume2 = wait_for_volume_healthy(client, volume2_name) + for replica in volume2.replicas: + assert replica.name in volume2_replicas + + # Step 11 + check_node_auto_evict_state(client, evict_source_node, False) + + # Step 12 + check_all_replicas_evict_state(client, volume1_name, False) + check_all_replicas_evict_state(client, volume2_name, False) + + # Step 13 + data_path = '/data/test' + test_data_checksum1 = get_pod_data_md5sum(core_api, + pod1, + data_path) + assert checksum1 == test_data_checksum1 + + test_data_checksum2 = get_pod_data_md5sum(core_api, + pod2, + data_path) + assert checksum2 == test_data_checksum2 + @pytest.mark.skip(reason="TODO") # NOQA def test_drain_with_block_for_eviction_failure(): From 75dfb4337ff09ca28ae03dfec3e0d0039c8e3b72 Mon Sep 17 00:00:00 2001 From: Chris Date: Fri, 23 Feb 2024 17:48:15 +0800 Subject: [PATCH 22/42] Add test case test_drain_with_block_for_eviction_failure ref: 7521 Signed-off-by: Chris --- manager/integration/tests/test_node.py | 82 +++++++++++++++++++------- 1 file changed, 61 insertions(+), 21 deletions(-) diff --git a/manager/integration/tests/test_node.py b/manager/integration/tests/test_node.py index f062996a9b..7e199e4f5b 100644 --- a/manager/integration/tests/test_node.py +++ b/manager/integration/tests/test_node.py @@ -2745,10 +2745,19 @@ def check_replica_evict_state(client, volume_name, node, expect_state): # NOQA assert eviction_requested is expect_state -def wait_drain_complete(future, timeout): +def wait_drain_complete(future, timeout, copmpleted=True): """ Wait concurrent.futures object complete in a duration """ + def stop_drain_process(): + """ + Both future.cancel() and executer.shutdown(wait=False) can not really + stop the drain process. + Use this function to stop drain process + """ + command = ["pkill", "-f", "kubectl drain"] + subprocess.check_output(command, text=True) + thread_timeout = timeout try: future.result(timeout=thread_timeout) @@ -2756,9 +2765,9 @@ def wait_drain_complete(future, timeout): except TimeoutError: print("drain node thread exceed timeout ({})s".format(thread_timeout)) drain_complete = False - future.cancel() + stop_drain_process() finally: - assert drain_complete is True + assert drain_complete is copmpleted def make_replica_on_specific_node(client, volume_name, node): # NOQA @@ -2787,15 +2796,11 @@ def check_all_replicas_evict_state(client, volume_name, expect_state): # NOQA assert eviction_requested is expect_state -@pytest.mark.skip(reason="Can not run when in-cluster backup store pod exist") # NOQA -def test_drain_with_block_for_eviction_success(client, core_api, volume_name, make_deployment_with_pvc): # NOQA +def test_drain_with_block_for_eviction_success(client, # NOQA + core_api, # NOQA + volume_name, # NOQA + make_deployment_with_pvc): # NOQA """ - Test case has the potential to drain node where backup store pods are - located. - In that case, test case will fail because backup store pods can only be - forcibly drained. - --- - Test drain completes after evicting replica with node-drain-policy block-for-eviction @@ -2880,16 +2885,10 @@ def test_drain_with_block_for_eviction_success(client, core_api, volume_name, ma assert checksum == test_data_checksum -@pytest.mark.skip(reason="Can not run when in-cluster backup store pod exist") # NOQA def test_drain_with_block_for_eviction_if_contains_last_replica_success(client, # NOQA core_api, # NOQA make_deployment_with_pvc): # NOQA """ - Test case has the potential to drain node where backup store pods are - located. - In that case, test case will fail because backup store pods can only be - forcibly drained. - --- Test drain completes after evicting replicas with node-drain-policy block-for-eviction-if-contains-last-replica @@ -2921,7 +2920,6 @@ def test_drain_with_block_for_eviction_if_contains_last_replica_success(client, nodes = client.list_node() evict_nodes = [node for node in nodes if node.id != host_id][:2] evict_source_node = evict_nodes[0] - # Create extra disk on current node node = client.by_id_node(host_id) disks = node.disks @@ -2993,7 +2991,7 @@ def test_drain_with_block_for_eviction_if_contains_last_replica_success(client, # Step 9 volume1 = client.by_id_volume(volume1_name) - assert len(volume1.replicas) == 1 + wait_for_volume_replica_count(client, volume1_name, 1) for replica in volume1.replicas: assert replica.hostId != evict_source_node.id @@ -3024,8 +3022,10 @@ def test_drain_with_block_for_eviction_if_contains_last_replica_success(client, assert checksum2 == test_data_checksum2 -@pytest.mark.skip(reason="TODO") # NOQA -def test_drain_with_block_for_eviction_failure(): +def test_drain_with_block_for_eviction_failure(client, # NOQA + core_api, # NOQA + volume_name, # NOQA + make_deployment_with_pvc): # NOQA """ Test drain never completes with node-drain-policy block-for-eviction @@ -3040,7 +3040,47 @@ def test_drain_with_block_for_eviction_failure(): - Verify that `node.status.autoEvicting == true`. - Verify that `replica.spec.evictionRequested == true`. 7. Verify the drain never completes. + 8. Stop the drain, check volume is healthy and data correct """ + host_id = get_self_host_id() + nodes = client.list_node() + evict_nodes = [node for node in nodes if node.id != host_id][:2] + evict_source_node = evict_nodes[0] + + # Step 1 + setting = client.by_id_setting( + SETTING_NODE_DRAIN_POLICY) + client.update(setting, value="block-for-eviction") + + # Step 2, 3, 4 + volume, pod, checksum = create_deployment_and_write_data(client, + core_api, + make_deployment_with_pvc, # NOQA + volume_name, + str(1 * Gi), + 3, + DATA_SIZE_IN_MB_3, host_id) # NOQA + + # Step 5 + executor = ThreadPoolExecutor(max_workers=5) + future = executor.submit(drain_node, core_api, evict_source_node) + + # Step 6 + check_replica_evict_state(client, volume_name, evict_source_node, True) + check_node_auto_evict_state(client, evict_source_node, True) + + # Step 7 + wait_drain_complete(future, 90, False) + + # Step 8 + set_node_cordon(core_api, evict_source_node.id, False) + wait_for_volume_healthy(client, volume_name) + data_path = '/data/test' + test_data_checksum = get_pod_data_md5sum(core_api, + pod, + data_path) + assert checksum == test_data_checksum + @pytest.mark.node # NOQA def test_auto_detach_volume_when_node_is_cordoned(client, core_api, volume_name): # NOQA From a94d8f4b42e2d909d4a863da0a5f5b98cd7d498a Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Mon, 26 Feb 2024 06:27:41 +0000 Subject: [PATCH 23/42] chore(deps): update minor dependencies --- e2e/requirements.txt | 6 +++--- engine/validation-test/requirements.txt | 10 +++++----- test_framework/terraform/azure/aks/main.tf | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/e2e/requirements.txt b/e2e/requirements.txt index 81eac4a39a..2963efe495 100644 --- a/e2e/requirements.txt +++ b/e2e/requirements.txt @@ -1,8 +1,8 @@ robotframework==6.1.1 -argcomplete==1.10.0 -directio==1.2 +argcomplete==1.12.3 +directio==1.3 flake8 kubernetes==27.2.0 requests==2.31.0 -boto3==1.26.86 +boto3==1.34.49 pyyaml==6.0.1 diff --git a/engine/validation-test/requirements.txt b/engine/validation-test/requirements.txt index a8a1d16462..709f1b3cb9 100644 --- a/engine/validation-test/requirements.txt +++ b/engine/validation-test/requirements.txt @@ -1,10 +1,10 @@ -flake8==2.5.1 +flake8==2.6.2 paramiko pytest==2.9.2 pytest-xdist -requests==2.20.0 +requests==2.31.0 cattle==0.5.1 -selenium==2.33.0 -websocket-client==0.23.0 -docker-py==1.2.3 +selenium==2.53.6 +websocket-client==0.59.0 +docker-py==1.10.6 boto diff --git a/test_framework/terraform/azure/aks/main.tf b/test_framework/terraform/azure/aks/main.tf index 35376b8faa..368f1dd43e 100644 --- a/test_framework/terraform/azure/aks/main.tf +++ b/test_framework/terraform/azure/aks/main.tf @@ -2,7 +2,7 @@ terraform { required_providers { azurerm = { source = "hashicorp/azurerm" - version = "=3.0.0" + version = "3.93.0" } } } From b2f6a7a751be1082c9ca42e892d1494b549b0b6f Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Mon, 26 Feb 2024 06:27:38 +0000 Subject: [PATCH 24/42] chore(deps): update docker dependencies --- e2e/Dockerfile | 2 +- manager/integration/Dockerfile | 2 +- mirror_csi_images/Dockerfile.setup | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/e2e/Dockerfile b/e2e/Dockerfile index f2f69efa65..8a28d9e1a9 100644 --- a/e2e/Dockerfile +++ b/e2e/Dockerfile @@ -1,4 +1,4 @@ -FROM registry.suse.com/bci/python:3.9 +FROM registry.suse.com/bci/python:3.11 ARG KUBECTL_VERSION=v1.17.0 ARG YQ_VERSION=v4.24.2 diff --git a/manager/integration/Dockerfile b/manager/integration/Dockerfile index 09ccfc9c2f..8ce1837528 100644 --- a/manager/integration/Dockerfile +++ b/manager/integration/Dockerfile @@ -1,4 +1,4 @@ -FROM registry.suse.com/bci/python:3.9 +FROM registry.suse.com/bci/python:3.11 ARG KUBECTL_VERSION=v1.28.4 ARG YQ_VERSION=v4.24.2 diff --git a/mirror_csi_images/Dockerfile.setup b/mirror_csi_images/Dockerfile.setup index 9f4511b6c3..b872981e90 100644 --- a/mirror_csi_images/Dockerfile.setup +++ b/mirror_csi_images/Dockerfile.setup @@ -6,6 +6,6 @@ WORKDIR $WORKSPACE RUN apk add --no-cache skopeo docker jq bash grep -COPY --from=docker/buildx-bin:v0.8 /buildx /usr/libexec/docker/cli-plugins/docker-buildx +COPY --from=docker/buildx-bin:v0.12 /buildx /usr/libexec/docker/cli-plugins/docker-buildx COPY [".", "$WORKSPACE"] From d7ca22f17cc7a4ae0c80f742f1abc1d25b1ce4d6 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Mon, 26 Feb 2024 06:58:45 +0000 Subject: [PATCH 25/42] chore(deps): update dependency cattle to v0.5.4 --- engine/validation-test/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/engine/validation-test/requirements.txt b/engine/validation-test/requirements.txt index 709f1b3cb9..c7a9aedfcb 100644 --- a/engine/validation-test/requirements.txt +++ b/engine/validation-test/requirements.txt @@ -3,7 +3,7 @@ paramiko pytest==2.9.2 pytest-xdist requests==2.31.0 -cattle==0.5.1 +cattle==0.5.4 selenium==2.53.6 websocket-client==0.59.0 docker-py==1.10.6 From 596791903de74f9e91c7ded0f4e3ea841b0fa123 Mon Sep 17 00:00:00 2001 From: Yang Chiu Date: Mon, 6 Nov 2023 18:24:59 +0800 Subject: [PATCH 26/42] ci: add pipeline for storage network Signed-off-by: Yang Chiu --- pipelines/storage_network/Dockerfile.setup | 31 ++ pipelines/storage_network/Jenkinsfile | 218 +++++++++++++ pipelines/storage_network/scripts/build.sh | 3 + pipelines/storage_network/scripts/cleanup.sh | 14 + .../scripts/download-support-bundle.sh | 31 ++ .../storage_network/scripts/longhorn-setup.sh | 58 ++++ .../scripts/terraform-setup.sh | 13 + pipelines/storage_network/terraform/data.tf | 52 +++ pipelines/storage_network/terraform/main.tf | 308 ++++++++++++++++++ pipelines/storage_network/terraform/output.tf | 7 + .../user-data-scripts/flannel.sh.tpl | 19 ++ .../provision_k3s_agent.sh.tpl | 45 +++ .../provision_k3s_server.sh.tpl | 53 +++ .../terraform/user-data-scripts/routes.sh.tpl | 18 + .../storage_network/terraform/variables.tf | 113 +++++++ pipelines/utilities/storage_network.sh | 52 +++ 16 files changed, 1035 insertions(+) create mode 100644 pipelines/storage_network/Dockerfile.setup create mode 100644 pipelines/storage_network/Jenkinsfile create mode 100755 pipelines/storage_network/scripts/build.sh create mode 100755 pipelines/storage_network/scripts/cleanup.sh create mode 100755 pipelines/storage_network/scripts/download-support-bundle.sh create mode 100755 pipelines/storage_network/scripts/longhorn-setup.sh create mode 100755 pipelines/storage_network/scripts/terraform-setup.sh create mode 100644 pipelines/storage_network/terraform/data.tf create mode 100644 pipelines/storage_network/terraform/main.tf create mode 100644 pipelines/storage_network/terraform/output.tf create mode 100644 pipelines/storage_network/terraform/user-data-scripts/flannel.sh.tpl create mode 100755 pipelines/storage_network/terraform/user-data-scripts/provision_k3s_agent.sh.tpl create mode 100755 pipelines/storage_network/terraform/user-data-scripts/provision_k3s_server.sh.tpl create mode 100644 pipelines/storage_network/terraform/user-data-scripts/routes.sh.tpl create mode 100644 pipelines/storage_network/terraform/variables.tf create mode 100755 pipelines/utilities/storage_network.sh diff --git a/pipelines/storage_network/Dockerfile.setup b/pipelines/storage_network/Dockerfile.setup new file mode 100644 index 0000000000..4e7e9b10e1 --- /dev/null +++ b/pipelines/storage_network/Dockerfile.setup @@ -0,0 +1,31 @@ +From alpine:latest + +ARG KUBECTL_VERSION=v1.20.2 + +ARG RKE_VERSION=v1.3.4 + +ARG TERRAFORM_VERSION=1.3.5 + +ARG YQ_VERSION=v4.24.2 + +ENV WORKSPACE /src/longhorn-tests + +WORKDIR $WORKSPACE + +RUN wget -q https://storage.googleapis.com/kubernetes-release/release/$KUBECTL_VERSION/bin/linux/amd64/kubectl && \ + mv kubectl /usr/local/bin/kubectl && \ + chmod +x /usr/local/bin/kubectl && \ + wget -q https://github.com/rancher/rke/releases/download/$RKE_VERSION/rke_linux-amd64 && \ + mv rke_linux-amd64 /usr/bin/rke && \ + chmod +x /usr/bin/rke && \ + wget -q https://releases.hashicorp.com/terraform/${TERRAFORM_VERSION}/terraform_${TERRAFORM_VERSION}_linux_amd64.zip && \ + unzip terraform_${TERRAFORM_VERSION}_linux_amd64.zip && rm terraform_${TERRAFORM_VERSION}_linux_amd64.zip && \ + mv terraform /usr/bin/terraform && \ + chmod +x /usr/bin/terraform && \ + wget -q "https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64" && \ + mv yq_linux_amd64 /usr/local/bin/yq && \ + chmod +x /usr/local/bin/yq && \ + apk add openssl openssh-client ca-certificates git rsync bash curl jq python3 py3-pip aws-cli && \ + ssh-keygen -t rsa -b 4096 -N "" -f ~/.ssh/id_rsa + +COPY [".", "$WORKSPACE"] diff --git a/pipelines/storage_network/Jenkinsfile b/pipelines/storage_network/Jenkinsfile new file mode 100644 index 0000000000..0824204ccf --- /dev/null +++ b/pipelines/storage_network/Jenkinsfile @@ -0,0 +1,218 @@ +def imageName = "${JOB_BASE_NAME}-${env.BUILD_NUMBER}" +def summary +def WORKSPACE = "/src/longhorn-tests" +def BUILD_TRIGGER_BY = "\n${currentBuild.getBuildCauses()[0].shortDescription}" + +// define optional parameters +def SELINUX_MODE = params.SELINUX_MODE ? params.SELINUX_MODE : "" + +def CREDS_ID = JOB_BASE_NAME == "longhorn-tests-regression" ? "AWS_CREDS_RANCHER_QA" : "AWS_CREDS" +def REGISTRATION_CODE_ID = params.ARCH == "amd64" ? "REGISTRATION_CODE" : "REGISTRATION_CODE_ARM64" + +// parameters for air gap installation +def AIR_GAP_INSTALLATION = params.AIR_GAP_INSTALLATION ? params.AIR_GAP_INSTALLATION : false +def LONGHORN_INSTALL_VERSION = params.LONGHORN_INSTALL_VERSION ? params.LONGHORN_INSTALL_VERSION : "master" +def LONGHORN_TRANSIENT_VERSION = params.LONGHORN_TRANSIENT_VERSION ? params.LONGHORN_TRANSIENT_VERSION : "" +def CIS_HARDENING = params.CIS_HARDENING ? params.CIS_HARDENING : false +def REGISTRY_URL +def REGISTRY_USERNAME +def REGISTRY_PASSWORD + +// parameter for hdd test +def USE_HDD = params.USE_HDD ? params.USE_HDD : false + +node { + + withCredentials([ + usernamePassword(credentialsId: CREDS_ID, passwordVariable: 'AWS_SECRET_KEY', usernameVariable: 'AWS_ACCESS_KEY'), + string(credentialsId: 'DO_CREDS', variable: 'DO_TOKEN'), + string(credentialsId: REGISTRATION_CODE_ID, variable: 'REGISTRATION_CODE'), + ]) { + + if (params.SEND_SLACK_NOTIFICATION) { + notifyBuild('STARTED', BUILD_TRIGGER_BY, params.NOTIFY_SLACK_CHANNEL) + } + + checkout scm + + try { + + if (params.AIR_GAP_INSTALLATION) { + + stage('airgap build') { + sh "airgap/scripts/build.sh" + sh """ docker run -itd --name airgap-${JOB_BASE_NAME}-${BUILD_NUMBER} \ + --env TF_VAR_longhorn_version=${LONGHORN_INSTALL_VERSION} \ + --env TF_VAR_do_token=${DO_TOKEN} \ + --env TF_VAR_aws_access_key=${AWS_ACCESS_KEY} \ + --env TF_VAR_aws_secret_key=${AWS_SECRET_KEY} \ + airgap-${JOB_BASE_NAME}-${BUILD_NUMBER} + """ + } + + stage ('airgap setup') { + sh "docker exec airgap-${JOB_BASE_NAME}-${BUILD_NUMBER} ./airgap/scripts/terraform-setup.sh" + REGISTRY_URL = sh ( + script: "docker exec airgap-${JOB_BASE_NAME}-${BUILD_NUMBER} terraform -chdir=./airgap/terraform output -raw registry_url", + returnStdout: true + ) + println REGISTRY_URL + REGISTRY_USERNAME = sh ( + script: "docker exec airgap-${JOB_BASE_NAME}-${BUILD_NUMBER} terraform -chdir=./airgap/terraform output -raw registry_username", + returnStdout: true + ) + REGISTRY_PASSWORD = sh ( + script: "docker exec airgap-${JOB_BASE_NAME}-${BUILD_NUMBER} terraform -chdir=./airgap/terraform output -raw registry_password", + returnStdout: true + ) + } + + } + + stage('build') { + + echo "Using credentials: $CREDS_ID" + echo "Using registration code: $REGISTRATION_CODE_ID" + + sh "pipelines/storage_network/scripts/build.sh" + sh """ docker run -itd --name ${JOB_BASE_NAME}-${BUILD_NUMBER} \ + --env AIR_GAP_INSTALLATION=${AIR_GAP_INSTALLATION} \ + --env REGISTRY_URL=${REGISTRY_URL} \ + --env REGISTRY_USERNAME=${REGISTRY_USERNAME} \ + --env REGISTRY_PASSWORD=${REGISTRY_PASSWORD} \ + --env LONGHORN_INSTALL_VERSION=${LONGHORN_INSTALL_VERSION} \ + --env CUSTOM_LONGHORN_ENGINE_IMAGE=${CUSTOM_LONGHORN_ENGINE_IMAGE} \ + --env CUSTOM_LONGHORN_INSTANCE_MANAGER_IMAGE=${CUSTOM_LONGHORN_INSTANCE_MANAGER_IMAGE} \ + --env CUSTOM_LONGHORN_MANAGER_IMAGE=${CUSTOM_LONGHORN_MANAGER_IMAGE} \ + --env CUSTOM_LONGHORN_SHARE_MANAGER_IMAGE=${CUSTOM_LONGHORN_SHARE_MANAGER_IMAGE} \ + --env CUSTOM_LONGHORN_BACKING_IMAGE_MANAGER_IMAGE=${CUSTOM_LONGHORN_BACKING_IMAGE_MANAGER_IMAGE} \ + --env LONGHORN_TESTS_CUSTOM_IMAGE=${LONGHORN_TESTS_CUSTOM_IMAGE} \ + --env DISTRO=${DISTRO} \ + --env LONGHORN_REPO_URI=${LONGHORN_REPO_URI} \ + --env LONGHORN_REPO_BRANCH=${LONGHORN_REPO_BRANCH} \ + --env LONGHORN_STABLE_VERSION=${LONGHORN_STABLE_VERSION} \ + --env LONGHORN_TRANSIENT_VERSION=${LONGHORN_TRANSIENT_VERSION} \ + --env LONGHORN_TEST_CLOUDPROVIDER=${LONGHORN_TEST_CLOUDPROVIDER} \ + --env LONGHORN_UPGRADE_TEST=${LONGHORN_UPGRADE_TEST} \ + --env PYTEST_CUSTOM_OPTIONS="${PYTEST_CUSTOM_OPTIONS}" \ + --env BACKUP_STORE_TYPE="${BACKUP_STORE_TYPE}" \ + --env TF_VAR_use_hdd=${USE_HDD} \ + --env TF_VAR_arch=${ARCH} \ + --env TF_VAR_k8s_distro_name=${K8S_DISTRO_NAME} \ + --env TF_VAR_k8s_distro_version=${K8S_DISTRO_VERSION} \ + --env TF_VAR_aws_availability_zone=${AWS_AVAILABILITY_ZONE} \ + --env TF_VAR_aws_region=${AWS_REGION} \ + --env TF_VAR_os_distro_version=${DISTRO_VERSION} \ + --env TF_VAR_do_token=${env.TF_VAR_do_token} \ + --env TF_VAR_aws_access_key=${AWS_ACCESS_KEY} \ + --env TF_VAR_lh_aws_instance_name_controlplane="${JOB_BASE_NAME}-ctrl" \ + --env TF_VAR_lh_aws_instance_name_worker="${JOB_BASE_NAME}-wrk" \ + --env TF_VAR_lh_aws_instance_type_controlplane=${CONTROLPLANE_INSTANCE_TYPE} \ + --env TF_VAR_lh_aws_instance_type_worker=${WORKER_INSTANCE_TYPE}\ + --env TF_VAR_aws_secret_key=${AWS_SECRET_KEY} \ + --env AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY} \ + --env AWS_SECRET_ACCESS_KEY=${AWS_SECRET_KEY} \ + --env AWS_DEFAULT_REGION=${AWS_REGION} \ + --env TF_VAR_selinux_mode=${SELINUX_MODE} \ + --env TF_VAR_registration_code=${REGISTRATION_CODE} \ + --env TF_VAR_cis_hardening=${CIS_HARDENING} \ + --env TF_VAR_mtu=${MTU_SIZE} \ + --env TF_VAR_multus_version=${MULTUS_VERSION} \ + --env TF_VAR_thick_plugin=${THICK_PLUGIN} \ + ${imageName} + """ + } + + timeout(60) { + stage ('terraform') { + sh "docker exec ${JOB_BASE_NAME}-${BUILD_NUMBER} pipelines/storage_network/scripts/terraform-setup.sh" + } + } + + stage ('longhorn setup & tests') { + sh "docker exec ${JOB_BASE_NAME}-${BUILD_NUMBER} pipelines/storage_network/scripts/longhorn-setup.sh" + } + + stage ('download support bundle') { + sh "docker exec ${JOB_BASE_NAME}-${BUILD_NUMBER} pipelines/storage_network/scripts/download-support-bundle.sh ${JOB_BASE_NAME}-${BUILD_NUMBER}-bundle.zip" + sh "docker cp ${JOB_BASE_NAME}-${BUILD_NUMBER}:${WORKSPACE}/${JOB_BASE_NAME}-${BUILD_NUMBER}-bundle.zip ." + archiveArtifacts allowEmptyArchive: true, artifacts: '**/*.zip', followSymlinks: false + } + + stage ('report generation') { + sh "docker cp ${JOB_BASE_NAME}-${BUILD_NUMBER}:${WORKSPACE}/longhorn-test-junit-report.xml ." + + if(params.LONGHORN_UPGRADE_TEST && params.LONGHORN_TRANSIENT_VERSION) { + sh "docker cp ${JOB_BASE_NAME}-${BUILD_NUMBER}:${WORKSPACE}/longhorn-test-upgrade-from-stable-junit-report.xml ." + sh "docker cp ${JOB_BASE_NAME}-${BUILD_NUMBER}:${WORKSPACE}/longhorn-test-upgrade-from-transient-junit-report.xml ." + summary = junit 'longhorn-test-upgrade-from-stable-junit-report.xml, longhorn-test-upgrade-from-transient-junit-report.xml, longhorn-test-junit-report.xml' + } + else if(params.LONGHORN_UPGRADE_TEST) { + sh "docker cp ${JOB_BASE_NAME}-${BUILD_NUMBER}:${WORKSPACE}/longhorn-test-upgrade-from-stable-junit-report.xml ." + summary = junit 'longhorn-test-upgrade-from-stable-junit-report.xml, longhorn-test-junit-report.xml' + } + else { + summary = junit 'longhorn-test-junit-report.xml' + } + } + + } catch (e) { + currentBuild.result = "FAILED" + throw e + } finally { + stage ('releasing resources') { + if (sh (script: "docker container inspect airgap-${JOB_BASE_NAME}-${BUILD_NUMBER} > /dev/null 2>&1", returnStatus: true) == 0) { + sh "docker exec airgap-${JOB_BASE_NAME}-${BUILD_NUMBER} ./airgap/scripts/cleanup.sh" + sh "docker stop airgap-${JOB_BASE_NAME}-${BUILD_NUMBER}" + sh "docker rm -v airgap-${JOB_BASE_NAME}-${BUILD_NUMBER}" + sh "docker rmi airgap-${JOB_BASE_NAME}-${BUILD_NUMBER}" + } + + if (sh (script: "docker container inspect ${JOB_BASE_NAME}-${BUILD_NUMBER} > /dev/null 2>&1", returnStatus: true) == 0) { + sh "docker exec ${JOB_BASE_NAME}-${BUILD_NUMBER} pipelines/storage_network/scripts/cleanup.sh" + sh "docker stop ${JOB_BASE_NAME}-${BUILD_NUMBER}" + sh "docker rm -v ${JOB_BASE_NAME}-${BUILD_NUMBER}" + sh "docker rmi ${imageName}" + } + + if (summary) { + summary_msg = "\nTest Summary - Failures: ${summary.failCount}, Skipped: ${summary.skipCount}, Passed: ${summary.passCount} -- Job completed in ${currentBuild.durationString.replace(' and counting', '')}" + } else { + summary_msg = "\n Test Failed: No Junit report" + } + + if(params.SEND_SLACK_NOTIFICATION){ + notifyBuild(currentBuild.result, summary_msg, params.NOTIFY_SLACK_CHANNEL) + } + } + } + } + +} + + +def notifyBuild(String buildStatus = 'STARTED', String summary_msg, String slack_channel) { + // build status of null means successful + buildStatus = buildStatus ?: 'SUCCESSFUL' + + // Default values + def colorName = 'RED' + def colorCode = '#FF0000' + def subject = "${buildStatus}: Job '${env.JOB_BASE_NAME} [${env.BUILD_NUMBER}]'" + def summary = "${subject} (${env.BUILD_URL})" + summary_msg + + // Override default values based on build status + if (buildStatus == 'STARTED') { + color = 'YELLOW' + colorCode = '#FFFF00' + } else if (buildStatus == 'SUCCESSFUL') { + color = 'GREEN' + colorCode = '#00FF00' + } else { + color = 'RED' + colorCode = '#FF0000' + } + + // Send notifications + slackSend (color: colorCode, message: summary, channel: slack_channel, tokenCredentialId: 'longhorn-tests-slack-token') +} diff --git a/pipelines/storage_network/scripts/build.sh b/pipelines/storage_network/scripts/build.sh new file mode 100755 index 0000000000..74f825bdef --- /dev/null +++ b/pipelines/storage_network/scripts/build.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +docker build --no-cache -f ./pipelines/storage_network/Dockerfile.setup -t "${JOB_BASE_NAME}-${BUILD_NUMBER}" . diff --git a/pipelines/storage_network/scripts/cleanup.sh b/pipelines/storage_network/scripts/cleanup.sh new file mode 100755 index 0000000000..6d41aeeabd --- /dev/null +++ b/pipelines/storage_network/scripts/cleanup.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +# terminate any terraform processes +TERRAFORM_PIDS=( `ps aux | grep -i terraform | grep -v grep | awk '{printf("%s ",$1)}'` ) +if [[ -n ${TERRAFORM_PIDS[@]} ]] ; then + for PID in "${TERRAFORM_PIDS[@]}"; do + kill "${TERRAFORM_PIDS}" + done +fi + +# wait 30 seconds for graceful terraform termination +sleep 30 + +terraform -chdir=pipelines/storage_network/terraform destroy -auto-approve -no-color diff --git a/pipelines/storage_network/scripts/download-support-bundle.sh b/pipelines/storage_network/scripts/download-support-bundle.sh new file mode 100755 index 0000000000..1bac81d5e0 --- /dev/null +++ b/pipelines/storage_network/scripts/download-support-bundle.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +set -e + +SUPPORT_BUNDLE_FILE_NAME=${1:-"lh-support-bundle.zip"} +SUPPORT_BUNDLE_ISSUE_URL=${2:-""} +SUPPORT_BUNDLE_ISSUE_DESC=${3:-"Auto-generated support bundle"} + +set_kubeconfig_envvar(){ + export KUBECONFIG="${PWD}/pipelines/storage_network/terraform/k3s.yaml" +} + +set_kubeconfig_envvar + +LH_FRONTEND_ADDR=`kubectl get svc -n longhorn-system longhorn-frontend -o json | jq -r '.spec.clusterIP + ":" + (.spec.ports[0].port|tostring)'` + +JSON_PAYLOAD="{\"issueURL\": \"${SUPPORT_BUNDLE_ISSUE_DESC}\", \"description\": \"${SUPPORT_BUNDLE_ISSUE_DESC}\"}" + +CURL_CMD="curl -XPOST http://${LH_FRONTEND_ADDR}/v1/supportbundles -H 'Accept: application/json' -H 'Accept-Encoding: gzip, deflate' -d '"${JSON_PAYLOAD}"'" + +SUPPORT_BUNDLE_URL=`kubectl exec -n longhorn-system svc/longhorn-frontend -- bash -c "${CURL_CMD}" | jq -r '.links.self + "/" + .name'` + +SUPPORT_BUNDLE_READY=false +while [[ ${SUPPORT_BUNDLE_READY} == false ]]; do + PERCENT=`kubectl exec -n longhorn-system svc/longhorn-frontend -- curl -H 'Accept: application/json' ${SUPPORT_BUNDLE_URL} | jq -r '.progressPercentage' || true` + echo ${PERCENT} + + if [[ ${PERCENT} == 100 ]]; then SUPPORT_BUNDLE_READY=true; fi +done + +kubectl exec -n longhorn-system svc/longhorn-frontend -- curl -H 'Accept-Encoding: gzip, deflate' ${SUPPORT_BUNDLE_URL}/download > ${SUPPORT_BUNDLE_FILE_NAME} diff --git a/pipelines/storage_network/scripts/longhorn-setup.sh b/pipelines/storage_network/scripts/longhorn-setup.sh new file mode 100755 index 0000000000..115d6ffbd1 --- /dev/null +++ b/pipelines/storage_network/scripts/longhorn-setup.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash + +set -x + +source pipelines/utilities/selinux_workaround.sh +source pipelines/utilities/install_csi_snapshotter.sh +source pipelines/utilities/create_aws_secret.sh +source pipelines/utilities/install_backupstores.sh +source pipelines/utilities/storage_network.sh +source pipelines/utilities/create_longhorn_namespace.sh +source pipelines/utilities/longhorn_manifest.sh +source pipelines/utilities/run_longhorn_test.sh + +# create and clean tmpdir +TMPDIR="/tmp/longhorn" +mkdir -p ${TMPDIR} +rm -rf "${TMPDIR}/" + +export LONGHORN_NAMESPACE="longhorn-system" +export LONGHORN_INSTALL_METHOD="manifest" + +set_kubeconfig_envvar(){ + export KUBECONFIG="${PWD}/pipelines/storage_network/terraform/k3s.yaml" +} + +main(){ + set_kubeconfig_envvar + + if [[ ${DISTRO} == "rhel" ]] || [[ ${DISTRO} == "rockylinux" ]] || [[ ${DISTRO} == "oracle" ]]; then + apply_selinux_workaround + fi + + # set debugging mode off to avoid leaking aws secrets to the logs. + # DON'T REMOVE! + set +x + create_aws_secret + set -x + + if [[ "${TF_VAR_thick_plugin}" == true ]]; then + deploy_multus_thick_plugin_daemonset + else + deploy_multus_thin_plugin_daemonset + fi + deploy_network_attachment_definition + + create_longhorn_namespace + install_backupstores + install_csi_snapshotter + + generate_longhorn_yaml_manifest + install_longhorn_by_manifest + + update_storage_network_setting + + run_longhorn_test +} + +main diff --git a/pipelines/storage_network/scripts/terraform-setup.sh b/pipelines/storage_network/scripts/terraform-setup.sh new file mode 100755 index 0000000000..0b7f9b3a8f --- /dev/null +++ b/pipelines/storage_network/scripts/terraform-setup.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +set -x + +terraform -chdir=pipelines/storage_network/terraform init +terraform -chdir=pipelines/storage_network/terraform apply -auto-approve -no-color + +NETWORK_INTERFACE_IDS=$(terraform -chdir=pipelines/storage_network/terraform output -json network_interface_ids | tr -d '"') +for id in ${NETWORK_INTERFACE_IDS}; do + aws ec2 modify-network-interface-attribute --network-interface-id "${id}" --no-source-dest-check +done + +exit $? diff --git a/pipelines/storage_network/terraform/data.tf b/pipelines/storage_network/terraform/data.tf new file mode 100644 index 0000000000..aeee2eae84 --- /dev/null +++ b/pipelines/storage_network/terraform/data.tf @@ -0,0 +1,52 @@ +locals { + aws_ami_sles_arch = var.arch == "amd64" ? "x86_64" : var.arch +} + +data "aws_ami" "aws_ami_sles" { + most_recent = true + owners = [var.aws_ami_sles_account_number] + name_regex = "^suse-sles-${var.os_distro_version}-v\\d+-hvm-ssd-${local.aws_ami_sles_arch}" +} + +# Generate template file for k3s server +data "template_file" "provision_k3s_server" { + template = var.k8s_distro_name == "k3s" ? file("${path.module}/user-data-scripts/provision_k3s_server.sh.tpl") : null + vars = { + k3s_cluster_secret = random_password.cluster_secret.result + k3s_server_public_ip = aws_eip.aws_eip[0].public_ip + k3s_version = var.k8s_distro_version + thick_plugin = var.thick_plugin + } +} + +# Generate template file for k3s agent +data "template_file" "provision_k3s_agent" { + template = var.k8s_distro_name == "k3s" ? file("${path.module}/user-data-scripts/provision_k3s_agent.sh.tpl") : null + vars = { + k3s_server_url = "https://${aws_eip.aws_eip[0].public_ip}:6443" + k3s_cluster_secret = random_password.cluster_secret.result + k3s_version = var.k8s_distro_version + thick_plugin = var.thick_plugin + } +} + +# Generate template file for flannel +data "template_file" "flannel" { + template = var.k8s_distro_name == "k3s" ? file("${path.module}/user-data-scripts/flannel.sh.tpl") : null + vars = { + N1 = aws_network_interface.instance_eth1[0].private_ip + N2 = aws_network_interface.instance_eth1[1].private_ip + N3 = aws_network_interface.instance_eth1[2].private_ip + mtu = var.mtu + } +} + +# Generate template file for routes +data "template_file" "routes" { + template = var.k8s_distro_name == "k3s" ? file("${path.module}/user-data-scripts/routes.sh.tpl") : null + vars = { + N1 = aws_network_interface.instance_eth1[0].private_ip + N2 = aws_network_interface.instance_eth1[1].private_ip + N3 = aws_network_interface.instance_eth1[2].private_ip + } +} diff --git a/pipelines/storage_network/terraform/main.tf b/pipelines/storage_network/terraform/main.tf new file mode 100644 index 0000000000..ce8a22ebeb --- /dev/null +++ b/pipelines/storage_network/terraform/main.tf @@ -0,0 +1,308 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 3.0" + } + } +} + +provider "aws" { + region = var.aws_region + access_key = var.aws_access_key + secret_key = var.aws_secret_key +} + +resource "random_string" "random_suffix" { + length = 8 + special = false + lower = true + upper = false +} + +resource "random_password" "cluster_secret" { + length = 64 + special = false +} + +resource "aws_vpc" "aws_vpc" { + cidr_block = "10.0.0.0/16" + + tags = { + Name = "${var.aws_vpc_name}-${random_string.random_suffix.id}" + Owner = var.resources_owner + } +} + +resource "aws_internet_gateway" "aws_igw" { + vpc_id = aws_vpc.aws_vpc.id + + tags = { + Name = "lh_igw-${random_string.random_suffix.id}" + Owner = var.resources_owner + } +} + +resource "aws_route_table" "aws_public_rt" { + depends_on = [ + aws_internet_gateway.aws_igw, + ] + + vpc_id = aws_vpc.aws_vpc.id + + route { + cidr_block = "0.0.0.0/0" + gateway_id = aws_internet_gateway.aws_igw.id + } + + tags = { + Name = "lh_aws_public_rt-${random_string.random_suffix.id}" + Owner = var.resources_owner + } +} + +resource "aws_subnet" "aws_subnet_1" { + vpc_id = aws_vpc.aws_vpc.id + availability_zone = "us-east-1c" + cidr_block = "10.0.1.0/24" + + tags = { + Name = "lh_subnet_1-${random_string.random_suffix.id}" + Owner = var.resources_owner + } +} + +resource "aws_subnet" "aws_subnet_2" { + vpc_id = aws_vpc.aws_vpc.id + availability_zone = "us-east-1c" + cidr_block = "10.0.2.0/24" + + tags = { + Name = "lh_subnet_2-${random_string.random_suffix.id}" + Owner = var.resources_owner + } +} + +resource "aws_route_table_association" "aws_subnet_1_rt_association" { + depends_on = [ + aws_subnet.aws_subnet_1, + aws_route_table.aws_public_rt + ] + + subnet_id = aws_subnet.aws_subnet_1.id + route_table_id = aws_route_table.aws_public_rt.id +} + +resource "aws_route_table_association" "aws_subnet_2_rt_association" { + depends_on = [ + aws_subnet.aws_subnet_2, + aws_route_table.aws_public_rt + ] + + subnet_id = aws_subnet.aws_subnet_2.id + route_table_id = aws_route_table.aws_public_rt.id +} + +resource "aws_security_group" "aws_secgrp" { + name = "lh_aws_secgrp" + description = "Allow all inbound traffic" + vpc_id = aws_vpc.aws_vpc.id + + ingress { + description = "Allow SSH" + from_port = 22 + to_port = 22 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Allow all ports" + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = { + Name = "lh_aws_secgrp-${random_string.random_suffix.id}" + Owner = var.resources_owner + } +} + +resource "aws_key_pair" "aws_pair_key" { + key_name = format("%s_%s", "aws_key_pair", random_string.random_suffix.id) + public_key = file(var.aws_ssh_public_key_file_path) +} + +resource "aws_network_interface" "instance_eth0" { + subnet_id = aws_subnet.aws_subnet_1.id + security_groups = [aws_security_group.aws_secgrp.id] + + count = var.aws_instance_count + + tags = { + Name = "instance_eth0-${count.index}-${random_string.random_suffix.id}" + Owner = var.resources_owner + } +} + +resource "aws_instance" "aws_instance" { + depends_on = [ + aws_subnet.aws_subnet_1, + aws_subnet.aws_subnet_2, + aws_network_interface.instance_eth0 + ] + + ami = data.aws_ami.aws_ami_sles.id + instance_type = var.aws_instance_type + + count = var.aws_instance_count + + network_interface { + network_interface_id = aws_network_interface.instance_eth0[count.index].id + device_index = 0 + } + + root_block_device { + delete_on_termination = true + volume_size = var.aws_instance_root_block_device_size + } + + key_name = aws_key_pair.aws_pair_key.key_name + user_data = count.index == 0 ? data.template_file.provision_k3s_server.rendered : data.template_file.provision_k3s_agent.rendered + + tags = { + Name = "${var.aws_instance_name}-${count.index}-${random_string.random_suffix.id}" + DoNotDelete = "true" + Owner = var.resources_owner + } +} + +resource "aws_network_interface" "instance_eth1" { + depends_on = [ + aws_subnet.aws_subnet_1, + aws_subnet.aws_subnet_2, + aws_instance.aws_instance + ] + + subnet_id = aws_subnet.aws_subnet_2.id + security_groups = [aws_security_group.aws_secgrp.id] + + count = var.aws_instance_count + + attachment { + instance = aws_instance.aws_instance[count.index].id + device_index = 1 + } + + tags = { + Name = "instance_eth1-${count.index}-${random_string.random_suffix.id}" + Owner = var.resources_owner + } +} + +resource "aws_eip" "aws_eip" { + vpc = true + + count = var.aws_instance_count + + tags = { + Name = "aws_eip-${count.index}-${random_string.random_suffix.id}" + Owner = var.resources_owner + } +} + +resource "aws_eip_association" "aws_eip_assoc" { + depends_on = [ + aws_instance.aws_instance, + aws_eip.aws_eip + ] + + count = var.aws_instance_count + + network_interface_id = aws_network_interface.instance_eth0[count.index].id + allocation_id = aws_eip.aws_eip[count.index].id +} + +resource "null_resource" "rsync_kubeconfig_file" { + + depends_on = [ + aws_instance.aws_instance, + aws_eip.aws_eip, + aws_eip_association.aws_eip_assoc + ] + + provisioner "remote-exec" { + + inline = [ + "cloud-init status --wait", + "if [ \"`cloud-init status | grep error`\" ]; then sudo cat /var/log/cloud-init-output.log; fi", + "RETRY=0; MAX_RETRY=450; until([ -f /etc/rancher/k3s/k3s.yaml ] && [ `sudo /usr/local/bin/kubectl get node -o jsonpath='{.items[*].status.conditions}' | jq '.[] | select(.type == \"Ready\").status' | grep -ci true` -eq ${var.aws_instance_count} ]); do echo \"waiting for k3s cluster nodes to be running\"; sleep 2; if [ $RETRY -eq $MAX_RETRY ]; then break; fi; RETRY=$((RETRY+1)); done" + ] + + connection { + type = "ssh" + user = "ec2-user" + host = aws_eip.aws_eip[0].public_ip + private_key = file(var.aws_ssh_private_key_file_path) + } + } + + provisioner "local-exec" { + command = "rsync -aPvz --rsync-path=\"sudo rsync\" -e \"ssh -o StrictHostKeyChecking=no -l ec2-user -i ${var.aws_ssh_private_key_file_path}\" ${aws_eip.aws_eip[0].public_ip}:/etc/rancher/k3s/k3s.yaml . && sed -i 's#https://127.0.0.1:6443#https://${aws_eip.aws_eip[0].public_ip}:6443#' k3s.yaml" + } +} + +# setup flannel +resource "null_resource" "cluster_setup_flannel" { + count = var.aws_instance_count + + depends_on = [ + aws_instance.aws_instance, + null_resource.rsync_kubeconfig_file + ] + + provisioner "remote-exec" { + + inline = [data.template_file.flannel.rendered] + + connection { + type = "ssh" + user = "ec2-user" + host = aws_eip.aws_eip[count.index].public_ip + private_key = file(var.aws_ssh_private_key_file_path) + } + } + +} + +# setup routes +resource "null_resource" "cluster_setup_routes" { + count = var.aws_instance_count + + depends_on = [ + aws_instance.aws_instance, + null_resource.cluster_setup_flannel + ] + + provisioner "remote-exec" { + + inline = [data.template_file.routes.rendered] + + connection { + type = "ssh" + user = "ec2-user" + host = aws_eip.aws_eip[count.index].public_ip + private_key = file(var.aws_ssh_private_key_file_path) + } + } + +} \ No newline at end of file diff --git a/pipelines/storage_network/terraform/output.tf b/pipelines/storage_network/terraform/output.tf new file mode 100644 index 0000000000..e5b811e065 --- /dev/null +++ b/pipelines/storage_network/terraform/output.tf @@ -0,0 +1,7 @@ +output "network_interface_ids" { + depends_on = [ + aws_network_interface.instance_eth0, + aws_network_interface.instance_eth1 + ] + value = join(" ", concat(aws_network_interface.instance_eth0[*].id, aws_network_interface.instance_eth1[*].id)) +} \ No newline at end of file diff --git a/pipelines/storage_network/terraform/user-data-scripts/flannel.sh.tpl b/pipelines/storage_network/terraform/user-data-scripts/flannel.sh.tpl new file mode 100644 index 0000000000..0e28086680 --- /dev/null +++ b/pipelines/storage_network/terraform/user-data-scripts/flannel.sh.tpl @@ -0,0 +1,19 @@ +#!/bin/bash +STORAGE_NETWORK_PREFIX="192.168" + +ETH1_IP=`ip a | grep eth1 | grep -Eo 'inet (addr:)?([0-9]*\.){3}[0-9]*' | awk '{print $2}'` + +count=1 +for n in ${N1} ${N2} ${N3}; do + [[ $ETH1_IP != $n ]] && ((count=count+1)) && continue + + NET=$count + break +done + +cat << EOF | sudo tee -a /run/flannel/multus-subnet-$STORAGE_NETWORK_PREFIX.0.0.env +FLANNEL_NETWORK=$STORAGE_NETWORK_PREFIX.0.0/16 +FLANNEL_SUBNET=$STORAGE_NETWORK_PREFIX.$NET.0/24 +FLANNEL_MTU=${mtu} +FLANNEL_IPMASQ=true +EOF \ No newline at end of file diff --git a/pipelines/storage_network/terraform/user-data-scripts/provision_k3s_agent.sh.tpl b/pipelines/storage_network/terraform/user-data-scripts/provision_k3s_agent.sh.tpl new file mode 100755 index 0000000000..35b22c900b --- /dev/null +++ b/pipelines/storage_network/terraform/user-data-scripts/provision_k3s_agent.sh.tpl @@ -0,0 +1,45 @@ +#!/bin/bash + +set -e + +sudo systemctl restart guestregister # Sometimes registration fails on first boot. +sudo zypper ref +sudo zypper install -y -t pattern devel_basis +sudo zypper install -y open-iscsi nfs-client +sudo systemctl -q enable iscsid +sudo systemctl start iscsid + +if [ -b "/dev/nvme1n1" ]; then + mkfs.ext4 -E nodiscard /dev/nvme1n1 + mkdir /mnt/sda1 + mount /dev/nvme1n1 /mnt/sda1 + + mkdir /mnt/sda1/local + mkdir /opt/local-path-provisioner + mount --bind /mnt/sda1/local /opt/local-path-provisioner + + mkdir /mnt/sda1/longhorn + mkdir /var/lib/longhorn + mount --bind /mnt/sda1/longhorn /var/lib/longhorn +elif [ -b "/dev/xvdh" ]; then + mkfs.ext4 -E nodiscard /dev/xvdh + mkdir /var/lib/longhorn + mount /dev/xvdh /var/lib/longhorn +fi + +# TODO: It looks like "set -e" will break the intended functionality of the remaining code. Consider a refactor. +set +e + +until (curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="agent --token ${k3s_cluster_secret}" K3S_URL="${k3s_server_url}" INSTALL_K3S_VERSION="${k3s_version}" sh -); do + echo 'k3s agent did not install correctly' + sleep 2 +done + +if [[ "${thick_plugin}" == true ]]; then + ln -s /var/lib/rancher/k3s/agent/etc/cni/net.d /etc/cni + ln -s /var/lib/rancher/k3s/data/current/bin /opt/cni +fi + +curl -OL https://github.com/containernetworking/plugins/releases/download/v1.3.0/cni-plugins-linux-amd64-v1.3.0.tgz +tar -zxvf cni-plugins-linux-amd64-v1.3.0.tgz +cp ipvlan /var/lib/rancher/k3s/data/current/bin/ \ No newline at end of file diff --git a/pipelines/storage_network/terraform/user-data-scripts/provision_k3s_server.sh.tpl b/pipelines/storage_network/terraform/user-data-scripts/provision_k3s_server.sh.tpl new file mode 100755 index 0000000000..f3533c959d --- /dev/null +++ b/pipelines/storage_network/terraform/user-data-scripts/provision_k3s_server.sh.tpl @@ -0,0 +1,53 @@ +#!/bin/bash + +set -e + +sudo systemctl restart guestregister # Sometimes registration fails on first boot. +sudo zypper ref +sudo zypper install -y -t pattern devel_basis +sudo zypper install -y open-iscsi nfs-client jq +sudo systemctl -q enable iscsid +sudo systemctl start iscsid + +if [ -b "/dev/nvme1n1" ]; then + mkfs.ext4 -E nodiscard /dev/nvme1n1 + mkdir /mnt/sda1 + mount /dev/nvme1n1 /mnt/sda1 + + mkdir /mnt/sda1/local + mkdir /opt/local-path-provisioner + mount --bind /mnt/sda1/local /opt/local-path-provisioner + + mkdir /mnt/sda1/longhorn + mkdir /var/lib/longhorn + mount --bind /mnt/sda1/longhorn /var/lib/longhorn +elif [ -b "/dev/xvdh" ]; then + mkfs.ext4 -E nodiscard /dev/xvdh + mkdir /var/lib/longhorn + mount /dev/xvdh /var/lib/longhorn +fi + +until (curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="server --tls-san ${k3s_server_public_ip} --write-kubeconfig-mode 644 --token ${k3s_cluster_secret}" INSTALL_K3S_VERSION="${k3s_version}" sh -); do + echo 'k3s server did not install correctly' + sleep 2 +done + +RETRY=0 +MAX_RETRY=180 +until (kubectl get pods -A | grep 'Running'); do + echo 'Waiting for k3s startup' + sleep 5 + if [ $RETRY -eq $MAX_RETRY ]; then + break + fi + RETRY=$((RETRY+1)) +done + +if [[ "${thick_plugin}" == true ]]; then + ln -s /var/lib/rancher/k3s/agent/etc/cni/net.d /etc/cni + ln -s /var/lib/rancher/k3s/data/current/bin /opt/cni +fi + +curl -OL https://github.com/containernetworking/plugins/releases/download/v1.3.0/cni-plugins-linux-amd64-v1.3.0.tgz +tar -zxvf cni-plugins-linux-amd64-v1.3.0.tgz +cp ipvlan /var/lib/rancher/k3s/data/current/bin/ \ No newline at end of file diff --git a/pipelines/storage_network/terraform/user-data-scripts/routes.sh.tpl b/pipelines/storage_network/terraform/user-data-scripts/routes.sh.tpl new file mode 100644 index 0000000000..f6585ede8e --- /dev/null +++ b/pipelines/storage_network/terraform/user-data-scripts/routes.sh.tpl @@ -0,0 +1,18 @@ +#!/bin/bash + +STORAGE_NETWORK_PREFIX="192.168" +ACTION="add" + +ETH1_IP=`ip a | grep eth1 | grep -Eo 'inet (addr:)?([0-9]*\.){3}[0-9]*' | awk '{print $2}'` + +if [[ $ETH1_IP != ${N1} ]]; then + sudo ip r $ACTION $STORAGE_NETWORK_PREFIX.1.0/24 via ${N1} dev eth1 +fi + +if [[ $ETH1_IP != ${N2} ]]; then + sudo ip r $ACTION $STORAGE_NETWORK_PREFIX.2.0/24 via ${N2} dev eth1 +fi + +if [[ $ETH1_IP != ${N3} ]]; then + sudo ip r $ACTION $STORAGE_NETWORK_PREFIX.3.0/24 via ${N3} dev eth1 +fi \ No newline at end of file diff --git a/pipelines/storage_network/terraform/variables.tf b/pipelines/storage_network/terraform/variables.tf new file mode 100644 index 0000000000..3d807d04dc --- /dev/null +++ b/pipelines/storage_network/terraform/variables.tf @@ -0,0 +1,113 @@ +variable "aws_access_key" { + type = string + description = "AWS ACCESS_KEY" +} + +variable "aws_secret_key" { + type = string + description = "AWS SECRET_KEY" +} + +variable "aws_region" { + type = string + default = "us-east-1" +} + +variable "aws_availability_zone" { + type = string + default = "us-east-1a" +} + +variable "aws_vpc_name" { + type = string + default = "vpc-lh-storage-network-tests" +} + +variable "arch" { + type = string + description = "available values (amd64, arm64)" + default = "amd64" +} + +variable "os_distro_version" { + type = string + default = "15-sp5" +} + +variable "aws_ami_sles_account_number" { + type = string + default = "amazon" +} + +variable "aws_instance_count" { + type = number + default = 3 +} + +variable "aws_instance_type" { + type = string + description = "Recommended instance types t2.xlarge for amd64 & a1.xlarge for arm64" + default = "t2.xlarge" +} + +variable "aws_ssh_public_key_file_path" { + type = string + default = "~/.ssh/id_rsa.pub" +} + +variable "aws_ssh_private_key_file_path" { + type = string + default = "~/.ssh/id_rsa" +} + +variable "aws_instance_name" { + type = string + default = "lh-storage-network-tests" +} + +variable "aws_instance_root_block_device_size" { + type = number + default = 40 +} + +variable "k8s_distro_name" { + type = string + default = "k3s" + description = "kubernetes distro version to install [rke, k3s, rke2] (default: k3s)" +} + +variable "k8s_distro_version" { + type = string + default = "v1.27.1+k3s1" + description = <<-EOT + kubernetes version that will be deployed + rke: (default: v1.22.5-rancher1-1) + k3s: (default: v1.27.1+k3s1) + rke2: (default: v1.27.2+rke2r1) + EOT +} + +variable "resources_owner" { + type = string + default = "longhorn-infra" +} + +variable "cis_hardening" { + type = bool + default = false +} + +variable "mtu" { + type = string + default = "8951" +} + +variable "multus_version" { + type = string + default = "v4.0.2" +} + +variable "thick_plugin" { + type = bool + default = true +} \ No newline at end of file diff --git a/pipelines/utilities/storage_network.sh b/pipelines/utilities/storage_network.sh new file mode 100755 index 0000000000..c9e36b481d --- /dev/null +++ b/pipelines/utilities/storage_network.sh @@ -0,0 +1,52 @@ +deploy_multus_thin_plugin_daemonset(){ + curl -O "https://raw.githubusercontent.com/k8snetworkplumbingwg/multus-cni/${TF_VAR_multus_version}/deployments/multus-daemonset.yml" + sed -Ei 's@"kubeconfig":.+@"kubeconfig": "/var/lib/rancher/k3s/agent/etc/cni/net.d/multus.d/multus.kubeconfig"@g' multus-daemonset.yml + yq e -i 'select(.kind == "DaemonSet" and .metadata.name == "kube-multus-ds").spec.template.spec.containers[0].args += "--multus-kubeconfig-file-host=/var/lib/rancher/k3s/agent/etc/cni/net.d/multus.d/multus.kubeconfig"' multus-daemonset.yml + yq e -i "select(.kind == \"DaemonSet\" and .metadata.name == \"kube-multus-ds\").spec.template.spec.containers[0].image=\"ghcr.io/k8snetworkplumbingwg/multus-cni:${TF_VAR_multus_version}\"" multus-daemonset.yml + yq e -i "select(.kind == \"DaemonSet\" and .metadata.name == \"kube-multus-ds\").spec.template.spec.initContainers[0].image=\"ghcr.io/k8snetworkplumbingwg/multus-cni:${TF_VAR_multus_version}\"" multus-daemonset.yml + sed -Ei 's@path: /etc/cni/net.d@path: /var/lib/rancher/k3s/agent/etc/cni/net.d@g' multus-daemonset.yml + sed -Ei 's@path: /opt/cni/bin@path: /var/lib/rancher/k3s/data/current/bin@g' multus-daemonset.yml + kubectl apply -f multus-daemonset.yml +} + +deploy_multus_thick_plugin_daemonset(){ + curl -O https://raw.githubusercontent.com/k8snetworkplumbingwg/multus-cni/v4.0.2/deployments/multus-daemonset-thick.yml + yq e -i 'select(.kind == "DaemonSet" and .metadata.name == "kube-multus-ds").spec.template.spec.containers[0].volumeMounts += {"name": "cnibin", "mountPath": "/opt/cni/bin"}' multus-daemonset-thick.yml + yq e -i "select(.kind == \"DaemonSet\" and .metadata.name == \"kube-multus-ds\").spec.template.spec.containers[0].image=\"ghcr.io/k8snetworkplumbingwg/multus-cni:${TF_VAR_multus_version}-thick\"" multus-daemonset-thick.yml + yq e -i "select(.kind == \"DaemonSet\" and .metadata.name == \"kube-multus-ds\").spec.template.spec.initContainers[0].image=\"ghcr.io/k8snetworkplumbingwg/multus-cni:${TF_VAR_multus_version}-thick\"" multus-daemonset-thick.yml + kubectl apply -f multus-daemonset-thick.yml +} + +deploy_network_attachment_definition(){ +cat << EOF > nad-192-168-0-0.yaml +apiVersion: "k8s.cni.cncf.io/v1" +kind: NetworkAttachmentDefinition +metadata: + name: demo-192-168-0-0 + namespace: kube-system +spec: + config: '{ + "cniVersion": "0.3.1", + "type": "flannel", + "subnetFile": "/run/flannel/multus-subnet-192.168.0.0.env", + "dataDir": "/var/lib/cni/multus-subnet-192.168.0.0", + "delegate": { + "type": "ipvlan", + "master": "eth1", + "mode": "l3", + "capabilities": { + "ips": true + } + }, + "kubernetes": { + "kubeconfig": "/etc/cni/net.d/multus.d/multus.kubeconfig" + } + }' +EOF +kubectl apply -f nad-192-168-0-0.yaml +} + + +update_storage_network_setting(){ + kubectl -n longhorn-system patch -p '{"value": "kube-system/demo-192-168-0-0"}' --type=merge setting.longhorn.io/storage-network +} \ No newline at end of file From 66540f628136c2aad26288ad268a9272639a414f Mon Sep 17 00:00:00 2001 From: Yang Chiu Date: Thu, 15 Feb 2024 16:51:42 +0800 Subject: [PATCH 27/42] test: find backup should also wait until backup volume is synced Signed-off-by: Yang Chiu --- manager/integration/tests/common.py | 35 ++++++++++------------------- 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/manager/integration/tests/common.py b/manager/integration/tests/common.py index e47f12136d..47dd4daee9 100644 --- a/manager/integration/tests/common.py +++ b/manager/integration/tests/common.py @@ -3114,17 +3114,20 @@ def check_volume_endpoint(v): return endpoint +def find_backup_volume(client, volume_name): + bvs = client.list_backupVolume() + for bv in bvs: + if bv.name == volume_name and bv.created != "": + return bv + return None + + def wait_for_backup_volume_backing_image_synced( client, volume_name, backing_image, retry_count=RETRY_BACKUP_COUNTS): - def find_backup_volume(): - bvs = client.list_backupVolume() - for bv in bvs: - if bv.name == volume_name: - return bv - return None + completed = False for _ in range(retry_count): - bv = find_backup_volume() + bv = find_backup_volume(client, volume_name) assert bv is not None if bv.backingImageName == backing_image: completed = True @@ -3757,17 +3760,10 @@ def find_backup(client, vol_name, snap_name): been completed successfully """ - def find_backup_volume(): - bvs = client.list_backupVolume() - for bv in bvs: - if bv.name == vol_name and bv.created != "": - return bv - return None - bv = None for i in range(120): if bv is None: - bv = find_backup_volume() + bv = find_backup_volume(client, vol_name) if bv is not None: backups = bv.backupList().data for b in backups: @@ -5162,15 +5158,8 @@ def wait_for_instance_manager_desire_state(client, core_api, im_name, def wait_for_backup_delete(client, volume_name, backup_name): - def find_backup_volume(): - bvs = client.list_backupVolume() - for bv in bvs: - if bv.name == volume_name: - return bv - return None - def backup_exists(): - bv = find_backup_volume() + bv = find_backup_volume(client, volume_name) if bv is not None: backups = bv.backupList() for b in backups: From c80a8c01ad55dbb7c227377cb104109b70b0f134 Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 6 Feb 2024 16:10:28 +0800 Subject: [PATCH 28/42] Add skeleton for test_engine_crash_during_live_upgrade ref: 7859 Signed-off-by: Chris --- manager/integration/tests/test_engine_upgrade.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/manager/integration/tests/test_engine_upgrade.py b/manager/integration/tests/test_engine_upgrade.py index 310b31e20e..31fc22859a 100644 --- a/manager/integration/tests/test_engine_upgrade.py +++ b/manager/integration/tests/test_engine_upgrade.py @@ -1196,3 +1196,19 @@ def test_engine_live_upgrade_while_replica_concurrent_rebuild(client, # NOQA for replica in volume2.replicas: assert replica.image == engine_upgrade_image assert replica.currentImage == engine_upgrade_image + +@pytest.mark.skip(reason="TODO") # NOQA +def test_engine_crash_during_live_upgrade(): + """ + 1. Create and attach a volume to a workload, then write data into the + volume. + 2. Deploy an extra engine image. + 3. Send live upgrade request then immediately delete the related engine + manager pod/engine process (The new replicas are not in active in this + case). + 4. Verify the workload will be restarted and the volume will be reattached + automatically. + 5. Verify the upgrade is done during the reattachment. + (It actually becomes offline upgrade.) + 6. Verify volume healthy and the data is correct. + """ From 4a3ac3c23f5aa2556467d3dd0f07d3d91772a4d1 Mon Sep 17 00:00:00 2001 From: Yang Chiu Date: Tue, 27 Feb 2024 14:34:02 +0800 Subject: [PATCH 29/42] ci: fix build test image failed Signed-off-by: Yang Chiu --- manager/integration/Dockerfile | 2 +- manager/integration/tests/requirements.txt | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/manager/integration/Dockerfile b/manager/integration/Dockerfile index 8ce1837528..a6f5a218d1 100644 --- a/manager/integration/Dockerfile +++ b/manager/integration/Dockerfile @@ -6,7 +6,7 @@ ARG TERRAFORM_VERSION=1.3.5 ARG ARCH=amd64 RUN zypper ref -f -RUN zypper in -y vim-small nfs-client xfsprogs e2fsprogs util-linux-systemd gcc python39-devel gawk java-11-openjdk tar awk gzip wget unzip && \ +RUN zypper in -y vim-small nfs-client xfsprogs e2fsprogs util-linux-systemd gcc python311-devel gawk java-11-openjdk tar awk gzip wget unzip && \ rm -rf /var/cache/zypp/* RUN curl -sO https://storage.googleapis.com/kubernetes-release/release/$KUBECTL_VERSION/bin/linux/${ARCH}/kubectl && \ diff --git a/manager/integration/tests/requirements.txt b/manager/integration/tests/requirements.txt index 51fdbdb828..b0bb8357b5 100644 --- a/manager/integration/tests/requirements.txt +++ b/manager/integration/tests/requirements.txt @@ -4,11 +4,11 @@ argcomplete==1.10.0 directio==1.2 flake8 kubernetes==25.3.0 -pytest==5.3.1 +pytest==6.2.4 pytest-repeat==0.9.1 pytest-order==1.0.1 six==1.12.0 minio==5.0.10 -pyyaml==5.4.1 +pyyaml==6.0 pandas prometheus_client From 621023b783ea756dc7f26ed856cbbbf4063285db Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Tue, 27 Feb 2024 10:38:31 +0000 Subject: [PATCH 30/42] chore(deps): update dependency boto3 to v1.34.50 --- e2e/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e/requirements.txt b/e2e/requirements.txt index 2963efe495..346967acac 100644 --- a/e2e/requirements.txt +++ b/e2e/requirements.txt @@ -4,5 +4,5 @@ directio==1.3 flake8 kubernetes==27.2.0 requests==2.31.0 -boto3==1.34.49 +boto3==1.34.50 pyyaml==6.0.1 From f907ec81b4448c5267dbfe4181f2aeb279925ded Mon Sep 17 00:00:00 2001 From: Chin-Ya Huang Date: Tue, 27 Feb 2024 22:20:57 +0800 Subject: [PATCH 31/42] fix(integration): flaky test_backuptarget_available_during_engine_image_not_ready Handles patch failure. Signed-off-by: Chin-Ya Huang --- manager/integration/tests/test_basic.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/manager/integration/tests/test_basic.py b/manager/integration/tests/test_basic.py index 619e5ee420..d6acfd7faa 100644 --- a/manager/integration/tests/test_basic.py +++ b/manager/integration/tests/test_basic.py @@ -101,6 +101,7 @@ from common import create_and_wait_deployment from common import get_custom_object_api_client from common import RETRY_COUNTS_SHORT +from common import scale_up_engine_image_daemonset from backupstore import backupstore_delete_volume_cfg_file from backupstore import backupstore_cleanup @@ -4426,10 +4427,7 @@ def test_backuptarget_available_during_engine_image_not_ready(client, apps_api): common.wait_for_backup_target_available(client, False) # Scale up the engine image DaemonSet - body = [{"op": "remove", - "path": "/spec/template/spec/nodeSelector/foo"}] - apps_api.patch_namespaced_daemon_set( - name=ds_name, namespace='longhorn-system', body=body) + scale_up_engine_image_daemonset(client) common.wait_for_backup_target_available(client, True) # Sleep 1 second to prevent the same time From feed006656a40dfe4dc61fd630e06504dd8f872b Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Wed, 28 Feb 2024 14:11:28 +0000 Subject: [PATCH 32/42] chore(deps): update dependency boto3 to v1.34.51 --- e2e/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e/requirements.txt b/e2e/requirements.txt index 346967acac..dc39de7348 100644 --- a/e2e/requirements.txt +++ b/e2e/requirements.txt @@ -4,5 +4,5 @@ directio==1.3 flake8 kubernetes==27.2.0 requests==2.31.0 -boto3==1.34.50 +boto3==1.34.51 pyyaml==6.0.1 From 4de0c00322813c619a519db23f0a202cee516d27 Mon Sep 17 00:00:00 2001 From: Chris Date: Thu, 29 Feb 2024 15:07:58 +0800 Subject: [PATCH 33/42] ci: delete upgrade test pod Signed-off-by: Chris --- test_framework/scripts/longhorn-setup.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test_framework/scripts/longhorn-setup.sh b/test_framework/scripts/longhorn-setup.sh index 8f7e3aa873..db4519ad27 100755 --- a/test_framework/scripts/longhorn-setup.sh +++ b/test_framework/scripts/longhorn-setup.sh @@ -341,6 +341,9 @@ run_longhorn_upgrade_test(){ # get upgrade test junit xml report kubectl cp ${LONGHORN_UPGRADE_TEST_POD_NAME}:${LONGHORN_JUNIT_REPORT_PATH} "${TF_VAR_tf_workspace}/${LONGHORN_UPGRADE_TEST_POD_NAME}-junit-report.xml" -c longhorn-test-report + + # delete upgrade test pod + kubectl delete -f ${LONGHORN_UPGRADE_TESTS_MANIFEST_FILE_PATH} } From 59a7b889d90b9fca29f534982a3792fb37ea72f6 Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 27 Feb 2024 16:21:58 +0800 Subject: [PATCH 34/42] Fix flaky test_volume_metrics ref: 7626 Signed-off-by: Chris --- manager/integration/tests/test_metric.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/manager/integration/tests/test_metric.py b/manager/integration/tests/test_metric.py index 5223c70abe..217fc24486 100644 --- a/manager/integration/tests/test_metric.py +++ b/manager/integration/tests/test_metric.py @@ -8,6 +8,7 @@ from common import client, core_api, volume_name # NOQA from common import delete_replica_processes +from common import check_volume_data from common import create_pv_for_volume from common import create_pvc_for_volume from common import create_snapshot @@ -194,6 +195,21 @@ def filter_metric_by_labels(metrics, labels): assert total_metrics["value"] >= 0.0 +def wait_for_metric_volume_actual_size(core_api, metric_name, metric_labels, actual_size): # NOQA + for _ in range(RETRY_COUNTS): + time.sleep(RETRY_INTERVAL) + + try: + check_metric(core_api, metric_name, + metric_labels, actual_size) + return + except AssertionError: + continue + + check_metric(core_api, metric_name, + metric_labels, actual_size) + + def wait_for_metric_count_all_nodes(client, core_api, metric_name, metric_labels, expected_count): # NOQA for _ in range(RETRY_COUNTS): time.sleep(RETRY_INTERVAL) @@ -271,7 +287,8 @@ def test_volume_metrics(client, core_api, volume_name, pvc_namespace): # NOQA volume = client.by_id_volume(volume_name) volume.attach(hostId=lht_hostId) volume = wait_for_volume_healthy(client, volume_name) - write_volume_random_data(volume) + data = write_volume_random_data(volume) + check_volume_data(volume, data) volume = client.by_id_volume(volume_name) actual_size = float(volume.controllers[0].actualSize) capacity_size = float(volume.size) @@ -284,8 +301,9 @@ def test_volume_metrics(client, core_api, volume_name, pvc_namespace): # NOQA } # check volume metric basic - check_metric(core_api, "longhorn_volume_actual_size_bytes", - metric_labels, actual_size) + wait_for_metric_volume_actual_size(core_api, + "longhorn_volume_actual_size_bytes", + metric_labels, actual_size) check_metric(core_api, "longhorn_volume_capacity_bytes", metric_labels, capacity_size) check_metric(core_api, "longhorn_volume_read_throughput", From 76f9406a3e50dd655e9b5e4a0bad5b4a3502b607 Mon Sep 17 00:00:00 2001 From: Roger Yao Date: Thu, 29 Feb 2024 17:45:09 +0800 Subject: [PATCH 35/42] test: fix flaky test case `test_replica_rebuild_per_volume_limit` Add wait_for_volume_healthy before deleting replicas to wait for volume rebuilding to complete. longhorn/longhorn#8011 Signed-off-by: Roger Yao --- manager/integration/tests/test_scheduling.py | 1 + 1 file changed, 1 insertion(+) diff --git a/manager/integration/tests/test_scheduling.py b/manager/integration/tests/test_scheduling.py index e6ffacd7c5..c95ce6101e 100644 --- a/manager/integration/tests/test_scheduling.py +++ b/manager/integration/tests/test_scheduling.py @@ -463,6 +463,7 @@ def test_replica_rebuild_per_volume_limit(client, core_api, storage_class, sts_n vol = common.wait_for_volume_replicas_mode(client, vol_name, 'RW', replica_count=r_count) + wait_for_volume_healthy(client, vol_name) # Delete 4 volume replicas del vol.replicas[0] From 842a1d07d3a26c954a661337ede66a915a2f0bd8 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Fri, 1 Mar 2024 13:20:29 +0000 Subject: [PATCH 36/42] chore(deps): update docker/buildx-bin docker tag to v0.13 --- mirror_csi_images/Dockerfile.setup | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mirror_csi_images/Dockerfile.setup b/mirror_csi_images/Dockerfile.setup index b872981e90..23aa8e287a 100644 --- a/mirror_csi_images/Dockerfile.setup +++ b/mirror_csi_images/Dockerfile.setup @@ -6,6 +6,6 @@ WORKDIR $WORKSPACE RUN apk add --no-cache skopeo docker jq bash grep -COPY --from=docker/buildx-bin:v0.12 /buildx /usr/libexec/docker/cli-plugins/docker-buildx +COPY --from=docker/buildx-bin:v0.13 /buildx /usr/libexec/docker/cli-plugins/docker-buildx COPY [".", "$WORKSPACE"] From 9894979f9b6fe2d671b946704d03b640c69dad6c Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Fri, 1 Mar 2024 13:20:34 +0000 Subject: [PATCH 37/42] chore(deps): update terraform azurerm to v3.94.0 --- test_framework/terraform/azure/aks/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_framework/terraform/azure/aks/main.tf b/test_framework/terraform/azure/aks/main.tf index 368f1dd43e..748f0f6b06 100644 --- a/test_framework/terraform/azure/aks/main.tf +++ b/test_framework/terraform/azure/aks/main.tf @@ -2,7 +2,7 @@ terraform { required_providers { azurerm = { source = "hashicorp/azurerm" - version = "3.93.0" + version = "3.94.0" } } } From 922f7398e24ebbf3a52f3cb63c432c78975493fa Mon Sep 17 00:00:00 2001 From: Yang Chiu Date: Thu, 29 Feb 2024 15:50:21 +0800 Subject: [PATCH 38/42] test: fix flaky test case test_csi_volumesnapshot_restore_pre_provision_backing_image Signed-off-by: Yang Chiu --- manager/integration/tests/test_csi_snapshotter.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/manager/integration/tests/test_csi_snapshotter.py b/manager/integration/tests/test_csi_snapshotter.py index 05493e5ce9..a6cc4f33bd 100644 --- a/manager/integration/tests/test_csi_snapshotter.py +++ b/manager/integration/tests/test_csi_snapshotter.py @@ -29,7 +29,6 @@ from common import make_deployment_with_pvc, apps_api # NOQA from common import check_pvc_in_specific_status # NOQA from common import wait_for_pvc_phase -from common import RETRY_COMMAND_COUNT from common import BACKING_IMAGE_QCOW2_URL, BACKING_IMAGE_QCOW2_CHECKSUM from common import BACKING_IMAGE_RAW_URL, BACKING_IMAGE_RAW_CHECKSUM from common import BACKING_IMAGE_SOURCE_TYPE_DOWNLOAD, RETRY_COUNTS_SHORT @@ -280,7 +279,7 @@ def get_volumesnapshotcontent(volumesnapshot_uid): def wait_volumesnapshot_deleted(name, namespace, - retry_counts=RETRY_COMMAND_COUNT, + retry_counts=RETRY_COUNTS, can_be_deleted=True): api = get_custom_object_api_client() api_group = "snapshot.storage.k8s.io" @@ -1349,8 +1348,7 @@ def finalizer(): delete_and_wait_pvc(core_api, restore_pvc_name) delete_volumesnapshot(csivolsnap_name, "default") wait_volumesnapshot_deleted(csivolsnap_name, - "default", - retry_counts=RETRY_COUNTS_SHORT) + "default") request.addfinalizer(finalizer) @@ -1661,7 +1659,6 @@ def finalizer(): delete_and_wait_pvc(core_api, pvc['metadata']['name']) delete_volumesnapshot(csivolsnap_name, "default") wait_volumesnapshot_deleted(csivolsnap_name, - "default", - retry_counts=RETRY_COUNTS_SHORT) + "default") request.addfinalizer(finalizer) From 50dfa923b971bb3e175b6a6fe9fdd0ad05f7e736 Mon Sep 17 00:00:00 2001 From: Chin-Ya Huang Date: Fri, 1 Mar 2024 14:14:03 +0800 Subject: [PATCH 39/42] test(manual): add v1.7.0 _index.md Signed-off-by: Chin-Ya Huang --- docs/content/manual/release-specific/v1.7.0/_index.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 docs/content/manual/release-specific/v1.7.0/_index.md diff --git a/docs/content/manual/release-specific/v1.7.0/_index.md b/docs/content/manual/release-specific/v1.7.0/_index.md new file mode 100644 index 0000000000..f4fc9aeb62 --- /dev/null +++ b/docs/content/manual/release-specific/v1.7.0/_index.md @@ -0,0 +1,3 @@ +--- +title: v1.7.0 +--- From f8f93b8bca7be5df113601ec44aaeb67d68c047c Mon Sep 17 00:00:00 2001 From: Chin-Ya Huang Date: Fri, 1 Mar 2024 14:32:30 +0800 Subject: [PATCH 40/42] test(manual): restart Kubelet should not result in repeated event longhorn/longhorn-8072 Signed-off-by: Chin-Ya Huang --- ...st-kubelet-restart-no-pending-pod-event.md | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 docs/content/manual/release-specific/v1.7.0/test-kubelet-restart-no-pending-pod-event.md diff --git a/docs/content/manual/release-specific/v1.7.0/test-kubelet-restart-no-pending-pod-event.md b/docs/content/manual/release-specific/v1.7.0/test-kubelet-restart-no-pending-pod-event.md new file mode 100644 index 0000000000..5021d855e0 --- /dev/null +++ b/docs/content/manual/release-specific/v1.7.0/test-kubelet-restart-no-pending-pod-event.md @@ -0,0 +1,22 @@ +--- +Test: restarting Kubelet should not result in repeated "no Pending workload pods ..." event for the workload pod. +--- + +## Related issues + +- https://github.com/longhorn/longhorn/issues/8072 + +## Test step + +**Given** A [deployment](https://github.com/longhorn/longhorn/blob/master/examples/deployment.yaml) is created. + +**When** Kubelet on the node with attached volume of the deployment is restarted. +```bash +systemctl restart k3s-agent.service +``` + +**Then** Observe the events of the deployment pod. +``` +kubectl get events --field-selector involvedObject.name=${POD_NAME} -w +``` +**And** There are no recurring `no Pending workload pods for volume xxx to be mounted` events. From 9f08d73c13565738e9e15fac335e50db0356f5bc Mon Sep 17 00:00:00 2001 From: khushboo-rancher <60111667+khushboo-rancher@users.noreply.github.com> Date: Fri, 1 Mar 2024 14:35:48 -0800 Subject: [PATCH 41/42] Update title test-kubelet-restart-no-pending-pod-event.md Updating the heading as title to have consistently across the test cases. Also, using different text breaks our automation to sync the test cases in Qase. Signed-off-by: khushboo-rancher <60111667+khushboo-rancher@users.noreply.github.com> --- .../v1.7.0/test-kubelet-restart-no-pending-pod-event.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/content/manual/release-specific/v1.7.0/test-kubelet-restart-no-pending-pod-event.md b/docs/content/manual/release-specific/v1.7.0/test-kubelet-restart-no-pending-pod-event.md index 5021d855e0..2758cd8b67 100644 --- a/docs/content/manual/release-specific/v1.7.0/test-kubelet-restart-no-pending-pod-event.md +++ b/docs/content/manual/release-specific/v1.7.0/test-kubelet-restart-no-pending-pod-event.md @@ -1,5 +1,5 @@ --- -Test: restarting Kubelet should not result in repeated "no Pending workload pods ..." event for the workload pod. +title: restarting Kubelet should not result in repeated "no Pending workload pods ..." event for the workload pod. --- ## Related issues From 1dbade38338d16e4686171a5a3d90a3ca9db4915 Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 31 Jan 2024 11:22:48 +0800 Subject: [PATCH 42/42] ci: add mtls test ref: 3978 Signed-off-by: Chris --- test_framework/Jenkinsfile | 4 ++++ test_framework/scripts/longhorn-setup.sh | 9 ++++++++- test_framework/templates/longhorn-grpc-tls.yml | 10 ++++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 test_framework/templates/longhorn-grpc-tls.yml diff --git a/test_framework/Jenkinsfile b/test_framework/Jenkinsfile index 3f2c7bd508..3b1de30efa 100644 --- a/test_framework/Jenkinsfile +++ b/test_framework/Jenkinsfile @@ -23,6 +23,9 @@ def REGISTRY_URL def REGISTRY_USERNAME def REGISTRY_PASSWORD +// parameter for mTls +def ENABLE_MTLS = params.ENABLE_MTLS ? params.ENABLE_MTLS : false + // parameter for hdd test def USE_HDD = params.USE_HDD ? params.USE_HDD : false @@ -115,6 +118,7 @@ node { --env PYTEST_CUSTOM_OPTIONS="${PYTEST_CUSTOM_OPTIONS}" \ --env BACKUP_STORE_TYPE="${BACKUP_STORE_TYPE}" \ --env TF_VAR_use_hdd=${USE_HDD} \ + --env TF_VAR_enable_mtls=${ENABLE_MTLS} \ --env TF_VAR_arch=${ARCH} \ --env TF_VAR_k8s_distro_name=${K8S_DISTRO_NAME} \ --env TF_VAR_k8s_distro_version=${K8S_DISTRO_VERSION} \ diff --git a/test_framework/scripts/longhorn-setup.sh b/test_framework/scripts/longhorn-setup.sh index db4519ad27..99f2dcd29c 100755 --- a/test_framework/scripts/longhorn-setup.sh +++ b/test_framework/scripts/longhorn-setup.sh @@ -54,6 +54,11 @@ install_cluster_autoscaler(){ } +enable_mtls(){ + kubectl apply -f "${TF_VAR_tf_workspace}/templates/longhorn-grpc-tls.yml" -n ${LONGHORN_NAMESPACE} +} + + install_csi_snapshotter_crds(){ CSI_SNAPSHOTTER_REPO_URL="https://github.com/kubernetes-csi/external-snapshotter.git" CSI_SNAPSHOTTER_REPO_DIR="${TMPDIR}/k8s-csi-external-snapshotter" @@ -441,7 +446,9 @@ main(){ install_backupstores fi install_csi_snapshotter_crds - + if [[ "${TF_VAR_enable_mtls}" == true ]]; then + enable_mtls + fi if [[ "${AIR_GAP_INSTALLATION}" == true ]]; then if [[ "${LONGHORN_INSTALL_METHOD}" == "manifest-file" ]]; then create_registry_secret diff --git a/test_framework/templates/longhorn-grpc-tls.yml b/test_framework/templates/longhorn-grpc-tls.yml new file mode 100644 index 0000000000..cf612e29f9 --- /dev/null +++ b/test_framework/templates/longhorn-grpc-tls.yml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: Secret +metadata: + name: longhorn-grpc-tls + namespace: longhorn-system +type: kubernetes.io/tls +data: + ca.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUREekNDQWZlZ0F3SUJBZ0lVTHN6YnlpMVN6dSs5UlJmZkl3TDJlNFkzSytzd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0ZqRVVNQklHQTFVRUF3d0xiRzl1WjJodmNtNHRZMkV3SUJjTk1qUXdNVEkyTVRjMU5EUTVXaGdQTXpBeQpNekExTWpreE56VTBORGxhTUJZeEZEQVNCZ05WQkFNTUMyeHZibWRvYjNKdUxXTmhNSUlCSWpBTkJna3Foa2lHCjl3MEJBUUVGQUFPQ0FROEFNSUlCQ2dLQ0FRRUFucGFFbG4xRjFhT2hzekZTcVJ0TUliNWdLTkNZSUhzRml4WGEKZTJiZ2hKRThQdUZ0bUdzekhlQVpBeWNueHM1L1J3cU1ieVBPbDFuL3FlU2RJMUg5QnMvNUQwZk1tUEFQMm94aQozZ3B2cXhRbzZwdE5PMGwxUnVBcmZmKytQKzNqd2RNdWpDMDdWVW9HZUpsbWoxNUpLbTZRQWJ1cURnejEyaDNjCmYvUzg5bWJWeXowZXMwMktTQnRqVm5RRTBlSVdGakg1SnVyVEU0bEJpT1hWbktHSUZnQXYzZ3pxeXZsdUo3VVgKUml5TC9UaVp1VS9aSnFtQlJpQyttWGpiUndlVTRvMW1mNGlrN1dPQXIzY2FNOUUzQVgvaDlMbzhYTXhDM1hqVAphdkZta2NnWXZhSlhicWhqak9VWVhlNmo1UmN6dnNUVk8wOXBsL3RlTld3Mkx4ZmFsUUlEQVFBQm8xTXdVVEFkCkJnTlZIUTRFRmdRVTNybmVhNVBoVFVzMVJCSG1ZT0lSdmdpR0p1Z3dId1lEVlIwakJCZ3dGb0FVM3JuZWE1UGgKVFVzMVJCSG1ZT0lSdmdpR0p1Z3dEd1lEVlIwVEFRSC9CQVV3QXdFQi96QU5CZ2txaGtpRzl3MEJBUXNGQUFPQwpBUUVBR3F4emxpdHVrVlFhUFZENmFZSC9xQ0IyRnh5WW1WU3pBVGJncGlwU3JrZHFBWmJRVUNQdXUvSnVMdGlSCjBuYXFicHkzZ2J6K0IzY1VPZlJjQWpxd2VQclpRMUVOTVF4TUZGZEJ2MG51Tko2TllFWWlKUEVhSFlhdE1IZlgKaXVndTZwcXNmZW56dlROMG1MeGx0eDBycVdXNnFiT1k4OGdVKzA1bXl2c0dTUjdWUldsQ2Yyc1FnQmtteWJHbgozSTBuaFFMVHd1N2Y2VkUrd21GeEhlUDl3OWN1Mk8wbFdMV1ZHTno1ZExybGdDcCsrdWttZDlMOFlPbW1tT3lVCkhVVm5rOGY5Ykk2NG9ENjNNS0M2UU83Kzk0ZnRETFBSRFZxVHBReE5pV25QOWl2M0lIVlQvUS93TkN5OVNYQUIKRzJ3Qm1nLzJ0eFY0S09HSHRCamxlb1BxcUE9PQotLS0tLUVORCBDRVJUSUZJQ0FURS0tLS0tCg== + tls.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUZBekNDQSt1Z0F3SUJBZ0lCQVRBTkJna3Foa2lHOXcwQkFRc0ZBREFXTVJRd0VnWURWUVFEREF0c2IyNW4KYUc5eWJpMWpZVEFnRncweU5EQXhNall4TnpVNU16aGFHQTh6TURJek1EVXlPVEUzTlRrek9Gb3dHekVaTUJjRwpBMVVFQXd3UWJHOXVaMmh2Y200dFltRmphMlZ1WkRDQ0FTSXdEUVlKS29aSWh2Y05BUUVCQlFBRGdnRVBBRENDCkFRb0NnZ0VCQUpOcXpVVnlHUG8wK3ZoNHl1d0lHYkkydXZNRm4wc3ZaMmlBZjlGRmdHRmFTU25PbnAycElWajkKcVl0UjluM1JwT0lDdUhvS1hucmQ5OWJxZlpTRXBwam1tTTIvYXFMcWZPVkRLWkQ2eURkZ2FhQ3U3NWo4UHBoQgpKOGFHKzlUMGJzaEhHbDRRamZHb0wrK3ZtemxQdk9vWGcwMW5uN29IRjVWcmtjNmNMRm1qazRGM0J4Z0lGL25aCmNiVVRlMFV0anBMazRvV1RHNll6aUVzbTY0cEJHWkc0TzZidkZpWnZzeFlqSy83RFVHUEdHOS9GTUw2SC9RNXEKQ3NKMjNsbU5MdnJtOUNCb3pUTWcvbUpPcDVyOVVkdDdHbGExM1BEcG0rUEpwMVpreENmdlZOUzd2ZWtmUXM4ZAovbXlkQ2xRLzQ4RUZHTm0vVkluM1NXeUhZOUhXTEowQ0F3RUFBYU9DQWxNd2dnSlBNSUlDQ3dZRFZSMFJCSUlDCkFqQ0NBZjZDRUd4dmJtZG9iM0p1TFdKaFkydGxibVNDSUd4dmJtZG9iM0p1TFdKaFkydGxibVF1Ykc5dVoyaHYKY200dGMzbHpkR1Z0Z2lSc2IyNW5hRzl5YmkxaVlXTnJaVzVrTG14dmJtZG9iM0p1TFhONWMzUmxiUzV6ZG1PQwpFV3h2Ym1kb2IzSnVMV1p5YjI1MFpXNWtnaUZzYjI1bmFHOXliaTFtY205dWRHVnVaQzVzYjI1bmFHOXliaTF6CmVYTjBaVzJDSld4dmJtZG9iM0p1TFdaeWIyNTBaVzVrTG14dmJtZG9iM0p1TFhONWMzUmxiUzV6ZG1PQ0YyeHYKYm1kb2IzSnVMV1Z1WjJsdVpTMXRZVzVoWjJWeWdpZHNiMjVuYUc5eWJpMWxibWRwYm1VdGJXRnVZV2RsY2k1cwpiMjVuYUc5eWJpMXplWE4wWlcyQ0syeHZibWRvYjNKdUxXVnVaMmx1WlMxdFlXNWhaMlZ5TG14dmJtZG9iM0p1CkxYTjVjM1JsYlM1emRtT0NHR3h2Ym1kb2IzSnVMWEpsY0d4cFkyRXRiV0Z1WVdkbGNvSW9iRzl1WjJodmNtNHQKY21Wd2JHbGpZUzF0WVc1aFoyVnlMbXh2Ym1kb2IzSnVMWE41YzNSbGJZSXNiRzl1WjJodmNtNHRjbVZ3YkdsagpZUzF0WVc1aFoyVnlMbXh2Ym1kb2IzSnVMWE41YzNSbGJTNXpkbU9DREd4dmJtZG9iM0p1TFdOemFZSWNiRzl1CloyaHZjbTR0WTNOcExteHZibWRvYjNKdUxYTjVjM1JsYllJZ2JHOXVaMmh2Y200dFkzTnBMbXh2Ym1kb2IzSnUKTFhONWMzUmxiUzV6ZG1PQ0VHeHZibWRvYjNKdUxXSmhZMnRsYm1TSEJIOEFBQUV3SFFZRFZSME9CQllFRklkdwpxZlQ5WmxUcVQrYkk5QnhuYnJtS3V1R1BNQjhHQTFVZEl3UVlNQmFBRk42NTNtdVQ0VTFMTlVRUjVtRGlFYjRJCmhpYm9NQTBHQ1NxR1NJYjNEUUVCQ3dVQUE0SUJBUUJ5UFdBTzlZbjI3Ym84QmgwbGVKWGNieUpDWjV1aGczREUKdzZuRU9rT3ZndCtkUXNYSDdqL0F1K3F0V0I1b0owY01aOVJjUkhEczZ4ZVp3S2Q3c1FxZE92dVJGUUZ3SW9tdgpDTGd4L1F6TzJucDlQZnNGV253ODNILzM5N3pyNnpSd2thWXRSYlZISGNSbGd4c1orLzhjc2FVZVhXdEZvQWdkCnNMckpWR2IwTWdkc0s4RlJFa2JpUWJLZDd6YXg0RDdzQVFWaUVYMmw2NUpBOG5WcUx1U2ZsWENZNDZGUWs4RXEKT3hWdGFWeE00bS9hWW1tQkxOVklrakMvVVZzL1NadGxrRFNOQjFqaFlkVWkralZvYlZFZURNS0Jhakl1bzAxUwpVWDZXUCt2dEFWZEVVb1Nqc0dqZzRMTVlNWGhpUDlRMnZlK1dDOExCeGZBaHZIRUUzaGo3Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + tls.key: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2Z0lCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktnd2dnU2tBZ0VBQW9JQkFRQ1RhczFGY2hqNk5QcjQKZU1yc0NCbXlOcnJ6Qlo5TEwyZG9nSC9SUllCaFdra3B6cDZkcVNGWS9hbUxVZlo5MGFUaUFyaDZDbDU2M2ZmVwo2bjJVaEthWTVwak52MnFpNm56bFF5bVErc2czWUdtZ3J1K1kvRDZZUVNmR2h2dlU5RzdJUnhwZUVJM3hxQy92CnI1czVUN3pxRjROTlo1KzZCeGVWYTVIT25DeFpvNU9CZHdjWUNCZjUyWEcxRTN0RkxZNlM1T0tGa3h1bU00aEwKSnV1S1FSbVJ1RHVtN3hZbWI3TVdJeXYrdzFCanhodmZ4VEMraC8wT2FnckNkdDVaalM3NjV2UWdhTTB6SVA1aQpUcWVhL1ZIYmV4cFd0ZHp3Nlp2anlhZFdaTVFuNzFUVXU3M3BIMExQSGY1c25RcFVQK1BCQlJqWnYxU0o5MGxzCmgyUFIxaXlkQWdNQkFBRUNnZ0VBUjVYTzdXQ3RHVjg5MmdmS1Bsam1wWUJuUXhqaFZDREdYZHc4ZFZLRE40TWcKMFNEcExGVlRnTVBaSDNMak9CRGtPTzRxUi9VZUVSc1Z2WExzSFlGVzV4dmZhdFgvZ2ZKTlNRVld1M1RVWWZPNwpCMUM3djdZSjdXU0NYS2p5eEdRWUljQkpZUkUzNUhnUUl4dkt6RWRZelBJekRCVDhYdGtQempySXVLUms4dmU3CnVNNkY0TE9tNEhtL0xIWlZteVNpNGhxQkhtSWEzS1diVEhGRGk5ODBqZm0vQjVORWNzV0sxSk96TW1DeS9lV0gKSU9jK0p4Nmk5dFk3YTliQ3ladlBzVFFOazV3dXlTaUQvMFloTVhBalBUVGNnRDlYL2xSRGtKRjVzejd5UXk4Ngpyemw0UU9QMXpSWG04Ykt6WUxCcFpxc2M1em4wcEdrTXJzd2ZXYmxXbndLQmdRREZhQWZQWXExRVdpMmd4WFE2ClFHZkRWQk1UK0pNSGIyZE4wL3k2NkVzS2huc2dEN2tFOVFqdm4vSnVCd3haRXZveDhPcHhzdFU0UjM2YmNHYnQKYUUyOTFyU3BDckpwK1R3OVVmamg0SlB3c1R0bjZvZXNjaVZHcDZGMzVlaFZTQlNnaVJ2L2hEdXhQaThwVWFRagpGS1FDbFhFTkliU3MwNy9oNFdYdzFjVmNYd0tCZ1FDL0xGVUdSSGl4bnNYelJsRkhITHZ1Znk1eSswSmNTcFVnCmFncFN3MFFNRE04VWpybnVIc3lxWXlBbkk4c1UyVXdJUVlFNkU5cEwrWWNVRVVrYUJsU2wyYkNibWVFVkpLZVkKOWlpUmwvejZ5T2Y1WlJ0Y09MdVhBRUtabTU4WVd0bFRaWGJvYnd3RVZNa1N0KzJNWml1SjZrQjlyMnRoOWxySwpMNG16SVRFWWd3S0JnUUNmTVA5clhGWUIwdjhNc1c3RE13RDZZYWhvNklJWTh0dkp4WFAvZmloVnVwRThEN0hTCnI0K2ZQY3NRczVwZmtwQTFDZVRsLzZNMm1XRWVGSXpNVXRxdWhxQjEyV3g3VFVRbzV4dmZlMjJTSWpxWDJHZkUKeHVBTWxFNEFGR1ZCc0xrQnBNL3hSRCttOVZDdTcybC82THRDWWlVaXc5V2hzYmtCZlBUcVBGbkYzUUtCZ0RidwpkSmJTZ3FUNDdnWlZ4UEhjemgxaUsyVWIxQnhWeXJsLy8rdDg5a2RJUHhLM1diT1c0bFp0R2tabFFPMkM3UmpLClNtcjRYWm5MNGdmZ1Y5UEUwZnEvcnNObzI0aUoraWc1UmJ0aHBIQWw0SlNKZSsxcTJHNHl3dkVHQ2hpanN5VUcKV2IrK2VnT2NvaFJoQzBGMzh6YzFQTWRoN0VoQTFpS1l1c2ZoMkF3bEFvR0JBTDBtOW9od1lhK3N0aytQTUI0SgpSaE1WeHNGUzlBRENXQ01jVHFrVktHQnRseVc3S1Q0ZVh4NGRFVUpYQktnaVJURVI1VCtyMzI4OVdEd05HWTIzCmFuN0dHTThCSHJ4WVdKdGtpOEFnNE1scHkvbS9YN1c4bkFjUjZpSGVVWEpPL21pa21ydjR4M0ZKODNJK2RUZlAKLy9QaU4rOFkyR1VHMGNYSzlsbFFaT0dKCi0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K \ No newline at end of file