diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index f9faa348dc..71aecd8a61 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Fetch gh-pages branch run: | diff --git a/.github/workflows/qase-sync.yaml b/.github/workflows/qase-sync.yaml new file mode 100644 index 0000000000..18ad5e9c48 --- /dev/null +++ b/.github/workflows/qase-sync.yaml @@ -0,0 +1,110 @@ +name: Qase Sync + +on: + push: + branches: + - master + +jobs: + qase-sync: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v2 + + - name: Fetch changed files + run: | + echo $(pwd) + files=($(git show --name-only --pretty="")) + echo ${files[@]} + + - name: Filter changed test cases + run: | + test_cases=() + for file in "${files[@]}"; do + if [[ "$file" == *"/manual/"* ]]; then + test_cases+=("$file") + fi + done + echo ${test_cases[@]} + + - name: Create missing test suites + env: + token: ${{ secrets.QASE_TOKEN }} + run: | + for file_path in "${test_cases}"; do + test_case_path=$(echo "${file_path#*manual/}") + IFS='/' read -ra arr <<< "$test_case_path" + parent_suite_id="" + for str in "${arr[@]}"; do + echo $str + if [[ $str == *".md"* ]]; then + break + fi + str=${str//-/ } # replace - with whitespace + echo $str + if [[ $str =~ ^v.+\..+\.+.+$ ]]; then + : # skip v*.*.* + else + str=($str) + str="${str[@]^}" # capitalize every word + fi + # check if the test suite already exists + echo "check if ${str} exists" + res=$(curl --request GET --url "https://api.qase.io/v1/suite/LH" --data-urlencode "search=${str}" --header "Token: ${token}" --header "accept: application/json") + # if not, create new test suite + if [[ $(echo "$res" | jq .result.count) == "0" ]]; then + echo "create new test suite ${str} with parent id ${parent_suite_id}" + curl --request POST \ + --url https://api.qase.io/v1/suite/LH \ + --header "Token: ${token}" \ + --header "accept: application/json" \ + --header "content-type: application/json" \ + --data "{ \"title\": \"${str}\", \"parent_id\": \"${parent_suite_id}\" }" + fi + # get parent suite id + res=$(curl --request GET --url "https://api.qase.io/v1/suite/LH" --data-urlencode "search=${str}" --header "Token: ${token}" --header "accept: application/json") + parent_suite_id=$(echo "$res" | jq .result.entities[0].id) + done + done + + - name: Create or update test cases + env: + token: ${{ secrets.QASE_TOKEN }} + run: | + cd docs/content/manual/ + for file_path in "${test_cases}"; do + + title=$(grep '^title:' ${file_path} | sed 's/title: "\(.*\)"/\1/') + echo "title = ${title}" + description=$(sed -z 's/\n/\\n/g' ${file_path} | sed 's/ \\/ \\\\/g') + echo "description = ${description}" + + res=$(curl --request GET --url "https://api.qase.io/v1/case/LH" --data-urlencode "search=${title}" --header "Token: ${token}" --header "accept: application/json") + if [[ "$(echo $res | jq .result.count) == "1" ]]; then + # update existing test case + test_case_id=$(echo $res | jq .result.entities[0].id) + curl --request PATCH \ + --url "https://api.qase.io/v1/case/LH/${test_case_id}" \ + --header "Token: ${token}" \ + --header "accept: application/json" \ + --header "content-type: application/json" \ + --data "{ \"description\": \"${description}\", \"title\": "${title}" }" + else + # create new test case + parent_suite_name=$(basename $(dirname ${file_path})) + echo "parent_suite_name = ${parent_suite_name}" + if [[ "${parent_suite_name}" == "manual" ]]; then: + parent_suite_id="" + else + res=$(curl --request GET --url "https://api.qase.io/v1/suite/LH" --data-urlencode "search=${str}" --header "Token: ${token}" --header "accept: application/json") + parent_suite_id=$(echo "$res" | jq .result.entities[0].id) + fi + curl --request POST \ + --url https://api.qase.io/v1/case/LH/ \ + --header "Token: ${token}" \ + --header "accept: application/json" \ + --header "content-type: application/json" \ + --data "{ \"description\": \"${description}\", \"title\": "${title}", \"suite_id\": \"${parent_suite_id}\" }" + fi + done diff --git a/docs/content/manual/pre-release/backup-and-restore/concurrent-backup.md b/docs/content/manual/pre-release/backup-and-restore/concurrent-backup.md index ec8661dc64..1adac16e81 100644 --- a/docs/content/manual/pre-release/backup-and-restore/concurrent-backup.md +++ b/docs/content/manual/pre-release/backup-and-restore/concurrent-backup.md @@ -3,4 +3,7 @@ title: "[#1341](https://github.com/longhorn/longhorn/issues/1341) concurrent bac --- - Take a manual backup of the volume `bak` while a recurring backup is running - verify that backup got created -- verify that backup sticks around even when recurring backups are cleaned up +- verify that backup sticks around even when recurring backups are cleaned up +from-literal=AWS_SECRET_ACCESS_KEY=$AWS_KEY \ +AWS_SECRET_ACCESS_KEY_ID=$AWS_ID \ +END diff --git a/docs/content/manual/release-specific/v1.8.0/fast-failover/test-rwx-fast-failover.md b/docs/content/manual/release-specific/v1.8.0/fast-failover/test-rwx-fast-failover.md new file mode 100644 index 0000000000..5e4876a389 --- /dev/null +++ b/docs/content/manual/release-specific/v1.8.0/fast-failover/test-rwx-fast-failover.md @@ -0,0 +1,184 @@ +--- +title: RWX Fast Failover +--- + +## Related issues + +- https://github.com/longhorn/longhorn/issues/6205 + +## LEP + +- https://github.com/longhorn/longhorn/pull/9069 + +## Test Failover with I/O + +**Given** Longhorn cluster with 3 worker nodes. + +**And** Enable the feature by setting `rwx-enable-fast-failover` to true. + Ensure that setting `auto-delete-pod-when-volume-detached-unexpectedly` is set to its default value of true. + +**And** Deploy an RWX volume with default storage class. Run an app pod with the RWX volume on each worker node. Execute the command in each app pod + + `( exec 7<>/data/testfile-${i}; flock -x 7; while date | dd conv=fsync >&7 ; do sleep 1; done )` + + where ${i} is the node number. + +**Then** Turn off or restart the node where share-manager is running. + +**Verify** The share-manager pod is recreated on a different node. + - In the client side, IO to the RWX volume will hang until a share-manager pod replacement is successfully created on another node. + - During the outage, the server rejects READ and WRITE operations and non-reclaim locking requests (i.e., other LOCK and OPEN operations) with an error of NFS4ERR_GRACE. + - New share-manager pod is created in under 20 seconds. + - Outage, including grace period, should be less than 60 seconds. + +## Test Mount Options + +**Given** Longhorn cluster with 3 worker nodes. + +**And** Enable the feature by setting `rwx-enable-fast-failover` to true. + Ensure that setting `auto-delete-pod-when-volume-detached-unexpectedly` is set to its default value of true. + +**And** Create a custom storage class with settings (nfsOptions: "hard,timeo=50,retrans=1") + +```yaml +kind: StorageClass +apiVersion: storage.k8s.io/v1 +metadata: + name: longhorn-test-hard +provisioner: driver.longhorn.io +allowVolumeExpansion: true +reclaimPolicy: Delete +volumeBindingMode: Immediate +parameters: + numberOfReplicas: "3" + staleReplicaTimeout: "2880" + fromBackup: "" + fsType: "ext4" + nfsOptions: "hard,timeo=50,retrans=1" +``` + +**And** Use the deployment in [example]([https://github.com/longhorn/longhorn/blob/master/examples/rwx/rwx-nginx-deployment.yaml](https://github.com/longhorn/longhorn/blob/master/examples/rwx/rwx-nginx-deployment.yaml) ) with the custom storage class. + +**Then** Turn off the node where share-manager is running. + +**Verify** The share-manager pod is recreated on a different node. + - The other active clients should not run into stale handle errors after the failover. + - New share-manager pod is created in under 20 seconds. + - Outage, including grace period, should be less than 60 seconds. + +**Repeat** Using a different storage class with soft NFS mount + +```yaml +kind: StorageClass +apiVersion: storage.k8s.io/v1 +metadata: + name: longhorn-test-soft +provisioner: driver.longhorn.io +allowVolumeExpansion: true +reclaimPolicy: Delete +volumeBindingMode: Immediate +parameters: + numberOfReplicas: "3" + staleReplicaTimeout: "2880" + fromBackup: "" + fsType: "ext4" + nfsOptions: "soft,timeo=250,retrans=5" +``` + +**Repeat** The mount option cases with `Automatically Delete Workload Pod when The Volume Is Detached Unexpectedly` disabled. + +## Test Resource Usage + +**Given** Longhorn cluster with 3 worker nodes. + +**And** Default Longhorn storage class (including normal mount options. Results should be independent of mount options.) + +**And** `Enable RWX Fast Failover` set to true. `Automatically Delete Workload Pod when The Volume Is Detached Unexpectedly` also set to true. + +Make multiple deployments with a script such as +```shell +#!/bin/bash + +for i in {1..60}; do + cat </data/testfile-$(hostname -s); flock -x 7; while date | dd conv=fsync >&7 ; do sleep 1; done"] + volumeMounts: + - mountPath: /data + name: rwx-volume-pvc-$i + volumes: + - name: rwx-volume-pvc-$i + persistentVolumeClaim: + claimName: rwx-volume-pvc-$i +EOF +done +``` + +**Then** with the `rwx-enable-fast-failover` setting off, check the CPU and memory use totals for longhorn-manager and longhorn-share-manager pods, using something like `kubectl -n longhorn-system top pod`. Establish baseline values. + +**Then** set `rwx-enable-fast-failover` to true, and scale down and up the deployments so that they will start updating and monitoring the leases. + +**Verify** that the CPU and memory use grows, but only by a small amount. + +Here is the expected outcome: + +| **Metric** | **Fast Failover Enabled** | **Fast Failover Disabled** | **Difference** | +|--------------------------------------|---------------------------|----------------------------|----------------------------| +| **1. Number of API Requests** | 59 req/s | 37.5 req/s | **+57.3%** | +| **2. RPC Rate** | 57 ops/s | 37 ops/s | **+54.1%** | +| **3. Memory Usage** | Higher Peaks/Minima | Lower Peaks/Minima | More usage with Fast Failover Enabled | +| **4. Longhorn Manager CPU/RAM** | 417MB / 0.13 CPU | 405MB / 0.1 CPU | **+3% RAM** / **+30% CPU** | +| **5. Share Manager CPU/RAM** | 2.25GB / 0.26 CPU | 2.2GB / 0.235 CPU | **+2.3% RAM** / **+10.6% CPU** | + +Ref. https://github.com/longhorn/longhorn/issues/6205#issuecomment-2262625965 + +If newer Longhorn version consume more resources than that, then the test is considered as failed + +**If possible** monitor the API server requests similar to the method in the report https://github.com/longhorn/longhorn/blob/master/scalability/reference-setup-performance-scalability-and-sizing-guidelines/public-cloud/medium-node-spec.md#longhorn-control-plane-performance + +**Verify** that the API request rate remains low. + +**Reference** How to set up a Grafana Testing Environment on Rancher +https://github.com/longhorn/longhorn/issues/6205#issuecomment-2264430975 + +## Test Multiple Simultaneous Failovers. +**Given** Longhorn cluster with 3 worker nodes. + +**With** the same deployments as in `Test Resource Usage` (but perhaps only 20-30 of them), and fast failover enabled, + +**Then** pick a node and restart it. + +**Verify** the share-managers on that node are recreated one of the remaining nodes. + - Every RWX volume with share-manager pods on the failed node are relocated to another node. I/O can resume on its own after the shortened grace period. + - RWX volumes with share-manager pods not on the failed node should continue to operate without any disruption.