From f13566c63d2d96f0380aeef507e71ea00f0ed2eb Mon Sep 17 00:00:00 2001 From: Khushboo Date: Wed, 4 Oct 2023 17:13:02 -0700 Subject: [PATCH 01/24] Add test cases for attach-detach issues Signed-off-by: Khushboo --- .../_index.md | 3 + ...hment-detachment-issues-reproducibility.md | 79 +++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 docs/content/manual/Test-cases-to-reproduce-attach-detach-issues/_index.md create mode 100644 docs/content/manual/Test-cases-to-reproduce-attach-detach-issues/attachment-detachment-issues-reproducibility.md diff --git a/docs/content/manual/Test-cases-to-reproduce-attach-detach-issues/_index.md b/docs/content/manual/Test-cases-to-reproduce-attach-detach-issues/_index.md new file mode 100644 index 0000000000..9c33ce8ecc --- /dev/null +++ b/docs/content/manual/Test-cases-to-reproduce-attach-detach-issues/_index.md @@ -0,0 +1,3 @@ +--- +title: Test cases to reproduce issues related to attach detach +--- \ No newline at end of file diff --git a/docs/content/manual/Test-cases-to-reproduce-attach-detach-issues/attachment-detachment-issues-reproducibility.md b/docs/content/manual/Test-cases-to-reproduce-attach-detach-issues/attachment-detachment-issues-reproducibility.md new file mode 100644 index 0000000000..9216bc61f0 --- /dev/null +++ b/docs/content/manual/Test-cases-to-reproduce-attach-detach-issues/attachment-detachment-issues-reproducibility.md @@ -0,0 +1,79 @@ +--- +title: Test cases to reproduce attachment-detachment issues +--- +**Prerequisite:** Have an environment with just with 2 worker nodes or taint 1 out of 3 worker node to be `NoExecute` & `NoSchedule`. +This will serve as a constrained fallback and limited source of recovery in the event of failure. + + +#### 1. Kill the engines and instance manager repeatedly +**Given** 1 RWO and 1 RWX volume is attached to a pod. +And Both the volumes have 2 replicas. +And Random data is continuously being written to the volume using command `dd if=/dev/urandom of=file1 count=100 bs=1M conv=fsync status=progress oflag=direct,sync` + +**When** One replica rebuilding is triggered by crashing the IM +AND Immediately IM associated with another replica is crashed +AND After crashing IMs, detaching of Volume is tried either by pod deletion or using Longhorn UI + +**Then** Volume should not stuck in attaching-detaching loop + +**When** Volume is detached and manually attached again. +And Engine running on the node where is volume is attached in killed + +**Then** Volume should recover once the engine is back online. + +#### 2. Illegal values in Volume/Snap.meta +**Given** 1 RWO and 1 RWX volume is attached to a pod. +And Both the volumes have 2 replicas. + +**When** Some random values are set in the Volume/snap meta file +And If replica rebuilding is triggered and the IM associated with another replica is also crashed + +**Then** Volume should not stuck in attaching-detaching loop + + +#### 3. Deletion of Volume/Snap.meta +**Given** 1 RWO and 1 RWX volume is attached to a pod. +And Both the volumes have 2 replicas. + +**When** The Volume & snap meta files are deleted one by one. +And If replica rebuilding is triggered and the IM associated with another replica is also crashed + +**Then** Volume should not stuck in attaching-detaching loop + +#### 4. Failed replica tries to rebuild from other just crashed replica - https://github.com/longhorn/longhorn/issues/4212 +**Given** 1 RWO and 1 RWX volume is attached to a pod. +And Both the volumes have 2 replicas. +And Random data is continuously being written to the volume using command `dd if=/dev/urandom of=file1 count=100 bs=1M conv=fsync status=progress oflag=direct,sync` + +**When** One replica rebuilding is triggered by crashing the IM +AND Immediately IM associated with another replica is crashed + +**Then** Volume should not stuck in attaching-detaching loop. + +#### 5. Volume attachment Modification/deletion + +**Given** A deployment and statefulSet are created with same name and attached to Longhorn Volume. +AND Some data is written and their md5sum is computed + +**When** The statefulSet and Deployment are deleted without deleting the volumes +And Same named new statefulSet and Deployment are created with new PVCs. +And Before above deployed workload could attach to volumes, attached node is rebooted + +**Then** After node reboot completion, volumes should reflect right status. +And the newly created deployment and statefulSet should get attached to the volumes. + +**When** The volume attachments of above workloads are deleted. +And above workloads are deleted and recreated immediately. + +**Then** No multi attach or other errors should be observed. + +#### 6. Use monitoring/word press/db workloads +**Given** Monitoring and word press and any other db related workload are deployed in the system +And All the volumes have 2 replicas. +And Random data is continuously being written to the volume using command `dd if=/dev/urandom of=file1 count=100 bs=1M conv=fsync status=progress oflag=direct,sync` + +**When** One replica rebuilding is triggered by crashing the IM +AND Immediately IM associated with another replica is crashed + +**Then** Volume should not stuck in attaching-detaching loop. + \ No newline at end of file From 03c6a92bfc763aadc4e3c961f030beeadf15a848 Mon Sep 17 00:00:00 2001 From: Chin-Ya Huang Date: Fri, 13 Oct 2023 18:15:14 +0800 Subject: [PATCH 02/24] test: fix `Rebuild` in volume.meta blocks engine start ref: 6626 Signed-off-by: Chin-Ya Huang --- ...est-rebuild-in-meta-blocks-engine-start.md | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 docs/content/manual/release-specific/v1.6.0/test-rebuild-in-meta-blocks-engine-start.md diff --git a/docs/content/manual/release-specific/v1.6.0/test-rebuild-in-meta-blocks-engine-start.md b/docs/content/manual/release-specific/v1.6.0/test-rebuild-in-meta-blocks-engine-start.md new file mode 100644 index 0000000000..f81a56c604 --- /dev/null +++ b/docs/content/manual/release-specific/v1.6.0/test-rebuild-in-meta-blocks-engine-start.md @@ -0,0 +1,47 @@ +--- +title: Test `Rebuild` in volume.meta blocks engine start +--- + +## Related issue +https://github.com/longhorn/longhorn/issues/6626 + +## Test with patched image + +**Given** a patched longhorn-engine image with the following code change. +```diff +diff --git a/pkg/sync/sync.go b/pkg/sync/sync.go +index b48ddd46..c4523f11 100644 +--- a/pkg/sync/sync.go ++++ b/pkg/sync/sync.go +@@ -534,9 +534,9 @@ func (t *Task) reloadAndVerify(address, instanceName string, repClient *replicaC + return err + } + +- if err := repClient.SetRebuilding(false); err != nil { +- return err +- } ++ // if err := repClient.SetRebuilding(false); err != nil { ++ // return err ++ // } + return nil + } +``` +**And** a patched longhorn-instance-manager image with the longhorn-engine vendor updated. +**And** Longhorn is installed with the patched images. +**And** the `data-locality` setting is set to `disabled`. +**And** the `auto-salvage` setting is set to `true`. +**And** a new StorageClass is created with `NumberOfReplica` set to `1`. +**And** a StatefulSet is created with `Replica` set to `1`. +**And** the node of the StatefulSet Pod and the node of its volume Replica are different. This is necessary to trigger the rebuilding in reponse to the data locality setting update later. +**And** Volume have 1 running Replica. +**And** data exists in the volume. + +**When** the `data-locality` setting is set to `best-effort`. +**And** the replica rebuilding is completed. +**And** the `Rebuilding` in the replicas's `volume.meta` file is `true`. +**And** Delete the instance manager Pod of the Replica. + +**Then** the Replica should be running. +**And** the StatefulSet Pod should restart. +**And** the `Rebuilding` in replicas's `volume.meta` file should be `false`. +**And** the data should remain intact. From b21a0dac738798cdbf0125e5a5483c6cfb070ee3 Mon Sep 17 00:00:00 2001 From: Yang Chiu Date: Mon, 23 Oct 2023 11:47:06 +0800 Subject: [PATCH 03/24] test: fix flaky test case test_default_storage_class_syncup since storage_api.read_storage_class is flaky, it should be wrapped in try-catch and for-loop Signed-off-by: Yang Chiu --- manager/integration/tests/test_basic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/manager/integration/tests/test_basic.py b/manager/integration/tests/test_basic.py index de192f47e4..9e689bc3fb 100644 --- a/manager/integration/tests/test_basic.py +++ b/manager/integration/tests/test_basic.py @@ -4512,7 +4512,6 @@ def edit_configmap_allow_vol_exp(allow_exp): print(e) finally: time.sleep(RETRY_INTERVAL) - longhorn_storage_class = storage_api.read_storage_class("longhorn") assert longhorn_storage_class.allow_volume_expansion is allow_exp def finalizer(): From fcaeff7b3b13209ca599bd031e72f2aab351c435 Mon Sep 17 00:00:00 2001 From: Chin-Ya Huang Date: Tue, 24 Oct 2023 13:01:13 +0800 Subject: [PATCH 04/24] style(negative): cleanup imports Signed-off-by: Chin-Ya Huang --- e2e/libs/engine/crd.py | 2 ++ e2e/libs/engine/engine.py | 1 + e2e/libs/keywords/common_keywords.py | 4 +++- e2e/libs/keywords/node_keywords.py | 8 +++++--- e2e/libs/keywords/recurring_job_keywords.py | 8 +++----- e2e/libs/keywords/volume_keywords.py | 7 ++++--- e2e/libs/longhorn.py | 12 +++++++----- e2e/libs/node/node.py | 14 ++++++++------ e2e/libs/node_exec/node_exec.py | 7 +++++-- e2e/libs/recurring_job/crd.py | 5 +++-- e2e/libs/recurring_job/recurring_job.py | 1 + e2e/libs/recurring_job/rest.py | 9 +++++++-- e2e/libs/replica/crd.py | 7 +++++-- e2e/libs/replica/replica.py | 1 + e2e/libs/replica/rest.py | 2 ++ e2e/libs/utility/utility.py | 2 ++ e2e/libs/volume/crd.py | 10 ++++++---- e2e/libs/volume/rest.py | 9 ++++++--- e2e/libs/volume/volume.py | 7 +++++-- e2e/libs/workload/workload.py | 9 ++++++--- 20 files changed, 82 insertions(+), 43 deletions(-) diff --git a/e2e/libs/engine/crd.py b/e2e/libs/engine/crd.py index 827c50248b..5b4ddef80b 100644 --- a/e2e/libs/engine/crd.py +++ b/e2e/libs/engine/crd.py @@ -1,8 +1,10 @@ import logging from engine.base import Base + from utils.common_utils import k8s_cr_api + class CRD(Base): def __init__(self): self.cr_api = k8s_cr_api() diff --git a/e2e/libs/engine/engine.py b/e2e/libs/engine/engine.py index 9908a07b5a..2003712917 100644 --- a/e2e/libs/engine/engine.py +++ b/e2e/libs/engine/engine.py @@ -1,5 +1,6 @@ from engine.base import Base from engine.crd import CRD + from strategy import LonghornOperationStrategy diff --git a/e2e/libs/keywords/common_keywords.py b/e2e/libs/keywords/common_keywords.py index 3c9d55fa71..2dafabd997 100644 --- a/e2e/libs/keywords/common_keywords.py +++ b/e2e/libs/keywords/common_keywords.py @@ -1,6 +1,8 @@ -from utility.utility import init_k8s_api_client from node_exec import NodeExec +from utility.utility import init_k8s_api_client + + class common_keywords: def __init__(self): diff --git a/e2e/libs/keywords/node_keywords.py b/e2e/libs/keywords/node_keywords.py index 4fe06a41a1..ca3d9c40bb 100644 --- a/e2e/libs/keywords/node_keywords.py +++ b/e2e/libs/keywords/node_keywords.py @@ -1,9 +1,11 @@ -from utility.utility import get_test_pod_running_node -from utility.utility import get_node -from utility.utility import wait_for_all_instance_manager_running from robot.libraries.BuiltIn import BuiltIn + from node import Node +from utility.utility import get_node +from utility.utility import wait_for_all_instance_manager_running + + class node_keywords: def __init__(self): diff --git a/e2e/libs/keywords/recurring_job_keywords.py b/e2e/libs/keywords/recurring_job_keywords.py index f6e417aece..999ee40794 100644 --- a/e2e/libs/keywords/recurring_job_keywords.py +++ b/e2e/libs/keywords/recurring_job_keywords.py @@ -1,10 +1,8 @@ -from utility.utility import get_test_case_namespace, generate_volume_name -from utility.utility import get_node, list_nodes -from utility.utility import get_test_pod_running_node, get_test_pod_not_running_node -from utility.utility import logging -from robot.libraries.BuiltIn import BuiltIn from recurring_job import RecurringJob +from utility.utility import logging + + class recurring_job_keywords: def __init__(self): diff --git a/e2e/libs/keywords/volume_keywords.py b/e2e/libs/keywords/volume_keywords.py index a7b73281fc..192b1a303f 100644 --- a/e2e/libs/keywords/volume_keywords.py +++ b/e2e/libs/keywords/volume_keywords.py @@ -1,10 +1,11 @@ -from utility.utility import get_test_case_namespace, generate_volume_name +from utility.utility import logging +from utility.utility import generate_volume_name from utility.utility import get_node, list_nodes from utility.utility import get_test_pod_running_node, get_test_pod_not_running_node -from utility.utility import logging -from robot.libraries.BuiltIn import BuiltIn + from volume import Volume + class volume_keywords: def __init__(self): diff --git a/e2e/libs/longhorn.py b/e2e/libs/longhorn.py index 916a6ec252..aea50f2dd8 100644 --- a/e2e/libs/longhorn.py +++ b/e2e/libs/longhorn.py @@ -1,15 +1,17 @@ #!/usr/bin/env python from __future__ import print_function -import six -import re -import requests + import collections import hashlib -import os import json -import time import operator +import os +import re +import requests +import six +import time + from functools import reduce try: diff --git a/e2e/libs/node/node.py b/e2e/libs/node/node.py index 1d5e408607..98935bffcd 100644 --- a/e2e/libs/node/node.py +++ b/e2e/libs/node/node.py @@ -1,11 +1,13 @@ -from kubernetes import client -import yaml +import boto3 import time +import yaml + +from kubernetes import client + +from utility.utility import list_nodes from utility.utility import logging -from utility.utility import apply_cr_from_yaml, get_cr from utility.utility import wait_for_cluster_ready -from utility.utility import list_nodes -import boto3 + class Node: @@ -90,4 +92,4 @@ def wait_all_pods_evicted(self, node_name): time.sleep(RETRY_INTERVAL) - assert evicted, 'failed to evict pods' \ No newline at end of file + assert evicted, 'failed to evict pods' diff --git a/e2e/libs/node_exec/node_exec.py b/e2e/libs/node_exec/node_exec.py index 2f27d82906..8d0ab27ae8 100644 --- a/e2e/libs/node_exec/node_exec.py +++ b/e2e/libs/node_exec/node_exec.py @@ -1,9 +1,12 @@ +import time + from kubernetes import client from kubernetes.stream import stream -import time + +from utility.utility import logging from utility.utility import wait_delete_pod from utility.utility import wait_delete_ns -from utility.utility import logging + DEFAULT_POD_TIMEOUT = 180 DEFAULT_POD_INTERVAL = 1 diff --git a/e2e/libs/recurring_job/crd.py b/e2e/libs/recurring_job/crd.py index b026bdf9c0..1b10fc8ce5 100644 --- a/e2e/libs/recurring_job/crd.py +++ b/e2e/libs/recurring_job/crd.py @@ -1,7 +1,8 @@ -from utility.utility import logging from recurring_job.base import Base from recurring_job.rest import Rest -from kubernetes import client + +from utility.utility import logging + class CRD(Base): diff --git a/e2e/libs/recurring_job/recurring_job.py b/e2e/libs/recurring_job/recurring_job.py index 96c799dbfe..9ee52f2347 100644 --- a/e2e/libs/recurring_job/recurring_job.py +++ b/e2e/libs/recurring_job/recurring_job.py @@ -1,6 +1,7 @@ from recurring_job.base import Base from recurring_job.crd import CRD from recurring_job.rest import Rest + from strategy import LonghornOperationStrategy diff --git a/e2e/libs/recurring_job/rest.py b/e2e/libs/recurring_job/rest.py index 05d257cda6..5d54ed2ac8 100644 --- a/e2e/libs/recurring_job/rest.py +++ b/e2e/libs/recurring_job/rest.py @@ -1,10 +1,15 @@ import time + +from datetime import datetime + from kubernetes import client + from recurring_job.base import Base -from utility.utility import get_longhorn_client + from utility.utility import filter_cr +from utility.utility import get_longhorn_client from utility.utility import logging -from datetime import datetime + RETRY_COUNTS = 180 RETRY_INTERVAL = 1 diff --git a/e2e/libs/replica/crd.py b/e2e/libs/replica/crd.py index c2024d2b8c..1d7c02937c 100644 --- a/e2e/libs/replica/crd.py +++ b/e2e/libs/replica/crd.py @@ -1,7 +1,10 @@ -from utility.utility import logging +from utils.common_utils import k8s_cr_api + from replica.base import Base from replica.rest import Rest -from utils.common_utils import k8s_cr_api + +from utility.utility import logging + class CRD(Base): def __init__(self, node_exec): diff --git a/e2e/libs/replica/replica.py b/e2e/libs/replica/replica.py index 5927d83e39..140b6443ee 100644 --- a/e2e/libs/replica/replica.py +++ b/e2e/libs/replica/replica.py @@ -1,5 +1,6 @@ from replica.base import Base from replica.crd import CRD + from strategy import LonghornOperationStrategy diff --git a/e2e/libs/replica/rest.py b/e2e/libs/replica/rest.py index 1eb2c391ca..6ace292c03 100644 --- a/e2e/libs/replica/rest.py +++ b/e2e/libs/replica/rest.py @@ -1,8 +1,10 @@ import time from replica.base import Base + from utils import common_utils + RETRY_COUNTS = 150 RETRY_INTERVAL = 1 diff --git a/e2e/libs/utility/utility.py b/e2e/libs/utility/utility.py index 39125df9ad..52d75c7c4c 100644 --- a/e2e/libs/utility/utility.py +++ b/e2e/libs/utility/utility.py @@ -8,9 +8,11 @@ import socket import time import yaml + from robot.api import logger from robot.libraries.BuiltIn import BuiltIn + def logging(msg, also_report=False): if also_report: logger.info(msg, also_console=True) diff --git a/e2e/libs/volume/crd.py b/e2e/libs/volume/crd.py index 2e45ca3ee9..1c9e4a155d 100644 --- a/e2e/libs/volume/crd.py +++ b/e2e/libs/volume/crd.py @@ -1,11 +1,13 @@ -import os import time -import warnings -from utility.utility import logging + +from kubernetes import client + from utility.utility import get_retry_count_and_interval +from utility.utility import logging + from volume.base import Base from volume.rest import Rest -from kubernetes import client + Ki = 2**10 Mi = 2**20 diff --git a/e2e/libs/volume/rest.py b/e2e/libs/volume/rest.py index d81b29a979..9443bf9961 100644 --- a/e2e/libs/volume/rest.py +++ b/e2e/libs/volume/rest.py @@ -1,8 +1,11 @@ -from volume.base import Base +import os +import time + from utility.utility import get_longhorn_client from utility.utility import logging -import time -import os + +from volume.base import Base + RETRY_COUNTS = 150 RETRY_INTERVAL = 1 diff --git a/e2e/libs/volume/volume.py b/e2e/libs/volume/volume.py index 81be48edf8..bf88009cd6 100644 --- a/e2e/libs/volume/volume.py +++ b/e2e/libs/volume/volume.py @@ -1,8 +1,11 @@ +from node_exec import NodeExec + +from strategy import LonghornOperationStrategy + from volume.base import Base from volume.crd import CRD from volume.rest import Rest -from node_exec import NodeExec -from strategy import LonghornOperationStrategy + class Volume(Base): diff --git a/e2e/libs/workload/workload.py b/e2e/libs/workload/workload.py index 3a3370a65b..232aa961fd 100644 --- a/e2e/libs/workload/workload.py +++ b/e2e/libs/workload/workload.py @@ -1,10 +1,13 @@ +import time +import yaml + from kubernetes import client from kubernetes.client.rest import ApiException from kubernetes.stream import stream -import time -import yaml -from utility.utility import logging + from utility.utility import get_retry_count_and_interval +from utility.utility import logging + WAIT_FOR_POD_STABLE_MAX_RETRY = 90 From 4a1f06afbcf9d08fb144dbf851e9239d571079c6 Mon Sep 17 00:00:00 2001 From: Chin-Ya Huang Date: Tue, 24 Oct 2023 14:07:39 +0800 Subject: [PATCH 05/24] style(negative): adopt BDD to existing cases Signed-off-by: Chin-Ya Huang --- e2e/tests/cluster_restart.robot | 43 ++-- .../heavy_writing_and_recurring_jobs.robot | 48 ++-- e2e/tests/node_reboot.robot | 212 ++++++++++-------- e2e/tests/replica_rebuilding.robot | 30 +-- e2e/tests/replica_resiliency_test.robot | 18 +- 5 files changed, 191 insertions(+), 160 deletions(-) diff --git a/e2e/tests/cluster_restart.robot b/e2e/tests/cluster_restart.robot index d431db70c3..1fffb04c86 100644 --- a/e2e/tests/cluster_restart.robot +++ b/e2e/tests/cluster_restart.robot @@ -14,24 +14,27 @@ ${RETRY_INTERVAL} 1 *** Test Cases *** Restart Cluster While Workload Heavy Writing - Create deployment 0 with rwo volume - Create deployment 1 with rwx volume - Create deployment 2 with rwo and strict-local volume - Create statefulset 0 with rwo volume - Create statefulset 1 with rwx volume - Create statefulset 2 with rwo and strict-local volume + Given Create deployment 0 with rwo volume + And Create deployment 1 with rwx volume + And Create deployment 2 with rwo and strict-local volume + And Create statefulset 0 with rwo volume + And Create statefulset 1 with rwx volume + And Create statefulset 2 with rwo and strict-local volume + FOR ${i} IN RANGE ${LOOP_COUNT} - Keep writing data to deployment 0 - Keep writing data to deployment 1 - Keep writing data to deployment 2 - Keep writing data to statefulset 0 - Keep writing data to statefulset 1 - Keep writing data to statefulset 2 - Restart cluster - Check deployment 0 works - Check deployment 1 works - Check deployment 2 works - Check statefulset 0 works - Check statefulset 1 works - Check statefulset 2 works - END \ No newline at end of file + And Keep writing data to deployment 0 + And Keep writing data to deployment 1 + And Keep writing data to deployment 2 + And Keep writing data to statefulset 0 + And Keep writing data to statefulset 1 + And Keep writing data to statefulset 2 + + When Restart cluster + + Then Check deployment 0 works + And Check deployment 1 works + And Check deployment 2 works + And Check statefulset 0 works + And Check statefulset 1 works + And Check statefulset 2 works + END diff --git a/e2e/tests/heavy_writing_and_recurring_jobs.robot b/e2e/tests/heavy_writing_and_recurring_jobs.robot index dc5aa28d3f..df21dacba5 100644 --- a/e2e/tests/heavy_writing_and_recurring_jobs.robot +++ b/e2e/tests/heavy_writing_and_recurring_jobs.robot @@ -15,31 +15,35 @@ ${RETRY_INTERVAL} 1 *** Test Cases *** Reboot Volume Node While Heavy Writing And Recurring Jobs Exist - Create volume 0 with 2 GB and 1 replicas - Create volume 1 with 2 GB and 3 replicas - Keep writing data to volume 0 - Keep Writing data to volume 1 - Create snapshot and backup recurring job for volume 0 - Create snapshot and backup recurring job for volume 1 + Given Create volume 0 with 2 GB and 1 replicas + And Create volume 1 with 2 GB and 3 replicas + And Keep writing data to volume 0 + And Keep Writing data to volume 1 + And Create snapshot and backup recurring job for volume 0 + And Create snapshot and backup recurring job for volume 1 + FOR ${i} IN RANGE ${LOOP_COUNT} - Reboot volume 0 volume node - Check recurring jobs for volume 0 work - Check recurring jobs for volume 1 work - Check volume 0 works - Check volume 1 works + When Reboot volume 0 volume node + + Then Check recurring jobs for volume 0 work + And Check recurring jobs for volume 1 work + And Check volume 0 works + And Check volume 1 works END Reboot Replica Node While Heavy Writing And Recurring Jobs Exist - Create volume 0 with 2 GB and 1 replicas - Create volume 1 with 2 GB and 3 replicas - Keep Writing data to volume 0 - Keep Writing data to volume 1 - Create snapshot and backup recurring job for volume 0 - Create snapshot and backup recurring job for volume 1 + Given Create volume 0 with 2 GB and 1 replicas + And Create volume 1 with 2 GB and 3 replicas + And Keep Writing data to volume 0 + And Keep Writing data to volume 1 + And Create snapshot and backup recurring job for volume 0 + And Create snapshot and backup recurring job for volume 1 + FOR ${i} IN RANGE ${LOOP_COUNT} - Reboot volume 1 replica node - Check recurring jobs for volume 0 work - Check recurring jobs for volume 1 work - Check volume 0 works - Check volume 1 works + When Reboot volume 1 replica node + + Then Check recurring jobs for volume 0 work + And Check recurring jobs for volume 1 work + And Check volume 0 works + And Check volume 1 works END diff --git a/e2e/tests/node_reboot.robot b/e2e/tests/node_reboot.robot index 912b10cea7..a22694a7c0 100644 --- a/e2e/tests/node_reboot.robot +++ b/e2e/tests/node_reboot.robot @@ -15,121 +15,139 @@ ${RETRY_INTERVAL} 1 *** Test Cases *** Reboot Node One By One While Workload Heavy Writing - Create deployment 0 with rwo volume - Create deployment 1 with rwx volume - Create deployment 2 with rwo and strict-local volume - Create statefulset 0 with rwo volume - Create statefulset 1 with rwx volume - Create statefulset 2 with rwo and strict-local volume + Given Create deployment 0 with rwo volume + And Create deployment 1 with rwx volume + And Create deployment 2 with rwo and strict-local volume + And Create statefulset 0 with rwo volume + And Create statefulset 1 with rwx volume + And Create statefulset 2 with rwo and strict-local volume + FOR ${i} IN RANGE ${LOOP_COUNT} - Keep writing data to deployment 0 - Keep writing data to deployment 1 - Keep writing data to deployment 2 - Keep writing data to statefulset 0 - Keep writing data to statefulset 1 - Keep writing data to statefulset 2 - Reboot node 0 - Reboot node 1 - Reboot node 2 - Wait for longhorn ready - Check deployment 0 works - Check deployment 1 works - Check deployment 2 works - Check statefulset 0 works - Check statefulset 1 works - Check statefulset 2 works + And Keep writing data to deployment 0 + And Keep writing data to deployment 1 + And Keep writing data to deployment 2 + And Keep writing data to statefulset 0 + And Keep writing data to statefulset 1 + And Keep writing data to statefulset 2 + + When Reboot node 0 + And Reboot node 1 + And Reboot node 2 + And Wait for longhorn ready + + Then Check deployment 0 works + And Check deployment 1 works + And Check deployment 2 works + And Check statefulset 0 works + And Check statefulset 1 works + And Check statefulset 2 works END Power Off Node One By Once For More Than Pod Eviction Timeout While Workload Heavy Writing - Create deployment 0 with rwo volume - Create deployment 1 with rwx volume - Create deployment 2 with rwo and strict-local volume - Create statefulset 0 with rwo volume - Create statefulset 1 with rwx volume - Create statefulset 2 with rwo and strict-local volume + Given Create deployment 0 with rwo volume + And Create deployment 1 with rwx volume + And Create deployment 2 with rwo and strict-local volume + And Create statefulset 0 with rwo volume + And Create statefulset 1 with rwx volume + And Create statefulset 2 with rwo and strict-local volume + FOR ${i} IN RANGE ${LOOP_COUNT} - Keep writing data to deployment 0 - Keep writing data to deployment 1 - Keep writing data to deployment 2 - Keep writing data to statefulset 0 - Keep writing data to statefulset 1 - Keep writing data to statefulset 2 - Power off node 0 for 6 mins - Power off node 1 for 6 mins - Power off node 2 for 6 mins - Wait for longhorn ready - Check deployment 0 works - Check deployment 1 works - Check deployment 2 works - Check statefulset 0 works - Check statefulset 1 works - Check statefulset 2 works + And Keep writing data to deployment 0 + And Keep writing data to deployment 1 + And Keep writing data to deployment 2 + And Keep writing data to statefulset 0 + And Keep writing data to statefulset 1 + And Keep writing data to statefulset 2 + + When Power off node 0 for 6 mins + And Power off node 1 for 6 mins + And Power off node 2 for 6 mins + And Wait for longhorn ready + + Then Check deployment 0 works + And Check deployment 1 works + And Check deployment 2 works + And Check statefulset 0 works + And Check statefulset 1 works + And Check statefulset 2 works END Reboot All Worker Nodes While Workload Heavy Writing - Create deployment 0 with rwo volume - Create deployment 1 with rwx volume - Create deployment 2 with rwo and strict-local volume - Create statefulset 0 with rwo volume - Create statefulset 1 with rwx volume - Create statefulset 2 with rwo and strict-local volume + Given Create deployment 0 with rwo volume + And Create deployment 1 with rwx volume + And Create deployment 2 with rwo and strict-local volume + And Create statefulset 0 with rwo volume + And Create statefulset 1 with rwx volume + And Create statefulset 2 with rwo and strict-local volume + FOR ${i} IN RANGE ${LOOP_COUNT} - Keep writing data to deployment 0 - Keep writing data to deployment 1 - Keep writing data to deployment 2 - Keep writing data to statefulset 0 - Keep writing data to statefulset 1 - Keep writing data to statefulset 2 - Restart all worker nodes - Wait for longhorn ready - Check deployment 0 works - Check deployment 1 works - Check deployment 2 works - Check statefulset 0 works - Check statefulset 1 works - Check statefulset 2 works + And Keep writing data to deployment 0 + And Keep writing data to deployment 1 + And Keep writing data to deployment 2 + And Keep writing data to statefulset 0 + And Keep writing data to statefulset 1 + And Keep writing data to statefulset 2 + + When Restart all worker nodes + And Wait for longhorn ready + + Then Check deployment 0 works + And Check deployment 1 works + And Check deployment 2 works + And Check statefulset 0 works + And Check statefulset 1 works + And Check statefulset 2 works END Power Off All Worker Nodes For More Than Pod Eviction Timeout While Workload Heavy Writing - Create deployment 0 with rwo volume - Create deployment 1 with rwx volume - Create deployment 2 with rwo and strict-local volume - Create statefulset 0 with rwo volume - Create statefulset 1 with rwx volume - Create statefulset 2 with rwo and strict-local volume + Given Create deployment 0 with rwo volume + And Create deployment 1 with rwx volume + And Create deployment 2 with rwo and strict-local volume + And Create statefulset 0 with rwo volume + And Create statefulset 1 with rwx volume + And Create statefulset 2 with rwo and strict-local volume + FOR ${i} IN RANGE ${LOOP_COUNT} - Keep writing data to deployment 0 - Keep writing data to deployment 1 - Keep writing data to deployment 2 - Keep writing data to statefulset 0 - Keep writing data to statefulset 1 - Keep writing data to statefulset 2 - Power off all worker nodes for 6 mins - Wait for longhorn ready - Check deployment 0 works - Check deployment 1 works - Check deployment 2 works - Check statefulset 0 works - Check statefulset 1 works - Check statefulset 2 works + And Keep writing data to deployment 0 + And Keep writing data to deployment 1 + And Keep writing data to deployment 2 + And Keep writing data to statefulset 0 + And Keep writing data to statefulset 1 + And Keep writing data to statefulset 2 + + When Power off all worker nodes for 6 mins + And Wait for longhorn ready + + Then Check deployment 0 works + And Check deployment 1 works + And Check deployment 2 works + And Check statefulset 0 works + And Check statefulset 1 works + And Check statefulset 2 works END Reboot Volume Node While Workload Heavy Writing - Create statefulset 0 with rwo volume + Given Create statefulset 0 with rwo volume + FOR ${i} IN RANGE ${LOOP_COUNT} - Keep writing data to statefulset 0 - Reboot volume node of statefulset 0 - Wait for volume of statefulset 0 healthy - Wait for statefulset 0 stable - Check statefulset 0 works + And Keep writing data to statefulset 0 + + When Reboot volume node of statefulset 0 + And Wait for volume of statefulset 0 healthy + And Wait for statefulset 0 stable + + Then Check statefulset 0 works END Power Off Volume Node For More Than Pod Eviction Timeout While Workload Heavy Writing - Create statefulset 0 with rwo volume + Given Create statefulset 0 with rwo volume + FOR ${i} IN RANGE ${LOOP_COUNT} - Keep writing data to statefulset 0 - Power off volume node of statefulset 0 for 6 mins - Wait for volume of statefulset 0 healthy - Wait for statefulset 0 stable - Check statefulset 0 works - END \ No newline at end of file + And Keep writing data to statefulset 0 + + When Power off volume node of statefulset 0 for 6 mins + And Wait for volume of statefulset 0 healthy + And Wait for statefulset 0 stable + + Then Check statefulset 0 works + END diff --git a/e2e/tests/replica_rebuilding.robot b/e2e/tests/replica_rebuilding.robot index 821d0bca82..830b674d5b 100644 --- a/e2e/tests/replica_rebuilding.robot +++ b/e2e/tests/replica_rebuilding.robot @@ -14,21 +14,25 @@ ${RETRY_INTERVAL} 1 *** Test Cases *** Reboot Volume Node While Replica Rebuilding - Create a volume with 5 GB and 3 replicas - Write data to the volume + Given Create a volume with 5 GB and 3 replicas + And Write data to the volume + FOR ${i} IN RANGE ${LOOP_COUNT} - Delete replica on volume node to trigger replica rebuilding - During replica rebuilding, reboot volume node - Wait until replica on volume node rebuilt - Check data is intact + When Delete replica on volume node to trigger replica rebuilding + And During replica rebuilding, reboot volume node + + Then Wait until replica on volume node rebuilt + And Check data is intact END Reboot Replica Node While Replica Rebuilding - Create a volume with 5 GB and 3 replicas - Write data to the volume + Given Create a volume with 5 GB and 3 replicas + And Write data to the volume + FOR ${i} IN RANGE ${LOOP_COUNT} - Delete replica on replica node to trigger replica rebuilding - During replica rebuilding, reboot replica node - Wait until replica on replica node rebuilt - Check data is intact - END \ No newline at end of file + When Delete replica on replica node to trigger replica rebuilding + And During replica rebuilding, reboot replica node + + Then Wait until replica on replica node rebuilt + And Check data is intact + END diff --git a/e2e/tests/replica_resiliency_test.robot b/e2e/tests/replica_resiliency_test.robot index 88f3ceaeb2..26dc8ce5eb 100644 --- a/e2e/tests/replica_resiliency_test.robot +++ b/e2e/tests/replica_resiliency_test.robot @@ -13,12 +13,14 @@ ${RETRY_INTERVAL} 1 *** Test Cases *** Delete Replica While Replica Rebuilding - Create a volume with 2 GB and 3 replicas - Write data to the volume + Given Create a volume with 2 GB and 3 replicas + And Write data to the volume + FOR ${i} IN RANGE ${LOOP_COUNT} - Delete replica 0 to trigger replica 0 rebuilding - During replica 0 rebuilding, delete replica 1 - Wait until replica 0 rebuilt, delete replica 2 - Check data is intact - Wait until all replicas rebuilt - END \ No newline at end of file + When Delete replica 0 to trigger replica 0 rebuilding + And During replica 0 rebuilding, delete replica 1 + And Wait until replica 0 rebuilt, delete replica 2 + + Then Check data is intact + And Wait until all replicas rebuilt + END From 152077ad7f03a06cb221d65a813de4211c79ce9c Mon Sep 17 00:00:00 2001 From: Chin-Ya Huang Date: Tue, 24 Oct 2023 12:37:11 +0800 Subject: [PATCH 06/24] test(negative): base work for CPU/memory stress cases ref: 6705 Signed-off-by: Chin-Ya Huang --- e2e/keywords/common.resource | 2 ++ e2e/libs/keywords/node_keywords.py | 5 +++++ e2e/libs/node/__init__.py | 3 ++- e2e/libs/node/stress.py | 12 ++++++++++++ e2e/libs/workload/pod.py | 6 ++++++ 5 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 e2e/libs/node/stress.py create mode 100644 e2e/libs/workload/pod.py diff --git a/e2e/keywords/common.resource b/e2e/keywords/common.resource index 078991263a..e978ddd0fc 100644 --- a/e2e/keywords/common.resource +++ b/e2e/keywords/common.resource @@ -2,6 +2,7 @@ Documentation Common keywords Library ../libs/keywords/common_keywords.py +Library ../libs/keywords/node_keywords.py Library ../libs/keywords/volume_keywords.py Library ../libs/keywords/recurring_job_keywords.py Library ../libs/keywords/workload_keywords.py @@ -24,6 +25,7 @@ Set test environment Cleanup test resources cleanup_node_exec + cleanup_stress_helper cleanup_recurring_jobs ${volume_list} cleanup_volumes ${volume_list} cleanup_deployments ${deployment_list} diff --git a/e2e/libs/keywords/node_keywords.py b/e2e/libs/keywords/node_keywords.py index ca3d9c40bb..898c055e6c 100644 --- a/e2e/libs/keywords/node_keywords.py +++ b/e2e/libs/keywords/node_keywords.py @@ -1,6 +1,7 @@ from robot.libraries.BuiltIn import BuiltIn from node import Node +from node import Stress from utility.utility import get_node from utility.utility import wait_for_all_instance_manager_running @@ -10,6 +11,7 @@ class node_keywords: def __init__(self): self.node = Node() + self.stress = Stress() def reboot_volume_node(self, volume_name): volume_keywords = BuiltIn().get_library_instance('volume_keywords') @@ -36,3 +38,6 @@ def reboot_node_by_name(self, node_name, power_off_time_in_min=1): def wait_for_all_instance_manager_running(self): wait_for_all_instance_manager_running() + + def cleanup_stress_helper(self): + self.stress.cleanup() diff --git a/e2e/libs/node/__init__.py b/e2e/libs/node/__init__.py index 667816bd05..6b60b2f5a4 100644 --- a/e2e/libs/node/__init__.py +++ b/e2e/libs/node/__init__.py @@ -1 +1,2 @@ -from node.node import Node \ No newline at end of file +from node.node import Node +from node.stress import Stress diff --git a/e2e/libs/node/stress.py b/e2e/libs/node/stress.py new file mode 100644 index 0000000000..6d599c7230 --- /dev/null +++ b/e2e/libs/node/stress.py @@ -0,0 +1,12 @@ +from utility.utility import logging + +from workload.pod import delete_pod +from workload.workload import get_workload_pods + +LABEL_STRESS_HELPER = "longhorn-stress-helper" + +class Stress: + def cleanup(self): + for pod in get_workload_pods(LABEL_STRESS_HELPER): + logging(f"Cleaning up stress pod {pod.metadata.name}") + delete_pod(pod.metadata.name, pod.metadata.namespace) diff --git a/e2e/libs/workload/pod.py b/e2e/libs/workload/pod.py new file mode 100644 index 0000000000..48e218e8de --- /dev/null +++ b/e2e/libs/workload/pod.py @@ -0,0 +1,6 @@ +from kubernetes import client + + +def delete_pod(name, namespace='default'): + core_api = client.CoreV1Api() + core_api.delete_namespaced_pod(name=name, namespace=namespace) From 2077f14756db0d786eebad14313209fcadd5d49a Mon Sep 17 00:00:00 2001 From: Chin-Ya Huang Date: Tue, 24 Oct 2023 12:25:20 +0800 Subject: [PATCH 07/24] test(negative): stress volume node CPU while replica rebuilding ref: 6705 Signed-off-by: Chin-Ya Huang --- e2e/keywords/node.resource | 3 + e2e/libs/keywords/node_keywords.py | 5 ++ e2e/libs/node/stress.py | 27 +++++++++ e2e/libs/node/utility.py | 9 +++ e2e/libs/utility/utility.py | 8 ++- e2e/libs/workload/pod.py | 91 ++++++++++++++++++++++++++++++ e2e/tests/replica_rebuilding.robot | 12 ++++ 7 files changed, 153 insertions(+), 2 deletions(-) create mode 100644 e2e/libs/node/utility.py diff --git a/e2e/keywords/node.resource b/e2e/keywords/node.resource index 048b4459ef..98d40aef01 100644 --- a/e2e/keywords/node.resource +++ b/e2e/keywords/node.resource @@ -64,3 +64,6 @@ Restart cluster FOR ${statefulset} IN @{statefulset_list} wait_for_workload_pod_stable ${statefulset} END + +During replica rebuilding, stress volume node cpu + stress_node_cpu_by_volume ${volume_name} diff --git a/e2e/libs/keywords/node_keywords.py b/e2e/libs/keywords/node_keywords.py index 898c055e6c..8a8c161ae6 100644 --- a/e2e/libs/keywords/node_keywords.py +++ b/e2e/libs/keywords/node_keywords.py @@ -41,3 +41,8 @@ def wait_for_all_instance_manager_running(self): def cleanup_stress_helper(self): self.stress.cleanup() + + def stress_node_cpu_by_volume(self, volume_name): + volume_keywords = BuiltIn().get_library_instance('volume_keywords') + volume_node = volume_keywords.get_volume_node(volume_name) + self.stress.cpu([volume_node]) diff --git a/e2e/libs/node/stress.py b/e2e/libs/node/stress.py index 6d599c7230..08e6ee10d8 100644 --- a/e2e/libs/node/stress.py +++ b/e2e/libs/node/stress.py @@ -1,8 +1,19 @@ +from kubernetes import client + +from node.utility import get_node_cpu_cores + from utility.utility import logging +from workload.pod import create_pod from workload.pod import delete_pod +from workload.pod import new_pod_manifest from workload.workload import get_workload_pods +from workload.pod import IMAGE_LITMUX + +NODE_CPU_LOAD_PERCENTAGE = 100 +NODE_STRESS_TIMEOUT_SECOND = 300 + LABEL_STRESS_HELPER = "longhorn-stress-helper" class Stress: @@ -10,3 +21,19 @@ def cleanup(self): for pod in get_workload_pods(LABEL_STRESS_HELPER): logging(f"Cleaning up stress pod {pod.metadata.name}") delete_pod(pod.metadata.name, pod.metadata.namespace) + + def cpu(self, node_names): + for node_name in node_names: + manifest = new_pod_manifest( + image=IMAGE_LITMUX, + command=["stress-ng"], + args=['--cpu', str(get_node_cpu_cores(node_name)), + '--cpu-load', str(NODE_CPU_LOAD_PERCENTAGE), + '--timeout', str(NODE_STRESS_TIMEOUT_SECOND)], + node_name=node_name, + labels={'app': LABEL_STRESS_HELPER} + ) + + pod_name = manifest['metadata']['name'] + logging(f"Creating cpu stress pod {pod_name} on {node_name}") + create_pod(manifest, is_wait_for_pod_running=True) diff --git a/e2e/libs/node/utility.py b/e2e/libs/node/utility.py new file mode 100644 index 0000000000..571b983b6f --- /dev/null +++ b/e2e/libs/node/utility.py @@ -0,0 +1,9 @@ +from kubernetes import client + +def get_node_by_name(node_name): + core_api = client.CoreV1Api() + return core_api.read_node(node_name) + +def get_node_cpu_cores(node_name): + node = get_node_by_name(node_name) + return node.status.capacity['cpu'] diff --git a/e2e/libs/utility/utility.py b/e2e/libs/utility/utility.py index 52d75c7c4c..f05085f959 100644 --- a/e2e/libs/utility/utility.py +++ b/e2e/libs/utility/utility.py @@ -24,11 +24,15 @@ def get_retry_count_and_interval(): retry_interval = int(BuiltIn().get_variable_value("${RETRY_INTERVAL}")) return retry_count, retry_interval -def generate_volume_name(): - return "vol-" + \ +def generate_name(name_prefix="test-"): + return name_prefix + \ ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(6)) +def generate_volume_name(): + return generate_name("vol-") + + def init_k8s_api_client(): if os.getenv('LONGHORN_CLIENT_URL'): # for develop or debug, run test in local environment diff --git a/e2e/libs/workload/pod.py b/e2e/libs/workload/pod.py index 48e218e8de..274067441f 100644 --- a/e2e/libs/workload/pod.py +++ b/e2e/libs/workload/pod.py @@ -1,6 +1,97 @@ +import time + from kubernetes import client +from robot.libraries.BuiltIn import BuiltIn + +from utility.utility import logging +from utility.utility import generate_name + + +IMAGE_BUSYBOX = 'busybox:1.34.0' +IMAGE_LITMUX = 'litmuschaos/go-runner:latest' + +def new_pod_manifest(image="", command=[], args=[], + claim_name="", node_name="", labels={}): + # Set default image and args + if image is None: + image = IMAGE_BUSYBOX + args = [ + '/bin/sh', '-c', + 'while true; do date; sleep 5; done' + ] + + manifest = { + 'apiVersion': 'v1', + 'kind': 'Pod', + 'metadata': { + 'name': generate_name(), + 'namespace': 'default', + 'labels': labels + }, + 'spec': { + 'nodeName': node_name, + 'containers': [{ + 'image': image, + 'imagePullPolicy': 'IfNotPresent', + 'name': 'run', + 'command': command, + 'args': args, + 'volumeMounts': [], + }], + 'volumes': [] + } + } + + if claim_name != "": + manifest['spec']['volumes'].append({ + 'name': 'pod-data', + 'persistentVolumeClaim': { + 'claimName': claim_name + } + }) + + manifest['spec']['containers'][0]['volumeMounts'].append({ + 'name': 'pod-data', + 'mountPath': '/data' + }) + + return manifest + +def create_pod(manifest, is_wait_for_pod_running=False): + core_api = client.CoreV1Api() + + name = manifest['metadata']['name'] + namespace = manifest['metadata']['namespace'] + + core_api.create_namespaced_pod(body=manifest, namespace=namespace) + + if is_wait_for_pod_running: + wait_for_pod_status(name, 'Running', namespace=namespace) + + return get_pod(name, namespace=namespace) def delete_pod(name, namespace='default'): core_api = client.CoreV1Api() core_api.delete_namespaced_pod(name=name, namespace=namespace) + +def get_pod(name, namespace='default'): + core_api = client.CoreV1Api() + return core_api.read_namespaced_pod(name=name, namespace=namespace) + +def wait_for_pod_status(name, status, namespace='default', + retry_count=int(BuiltIn().get_variable_value("${RETRY_COUNT}")), + retry_interval=int(BuiltIn().get_variable_value("${RETRY_INTERVAL}"))): + is_running = False + for i in range(retry_count): + pod = get_pod(name, namespace) + + logging(f"Waiting for pod {name} status {status}, current status {pod.status.phase} ({i}) ...") + + if pod.status.phase == status: + is_running = True + break + + time.sleep(retry_interval) + + assert is_running diff --git a/e2e/tests/replica_rebuilding.robot b/e2e/tests/replica_rebuilding.robot index 830b674d5b..f42ee8079b 100644 --- a/e2e/tests/replica_rebuilding.robot +++ b/e2e/tests/replica_rebuilding.robot @@ -36,3 +36,15 @@ Reboot Replica Node While Replica Rebuilding Then Wait until replica on replica node rebuilt And Check data is intact END + +Stress Volume Node CPU While Replica Rebuilding + Given Create a volume with 5 GB and 3 replicas + And Write data to the volume + + FOR ${i} IN RANGE ${LOOP_COUNT} + When Delete replica on volume node to trigger replica rebuilding + And During replica rebuilding, stress volume node cpu + + Then Wait until replica on volume node rebuilt + And Check data is intact + END From ac0f6ca2df7e4ad6a41a63a4f2959b6b65759a9f Mon Sep 17 00:00:00 2001 From: Chin-Ya Huang Date: Tue, 24 Oct 2023 09:03:43 +0800 Subject: [PATCH 08/24] test(negative): stress volume node memory while replica rebuilding ref: 6705 Signed-off-by: Chin-Ya Huang --- e2e/keywords/node.resource | 3 +++ e2e/libs/keywords/node_keywords.py | 5 +++++ e2e/libs/node/stress.py | 18 ++++++++++++++++++ e2e/tests/replica_rebuilding.robot | 12 ++++++++++++ 4 files changed, 38 insertions(+) diff --git a/e2e/keywords/node.resource b/e2e/keywords/node.resource index 98d40aef01..59f2b3313f 100644 --- a/e2e/keywords/node.resource +++ b/e2e/keywords/node.resource @@ -67,3 +67,6 @@ Restart cluster During replica rebuilding, stress volume node cpu stress_node_cpu_by_volume ${volume_name} + +During replica rebuilding, stress volume node memory + stress_node_memory_by_volume ${volume_name} diff --git a/e2e/libs/keywords/node_keywords.py b/e2e/libs/keywords/node_keywords.py index 8a8c161ae6..c41977bbeb 100644 --- a/e2e/libs/keywords/node_keywords.py +++ b/e2e/libs/keywords/node_keywords.py @@ -46,3 +46,8 @@ def stress_node_cpu_by_volume(self, volume_name): volume_keywords = BuiltIn().get_library_instance('volume_keywords') volume_node = volume_keywords.get_volume_node(volume_name) self.stress.cpu([volume_node]) + + def stress_node_memory_by_volume(self, volume_name): + volume_keywords = BuiltIn().get_library_instance('volume_keywords') + volume_node = volume_keywords.get_volume_node(volume_name) + self.stress.memory([volume_node]) diff --git a/e2e/libs/node/stress.py b/e2e/libs/node/stress.py index 08e6ee10d8..f142fca7c5 100644 --- a/e2e/libs/node/stress.py +++ b/e2e/libs/node/stress.py @@ -12,6 +12,8 @@ from workload.pod import IMAGE_LITMUX NODE_CPU_LOAD_PERCENTAGE = 100 +NODE_MEM_LOAD_PERCENTAGE = 100 +NODE_MEM_VM_WORKERS = 1 NODE_STRESS_TIMEOUT_SECOND = 300 LABEL_STRESS_HELPER = "longhorn-stress-helper" @@ -37,3 +39,19 @@ def cpu(self, node_names): pod_name = manifest['metadata']['name'] logging(f"Creating cpu stress pod {pod_name} on {node_name}") create_pod(manifest, is_wait_for_pod_running=True) + + def memory(self, node_names): + for node_name in node_names: + manifest = new_pod_manifest( + image=IMAGE_LITMUX, + command=["stress-ng"], + args=['--vm', str(NODE_MEM_VM_WORKERS), + '--vm-bytes', f"{NODE_MEM_LOAD_PERCENTAGE}%", + '--timeout', str(NODE_STRESS_TIMEOUT_SECOND)], + node_name=node_name, + labels={'app': LABEL_STRESS_HELPER} + ) + + pod_name = manifest['metadata']['name'] + logging(f"Creating memory stress pod {pod_name} on {node_name}") + create_pod(manifest, is_wait_for_pod_running=True) diff --git a/e2e/tests/replica_rebuilding.robot b/e2e/tests/replica_rebuilding.robot index f42ee8079b..6664b4d6eb 100644 --- a/e2e/tests/replica_rebuilding.robot +++ b/e2e/tests/replica_rebuilding.robot @@ -48,3 +48,15 @@ Stress Volume Node CPU While Replica Rebuilding Then Wait until replica on volume node rebuilt And Check data is intact END + +Stress Volume Node Memory While Replica Rebuilding + Given Create a volume with 5 GB and 3 replicas + And Write data to the volume + + FOR ${i} IN RANGE ${LOOP_COUNT} + When Delete replica on volume node to trigger replica rebuilding + And During replica rebuilding, stress volume node memory + + Then Wait until replica on volume node rebuilt + And Check data is intact + END From 5f6deb694d291b831cb8e1a08eebb9e4e04feab5 Mon Sep 17 00:00:00 2001 From: Yang Chiu Date: Wed, 25 Oct 2023 08:13:54 +0800 Subject: [PATCH 09/24] test: make volume type configurable for reboot volume node test case to test both rwo and rwx volume Signed-off-by: Yang Chiu --- e2e/tests/node_reboot.robot | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/e2e/tests/node_reboot.robot b/e2e/tests/node_reboot.robot index a22694a7c0..780146415f 100644 --- a/e2e/tests/node_reboot.robot +++ b/e2e/tests/node_reboot.robot @@ -12,6 +12,7 @@ Test Teardown Cleanup test resources ${LOOP_COUNT} 1 ${RETRY_COUNT} 300 ${RETRY_INTERVAL} 1 +${VOLUME_TYPE} rwo *** Test Cases *** Reboot Node One By One While Workload Heavy Writing @@ -127,8 +128,7 @@ Power Off All Worker Nodes For More Than Pod Eviction Timeout While Workload Hea END Reboot Volume Node While Workload Heavy Writing - Given Create statefulset 0 with rwo volume - + Given Create statefulset 0 with ${VOLUME_TYPE} volume FOR ${i} IN RANGE ${LOOP_COUNT} And Keep writing data to statefulset 0 From 3d5f5ef52b5676337dbece94f4e161a7a6b15455 Mon Sep 17 00:00:00 2001 From: Chin-Ya Huang Date: Wed, 25 Oct 2023 16:36:34 +0800 Subject: [PATCH 10/24] test(manual): storage-network on Multus version above v4.0.0 ref: 6953 Signed-off-by: Chin-Ya Huang --- .../basic-operations/storage-network.md | 12 +- .../v1.6.0/test-storage-network.md | 216 ++++++++++++++++++ 2 files changed, 226 insertions(+), 2 deletions(-) create mode 100644 docs/content/manual/release-specific/v1.6.0/test-storage-network.md diff --git a/docs/content/manual/pre-release/basic-operations/storage-network.md b/docs/content/manual/pre-release/basic-operations/storage-network.md index 74567abbec..f1f59642c5 100644 --- a/docs/content/manual/pre-release/basic-operations/storage-network.md +++ b/docs/content/manual/pre-release/basic-operations/storage-network.md @@ -4,7 +4,15 @@ title: Storage Network Test ## Related issue: https://github.com/longhorn/longhorn/issues/2285 -## Test Steps +## Test Multus version below v4.0.0 **Given** Set up the Longhorn environment as mentioned [here](https://longhorn.github.io/longhorn-tests/manual/release-specific/v1.3.0/test-storage-network/) **When** Run Longhorn core tests on the environment. -**Then** All the tests should pass. +**Then** All the tests should pass. + +## Related issue: +https://github.com/longhorn/longhorn/issues/6953 + +## Test Multus version above v4.0.0 +**Given** Set up the Longhorn environment as mentioned [here](https://longhorn.github.io/longhorn-tests/manual/release-specific/v1.6.0/test-storage-network/) +**When** Run Longhorn core tests on the environment. +**Then** All the tests should pass. diff --git a/docs/content/manual/release-specific/v1.6.0/test-storage-network.md b/docs/content/manual/release-specific/v1.6.0/test-storage-network.md new file mode 100644 index 0000000000..546d0c8410 --- /dev/null +++ b/docs/content/manual/release-specific/v1.6.0/test-storage-network.md @@ -0,0 +1,216 @@ +--- +title: Setup and test storage network when Multus version is above v4.0.0 +--- + +## Related issue +https://github.com/longhorn/longhorn/issues/6953 + +## Test storage network + +### Create AWS instances +**Given** Create VPC. +- VPC only +- IPv4 CIDR 10.0.0.0/16 + +*And* Create an internet gateway. +- Attach to VPC + +*And* Add the internet gateway to the VPC `Main route table`, `Routes`. +- Destination 0.0.0.0/0 + +*And* Create 2 subnets in the VPC. +- Subnet-1: 10.0.1.0/24 +- Subnet-2: 10.0.2.0/24 + +*And* Launch 3 EC2 instances. +- Use the created VPC +- Use subnet-1 for network interface 1 +- Use subnet-2 for network interface 2 +- Disable `Auto-assign public IP` +- Add security group inbound rule to allow `All traffic` from `Anywhere-IPv4` +- Stop `Source/destination check` + +*And* Create 3 elastic IPs. + +*And* Associate one of the elastic IP to one of the EC2 instance network interface 1. +- Repeat for the other 2 EC2 instances with the remain elastic IPs. + + +### Setup instances + +**Given** K3s K8s cluster installed on EC2 instances. + +*And* Deploy Multus DaemonSet on the control-plane node. +- Download YAML. + ``` + curl -O https://raw.githubusercontent.com/k8snetworkplumbingwg/multus-cni/v4.0.2/deployments/multus-daemonset.yml + ``` +- Edit YAML. + ``` + diff --git a/deployments/multus-daemonset.yml b/deployments/multus-daemonset.yml + index ab626a66..a7228942 100644 + --- a/deployments/multus-daemonset.yml + +++ b/deployments/multus-daemonset.yml + @@ -145,7 +145,7 @@ data: + ] + } + ], + - "kubeconfig": "/etc/cni/net.d/multus.d/multus.kubeconfig" + + "kubeconfig": "/var/lib/rancher/k3s/agent/etc/cni/net.d/multus.d/multus.kubeconfig" + } + --- + apiVersion: apps/v1 + @@ -179,12 +179,13 @@ spec: + serviceAccountName: multus + containers: + - name: kube-multus + - image: ghcr.io/k8snetworkplumbingwg/multus-cni:snapshot + + image: ghcr.io/k8snetworkplumbingwg/multus-cni:v4.0.2 + command: ["/thin_entrypoint"] + args: + - "--multus-conf-file=auto" + - "--multus-autoconfig-dir=/host/etc/cni/net.d" + - "--cni-conf-dir=/host/etc/cni/net.d" + + - "--multus-kubeconfig-file-host=/var/lib/rancher/k3s/agent/etc/cni/net.d/multus.d/multus.kubeconfig" + resources: + requests: + cpu: "100m" + @@ -222,10 +223,10 @@ spec: + volumes: + - name: cni + hostPath: + - path: /etc/cni/net.d + + path: /var/lib/rancher/k3s/agent/etc/cni/net.d + - name: cnibin + hostPath: + - path: /opt/cni/bin + + path: /var/lib/rancher/k3s/data/current/bin + - name: multus-cfg + configMap: + name: multus-cni-config + ``` +- Apply YAML to K8s cluster. + ``` + kubectl apply -f multus-daemonset.yml.new + ``` + +*And* Download `ipvlan` and put to K3s binaries path to all cluster nodes. +``` +curl -OL https://github.com/containernetworking/plugins/releases/download/v1.3.0/cni-plugins-linux-amd64-v1.3.0.tgz +tar -zxvf cni-plugins-linux-amd64-v1.3.0.tgz +cp ipvlan /var/lib/rancher/k3s/data/current/bin/ +``` + +*And* Setup flannels on all cluster nodes. +``` +# Update nodes eth1 IP to N1, N2, N3 +N1="10.0.2.95" +N2="10.0.2.139" +N3="10.0.2.158" +NODES=(${N1} ${N2} ${N3}) + +STORAGE_NETWORK_PREFIX="192.168" + +ETH1_IP=`ip a | grep eth1 | grep -Eo 'inet (addr:)?([0-9]*\.){3}[0-9]*' | awk '{print $2}'` + +count=1 +for n in "${NODES[@]}"; do + [[ ${ETH1_IP} != $n ]] && ((count=count+1)) && continue + + NET=$count + break +done + +cat << EOF > /run/flannel/multus-subnet-${STORAGE_NETWORK_PREFIX}.0.0.env +FLANNEL_NETWORK=${STORAGE_NETWORK_PREFIX}.0.0/16 +FLANNEL_SUBNET=${STORAGE_NETWORK_PREFIX}.${NET}.0/24 +FLANNEL_MTU=1472 +FLANNEL_IPMASQ=true +EOF +``` +*And* Setup routes on all cluster nodes. +``` +# Update nodes eth1 IP to N1, N2, N3 +N1="10.0.2.95" +N2="10.0.2.139" +N3="10.0.2.158" + +STORAGE_NETWORK_PREFIX="192.168" +ACTION="add" + +ETH1_IP=`ip a | grep eth1 | grep -Eo 'inet (addr:)?([0-9]*\.){3}[0-9]*' | awk '{print $2}'` + +[[ ${ETH1_IP} != ${N1} ]] && ip r ${ACTION} ${STORAGE_NETWORK_PREFIX}.1.0/24 via ${N1} dev eth1 +[[ ${ETH1_IP} != ${N2} ]] && ip r ${ACTION} ${STORAGE_NETWORK_PREFIX}.2.0/24 via ${N2} dev eth1 +[[ ${ETH1_IP} != ${N3} ]] && ip r ${ACTION} ${STORAGE_NETWORK_PREFIX}.3.0/24 via ${N3} dev eth1 +``` + +*And* Deploy `NetworkAttachmentDefinition`. +``` +cat << EOF > nad-192-168-0-0.yaml +apiVersion: "k8s.cni.cncf.io/v1" +kind: NetworkAttachmentDefinition +metadata: + name: demo-192-168-0-0 + namespace: kube-system + #namespace: longhorn-system +spec: + config: '{ + "cniVersion": "0.3.1", + "type": "flannel", + "subnetFile": "/run/flannel/multus-subnet-192.168.0.0.env", + "dataDir": "/var/lib/cni/multus-subnet-192.168.0.0", + "delegate": { + "type": "ipvlan", + "master": "eth1", + "mode": "l3", + "capabilities": { + "ips": true + } + }, + "kubernetes": { + "kubeconfig": "/etc/cni/net.d/multus.d/multus.kubeconfig" + } + }' +EOF +kubectl apply -f nad-192-168-0-0.yaml +``` + + +### Test storage network +**Given** Longhorn deployed. + +**When** Update storage network setting value to `kube-system/demo-192-168-0-0`. + +**Then** Instance manager pods should restart. + +*And* Should have storage network in `k8s.v1.cni.cncf.io/network-status` instance manager pods annotations. +- Should have 2 network in `k8s.v1.cni.cncf.io/network-status` annotation +- `kube-system/demo-192-168-0-0` should exist in `k8s.v1.cni.cncf.io/network-status` annotation +- `kube-system/demo-192-168-0-0` should use `lhnet1` interface. +- `kube-system/demo-192-168-0-0` should be in `192.168.0.0/16` subnet. +*And* Should be able to create/attach/detach/delete volumes successfully. +- Example: + ``` + Annotations: k8s.v1.cni.cncf.io/network-status: + [{ + "name": "cbr0", + "interface": "eth0", + "ips": [ + "10.42.2.35" + ], + "mac": "26:a7:d3:0d:af:68", + "default": true, + "dns": {} + },{ + "name": "kube-system/demo-192-168-0-0", + "interface": "lhnet1", + "ips": [ + "192.168.2.230" + ], + "mac": "02:d3:d9:0b:2e:50", + "dns": {} + }] + k8s.v1.cni.cncf.io/networks: [{"namespace": "kube-system", "name": "demo-192-168-0-0", "interface": "lhnet1"}] + ``` +- Should see engine/replica `storageIP` in `192.168.0.0` subnet. From d038a55f28d27ecc456da3b705f8af28a7891632 Mon Sep 17 00:00:00 2001 From: Yang Chiu Date: Fri, 6 Oct 2023 13:54:12 +0800 Subject: [PATCH 11/24] test: implement kubelet restart currently only support k3s Signed-off-by: Yang Chiu --- e2e/keywords/kubelet.resource | 15 ++++++++ e2e/libs/keywords/kubelet_keywords.py | 6 +++ e2e/libs/kubelet/kubelet.py | 22 +++++++++++ e2e/libs/node_exec/node_exec.py | 47 ++++++++++++++++------- e2e/libs/utility/utility.py | 15 -------- e2e/libs/workload/pod.py | 54 +++++++++++++++++++++++---- e2e/tests/kubelet_restart.robot | 36 ++++++++++++++++++ 7 files changed, 159 insertions(+), 36 deletions(-) create mode 100644 e2e/keywords/kubelet.resource create mode 100644 e2e/libs/keywords/kubelet_keywords.py create mode 100644 e2e/libs/kubelet/kubelet.py create mode 100644 e2e/tests/kubelet_restart.robot diff --git a/e2e/keywords/kubelet.resource b/e2e/keywords/kubelet.resource new file mode 100644 index 0000000000..da5abe52ee --- /dev/null +++ b/e2e/keywords/kubelet.resource @@ -0,0 +1,15 @@ +*** Settings *** +Documentation Kubelet keywords + +Library ../libs/keywords/kubelet_keywords.py +Library ../libs/keywords/workload_keywords.py +Library ../libs/keywords/volume_keywords.py + +*** Variables *** + + +*** Keywords *** +Stop volume node kubelet of statefulset ${idx} for ${stop_time_in_sec} seconds + ${volume_name} = get_workload_volume_name ${statefulset_list}[${idx}] + ${node_name} = get_volume_node ${volume_name} + restart_kubelet ${node_name} ${stop_time_in_sec} diff --git a/e2e/libs/keywords/kubelet_keywords.py b/e2e/libs/keywords/kubelet_keywords.py new file mode 100644 index 0000000000..55c8e6cef6 --- /dev/null +++ b/e2e/libs/keywords/kubelet_keywords.py @@ -0,0 +1,6 @@ +from kubelet.kubelet import restart_kubelet + +class kubelet_keywords: + + def restart_kubelet(self, node_name, stop_time_in_sec): + restart_kubelet(node_name, int(stop_time_in_sec)) diff --git a/e2e/libs/kubelet/kubelet.py b/e2e/libs/kubelet/kubelet.py new file mode 100644 index 0000000000..c9c5180050 --- /dev/null +++ b/e2e/libs/kubelet/kubelet.py @@ -0,0 +1,22 @@ +from utility.utility import logging +import time + +from workload.pod import new_pod_manifest +from workload.pod import create_pod +from workload.pod import wait_for_pod_status +from workload.pod import delete_pod +from workload.pod import IMAGE_UBUNTU + +def restart_kubelet(node_name, stop_time_in_sec=10): + manifest = new_pod_manifest( + image=IMAGE_UBUNTU, + command=["/bin/bash"], + args=["-c", f"sleep 10 && systemctl stop k3s-agent && sleep {stop_time_in_sec} && systemctl start k3s-agent"], + node_name=node_name + ) + pod_name = manifest['metadata']['name'] + create_pod(manifest, is_wait_for_pod_running=True) + + time.sleep(stop_time_in_sec) + + delete_pod(pod_name) diff --git a/e2e/libs/node_exec/node_exec.py b/e2e/libs/node_exec/node_exec.py index 8d0ab27ae8..76011c29ef 100644 --- a/e2e/libs/node_exec/node_exec.py +++ b/e2e/libs/node_exec/node_exec.py @@ -4,7 +4,7 @@ from kubernetes.stream import stream from utility.utility import logging -from utility.utility import wait_delete_pod +from workload.pod import wait_delete_pod from utility.utility import wait_delete_ns @@ -52,7 +52,7 @@ def cleanup(self): namespace=self.namespace, body=client.V1DeleteOptions() ) - wait_delete_pod(pod.metadata.uid) + wait_delete_pod(pod.metadata.name) self.core_api.delete_namespace( name=self.namespace ) @@ -61,15 +61,20 @@ def cleanup(self): def issue_cmd(self, node_name, cmd): + logging(f"Issuing command: {cmd} on {node_name}") pod = self.launch_pod(node_name) - exec_command = [ - 'nsenter', - '--mount=/rootfs/proc/1/ns/mnt', - '--', - 'sh', - '-c', - cmd - ] + if isinstance(cmd, list): + exec_command = cmd + else: + exec_command = [ + 'nsenter', + '--mount=/rootfs/proc/1/ns/mnt', + '--net=/rootfs/proc/1/ns/net', + '--', + 'sh', + '-c', + cmd + ] res = stream( self.core_api.connect_get_namespaced_pod_exec, pod.metadata.name, @@ -80,6 +85,7 @@ def issue_cmd(self, node_name, cmd): stdout=True, tty=False ) + logging(f"Issued command: {cmd} with result {res}") return res def launch_pod(self, node_name): @@ -109,7 +115,7 @@ def launch_pod(self, node_name): } }, 'containers': [{ - 'image': 'busybox:1.34.0', + 'image': 'ubuntu:16.04', 'imagePullPolicy': 'IfNotPresent', 'securityContext': { 'privileged': True @@ -120,8 +126,13 @@ def launch_pod(self, node_name): ], "volumeMounts": [{ 'name': 'rootfs', - 'mountPath': '/rootfs', - 'readOnly': True + 'mountPath': '/rootfs' + }, { + 'name': 'bus', + 'mountPath': '/var/run' + }, { + 'name': 'rancher', + 'mountPath': '/var/lib/rancher' }], }], 'volumes': [{ @@ -129,6 +140,16 @@ def launch_pod(self, node_name): 'hostPath': { 'path': '/' } + }, { + 'name': 'bus', + 'hostPath': { + 'path': '/var/run' + } + }, { + 'name': 'rancher', + 'hostPath': { + 'path': '/var/lib/rancher' + } }] } } diff --git a/e2e/libs/utility/utility.py b/e2e/libs/utility/utility.py index f05085f959..0786cf93bd 100644 --- a/e2e/libs/utility/utility.py +++ b/e2e/libs/utility/utility.py @@ -136,21 +136,6 @@ def filter_cr(group, version, namespace, plural, field_selector="", label_select except ApiException as e: logging(f"Listing namespaced custom object: {e}") -def wait_delete_pod(pod_uid, namespace='default'): - api = client.CoreV1Api() - retry_count, retry_interval = get_retry_count_and_interval() - for i in range(retry_count): - ret = api.list_namespaced_pod(namespace=namespace) - found = False - for item in ret.items: - if item.metadata.uid == pod_uid: - found = True - break - if not found: - break - time.sleep(retry_interval) - assert not found - def wait_delete_ns(name): api = client.CoreV1Api() retry_count, retry_interval = get_retry_count_and_interval() diff --git a/e2e/libs/workload/pod.py b/e2e/libs/workload/pod.py index 274067441f..84c2d278cf 100644 --- a/e2e/libs/workload/pod.py +++ b/e2e/libs/workload/pod.py @@ -2,14 +2,14 @@ from kubernetes import client -from robot.libraries.BuiltIn import BuiltIn - from utility.utility import logging from utility.utility import generate_name +from utility.utility import get_retry_count_and_interval IMAGE_BUSYBOX = 'busybox:1.34.0' IMAGE_LITMUX = 'litmuschaos/go-runner:latest' +IMAGE_UBUNTU = 'ubuntu:16.04' def new_pod_manifest(image="", command=[], args=[], claim_name="", node_name="", labels={}): @@ -31,15 +31,35 @@ def new_pod_manifest(image="", command=[], args=[], }, 'spec': { 'nodeName': node_name, + 'restartPolicy': 'Never', 'containers': [{ 'image': image, 'imagePullPolicy': 'IfNotPresent', + 'securityContext': { + 'privileged': True + }, 'name': 'run', 'command': command, 'args': args, - 'volumeMounts': [], + 'volumeMounts': [{ + 'name': 'bus', + 'mountPath': '/var/run' + }, { + 'name': 'rancher', + 'mountPath': '/var/lib/rancher' + }] }], - 'volumes': [] + 'volumes': [{ + 'name': 'bus', + 'hostPath': { + 'path': '/var/run' + } + }, { + 'name': 'rancher', + 'hostPath': { + 'path': '/var/lib/rancher' + } + }] } } @@ -73,15 +93,33 @@ def create_pod(manifest, is_wait_for_pod_running=False): def delete_pod(name, namespace='default'): core_api = client.CoreV1Api() - core_api.delete_namespaced_pod(name=name, namespace=namespace) + try: + core_api.delete_namespaced_pod(name=name, namespace=namespace) + wait_delete_pod(name) + except ApiException as e: + assert e.status == 404 + +def wait_delete_pod(name, namespace='default'): + api = client.CoreV1Api() + retry_count, retry_interval = get_retry_count_and_interval() + for i in range(retry_count): + ret = api.list_namespaced_pod(namespace=namespace) + found = False + for item in ret.items: + if item.metadata.name == name: + found = True + break + if not found: + break + time.sleep(retry_interval) + assert not found def get_pod(name, namespace='default'): core_api = client.CoreV1Api() return core_api.read_namespaced_pod(name=name, namespace=namespace) -def wait_for_pod_status(name, status, namespace='default', - retry_count=int(BuiltIn().get_variable_value("${RETRY_COUNT}")), - retry_interval=int(BuiltIn().get_variable_value("${RETRY_INTERVAL}"))): +def wait_for_pod_status(name, status, namespace='default'): + retry_count, retry_interval = get_retry_count_and_interval() is_running = False for i in range(retry_count): pod = get_pod(name, namespace) diff --git a/e2e/tests/kubelet_restart.robot b/e2e/tests/kubelet_restart.robot new file mode 100644 index 0000000000..feb542b144 --- /dev/null +++ b/e2e/tests/kubelet_restart.robot @@ -0,0 +1,36 @@ +*** Settings *** +Documentation Negative Test Cases +Resource ../keywords/workload.resource +Resource ../keywords/volume.resource +Resource ../keywords/node.resource +Resource ../keywords/common.resource +Resource ../keywords/kubelet.resource + +Test Setup Set test environment +Test Teardown Cleanup test resources + +*** Variables *** +${LOOP_COUNT} 1 +${RETRY_COUNT} 300 +${RETRY_INTERVAL} 1 + +*** Test Cases *** +Restart Volume Node Kubelet While Workload Heavy Writing + Given Create statefulset 0 with rwo volume + FOR ${i} IN RANGE ${LOOP_COUNT} + And Keep writing data to statefulset 0 + When Stop volume node kubelet of statefulset 0 for 10 seconds + And Wait for volume of statefulset 0 healthy + And Wait for statefulset 0 stable + Then Check statefulset 0 works + END + +Stop Volume Node Kubelet For More Than Pod Eviction Timeout While Workload Heavy Writing + Given Create statefulset 0 with rwo volume + FOR ${i} IN RANGE ${LOOP_COUNT} + And Keep writing data to statefulset 0 + When Stop volume node kubelet of statefulset 0 for 360 seconds + And Wait for volume of statefulset 0 healthy + And Wait for statefulset 0 stable + Then Check statefulset 0 works + END From a43b84a438938dbe399269d5df7ed2a4995bca0a Mon Sep 17 00:00:00 2001 From: Yang Chiu Date: Mon, 2 Oct 2023 17:03:10 +0800 Subject: [PATCH 12/24] test: inject control plane node network latency Signed-off-by: Yang Chiu --- e2e/keywords/common.resource | 3 +++ e2e/keywords/node.resource | 2 ++ e2e/libs/keywords/network_keywords.py | 10 ++++++++ e2e/libs/network/network.py | 29 +++++++++++++++++++++ e2e/libs/node_exec/node_exec.py | 36 +++++++++++++++++++++++++-- e2e/libs/utility/utility.py | 10 ++++++++ e2e/tests/cluster_restart.robot | 1 + e2e/tests/node_reboot.robot | 1 + 8 files changed, 90 insertions(+), 2 deletions(-) create mode 100644 e2e/libs/keywords/network_keywords.py create mode 100644 e2e/libs/network/network.py diff --git a/e2e/keywords/common.resource b/e2e/keywords/common.resource index e978ddd0fc..c1757e5d99 100644 --- a/e2e/keywords/common.resource +++ b/e2e/keywords/common.resource @@ -6,6 +6,7 @@ Library ../libs/keywords/node_keywords.py Library ../libs/keywords/volume_keywords.py Library ../libs/keywords/recurring_job_keywords.py Library ../libs/keywords/workload_keywords.py +Library ../libs/keywords/network_keywords.py *** Variables *** @@ -22,8 +23,10 @@ Set test environment Set Test Variable ${deployment_list} @{statefulset_list} = Create List Set Test Variable ${statefulset_list} + setup_control_plane_network_latency Cleanup test resources + cleanup_control_plane_network_latency cleanup_node_exec cleanup_stress_helper cleanup_recurring_jobs ${volume_list} diff --git a/e2e/keywords/node.resource b/e2e/keywords/node.resource index 59f2b3313f..59bb4cb61f 100644 --- a/e2e/keywords/node.resource +++ b/e2e/keywords/node.resource @@ -4,6 +4,7 @@ Documentation Physical Node Keywords Library ../libs/keywords/volume_keywords.py Library ../libs/keywords/node_keywords.py Library ../libs/keywords/workload_keywords.py +Library ../libs/keywords/network_keywords.py *** Keywords *** During replica rebuilding, reboot volume node @@ -57,6 +58,7 @@ Wait for longhorn ready Restart cluster reboot_all_nodes + setup_control_plane_network_latency wait_for_all_instance_manager_running FOR ${deployment} IN @{deployment_list} wait_for_workload_pod_stable ${deployment} diff --git a/e2e/libs/keywords/network_keywords.py b/e2e/libs/keywords/network_keywords.py new file mode 100644 index 0000000000..93a86797d5 --- /dev/null +++ b/e2e/libs/keywords/network_keywords.py @@ -0,0 +1,10 @@ +from network.network import setup_control_plane_network_latency +from network.network import cleanup_control_plane_network_latency + +class network_keywords: + + def setup_control_plane_network_latency(self): + setup_control_plane_network_latency() + + def cleanup_control_plane_network_latency(self): + cleanup_control_plane_network_latency() diff --git a/e2e/libs/network/network.py b/e2e/libs/network/network.py new file mode 100644 index 0000000000..a9d81b4b3d --- /dev/null +++ b/e2e/libs/network/network.py @@ -0,0 +1,29 @@ +from robot.libraries.BuiltIn import BuiltIn +from utility.utility import get_control_plane_nodes +from node_exec import NodeExec + +def get_control_plane_node_network_latency_in_ms(): + latency_in_ms = int(BuiltIn().get_variable_value("${CONTROL_PLANE_NODE_NETWORK_LATENCY_IN_MS}")) + return latency_in_ms + +def setup_control_plane_network_latency(): + latency_in_ms = get_control_plane_node_network_latency_in_ms() + if latency_in_ms != 0: + nodes = get_control_plane_nodes() + for node in nodes: + cmd = f"tc qdisc replace dev eth0 root netem delay {latency_in_ms}ms" + res = NodeExec.get_instance().issue_cmd(node, cmd) + cmd = f"tc qdisc show dev eth0 | grep delay" + res = NodeExec.get_instance().issue_cmd(node, cmd) + assert res, "setup control plane network latency failed" + +def cleanup_control_plane_network_latency(): + latency_in_ms = get_control_plane_node_network_latency_in_ms() + if latency_in_ms != 0: + nodes = get_control_plane_nodes() + for node in nodes: + cmd = "tc qdisc del dev eth0 root" + res = NodeExec.get_instance().issue_cmd(node, cmd) + cmd = f"tc qdisc show dev eth0 | grep -v delay" + res = NodeExec.get_instance().issue_cmd(node, cmd) + assert res, "cleanup control plane network failed" \ No newline at end of file diff --git a/e2e/libs/node_exec/node_exec.py b/e2e/libs/node_exec/node_exec.py index 76011c29ef..d01f39988f 100644 --- a/e2e/libs/node_exec/node_exec.py +++ b/e2e/libs/node_exec/node_exec.py @@ -85,12 +85,20 @@ def issue_cmd(self, node_name, cmd): stdout=True, tty=False ) - logging(f"Issued command: {cmd} with result {res}") + logging(f"Issued command: {cmd} on {node_name} with result {res}") return res def launch_pod(self, node_name): if node_name in self.node_exec_pod: - return self.node_exec_pod[node_name] + for i in range(DEFAULT_POD_TIMEOUT): + pod = self.core_api.read_namespaced_pod( + name=node_name, + namespace=self.namespace + ) + if pod is not None and pod.status.phase == 'Running': + break + time.sleep(DEFAULT_POD_INTERVAL) + return pod else: pod_manifest = { 'apiVersion': 'v1', @@ -114,6 +122,30 @@ def launch_pod(self, node_name): } } }, + "tolerations": [{ + "key": "node-role.kubernetes.io/master", + "operator": "Equal", + "value": "true", + "effect": "NoSchedule" + }, + { + "key": "node-role.kubernetes.io/master", + "operator": "Equal", + "value": "true", + "effect": "NoExecute" + }, + { + "key": "node-role.kubernetes.io/control-plane", + "operator": "Equal", + "value": "true", + "effect": "NoSchedule" + }, + { + "key": "node-role.kubernetes.io/control-plane", + "operator": "Equal", + "value": "true", + "effect": "NoExecute" + }], 'containers': [{ 'image': 'ubuntu:16.04', 'imagePullPolicy': 'IfNotPresent', diff --git a/e2e/libs/utility/utility.py b/e2e/libs/utility/utility.py index 0786cf93bd..9108cf5805 100644 --- a/e2e/libs/utility/utility.py +++ b/e2e/libs/utility/utility.py @@ -53,6 +53,16 @@ def list_nodes(): nodes.append(item.metadata.name) return sorted(nodes) +def get_control_plane_nodes(): + core_api = client.CoreV1Api() + obj = core_api.list_node() + nodes = [] + for item in obj.items: + if 'node-role.kubernetes.io/control-plane' in item.metadata.labels or \ + 'node-role.kubernetes.io/master' in item.metadata.labels: + nodes.append(item.metadata.name) + return sorted(nodes) + def wait_for_cluster_ready(): core_api = client.CoreV1Api() retry_count, retry_interval = get_retry_count_and_interval() diff --git a/e2e/tests/cluster_restart.robot b/e2e/tests/cluster_restart.robot index 1fffb04c86..06c160a093 100644 --- a/e2e/tests/cluster_restart.robot +++ b/e2e/tests/cluster_restart.robot @@ -11,6 +11,7 @@ Test Teardown Cleanup test resources ${LOOP_COUNT} 1 ${RETRY_COUNT} 300 ${RETRY_INTERVAL} 1 +${CONTROL_PLANE_NODE_NETWORK_LATENCY_IN_MS} 0 *** Test Cases *** Restart Cluster While Workload Heavy Writing diff --git a/e2e/tests/node_reboot.robot b/e2e/tests/node_reboot.robot index 780146415f..fbc87ef62e 100644 --- a/e2e/tests/node_reboot.robot +++ b/e2e/tests/node_reboot.robot @@ -13,6 +13,7 @@ ${LOOP_COUNT} 1 ${RETRY_COUNT} 300 ${RETRY_INTERVAL} 1 ${VOLUME_TYPE} rwo +${CONTROL_PLANE_NODE_NETWORK_LATENCY_IN_MS} 0 *** Test Cases *** Reboot Node One By One While Workload Heavy Writing From 7bf0723b6a39c28f581595de2c1b30e91245937a Mon Sep 17 00:00:00 2001 From: Chin-Ya Huang Date: Fri, 27 Oct 2023 14:38:50 +0800 Subject: [PATCH 13/24] refactor(negative): remove unused variable assignment Signed-off-by: Chin-Ya Huang --- e2e/libs/keywords/workload_keywords.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e/libs/keywords/workload_keywords.py b/e2e/libs/keywords/workload_keywords.py index cee3fb2c94..10c0299876 100644 --- a/e2e/libs/keywords/workload_keywords.py +++ b/e2e/libs/keywords/workload_keywords.py @@ -14,7 +14,7 @@ def cleanup_storageclasses(self): delete_storageclass('longhorn-test-strict-local') def create_deployment(self, volume_type="rwo", option=""): - pvc_name = create_pvc(volume_type, option) + create_pvc(volume_type, option) deployment_name = create_deployment(volume_type, option) return deployment_name From 2a36c2b9de4d173e708e8a86be2d4ef212fcdc09 Mon Sep 17 00:00:00 2001 From: Chin-Ya Huang Date: Fri, 27 Oct 2023 14:40:42 +0800 Subject: [PATCH 14/24] doc(negative): remove unnecessary comments Signed-off-by: Chin-Ya Huang --- e2e/libs/keywords/volume_keywords.py | 1 - 1 file changed, 1 deletion(-) diff --git a/e2e/libs/keywords/volume_keywords.py b/e2e/libs/keywords/volume_keywords.py index 192b1a303f..4a3c0fa955 100644 --- a/e2e/libs/keywords/volume_keywords.py +++ b/e2e/libs/keywords/volume_keywords.py @@ -28,7 +28,6 @@ def attach_volume(self, volume_name): def get_volume_node(self, volume_name): volume = self.volume.get(volume_name) return volume['spec']['nodeID'] - # return volume.controllers[0].hostId def get_replica_node(self, volume_name): From d562b7eaed249de0d7207d86b963a5379e554d07 Mon Sep 17 00:00:00 2001 From: Chin-Ya Huang Date: Fri, 27 Oct 2023 14:44:07 +0800 Subject: [PATCH 15/24] style(negative): reorganize imports Signed-off-by: Chin-Ya Huang --- e2e/libs/keywords/volume_keywords.py | 8 +++++--- e2e/libs/kubelet/kubelet.py | 5 ++--- e2e/libs/network/network.py | 3 +++ e2e/libs/node/stress.py | 2 -- e2e/libs/utility/utility.py | 15 +++++++++------ 5 files changed, 19 insertions(+), 14 deletions(-) diff --git a/e2e/libs/keywords/volume_keywords.py b/e2e/libs/keywords/volume_keywords.py index 4a3c0fa955..ce0719a467 100644 --- a/e2e/libs/keywords/volume_keywords.py +++ b/e2e/libs/keywords/volume_keywords.py @@ -1,7 +1,9 @@ -from utility.utility import logging from utility.utility import generate_volume_name -from utility.utility import get_node, list_nodes -from utility.utility import get_test_pod_running_node, get_test_pod_not_running_node +from utility.utility import get_node +from utility.utility import get_test_pod_not_running_node +from utility.utility import get_test_pod_running_node +from utility.utility import list_nodes +from utility.utility import logging from volume import Volume diff --git a/e2e/libs/kubelet/kubelet.py b/e2e/libs/kubelet/kubelet.py index c9c5180050..ee306c54c7 100644 --- a/e2e/libs/kubelet/kubelet.py +++ b/e2e/libs/kubelet/kubelet.py @@ -1,10 +1,9 @@ -from utility.utility import logging import time -from workload.pod import new_pod_manifest from workload.pod import create_pod -from workload.pod import wait_for_pod_status from workload.pod import delete_pod +from workload.pod import new_pod_manifest + from workload.pod import IMAGE_UBUNTU def restart_kubelet(node_name, stop_time_in_sec=10): diff --git a/e2e/libs/network/network.py b/e2e/libs/network/network.py index a9d81b4b3d..838de57bf8 100644 --- a/e2e/libs/network/network.py +++ b/e2e/libs/network/network.py @@ -1,7 +1,10 @@ from robot.libraries.BuiltIn import BuiltIn + from utility.utility import get_control_plane_nodes + from node_exec import NodeExec + def get_control_plane_node_network_latency_in_ms(): latency_in_ms = int(BuiltIn().get_variable_value("${CONTROL_PLANE_NODE_NETWORK_LATENCY_IN_MS}")) return latency_in_ms diff --git a/e2e/libs/node/stress.py b/e2e/libs/node/stress.py index f142fca7c5..efca69856a 100644 --- a/e2e/libs/node/stress.py +++ b/e2e/libs/node/stress.py @@ -1,5 +1,3 @@ -from kubernetes import client - from node.utility import get_node_cpu_cores from utility.utility import logging diff --git a/e2e/libs/utility/utility.py b/e2e/libs/utility/utility.py index 9108cf5805..da7f2fca18 100644 --- a/e2e/libs/utility/utility.py +++ b/e2e/libs/utility/utility.py @@ -1,14 +1,17 @@ -from kubernetes import config, client, dynamic -from kubernetes.client.rest import ApiException -from kubernetes.stream import stream -from longhorn import from_env -import string -import random import os import socket +import string import time +import random import yaml +from longhorn import from_env + +from kubernetes import client +from kubernetes import config +from kubernetes import dynamic +from kubernetes.client.rest import ApiException + from robot.api import logger from robot.libraries.BuiltIn import BuiltIn From e2cbf7d6089dab12cf332d5fa67e510e0b700886 Mon Sep 17 00:00:00 2001 From: Chin-Ya Huang Date: Fri, 27 Oct 2023 16:03:18 +0800 Subject: [PATCH 16/24] refactor(negative): improve log Signed-off-by: Chin-Ya Huang --- e2e/libs/keywords/volume_keywords.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/e2e/libs/keywords/volume_keywords.py b/e2e/libs/keywords/volume_keywords.py index ce0719a467..6caa9cb4b0 100644 --- a/e2e/libs/keywords/volume_keywords.py +++ b/e2e/libs/keywords/volume_keywords.py @@ -16,14 +16,14 @@ def __init__(self): def create_volume(self, size, replica_count): volume_name = generate_volume_name() + logging(f'Creating volume {volume_name}') self.volume.create(volume_name, size, replica_count) - logging(f'Created volume {volume_name}') return volume_name def attach_volume(self, volume_name): attach_node = get_test_pod_not_running_node() - logging(f'Attached volume {volume_name} to {attach_node}') + logging(f'Attaching volume {volume_name} to {attach_node}') self.volume.attach(volume_name, attach_node) From 2b96c656bc37856be68c70d4419ee2a7bc1809e4 Mon Sep 17 00:00:00 2001 From: Chin-Ya Huang Date: Mon, 30 Oct 2023 13:57:12 +0800 Subject: [PATCH 17/24] fix(negative): import "utils.common_utils" could not be resolved Signed-off-by: Chin-Ya Huang --- e2e/libs/engine/crd.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/e2e/libs/engine/crd.py b/e2e/libs/engine/crd.py index 5b4ddef80b..cf027bcc8c 100644 --- a/e2e/libs/engine/crd.py +++ b/e2e/libs/engine/crd.py @@ -1,13 +1,13 @@ import logging -from engine.base import Base +from kubernetes import client -from utils.common_utils import k8s_cr_api +from engine.base import Base class CRD(Base): def __init__(self): - self.cr_api = k8s_cr_api() + self.obj_api = client.CustomObjectsApi() def get_engine(self, volume_name, node_name): if volume_name == "" or node_name == "": @@ -48,7 +48,7 @@ def delete_engine(self, volume_name, node_name): for engine in engines: engine_name = engine['metadata']['name'] - self.cr_api.delete_namespaced_custom_object( + self.obj_api.delete_namespaced_custom_object( group="longhorn.io", version="v1beta2", namespace="longhorn-system", From d33fbe09b1ea6a88a740b678160554fdd60a74fc Mon Sep 17 00:00:00 2001 From: Chin-Ya Huang Date: Mon, 30 Oct 2023 14:25:33 +0800 Subject: [PATCH 18/24] fix(negative): get_variable_value "NoneType" error Should return default if the variable does not exist. Signed-off-by: Chin-Ya Huang --- e2e/libs/network/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e/libs/network/network.py b/e2e/libs/network/network.py index 838de57bf8..fdcfd534de 100644 --- a/e2e/libs/network/network.py +++ b/e2e/libs/network/network.py @@ -6,7 +6,7 @@ def get_control_plane_node_network_latency_in_ms(): - latency_in_ms = int(BuiltIn().get_variable_value("${CONTROL_PLANE_NODE_NETWORK_LATENCY_IN_MS}")) + latency_in_ms = int(BuiltIn().get_variable_value("${CONTROL_PLANE_NODE_NETWORK_LATENCY_IN_MS}", default="0")) return latency_in_ms def setup_control_plane_network_latency(): From 188bf997a0606a939f72007863dbcaf5975596f2 Mon Sep 17 00:00:00 2001 From: Chin-Ya Huang Date: Mon, 30 Oct 2023 11:18:18 +0800 Subject: [PATCH 19/24] refactor(negative): move constants to constant.py Signed-off-by: Chin-Ya Huang --- e2e/libs/kubelet/kubelet.py | 3 ++- e2e/libs/node/constant.py | 6 ++++++ e2e/libs/node/stress.py | 20 ++++++++++---------- e2e/libs/node_exec/constant.py | 2 ++ e2e/libs/node_exec/node_exec.py | 4 ++-- e2e/libs/recurring_job/constant.py | 2 ++ e2e/libs/recurring_job/rest.py | 4 ++-- e2e/libs/replica/constant.py | 2 ++ e2e/libs/replica/rest.py | 4 ++-- e2e/libs/volume/constant.py | 11 +++++++++++ e2e/libs/volume/crd.py | 7 ++----- e2e/libs/volume/rest.py | 12 +++++------- e2e/libs/workload/constant.py | 3 +++ e2e/libs/workload/pod.py | 5 +---- 14 files changed, 52 insertions(+), 33 deletions(-) create mode 100644 e2e/libs/node/constant.py create mode 100644 e2e/libs/node_exec/constant.py create mode 100644 e2e/libs/recurring_job/constant.py create mode 100644 e2e/libs/replica/constant.py create mode 100644 e2e/libs/volume/constant.py create mode 100644 e2e/libs/workload/constant.py diff --git a/e2e/libs/kubelet/kubelet.py b/e2e/libs/kubelet/kubelet.py index ee306c54c7..06beb039da 100644 --- a/e2e/libs/kubelet/kubelet.py +++ b/e2e/libs/kubelet/kubelet.py @@ -4,7 +4,8 @@ from workload.pod import delete_pod from workload.pod import new_pod_manifest -from workload.pod import IMAGE_UBUNTU +from workload.constant import IMAGE_UBUNTU + def restart_kubelet(node_name, stop_time_in_sec=10): manifest = new_pod_manifest( diff --git a/e2e/libs/node/constant.py b/e2e/libs/node/constant.py new file mode 100644 index 0000000000..ae7aec1941 --- /dev/null +++ b/e2e/libs/node/constant.py @@ -0,0 +1,6 @@ +NODE_STRESS_CPU_LOAD_PERCENTAGE = 100 +NODE_STRESS_MEM_LOAD_PERCENTAGE = 100 +NODE_STRESS_MEM_VM_WORKERS = 1 +NODE_STRESS_TIMEOUT_SECOND = 300 + +LABEL_STRESS_HELPER = "longhorn-stress-helper" diff --git a/e2e/libs/node/stress.py b/e2e/libs/node/stress.py index efca69856a..f5072866f7 100644 --- a/e2e/libs/node/stress.py +++ b/e2e/libs/node/stress.py @@ -1,5 +1,11 @@ from node.utility import get_node_cpu_cores +from node.constant import LABEL_STRESS_HELPER +from node.constant import NODE_STRESS_CPU_LOAD_PERCENTAGE +from node.constant import NODE_STRESS_MEM_LOAD_PERCENTAGE +from node.constant import NODE_STRESS_MEM_VM_WORKERS +from node.constant import NODE_STRESS_TIMEOUT_SECOND + from utility.utility import logging from workload.pod import create_pod @@ -7,14 +13,8 @@ from workload.pod import new_pod_manifest from workload.workload import get_workload_pods -from workload.pod import IMAGE_LITMUX - -NODE_CPU_LOAD_PERCENTAGE = 100 -NODE_MEM_LOAD_PERCENTAGE = 100 -NODE_MEM_VM_WORKERS = 1 -NODE_STRESS_TIMEOUT_SECOND = 300 +from workload.constant import IMAGE_LITMUX -LABEL_STRESS_HELPER = "longhorn-stress-helper" class Stress: def cleanup(self): @@ -28,7 +28,7 @@ def cpu(self, node_names): image=IMAGE_LITMUX, command=["stress-ng"], args=['--cpu', str(get_node_cpu_cores(node_name)), - '--cpu-load', str(NODE_CPU_LOAD_PERCENTAGE), + '--cpu-load', str(NODE_STRESS_CPU_LOAD_PERCENTAGE), '--timeout', str(NODE_STRESS_TIMEOUT_SECOND)], node_name=node_name, labels={'app': LABEL_STRESS_HELPER} @@ -43,8 +43,8 @@ def memory(self, node_names): manifest = new_pod_manifest( image=IMAGE_LITMUX, command=["stress-ng"], - args=['--vm', str(NODE_MEM_VM_WORKERS), - '--vm-bytes', f"{NODE_MEM_LOAD_PERCENTAGE}%", + args=['--vm', str(NODE_STRESS_MEM_VM_WORKERS), + '--vm-bytes', f"{NODE_STRESS_MEM_LOAD_PERCENTAGE}%", '--timeout', str(NODE_STRESS_TIMEOUT_SECOND)], node_name=node_name, labels={'app': LABEL_STRESS_HELPER} diff --git a/e2e/libs/node_exec/constant.py b/e2e/libs/node_exec/constant.py new file mode 100644 index 0000000000..255c49afc4 --- /dev/null +++ b/e2e/libs/node_exec/constant.py @@ -0,0 +1,2 @@ +DEFAULT_POD_TIMEOUT = 180 +DEFAULT_POD_INTERVAL = 1 diff --git a/e2e/libs/node_exec/node_exec.py b/e2e/libs/node_exec/node_exec.py index d01f39988f..d1c2136076 100644 --- a/e2e/libs/node_exec/node_exec.py +++ b/e2e/libs/node_exec/node_exec.py @@ -7,9 +7,9 @@ from workload.pod import wait_delete_pod from utility.utility import wait_delete_ns +from node_exec.constant import DEFAULT_POD_INTERVAL +from node_exec.constant import DEFAULT_POD_TIMEOUT -DEFAULT_POD_TIMEOUT = 180 -DEFAULT_POD_INTERVAL = 1 class NodeExec: diff --git a/e2e/libs/recurring_job/constant.py b/e2e/libs/recurring_job/constant.py new file mode 100644 index 0000000000..bb5017e701 --- /dev/null +++ b/e2e/libs/recurring_job/constant.py @@ -0,0 +1,2 @@ +RETRY_COUNTS = 180 +RETRY_INTERVAL = 1 diff --git a/e2e/libs/recurring_job/rest.py b/e2e/libs/recurring_job/rest.py index 5d54ed2ac8..fa01bceb75 100644 --- a/e2e/libs/recurring_job/rest.py +++ b/e2e/libs/recurring_job/rest.py @@ -10,9 +10,9 @@ from utility.utility import get_longhorn_client from utility.utility import logging +from recurring_job.constant import RETRY_COUNTS +from recurring_job.constant import RETRY_INTERVAL -RETRY_COUNTS = 180 -RETRY_INTERVAL = 1 class Rest(Base): diff --git a/e2e/libs/replica/constant.py b/e2e/libs/replica/constant.py new file mode 100644 index 0000000000..82e875b169 --- /dev/null +++ b/e2e/libs/replica/constant.py @@ -0,0 +1,2 @@ +RETRY_COUNTS = 150 +RETRY_INTERVAL = 1 diff --git a/e2e/libs/replica/rest.py b/e2e/libs/replica/rest.py index 6ace292c03..a5d111be6e 100644 --- a/e2e/libs/replica/rest.py +++ b/e2e/libs/replica/rest.py @@ -4,9 +4,9 @@ from utils import common_utils +from replica.constant import RETRY_COUNTS +from replica.constant import RETRY_INTERVAL -RETRY_COUNTS = 150 -RETRY_INTERVAL = 1 class Rest(Base): def __init__(self, node_exec): diff --git a/e2e/libs/volume/constant.py b/e2e/libs/volume/constant.py new file mode 100644 index 0000000000..c9d6e4a990 --- /dev/null +++ b/e2e/libs/volume/constant.py @@ -0,0 +1,11 @@ +KIBIBYTE = 1024 +MEBIBYTE = (KIBIBYTE * KIBIBYTE) +GIBIBYTE = (MEBIBYTE * KIBIBYTE) + +RETRY_COUNTS = 150 +RETRY_INTERVAL = 1 + +VOLUME_FRONTEND_BLOCKDEV = "blockdev" +VOLUME_FRONTEND_ISCSI = "iscsi" + +DEV_PATH = "/dev/longhorn/" diff --git a/e2e/libs/volume/crd.py b/e2e/libs/volume/crd.py index 1c9e4a155d..33d94444a9 100644 --- a/e2e/libs/volume/crd.py +++ b/e2e/libs/volume/crd.py @@ -8,10 +8,7 @@ from volume.base import Base from volume.rest import Rest - -Ki = 2**10 -Mi = 2**20 -Gi = 2**30 +from volume.constant import GIBIBYTE class CRD(Base): @@ -38,7 +35,7 @@ def create(self, volume_name, size, replica_count): "spec": { "frontend": "blockdev", "replicaAutoBalance": "ignored", - "size": str(int(size) * Gi), + "size": str(int(size) * GIBIBYTE), "numberOfReplicas": int(replica_count) } } diff --git a/e2e/libs/volume/rest.py b/e2e/libs/volume/rest.py index 9443bf9961..7bb43d36c7 100644 --- a/e2e/libs/volume/rest.py +++ b/e2e/libs/volume/rest.py @@ -6,13 +6,11 @@ from volume.base import Base - -RETRY_COUNTS = 150 -RETRY_INTERVAL = 1 - -VOLUME_FRONTEND_BLOCKDEV = "blockdev" -VOLUME_FRONTEND_ISCSI = "iscsi" -DEV_PATH = "/dev/longhorn/" +from volume.constant import DEV_PATH +from volume.constant import RETRY_COUNTS +from volume.constant import RETRY_INTERVAL +from volume.constant import VOLUME_FRONTEND_BLOCKDEV +from volume.constant import VOLUME_FRONTEND_ISCSI class Rest(Base): diff --git a/e2e/libs/workload/constant.py b/e2e/libs/workload/constant.py new file mode 100644 index 0000000000..cd7aa90153 --- /dev/null +++ b/e2e/libs/workload/constant.py @@ -0,0 +1,3 @@ +IMAGE_BUSYBOX = 'busybox:1.34.0' +IMAGE_LITMUX = 'litmuschaos/go-runner:latest' +IMAGE_UBUNTU = 'ubuntu:16.04' diff --git a/e2e/libs/workload/pod.py b/e2e/libs/workload/pod.py index 84c2d278cf..d2ec4c0ba3 100644 --- a/e2e/libs/workload/pod.py +++ b/e2e/libs/workload/pod.py @@ -6,10 +6,7 @@ from utility.utility import generate_name from utility.utility import get_retry_count_and_interval - -IMAGE_BUSYBOX = 'busybox:1.34.0' -IMAGE_LITMUX = 'litmuschaos/go-runner:latest' -IMAGE_UBUNTU = 'ubuntu:16.04' +from workload.constant import IMAGE_BUSYBOX def new_pod_manifest(image="", command=[], args=[], claim_name="", node_name="", labels={}): From f4bab23546c7c8480287b9f36fc5483e597596fb Mon Sep 17 00:00:00 2001 From: Chin-Ya Huang Date: Mon, 30 Oct 2023 11:19:16 +0800 Subject: [PATCH 20/24] fix(negative): ApiException not defined Signed-off-by: Chin-Ya Huang --- e2e/libs/workload/pod.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/e2e/libs/workload/pod.py b/e2e/libs/workload/pod.py index d2ec4c0ba3..144acee206 100644 --- a/e2e/libs/workload/pod.py +++ b/e2e/libs/workload/pod.py @@ -1,6 +1,7 @@ import time from kubernetes import client +from kubernetes.client import rest from utility.utility import logging from utility.utility import generate_name @@ -93,7 +94,7 @@ def delete_pod(name, namespace='default'): try: core_api.delete_namespaced_pod(name=name, namespace=namespace) wait_delete_pod(name) - except ApiException as e: + except rest.ApiException as e: assert e.status == 404 def wait_delete_pod(name, namespace='default'): From 653379e8cf9b9d88c49a80958333285753e1db95 Mon Sep 17 00:00:00 2001 From: Chin-Ya Huang Date: Mon, 30 Oct 2023 11:57:50 +0800 Subject: [PATCH 21/24] refactor(negative): remove unused variable assignment Signed-off-by: Chin-Ya Huang --- e2e/libs/volume/crd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/e2e/libs/volume/crd.py b/e2e/libs/volume/crd.py index 33d94444a9..af7dafcf0f 100644 --- a/e2e/libs/volume/crd.py +++ b/e2e/libs/volume/crd.py @@ -95,7 +95,7 @@ def attach(self, volume_name, node_name): def delete(self, volume_name): try: - resp = self.obj_api.delete_namespaced_custom_object( + self.obj_api.delete_namespaced_custom_object( group="longhorn.io", version="v1beta2", namespace="longhorn-system", @@ -109,7 +109,7 @@ def delete(self, volume_name): def wait_for_volume_delete(self, volume_name): for i in range(self.retry_count): try: - resp = self.obj_api.get_namespaced_custom_object( + self.obj_api.get_namespaced_custom_object( group="longhorn.io", version="v1beta2", namespace="longhorn-system", From 2d045a01d5504a55cf3e1392b33aaecda501cdb3 Mon Sep 17 00:00:00 2001 From: Chin-Ya Huang Date: Mon, 30 Oct 2023 15:58:59 +0800 Subject: [PATCH 22/24] fix(negative): logging is not defined Signed-off-by: Chin-Ya Huang --- e2e/libs/engine/engine.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/e2e/libs/engine/engine.py b/e2e/libs/engine/engine.py index 2003712917..3c4bc2cf9f 100644 --- a/e2e/libs/engine/engine.py +++ b/e2e/libs/engine/engine.py @@ -3,6 +3,8 @@ from strategy import LonghornOperationStrategy +from utility.utility import logging + class Engine(Base): From 3d21dda02a7893622347eec91460bb90eb0d23df Mon Sep 17 00:00:00 2001 From: Chin-Ya Huang Date: Mon, 30 Oct 2023 16:38:16 +0800 Subject: [PATCH 23/24] fix(negative): 'CRD' object has no attribute 'cr_api' Signed-off-by: Chin-Ya Huang --- e2e/libs/engine/crd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e/libs/engine/crd.py b/e2e/libs/engine/crd.py index cf027bcc8c..8ab7f1e372 100644 --- a/e2e/libs/engine/crd.py +++ b/e2e/libs/engine/crd.py @@ -22,7 +22,7 @@ def get_engine(self, volume_name, node_name): if node_name != "": label_selector.append(f"longhornnode={node_name}") - api_response = self.cr_api.list_namespaced_custom_object( + api_response = self.obj_api.list_namespaced_custom_object( group="longhorn.io", version="v1beta2", namespace="longhorn-system", From 2b993492a65c0514fc6a3565c1847fac454dc734 Mon Sep 17 00:00:00 2001 From: Chin-Ya Huang Date: Wed, 1 Nov 2023 15:58:02 +0800 Subject: [PATCH 24/24] refactor: improve logging Signed-off-by: Chin-Ya Huang --- e2e/libs/volume/crd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e/libs/volume/crd.py b/e2e/libs/volume/crd.py index 1c9e4a155d..0f55e5719f 100644 --- a/e2e/libs/volume/crd.py +++ b/e2e/libs/volume/crd.py @@ -152,7 +152,7 @@ def wait_for_volume_robustness(self, volume_name, desired_state): def wait_for_volume_robustness_not(self, volume_name, not_desired_state): for i in range(self.retry_count): - logging(f"Waiting for {volume_name} not {not_desired_state} ({i}) ...") + logging(f"Waiting for {volume_name} robustness not {not_desired_state} ({i}) ...") try: if self.get(volume_name)["status"]["robustness"] != not_desired_state: break