Merge branch 'master' into backupstore

longhorn · Oct 3, 2023 · 6ff3fdd · 6ff3fdd
2 parents 0159bc7 + d16c87d
commit 6ff3fdd
Show file tree

Hide file tree

Showing 17 changed files with 253 additions and 939 deletions.
diff --git a/e2e/README.md b/e2e/README.md
@@ -0,0 +1,94 @@
+# Longhorn e2e tests
+
+### Requirement
+
+1. A Kubernetes cluster with 3 worker nodes.
+   - And control node(s) with following taints:
+      - `node-role.kubernetes.io/master=true:NoExecute`
+      - `node-role.kubernetes.io/master=true:NoSchedule` 
+2. Longhorn system has already been successfully deployed in the cluster.
+3. Run the environment check script to check if each node in the cluster fulfills the requirements:
+```
+curl -sSfL https://raw.githubusercontent.com/longhorn/longhorn/master/scripts/environment_check.sh | bash
+```
+
+### Run the test
+
+1. Deploy all backupstore servers (including `NFS` server and `Minio` as s3 server) for test purposes.
+```
+kubectl create -f https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/minio-backupstore.yaml \
+               -f https://raw.githubusercontent.com/longhorn/longhorn/master/deploy/backupstores/nfs-backupstore.yaml
+```
+
+2. Expose Longhorn API:
+```
+# for example, using nodeport:
+kubectl expose --type=NodePort deployment longhorn-ui -n longhorn-system --port 8000 --name longhorn-ui-nodeport --overrides '{ "apiVersion": "v1","spec":{"ports": [{"port":8000,"protocol":"TCP","targetPort":8000,"nodePort":30000}]}}'
+# or using port-forward:
+kubectl port-forward services/longhorn-frontend 8080:http -n longhorn-system
+```
+
+3. Export environment variable `KUBECONFIG`:
+```
+export KUBECONFIG=/path/to/your/kubeconfig.yaml
+```
+
+4. Export environment variable `LONGHORN_CLIENT_URL`:
+```
+# for example, if it's exposed by nodeport:
+export LONGHORN_CLIENT_URL=http://node-public-ip:30000
+# or exposed by port-foraword:
+export LONGHORN_CLIENT_URL=http://localhost:8080
+```
+
+5. Prepare test environment and run the test
+```
+cd e2e
+python -m venv .
+source bin/activate
+pip install -r requirements.txt
+
+# to run all the test cases, simply execute:
+./run.sh
+
+# to specify the test case you'd like to run, use "-t" option:
+./run.sh -t "Reboot Volume Node While Workload Heavy Writing"
+
+# to specify the LOOP_COUNT or any other test variables, use "-v" option:
+./run.sh -t "Reboot Volume Node While Workload Heavy Writing" -v LOOP_COUNT:100 -v RETRY_COUNT:259200
+
+# to specify which test suite you'd like to run, use "-s" option:
+./run.sh -s "replica_rebuilding"
+
+# to modify debug level, use "-L" option:
+./run.sh -L DEBUG
+```
+
+Once the test completed, the test result can be found at /tmp/test-report folder.
+
+### Architecture
+
+The e2e robot test framework includes 4 layers:
+
+```
+ ---------------------------------------------------------------------
+|                                                                     |
+|               tests/*.robot: Test Case Definition                   |
+|                                                                     |
+ ---------------------------------------------------------------------
+|                                                                     |
+|             keywords/*.resource: Keyword Definition                 |
+|                                                                     |
+ ---------------------------------------------------------------------
+|                                                                     |
+|              libs/keywords: Keyword Implementation                  |
+|                                                                     |
+ ---------------------------------------------------------------------
+|                                                                     |
+| libs/COMPONENT_NAME: Basic operations to manipulate each component  |
+|                   (volume, replica, workload, node, etc.)           |
+|                                                                     |
+ ---------------------------------------------------------------------
+```
+
+ __* Each layer can only call functions from the next layer or the same layer. Skip-layer is strictly forbidden. For example, Keyword Definition layer can only call functions in Keyword Implementation layer or Keyword Definition layer, directly call functions in Basic operations layer is strictly forbidden.__
diff --git a/e2e/doc/condition_table.md b/e2e/doc/condition_table.md
diff --git a/e2e/keywords/replica.resource b/e2e/keywords/replica.resource
diff --git a/e2e/keywords/volume.resource b/e2e/keywords/volume.resource
@@ -47,10 +47,6 @@ Wait until replica ${replica_0} rebuilt, delete replica ${replica_2}
     wait_for_replica_rebuilding_complete    ${volume_name}    ${replica_0}
     delete_replica    ${volume_name}    ${replica_2}
 
-Wait until replica ${replica_0} rebuilt, delete replica ${replica_2}
-    wait_for_replica_rebuilding_complete    ${volume_name}    ${replica_0}
-    delete_replica    ${volume_name}    ${replica_2}
-
 Check data is intact
     check_data    ${volume_name}    ${volume_data_checksum}
 

diff --git a/e2e/libs/engine/engine.py b/e2e/libs/engine/engine.py
@@ -17,3 +17,22 @@ def get_engine(self, volume_name, node_name):
     # delete engines, if input parameters are empty then will delete all
     def delete_engine(self, volume_name="", node_name=""):
         return self.engine.delete_engine(volume_name, node_name)
+
+    def get_engine_state(self, volume_name, node_name):
+        logging(f"Getting the volume {volume_name} engine on the node {node_name} state")
+
+        resp = self.get_engine(volume_name, node_name)
+        if resp == "" or resp is None:
+            raise Exception(f"failed to get the volume {volume_name} engine")
+
+        engines = resp["items"]
+        if len(engines) == 0:
+            logging.warning(f"cannot get the volume {volume_name} engines")
+            return
+
+        engines_states = {}
+        for engine in engines:
+            engine_name = engine["metadata"]["name"]
+            engine_state = engine['status']['currentState']
+            engines_states[engine_name] = engine_state
+        return engines_states
diff --git a/e2e/libs/keywords/engine_keywords.py b/e2e/libs/keywords/engine_keywords.py
diff --git a/e2e/libs/keywords/pod_keywords.py b/e2e/libs/keywords/pod_keywords.py
diff --git a/e2e/libs/keywords/replica_keywords.py b/e2e/libs/keywords/replica_keywords.py
diff --git a/e2e/libs/node/node.py b/e2e/libs/node/node.py
@@ -68,3 +68,29 @@ def reboot_all_worker_nodes(self, shut_down_time_in_sec=60):
         waiter = self.aws_client.get_waiter('instance_running')
         waiter.wait(InstanceIds=instance_ids)
         logging(f"Started instances")
+
+    def get_all_pods_on_node(self, node_name):
+        api = client.CoreV1Api()
+        all_pods = api.list_namespaced_pod(namespace='longhorn-system', field_selector='spec.nodeName=' + node_name)
+        user_pods = [p for p in all_pods.items if (p.metadata.namespace != 'kube-system')]
+        return user_pods
+
+    def wait_all_pods_evicted(self, node_name):
+        for i in range(RETRY_COUNT):
+            pods = self.get_all_pods_on_node(node_name)
+            evicted = True
+            for pod in pods:
+                # check non DaemonSet Pods are evicted or terminating (deletionTimestamp != None)
+                pod_type = pod.metadata.owner_references[0].kind
+                pod_delete_timestamp = pod.metadata.deletion_timestamp
+
+                if pod_type != 'DaemonSet' and pod_delete_timestamp == None:
+                    evicted = False
+                    break
+
+            if evicted:
+                break
+
+            time.sleep(RETRY_INTERVAL)
+
+        assert evicted, 'failed to evict pods'
diff --git a/e2e/libs/utils/__init__.py b/e2e/libs/utils/__init__.py