Skip to content

Commit

Permalink
test(negative): implement force drain node
Browse files Browse the repository at this point in the history
Signed-off-by: Yang Chiu <[email protected]>
  • Loading branch information
yangchiu committed May 3, 2024
1 parent 6e3e381 commit 98f3e2c
Show file tree
Hide file tree
Showing 9 changed files with 163 additions and 34 deletions.
23 changes: 23 additions & 0 deletions e2e/keywords/k8s.resource
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,26 @@ Delete volume of ${workload_kind} ${workload_id} replica node

Add deleted node back
reboot_node_by_name ${deleted_node}

Force drain volume of ${workload_kind} ${workload_id} volume node
${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id}
${volume_name} = get_workload_volume_name ${workload_name}
${drained_node} = get_volume_node ${volume_name}
${last_volume_node} = get_volume_node ${volume_name}
force_drain_node ${drained_node}
wait_for_all_pods_evicted ${drained_node}
Set Test Variable ${drained_node}
Set Test Variable ${last_volume_node}

Force drain volume of ${workload_kind} ${workload_id} replica node
${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id}
${volume_name} = get_workload_volume_name ${workload_name}
${drained_node} = get_replica_node ${volume_name}
${last_volume_node} = get_volume_node ${volume_name}
force_drain_node ${drained_node}
wait_for_all_pods_evicted ${drained_node}
Set Test Variable ${drained_node}
Set Test Variable ${last_volume_node}

Uncordon the drained node
uncordon_node ${drained_node}
16 changes: 15 additions & 1 deletion e2e/keywords/workload.resource
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ Wait for volume of ${workload_kind} ${workload_id} healthy
${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id}
wait_for_workload_volume_healthy ${workload_name}

Wait until volume of ${workload_kind} ${workload_id} replica rebuidling started on ${replica_locality}
Wait until volume of ${workload_kind} ${workload_id} replica rebuilding started on ${replica_locality}
${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id}
${volume_name} = get_workload_volume_name ${workload_name}
wait_for_replica_rebuilding_to_start_on_node ${volume_name} ${replica_locality}
Expand All @@ -66,6 +66,20 @@ Wait for volume of ${workload_kind} ${workload_id} attached and healthy
${volume_name} = get_workload_volume_name ${workload_name}
wait_for_volume_healthy ${volume_name}

Wait for volume of ${workload_kind} ${workload_id} attached to the original node and degraded
${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id}
${volume_name} = get_workload_volume_name ${workload_name}
wait_for_volume_degraded ${volume_name}
${volume_node} = get_volume_node ${volume_name}
Should Be Equal ${last_volume_node} ${volume_node}

Wait for volume of ${workload_kind} ${workload_id} attached to another node and degraded
${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id}
${volume_name} = get_workload_volume_name ${workload_name}
wait_for_volume_degraded ${volume_name}
${volume_node} = get_volume_node ${volume_name}
Should Not Be Equal ${last_volume_node} ${volume_node}

Delete replica of ${workload_kind} ${workload_id} volume on ${replica_locality}
${workload_name} = generate_name_with_suffix ${workload_kind} ${workload_id}
${volume_name} = get_workload_volume_name ${workload_name}
Expand Down
51 changes: 48 additions & 3 deletions e2e/libs/k8s/k8s.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import time
import subprocess
import asyncio
from kubernetes import client
from workload.pod import create_pod
from workload.pod import delete_pod
from workload.pod import new_pod_manifest
from workload.constant import IMAGE_UBUNTU

from utility.utility import subprocess_exec_cmd
from utility.utility import logging
from utility.utility import get_retry_count_and_interval

async def restart_kubelet(node_name, downtime_in_sec=10):
manifest = new_pod_manifest(
Expand All @@ -24,5 +26,48 @@ async def restart_kubelet(node_name, downtime_in_sec=10):

def delete_node(node_name):
exec_cmd = ["kubectl", "delete", "node", node_name]
res = subprocess.check_output(exec_cmd)
logging(f"Executed command {exec_cmd} with result {res}")
res = subprocess_exec_cmd(exec_cmd)

def drain_node(node_name):
exec_cmd = ["kubectl", "drain", node_name, "--ignore-daemonsets", "--delete-emptydir-data"]
res = subprocess_exec_cmd(exec_cmd)

def force_drain_node(node_name):
exec_cmd = ["kubectl", "drain", node_name, "--force", "--ignore-daemonsets", "--delete-emptydir-data"]
res = subprocess_exec_cmd(exec_cmd)

def cordon_node(node_name):
exec_cmd = ["kubectl", "cordon", node_name]
res = subprocess_exec_cmd(exec_cmd)

def uncordon_node(node_name):
exec_cmd = ["kubectl", "uncordon", node_name]
res = subprocess_exec_cmd(exec_cmd)

def get_all_pods_on_node(node_name):
api = client.CoreV1Api()
all_pods = api.list_namespaced_pod(namespace='longhorn-system', field_selector='spec.nodeName=' + node_name)
user_pods = [p for p in all_pods.items if (p.metadata.namespace != 'kube-system')]
return user_pods

def wait_all_pods_evicted(node_name):
retry_count, retry_interval = get_retry_count_and_interval()
for i in range(retry_count):
pods = get_all_pods_on_node(node_name)
logging(f"Waiting for pods evicted from {node_name} ... ({i})")
evicted = True
for pod in pods:
# check non DaemonSet Pods are evicted or terminating (deletionTimestamp != None)
pod_type = pod.metadata.owner_references[0].kind
pod_delete_timestamp = pod.metadata.deletion_timestamp

if (pod_type != 'DaemonSet' and pod_type != 'BackingImageManager') and pod_delete_timestamp == None:
evicted = False
break

if evicted:
break

time.sleep(retry_interval)

assert evicted, 'failed to evict pods'
15 changes: 15 additions & 0 deletions e2e/libs/keywords/k8s_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
from robot.libraries.BuiltIn import BuiltIn
from k8s.k8s import restart_kubelet
from k8s.k8s import delete_node
from k8s.k8s import drain_node, force_drain_node
from k8s.k8s import cordon_node, uncordon_node
from k8s.k8s import wait_all_pods_evicted
from utility.utility import logging


Expand Down Expand Up @@ -37,3 +40,15 @@ def delete_replica_node(self, volume_name):
replica_node = volume_keywords.get_replica_node(volume_name)
delete_node(replica_node)
return replica_node

def drain_node(self, node_name):
drain_node(node_name)

def force_drain_node(self, node_name):
force_drain_node(node_name)

def uncordon_node(self, node_name):
uncordon_node(node_name)

def wait_for_all_pods_evicted(self, node_name):
wait_all_pods_evicted(node_name)
27 changes: 0 additions & 27 deletions e2e/libs/node/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,33 +49,6 @@ def reset_disks(self, node_name):
logging(f"Try to remove disk {disk_name} from node {node_name}")
self.update_disks(node_name, disks)

def get_all_pods_on_node(self, node_name):
api = client.CoreV1Api()
all_pods = api.list_namespaced_pod(namespace='longhorn-system', field_selector='spec.nodeName=' + node_name)
user_pods = [p for p in all_pods.items if (p.metadata.namespace != 'kube-system')]
return user_pods

def wait_all_pods_evicted(self, node_name):
retry_count, retry_interval = get_retry_count_and_interval()
for _ in range(retry_count):
pods = self.get_all_pods_on_node(node_name)
evicted = True
for pod in pods:
# check non DaemonSet Pods are evicted or terminating (deletionTimestamp != None)
pod_type = pod.metadata.owner_references[0].kind
pod_delete_timestamp = pod.metadata.deletion_timestamp

if pod_type != 'DaemonSet' and pod_delete_timestamp == None:
evicted = False
break

if evicted:
break

time.sleep(retry_interval)

assert evicted, 'failed to evict pods'

def is_accessing_node_by_index(self, node):
p = re.compile('node (\d)')
if m := p.match(node):
Expand Down
7 changes: 7 additions & 0 deletions e2e/libs/utility/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import signal
from robot.api import logger
from robot.libraries.BuiltIn import BuiltIn
import subprocess

from longhorn import from_env

Expand Down Expand Up @@ -79,6 +80,12 @@ def get_backupstore():
return os.environ.get('LONGHORN_BACKUPSTORE', "")


def subprocess_exec_cmd(cmd):
res = subprocess.check_output(cmd)
logging(f"Executed command {cmd} with result {res}")
return res


def wait_for_cluster_ready():
core_api = client.CoreV1Api()
retry_count, retry_interval = get_retry_count_and_interval()
Expand Down
4 changes: 2 additions & 2 deletions e2e/tests/node_delete.robot
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ Delete Volume Node While Replica Rebuilding

FOR ${i} IN RANGE ${LOOP_COUNT}
When Delete replica of deployment 0 volume on volume node
And Wait until volume of deployment 0 replica rebuidling started on volume node
And Wait until volume of deployment 0 replica rebuilding started on volume node
And Delete volume of deployment 0 volume node

Then Wait for volume of deployment 0 attached and unknown
Expand All @@ -44,7 +44,7 @@ Delete Replica Node While Replica Rebuilding

FOR ${i} IN RANGE ${LOOP_COUNT}
When Delete replica of deployment 0 volume on replica node
And Wait until volume of deployment 0 replica rebuidling started on replica node
And Wait until volume of deployment 0 replica rebuilding started on replica node
And Delete volume of deployment 0 replica node

Then Wait for volume of deployment 0 attached and degraded
Expand Down
53 changes: 53 additions & 0 deletions e2e/tests/node_drain.robot
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
*** Settings ***
Documentation Negative Test Cases
Resource ../keywords/common.resource
Resource ../keywords/persistentvolumeclaim.resource
Resource ../keywords/k8s.resource
Resource ../keywords/deployment.resource
Resource ../keywords/workload.resource

Test Setup Set test environment
Test Teardown Cleanup test resources

*** Variables ***
${LOOP_COUNT} 1
${RETRY_COUNT} 300
${RETRY_INTERVAL} 1

*** Test Cases ***
Force Drain Volume Node While Replica Rebuilding
Given Create persistentvolumeclaim 0 using RWO volume
And Create deployment 0 with persistentvolumeclaim 0
And Wait for volume of deployment 0 healthy
And Write 2048 MB data to file data.txt in deployment 0

FOR ${i} IN RANGE ${LOOP_COUNT}
When Delete replica of deployment 0 volume on volume node
And Wait until volume of deployment 0 replica rebuilding started on volume node
And Force drain volume of deployment 0 volume node

Then Wait for volume of deployment 0 attached to another node and degraded
And Uncordon the drained node
And Wait for volume of deployment 0 attached and healthy
And Wait for deployment 0 pods stable
And Check deployment 0 data in file data.txt is intact
END

Force Drain Replica Node While Replica Rebuilding
Given Create persistentvolumeclaim 0 using RWO volume
And Create deployment 0 with persistentvolumeclaim 0
And Wait for volume of deployment 0 healthy
And Write 2048 MB data to file data.txt in deployment 0

FOR ${i} IN RANGE ${LOOP_COUNT}
When Delete replica of deployment 0 volume on replica node
And Wait until volume of deployment 0 replica rebuilding started on replica node
And Force drain volume of deployment 0 replica node

Then Wait for volume of deployment 0 attached to the original node and degraded
And Uncordon the drained node
And Wait for volume of deployment 0 attached and healthy
And Wait for deployment 0 pods stable
And Check deployment 0 data in file data.txt is intact
END
1 change: 0 additions & 1 deletion e2e/tests/replica_rebuilding.robot
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ Documentation Negative Test Cases
Resource ../keywords/common.resource
Resource ../keywords/host.resource
Resource ../keywords/persistentvolumeclaim.resource
Resource ../keywords/volume.resource

Test Setup Set test environment
Expand Down

0 comments on commit 98f3e2c

Please sign in to comment.