Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

config-daemon: Restart all instances of device-plugin #783

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 27 additions & 26 deletions pkg/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ func New(
eventRecorder: er,
featureGate: featureGates,
disabledPlugins: disabledPlugins,
mu: &sync.Mutex{},
}
}

Expand Down Expand Up @@ -159,7 +160,6 @@ func (dn *Daemon) Run(stopCh <-chan struct{}, exitCh <-chan error) error {

var timeout int64 = 5
var metadataKey = "metadata.name"
dn.mu = &sync.Mutex{}
informerFactory := sninformer.NewFilteredSharedInformerFactory(dn.sriovClient,
time.Second*15,
vars.Namespace,
Expand Down Expand Up @@ -683,7 +683,6 @@ func (dn *Daemon) restartDevicePluginPod() error {
defer dn.mu.Unlock()
log.Log.V(2).Info("restartDevicePluginPod(): try to restart device plugin pod")

var podToDelete string
pods, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).List(context.Background(), metav1.ListOptions{
LabelSelector: "app=sriov-device-plugin",
FieldSelector: "spec.nodeName=" + vars.NodeName,
Expand All @@ -702,35 +701,37 @@ func (dn *Daemon) restartDevicePluginPod() error {
log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
return nil
}
podToDelete = pods.Items[0].Name

log.Log.V(2).Info("restartDevicePluginPod(): Found device plugin pod, deleting it", "pod-name", podToDelete)
err = dn.kubeClient.CoreV1().Pods(vars.Namespace).Delete(context.Background(), podToDelete, metav1.DeleteOptions{})
if errors.IsNotFound(err) {
log.Log.Info("restartDevicePluginPod(): pod to delete not found")
return nil
}
if err != nil {
log.Log.Error(err, "restartDevicePluginPod(): Failed to delete device plugin pod, retrying")
return err
}

if err := wait.PollImmediateUntil(3*time.Second, func() (bool, error) {
_, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).Get(context.Background(), podToDelete, metav1.GetOptions{})
for _, pod := range pods.Items {
podToDelete := pod.Name
log.Log.V(2).Info("restartDevicePluginPod(): Found device plugin pod, deleting it", "pod-name", podToDelete)
err = dn.kubeClient.CoreV1().Pods(vars.Namespace).Delete(context.Background(), podToDelete, metav1.DeleteOptions{})
if errors.IsNotFound(err) {
log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
return true, nil
log.Log.Info("restartDevicePluginPod(): pod to delete not found")
return nil
}

if err != nil {
log.Log.Error(err, "restartDevicePluginPod(): Failed to check for device plugin exit, retrying")
} else {
log.Log.Info("restartDevicePluginPod(): waiting for device plugin pod to exit", "pod-name", podToDelete)
log.Log.Error(err, "restartDevicePluginPod(): Failed to delete device plugin pod, retrying")
return err
}

if err := wait.PollImmediateUntil(3*time.Second, func() (bool, error) {
_, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).Get(context.Background(), podToDelete, metav1.GetOptions{})
if errors.IsNotFound(err) {
log.Log.Info("restartDevicePluginPod(): device plugin pod exited")
return true, nil
}

if err != nil {
log.Log.Error(err, "restartDevicePluginPod(): Failed to check for device plugin exit, retrying")
} else {
log.Log.Info("restartDevicePluginPod(): waiting for device plugin pod to exit", "pod-name", podToDelete)
}
return false, nil
}, dn.stopCh); err != nil {
log.Log.Error(err, "restartDevicePluginPod(): failed to wait for checking pod deletion")
return err
}
return false, nil
}, dn.stopCh); err != nil {
log.Log.Error(err, "restartDevicePluginPod(): failed to wait for checking pod deletion")
return err
}

return nil
Expand Down
61 changes: 47 additions & 14 deletions pkg/daemon/daemon_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ import (
"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/fakefilesystem"
)

var SriovDevicePluginPod corev1.Pod

func TestConfigDaemon(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "Config Daemon Suite")
Expand Down Expand Up @@ -107,19 +109,6 @@ var _ = Describe("Config Daemon", func() {
},
}

SriovDevicePluginPod := corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "sriov-device-plugin-xxxx",
Namespace: vars.Namespace,
Labels: map[string]string{
"app": "sriov-device-plugin",
},
},
Spec: corev1.PodSpec{
NodeName: "test-node",
},
}

err = sriovnetworkv1.AddToScheme(scheme.Scheme)
Expect(err).ToNot(HaveOccurred())
kClient := kclient.NewClientBuilder().WithScheme(scheme.Scheme).WithRuntimeObjects(&corev1.Node{
Expand All @@ -130,7 +119,7 @@ var _ = Describe("Config Daemon", func() {
Namespace: vars.Namespace,
}}).Build()

kubeClient := fakek8s.NewSimpleClientset(&FakeSupportedNicIDs, &SriovDevicePluginPod)
kubeClient := fakek8s.NewSimpleClientset(&FakeSupportedNicIDs)
snclient := snclientset.NewSimpleClientset()
err = sriovnetworkv1.InitNicIDMapFromConfigMap(kubeClient, vars.Namespace)
Expect(err).ToNot(HaveOccurred())
Expand Down Expand Up @@ -175,6 +164,22 @@ var _ = Describe("Config Daemon", func() {
err := sut.Run(stopCh, exitCh)
Expect(err).ToNot(HaveOccurred())
}()

SriovDevicePluginPod = corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "sriov-device-plugin-xxxx",
Namespace: vars.Namespace,
Labels: map[string]string{
"app": "sriov-device-plugin",
},
},
Spec: corev1.PodSpec{
NodeName: "test-node",
},
}
_, err = sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), &SriovDevicePluginPod, metav1.CreateOptions{})
Expect(err).ToNot(HaveOccurred())

})

AfterEach(func() {
Expand Down Expand Up @@ -286,6 +291,34 @@ var _ = Describe("Config Daemon", func() {

Expect(sut.desiredNodeState.GetGeneration()).To(BeNumerically("==", 777))
})

It("restart all the sriov-device-plugin pods present on the node", func() {
otherPod1 := SriovDevicePluginPod.DeepCopy()
otherPod1.Name = "sriov-device-plugin-xxxa"
_, err := sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), otherPod1, metav1.CreateOptions{})
Expect(err).ToNot(HaveOccurred())

otherPod2 := SriovDevicePluginPod.DeepCopy()
otherPod2.Name = "sriov-device-plugin-xxxz"
_, err = sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), otherPod2, metav1.CreateOptions{})
Expect(err).ToNot(HaveOccurred())

err = sut.restartDevicePluginPod()
Expect(err).ToNot(HaveOccurred())

Eventually(func() (int, error) {
podList, err := sut.kubeClient.CoreV1().Pods(vars.Namespace).List(context.Background(), metav1.ListOptions{
LabelSelector: "app=sriov-device-plugin",
FieldSelector: "spec.nodeName=test-node",
})

if err != nil {
return 0, err
}

return len(podList.Items), nil
}, "1s").Should(BeZero())
})
})
})

Expand Down
Loading