From 4a50683ad28c0b5cfd787825ae9e8a41625a43d8 Mon Sep 17 00:00:00 2001 From: Tom Wieczorek Date: Thu, 19 Oct 2023 12:31:01 +0200 Subject: [PATCH] Add a PodDisruptionBudget for CoreDNS As CoreDNS is a rather critical component in Kubernetes clusters, k0s should ensure it remains available, even during node maintenance or other disruptions. A PodDisruptionBudget helps to maintain a minimum level of availability. Some additional tweaks and their considerations: * Set maxUnavailable=1 only for deployments with two or three replicas. Use the Kubernetes defaults (maxUnavailable=25%, rounded down to get absolute values) for all other cases. For single replica deployments, maxUnavailable=1 meant that the deployment's available condition would be true even with zero ready replicas. For deployments with 4 to 7 replicas, this has been the same as the default, and for deployments with 8 or more replicase, this has been artificially constraining the rolling update speed. * No pod anti-affinity for single replica deployments, so that CoreDNS can be rolled on a single node without downtime, as the single replica can remain operational until the new replica can take over. * No PDB for single node clusters, so that CoreDNS may be drained from the single node. This might leave CoreDNS eligible for eviction due to node pressure, but such clusters aren't HA in the first place and it seems more desirable to not block a drain. Basic integration test: * Use strings as identifier when detecting multiple CoreDNS pods on nodes. Makes for more readable log/error messages. * Ignore pods without pod anti-affinity, as they are remnants when scaling up single node clusters. Signed-off-by: Tom Wieczorek --- inttest/basic/basic_test.go | 19 ++++++++++--------- pkg/component/controller/coredns.go | 24 ++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/inttest/basic/basic_test.go b/inttest/basic/basic_test.go index e040fe125385..03996e67f479 100644 --- a/inttest/basic/basic_test.go +++ b/inttest/basic/basic_test.go @@ -19,7 +19,6 @@ package basic import ( "bytes" "context" - "errors" "fmt" "strings" "testing" @@ -33,7 +32,6 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/labels" - "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" "github.com/BurntSushi/toml" @@ -233,27 +231,30 @@ func (s *BasicSuite) verifyContainerdDefaultConfig(ctx context.Context) { func (s *BasicSuite) probeCoreDNSAntiAffinity(ctx context.Context, kc *kubernetes.Clientset) error { // Wait until both CoreDNS Pods got assigned to a node - pods := map[string]types.UID{} + pods := map[string]string{} return watch.Pods(kc.CoreV1().Pods("kube-system")). WithLabels(labels.Set{"k8s-app": "kube-dns"}). WithErrorCallback(common.RetryWatchErrors(s.T().Logf)). Until(ctx, func(pod *corev1.Pod) (bool, error) { - // Keep waiting if there's no node assigned yet. + // Keep waiting until there's anti-affinity and node assignment + if a := pod.Spec.Affinity; a == nil || a.PodAntiAffinity == nil { + s.T().Logf("Pod %s doesn't have any pod anti-affinity", pod.Name) + return false, nil + } nodeName := pod.Spec.NodeName if nodeName == "" { s.T().Logf("Pod %s not scheduled yet: %+v", pod.ObjectMeta.Name, pod.Status) return false, nil } - uid := pod.GetUID() - if prevUID, ok := pods[nodeName]; ok && uid != prevUID { - return false, errors.New("multiple CoreDNS pods scheduled on the same node") + if prevName, ok := pods[nodeName]; ok && pod.Name != prevName { + return false, fmt.Errorf("multiple CoreDNS pods scheduled on node %s: %s and %s", nodeName, prevName, pod.Name) } - s.T().Logf("Pod %s scheduled on %s", pod.ObjectMeta.Name, pod.Spec.NodeName) + s.T().Logf("Pod %s scheduled on %s", pod.Name, pod.Spec.NodeName) - pods[nodeName] = pod.GetUID() + pods[nodeName] = pod.Name return len(pods) > 1, nil }) } diff --git a/pkg/component/controller/coredns.go b/pkg/component/controller/coredns.go index 5ef71a8204c3..5d1e45526b22 100644 --- a/pkg/component/controller/coredns.go +++ b/pkg/component/controller/coredns.go @@ -120,11 +120,13 @@ metadata: k8s-app: kube-dns kubernetes.io/name: "CoreDNS" spec: - replicas: {{ .Replicas}} + replicas: {{ .Replicas }} strategy: type: RollingUpdate + {{- if or (eq .Replicas 2) (eq .Replicas 3) }} rollingUpdate: maxUnavailable: 1 + {{- end }} selector: matchLabels: k8s-app: kube-dns @@ -145,6 +147,7 @@ spec: effect: "NoSchedule" nodeSelector: kubernetes.io/os: linux + {{- if gt .Replicas 1 }} # Require running coredns replicas on different nodes affinity: podAntiAffinity: @@ -155,6 +158,7 @@ spec: - key: k8s-app operator: In values: ['kube-dns'] + {{- end }} containers: - name: coredns image: {{ .Image }} @@ -203,7 +207,7 @@ spec: path: /ready port: 8181 scheme: HTTP - initialDelaySeconds: 0 + initialDelaySeconds: 30 # give loop plugin time to detect loops periodSeconds: 2 timeoutSeconds: 1 successThreshold: 1 @@ -216,6 +220,22 @@ spec: items: - key: Corefile path: Corefile +{{- if gt .Replicas 1 }} +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: coredns + namespace: kube-system + labels: + k8s-app: kube-dns + kubernetes.io/name: "CoreDNS" +spec: + minAvailable: 50% + selector: + matchLabels: + k8s-app: kube-dns +{{- end }} --- apiVersion: v1 kind: Service