Skip to content

Commit

Permalink
Add a PodDisruptionBudget for CoreDNS
Browse files Browse the repository at this point in the history
As CoreDNS is a rather critical component in Kubernetes clusters, k0s
should ensure it remains available, even during node maintenance or
other disruptions. A PodDisruptionBudget helps to maintain a minimum
level of availability.

Some additional tweaks and their considerations:

* No pod anti-affinity for single replica deployments, so that CoreDNS
  can be rolled on a single node without downtime, as the single replica
  can remain operational until the new replica can take over.
* No PodDisruptionBudget for single replica deployments, so that CoreDNS
  may be drained from the node running the single replica. This might
  leave CoreDNS eligible for eviction due to node pressure, but such
  clusters aren't HA in the first place and it seems more desirable to
  not block a drain.
* Set maxUnavailable=1 only for deployments with two or three replicas.
  Use the Kubernetes defaults (maxUnavailable=25%, rounded down to get
  absolute values) for all other cases. For single replica deployments,
  maxUnavailable=1 meant that the deployment's available condition would
  be true even with zero ready replicas. For deployments with 4 to 7
  replicas, this has been the same as the default, and for deployments
  with 8 or more replicas, this has been artificially constraining the
  rolling update speed.

Basic integration test:

* Use strings as identifier when detecting multiple CoreDNS pods on
  nodes. Makes for more readable log/error messages.
* Ignore pods without pod anti-affinity, as they are remnants when
  scaling up single node clusters.

Signed-off-by: Tom Wieczorek <[email protected]>
  • Loading branch information
twz123 committed Oct 26, 2023
1 parent 9c1a0c7 commit c802ffd
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 19 deletions.
19 changes: 10 additions & 9 deletions inttest/basic/basic_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ package basic
import (
"bytes"
"context"
"errors"
"fmt"
"strings"
"testing"
Expand All @@ -33,7 +32,6 @@ import (
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes"

"github.com/BurntSushi/toml"
Expand Down Expand Up @@ -233,27 +231,30 @@ func (s *BasicSuite) verifyContainerdDefaultConfig(ctx context.Context) {

func (s *BasicSuite) probeCoreDNSAntiAffinity(ctx context.Context, kc *kubernetes.Clientset) error {
// Wait until both CoreDNS Pods got assigned to a node
pods := map[string]types.UID{}
pods := map[string]string{}

return watch.Pods(kc.CoreV1().Pods("kube-system")).
WithLabels(labels.Set{"k8s-app": "kube-dns"}).
WithErrorCallback(common.RetryWatchErrors(s.T().Logf)).
Until(ctx, func(pod *corev1.Pod) (bool, error) {
// Keep waiting if there's no node assigned yet.
// Keep waiting until there's anti-affinity and node assignment
if a := pod.Spec.Affinity; a == nil || a.PodAntiAffinity == nil {
s.T().Logf("Pod %s doesn't have any pod anti-affinity", pod.Name)
return false, nil
}
nodeName := pod.Spec.NodeName
if nodeName == "" {
s.T().Logf("Pod %s not scheduled yet: %+v", pod.ObjectMeta.Name, pod.Status)
return false, nil
}

uid := pod.GetUID()
if prevUID, ok := pods[nodeName]; ok && uid != prevUID {
return false, errors.New("multiple CoreDNS pods scheduled on the same node")
if prevName, ok := pods[nodeName]; ok && pod.Name != prevName {
return false, fmt.Errorf("multiple CoreDNS pods scheduled on node %s: %s and %s", nodeName, prevName, pod.Name)
}

s.T().Logf("Pod %s scheduled on %s", pod.ObjectMeta.Name, pod.Spec.NodeName)
s.T().Logf("Pod %s scheduled on %s", pod.Name, pod.Spec.NodeName)

pods[nodeName] = pod.GetUID()
pods[nodeName] = pod.Name
return len(pods) > 1, nil
})
}
Expand Down
71 changes: 61 additions & 10 deletions pkg/component/controller/coredns.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"github.com/sirupsen/logrus"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/utils/pointer"

"github.com/k0sproject/k0s/internal/pkg/dir"
"github.com/k0sproject/k0s/internal/pkg/templatewriter"
Expand Down Expand Up @@ -120,11 +121,13 @@ metadata:
k8s-app: kube-dns
kubernetes.io/name: "CoreDNS"
spec:
replicas: {{ .Replicas}}
replicas: {{ .Replicas }}
strategy:
type: RollingUpdate
{{- if .MaxUnavailableReplicas }}
rollingUpdate:
maxUnavailable: 1
maxUnavailable: {{ .MaxUnavailableReplicas }}
{{- end }}
selector:
matchLabels:
k8s-app: kube-dns
Expand All @@ -145,6 +148,7 @@ spec:
effect: "NoSchedule"
nodeSelector:
kubernetes.io/os: linux
{{- if not .DisablePodAntiAffinity }}
# Require running coredns replicas on different nodes
affinity:
podAntiAffinity:
Expand All @@ -155,6 +159,7 @@ spec:
- key: k8s-app
operator: In
values: ['kube-dns']
{{- end }}
containers:
- name: coredns
image: {{ .Image }}
Expand Down Expand Up @@ -203,7 +208,7 @@ spec:
path: /ready
port: 8181
scheme: HTTP
initialDelaySeconds: 0
initialDelaySeconds: 30 # give loop plugin time to detect loops
periodSeconds: 2
timeoutSeconds: 1
successThreshold: 1
Expand All @@ -216,6 +221,22 @@ spec:
items:
- key: Corefile
path: Corefile
{{- if not .DisablePodDisruptionBudget }}
---
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: coredns
namespace: kube-system
labels:
k8s-app: kube-dns
kubernetes.io/name: "CoreDNS"
spec:
minAvailable: 50%
selector:
matchLabels:
k8s-app: kube-dns
{{- end }}
---
apiVersion: v1
kind: Service
Expand Down Expand Up @@ -264,11 +285,14 @@ type CoreDNS struct {
}

type coreDNSConfig struct {
Replicas int
ClusterDNSIP string
ClusterDomain string
Image string
PullPolicy string
Replicas int
ClusterDNSIP string
ClusterDomain string
Image string
PullPolicy string
MaxUnavailableReplicas *uint
DisablePodAntiAffinity bool
DisablePodDisruptionBudget bool
}

// NewCoreDNS creates new instance of CoreDNS component
Expand Down Expand Up @@ -339,16 +363,43 @@ func (c *CoreDNS) getConfig(ctx context.Context, clusterConfig *v1beta1.ClusterC
}

nodeCount := len(nodes.Items)
replicas := replicaCount(nodeCount)

config := coreDNSConfig{
Replicas: replicas,
Replicas: replicaCount(nodeCount),
ClusterDomain: nodeConfig.Spec.Network.ClusterDomain,
ClusterDNSIP: dns,
Image: clusterConfig.Spec.Images.CoreDNS.URI(),
PullPolicy: clusterConfig.Spec.Images.DefaultPullPolicy,
}

if config.Replicas <= 1 {

// No pod anti-affinity for single replica deployments, so that CoreDNS
// can be rolled on a single node without downtime, as the single
// replica can remain operational until the new replica can take over.
config.DisablePodAntiAffinity = true

// No PodDisruptionBudget for single replica deployments, so that
// CoreDNS may be drained from the node running the single replica. This
// might leave CoreDNS eligible for eviction due to node pressure, but
// such clusters aren't HA in the first place and it seems more
// desirable to not block a drain.
config.DisablePodDisruptionBudget = true

} else if config.Replicas == 2 || config.Replicas == 3 {

// Set maxUnavailable=1 only for deployments with two or three replicas.
// Use the Kubernetes defaults (maxUnavailable=25%, rounded down to get
// absolute values) for all other cases. For single replica deployments,
// maxUnavailable=1 would mean that the deployment's available condition
// would be true even with zero ready replicas. For deployments with 4
// to 7 replicas, it would be the same as the default, and for
// deployments with 8 or more replicas, this would artificially
// constrain the rolling update speed.
config.MaxUnavailableReplicas = pointer.Uint(1)

}

return config, nil
}

Expand Down

0 comments on commit c802ffd

Please sign in to comment.