diff --git a/inttest/basic/basic_test.go b/inttest/basic/basic_test.go index e040fe125385..03996e67f479 100644 --- a/inttest/basic/basic_test.go +++ b/inttest/basic/basic_test.go @@ -19,7 +19,6 @@ package basic import ( "bytes" "context" - "errors" "fmt" "strings" "testing" @@ -33,7 +32,6 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/labels" - "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" "github.com/BurntSushi/toml" @@ -233,27 +231,30 @@ func (s *BasicSuite) verifyContainerdDefaultConfig(ctx context.Context) { func (s *BasicSuite) probeCoreDNSAntiAffinity(ctx context.Context, kc *kubernetes.Clientset) error { // Wait until both CoreDNS Pods got assigned to a node - pods := map[string]types.UID{} + pods := map[string]string{} return watch.Pods(kc.CoreV1().Pods("kube-system")). WithLabels(labels.Set{"k8s-app": "kube-dns"}). WithErrorCallback(common.RetryWatchErrors(s.T().Logf)). Until(ctx, func(pod *corev1.Pod) (bool, error) { - // Keep waiting if there's no node assigned yet. + // Keep waiting until there's anti-affinity and node assignment + if a := pod.Spec.Affinity; a == nil || a.PodAntiAffinity == nil { + s.T().Logf("Pod %s doesn't have any pod anti-affinity", pod.Name) + return false, nil + } nodeName := pod.Spec.NodeName if nodeName == "" { s.T().Logf("Pod %s not scheduled yet: %+v", pod.ObjectMeta.Name, pod.Status) return false, nil } - uid := pod.GetUID() - if prevUID, ok := pods[nodeName]; ok && uid != prevUID { - return false, errors.New("multiple CoreDNS pods scheduled on the same node") + if prevName, ok := pods[nodeName]; ok && pod.Name != prevName { + return false, fmt.Errorf("multiple CoreDNS pods scheduled on node %s: %s and %s", nodeName, prevName, pod.Name) } - s.T().Logf("Pod %s scheduled on %s", pod.ObjectMeta.Name, pod.Spec.NodeName) + s.T().Logf("Pod %s scheduled on %s", pod.Name, pod.Spec.NodeName) - pods[nodeName] = pod.GetUID() + pods[nodeName] = pod.Name return len(pods) > 1, nil }) } diff --git a/pkg/component/controller/coredns.go b/pkg/component/controller/coredns.go index 5ef71a8204c3..c900123ad0a5 100644 --- a/pkg/component/controller/coredns.go +++ b/pkg/component/controller/coredns.go @@ -30,6 +30,7 @@ import ( "github.com/sirupsen/logrus" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" + "k8s.io/utils/pointer" "github.com/k0sproject/k0s/internal/pkg/dir" "github.com/k0sproject/k0s/internal/pkg/templatewriter" @@ -120,11 +121,13 @@ metadata: k8s-app: kube-dns kubernetes.io/name: "CoreDNS" spec: - replicas: {{ .Replicas}} + replicas: {{ .Replicas }} strategy: type: RollingUpdate + {{- if .MaxUnavailableReplicas }} rollingUpdate: - maxUnavailable: 1 + maxUnavailable: {{ .MaxUnavailableReplicas }} + {{- end }} selector: matchLabels: k8s-app: kube-dns @@ -145,6 +148,7 @@ spec: effect: "NoSchedule" nodeSelector: kubernetes.io/os: linux + {{- if not .DisablePodAntiAffinity }} # Require running coredns replicas on different nodes affinity: podAntiAffinity: @@ -155,6 +159,7 @@ spec: - key: k8s-app operator: In values: ['kube-dns'] + {{- end }} containers: - name: coredns image: {{ .Image }} @@ -203,7 +208,7 @@ spec: path: /ready port: 8181 scheme: HTTP - initialDelaySeconds: 0 + initialDelaySeconds: 30 # give loop plugin time to detect loops periodSeconds: 2 timeoutSeconds: 1 successThreshold: 1 @@ -216,6 +221,22 @@ spec: items: - key: Corefile path: Corefile +{{- if not .DisablePodDisruptionBudget }} +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: coredns + namespace: kube-system + labels: + k8s-app: kube-dns + kubernetes.io/name: "CoreDNS" +spec: + minAvailable: 50% + selector: + matchLabels: + k8s-app: kube-dns +{{- end }} --- apiVersion: v1 kind: Service @@ -264,11 +285,14 @@ type CoreDNS struct { } type coreDNSConfig struct { - Replicas int - ClusterDNSIP string - ClusterDomain string - Image string - PullPolicy string + Replicas int + ClusterDNSIP string + ClusterDomain string + Image string + PullPolicy string + MaxUnavailableReplicas *uint + DisablePodAntiAffinity bool + DisablePodDisruptionBudget bool } // NewCoreDNS creates new instance of CoreDNS component @@ -339,16 +363,43 @@ func (c *CoreDNS) getConfig(ctx context.Context, clusterConfig *v1beta1.ClusterC } nodeCount := len(nodes.Items) - replicas := replicaCount(nodeCount) config := coreDNSConfig{ - Replicas: replicas, + Replicas: replicaCount(nodeCount), ClusterDomain: nodeConfig.Spec.Network.ClusterDomain, ClusterDNSIP: dns, Image: clusterConfig.Spec.Images.CoreDNS.URI(), PullPolicy: clusterConfig.Spec.Images.DefaultPullPolicy, } + if config.Replicas <= 1 { + + // No pod anti-affinity for single replica deployments, so that CoreDNS + // can be rolled on a single node without downtime, as the single + // replica can remain operational until the new replica can take over. + config.DisablePodAntiAffinity = true + + // No PodDisruptionBudget for single replica deployments, so that + // CoreDNS may be drained from the node running the single replica. This + // might leave CoreDNS eligible for eviction due to node pressure, but + // such clusters aren't HA in the first place and it seems more + // desirable to not block a drain. + config.DisablePodDisruptionBudget = true + + } else if config.Replicas == 2 || config.Replicas == 3 { + + // Set maxUnavailable=1 only for deployments with two or three replicas. + // Use the Kubernetes defaults (maxUnavailable=25%, rounded down to get + // absolute values) for all other cases. For single replica deployments, + // maxUnavailable=1 would mean that the deployment's available condition + // would be true even with zero ready replicas. For deployments with 4 + // to 7 replicas, it would be the same as the default, and for + // deployments with 8 or more replicas, this would artificially + // constrain the rolling update speed. + config.MaxUnavailableReplicas = pointer.Uint(1) + + } + return config, nil }