From 398a4c8f8862fe2f4adca2c192d1134a3d76fd4e Mon Sep 17 00:00:00 2001 From: Francois Allard Date: Wed, 9 Jun 2021 16:54:57 -0400 Subject: [PATCH] Adding support for GKE preemptibles nodes Context: When a GKE nodes get preempted, pods that were on the pods remain there and will run when the nodes comes back online. It is also possible that the daemonset pods status will be ready even when the node is not yet ready (possibly staled cache) Solution: - Added an option to configure the effect to set for the taint - Taint the node in case the pod is ready but the node is not --- README.md | 31 ++++++++++++++++++++----------- pkg/nidhogg/handler.go | 37 +++++++++++++++++++++++++++++-------- 2 files changed, 49 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index fa486a05..6bddfa2e 100644 --- a/README.md +++ b/README.md @@ -11,18 +11,20 @@ Nidhogg was built using [Kubebuilder](https://github.com/kubernetes-sigs/kubebui ## Usage Nidhogg requires a yaml/json config file to tell it what Daemonsets to watch and what nodes to act on. -`nodeSelector` is a map of keys/values corresponding to node labels. `daemonsets` is an array of Daemonsets to watch, each containing two fields `name` and `namespace`. Nodes are tainted with taint that follows the format of `nidhogg.uswitch.com/namespace.name:NoSchedule`. +`nodeSelector` is a map of keys/values corresponding to node labels. `daemonsets` is an array of Daemonsets to watch, each containing two mandatory fields `name` and `namespace` and one optional field `effect` (the effect will be set to `NoSchedule` by default). Nodes are tainted with taint that follows the format of `nidhogg.uswitch.com/{namespace}.{name}:{effect}`. Example: YAML: + ```yaml -nodeSelector: - node-role.kubernetes.io/node +nodeSelector: node-role.kubernetes.io/node daemonsets: - name: kiam - namespace: kube-system + namespace: kube-system + effect: NoSchedule # default value, can be omitted ``` + JSON: ```json @@ -40,9 +42,10 @@ JSON: ] } ``` -This example will select any nodes in AWS ASGs named "standard" or "special" that have the label -`node-role.kubernetes.io/node` present, and no nodes with label `node-role.kubernetes.io/master`. If the matching nodes -do not have a running and ready pod from the `kiam` daemonset in the `kube-system` namespace. It will add a taint of + +This example will select any nodes in AWS ASGs named "standard" or "special" that have the label +`node-role.kubernetes.io/node` present, and no nodes with label `node-role.kubernetes.io/master`. If the matching nodes +do not have a running and ready pod from the `kiam` daemonset in the `kube-system` namespace. It will add a taint of `nidhogg.uswitch.com/kube-system.kiam:NoSchedule` until there is a ready kiam pod on the node. If you want pods to be able to run on the nidhogg tainted nodes you can add a toleration: @@ -50,17 +53,19 @@ If you want pods to be able to run on the nidhogg tainted nodes you can add a to ```yaml spec: tolerations: - - key: nidhogg.uswitch.com/kube-system.kiam - operator: "Exists" - effect: NoSchedule + - key: nidhogg.uswitch.com/kube-system.kiam + operator: "Exists" + effect: NoSchedule ``` ## Deploying + Docker images can be found at https://quay.io/uswitch/nidhogg -Example [Kustomize](https://github.com/kubernetes-sigs/kustomize) manifests can be found [here](/config) to quickly deploy this to a cluster. +Example [Kustomize](https://github.com/kubernetes-sigs/kustomize) manifests can be found [here](/config) to quickly deploy this to a cluster. ## Flags + ``` -config-file string Path to config file (default "config.json") @@ -77,3 +82,7 @@ Example [Kustomize](https://github.com/kubernetes-sigs/kustomize) manifests can -metrics-addr string The address the metric endpoint binds to. (default ":8080") ``` + +## GKE Preemptible Nodes Support + +If you are using Nidhogg on GKE with preemptible nodes, make sure to set `effect:NoExecute` for the DaemonSets. This is required because when a node gets preempted and comes back online, the pods will already be scheduled on it, therefore the `effect:NoSchedule` will not prevent pods without a toleration to start running. diff --git a/pkg/nidhogg/handler.go b/pkg/nidhogg/handler.go index 746774bc..1fe5aee9 100644 --- a/pkg/nidhogg/handler.go +++ b/pkg/nidhogg/handler.go @@ -79,10 +79,11 @@ func (hc *HandlerConfig) BuildSelectors() error { return nil } -// Daemonset contains the name and namespace of a Daemonset +// Daemonset contains the name and namespace of a Daemonset. Also includes the effect to set for the taint. type Daemonset struct { - Name string `json:"name" yaml:"name"` - Namespace string `json:"namespace" yaml:"namespace"` + Name string `json:"name" yaml:"name"` + Namespace string `json:"namespace" yaml:"namespace"` + Effect corev1.TaintEffect `json:"effect,omitempty" yaml:"effect,omitempty"` } type taintChanges struct { @@ -159,7 +160,7 @@ func (h *Handler) HandleNode(instance *corev1.Node) (reconcile.Result, error) { } func (h *Handler) calculateTaints(instance *corev1.Node) (*corev1.Node, taintChanges, error) { - + log := logf.Log.WithName("nidhogg") nodeCopy := instance.DeepCopy() var changes taintChanges @@ -181,20 +182,22 @@ func (h *Handler) calculateTaints(instance *corev1.Node) (*corev1.Node, taintCha return nil, taintChanges{}, fmt.Errorf("error fetching pods: %v", err) } - if pod != nil && podReady(pod) { + if pod != nil && podReady(pod) && nodeReady(nodeCopy) { // if the taint is in the taintsToRemove map, it'll be removed continue } + log.Info(fmt.Sprintf("Pod does not exist or is not ready, or node is not ready")) // pod doesn't exist or is not ready _, ok := taintsToRemove[taint] if ok { // we want to keep this already existing taint on it + log.Info(fmt.Sprintf("Keeping taint %s", taint)) delete(taintsToRemove, taint) continue } // taint is not already present, adding it changes.taintsAdded = append(changes.taintsAdded, taint) - nodeCopy.Spec.Taints = addTaint(nodeCopy.Spec.Taints, taint) + nodeCopy.Spec.Taints = addTaint(nodeCopy.Spec.Taints, taint, daemonset.Effect) } for taint := range taintsToRemove { nodeCopy.Spec.Taints = removeTaint(nodeCopy.Spec.Taints, taint) @@ -225,16 +228,34 @@ func (h *Handler) getDaemonsetPod(nodeName string, ds Daemonset) (*corev1.Pod, e } func podReady(pod *corev1.Pod) bool { + log := logf.Log.WithName("nidhogg") for _, status := range pod.Status.ContainerStatuses { if status.Ready == false { + log.Info(fmt.Sprintf("Pod %s is not ready", pod.Name)) return false } } + log.Info(fmt.Sprintf("Pod %s is ready", pod.Name)) return true } -func addTaint(taints []corev1.Taint, taintName string) []corev1.Taint { - return append(taints, corev1.Taint{Key: taintName, Effect: corev1.TaintEffectNoSchedule}) +func nodeReady(node *corev1.Node) bool { + log := logf.Log.WithName("nidhogg") + for _, condition := range node.Status.Conditions { + if condition.Type == corev1.NodeReady && condition.Status == corev1.ConditionFalse { + log.Info(fmt.Sprintf("Node %s is not ready", node.Name)) + return false + } + } + log.Info(fmt.Sprintf("Node %s is ready", node.Name)) + return true +} + +func addTaint(taints []corev1.Taint, taintName string, effect corev1.TaintEffect) []corev1.Taint { + if effect == "" { + effect = corev1.TaintEffectNoSchedule + } + return append(taints, corev1.Taint{Key: taintName, Effect: effect}) } func removeTaint(taints []corev1.Taint, taintName string) []corev1.Taint {