From 6e17cb0fd453ff687fc7f8c999f8042072e93a40 Mon Sep 17 00:00:00 2001 From: Ilya Mashchenko Date: Mon, 28 Oct 2024 18:59:41 +0200 Subject: [PATCH] improvement(go.d/k8sstate): collect pod status reason (#18887) --- .../plugin/go.d/modules/k8s_state/charts.go | 65 ++++++++++-------- .../plugin/go.d/modules/k8s_state/collect.go | 62 +++++++++++++---- .../go.d/modules/k8s_state/kube_state_test.go | 66 +++++++++++++------ .../go.d/modules/k8s_state/metadata.yaml | 25 +++++-- src/go/plugin/go.d/modules/k8s_state/state.go | 16 ++--- .../modules/k8s_state/update_node_state.go | 9 +-- .../modules/k8s_state/update_pod_state.go | 1 + 7 files changed, 159 insertions(+), 85 deletions(-) diff --git a/src/go/plugin/go.d/modules/k8s_state/charts.go b/src/go/plugin/go.d/modules/k8s_state/charts.go index e89e0698412aea..cfc7e027fa11d2 100644 --- a/src/go/plugin/go.d/modules/k8s_state/charts.go +++ b/src/go/plugin/go.d/modules/k8s_state/charts.go @@ -43,6 +43,7 @@ const ( prioPodMemLimitsUsed prioPodCondition prioPodPhase + prioPodStatusReason prioPodAge prioPodContainersCount prioPodContainersState @@ -106,6 +107,7 @@ var podChartsTmpl = module.Charts{ podMemLimitsUsedChartTmpl.Copy(), podConditionChartTmpl.Copy(), podPhaseChartTmpl.Copy(), + podStatusReasonChartTmpl.Copy(), podAgeChartTmpl.Copy(), podContainersCountChartTmpl.Copy(), podContainersStateChartTmpl.Copy(), @@ -247,15 +249,24 @@ var ( }, } // condition - nodeConditionsChartTmpl = module.Chart{ - IDSep: true, - ID: "node_%s.condition_status", - Title: "Condition status", - Units: "status", - Fam: "node condition", - Ctx: "k8s_state.node_condition", - Priority: prioNodeConditions, - } + nodeConditionsChartTmpl = func() module.Chart { + chart := module.Chart{ + IDSep: true, + ID: "node_%s.condition_status", + Title: "Condition status", + Units: "status", + Fam: "node condition", + Ctx: "k8s_state.node_condition", + Priority: prioNodeConditions, + } + for _, v := range nodeConditionStatuses { + chart.Dims = append(chart.Dims, &module.Dim{ + ID: "node_%s_cond_" + v, + Name: v, + }) + } + return chart + }() nodeSchedulabilityChartTmpl = module.Chart{ IDSep: true, ID: "node_%s.schedulability", @@ -426,24 +437,6 @@ func (ks *KubeState) removeNodeCharts(ns *nodeState) { } } -func (ks *KubeState) addNodeConditionToCharts(ns *nodeState, cond string) { - id := fmt.Sprintf(nodeConditionsChartTmpl.ID, replaceDots(ns.id())) - c := ks.Charts().Get(id) - if c == nil { - ks.Warningf("chart '%s' does not exist", id) - return - } - dim := &module.Dim{ - ID: fmt.Sprintf("node_%s_cond_%s", ns.id(), strings.ToLower(cond)), - Name: cond, - } - if err := c.AddDim(dim); err != nil { - ks.Warning(err) - return - } - c.MarkNotCreated() -} - var ( podCPURequestsUsedChartTmpl = module.Chart{ IDSep: true, @@ -523,6 +516,24 @@ var ( {ID: "pod_%s_phase_pending", Name: "pending"}, }, } + podStatusReasonChartTmpl = func() module.Chart { + chart := module.Chart{ + IDSep: true, + ID: "pod_%s.status_reason", + Title: "Status reason", + Units: "status", + Fam: "pod status", + Ctx: "k8s_state.pod_status_reason", + Priority: prioPodStatusReason, + } + for _, v := range podStatusReasons { + chart.Dims = append(chart.Dims, &module.Dim{ + ID: "pod_%s_status_reason_" + v, + Name: v, + }) + } + return chart + }() podAgeChartTmpl = module.Chart{ IDSep: true, ID: "pod_%s.age", diff --git a/src/go/plugin/go.d/modules/k8s_state/collect.go b/src/go/plugin/go.d/modules/k8s_state/collect.go index 1c971e96ab9448..12eb07ab81a018 100644 --- a/src/go/plugin/go.d/modules/k8s_state/collect.go +++ b/src/go/plugin/go.d/modules/k8s_state/collect.go @@ -6,7 +6,6 @@ import ( "errors" "fmt" "slices" - "strings" "time" "github.com/netdata/netdata/go/plugins/plugin/go.d/agent/module" @@ -17,28 +16,47 @@ import ( const precision = 1000 var ( + podStatusReasons = []string{ + "Evicted", + "NodeAffinity", + "NodeLost", + "Shutdown", + "UnexpectedAdmissionError", + "Other", + } + containerWaitingStateReasons = []string{ - "PodInitializing", "ContainerCreating", "CrashLoopBackOff", "CreateContainerConfigError", + "CreateContainerError", "ErrImagePull", "ImagePullBackOff", - "CreateContainerError", "InvalidImageName", + "PodInitializing", "Other", } containerTerminatedStateReasons = []string{ - "OOMKilled", "Completed", - "Error", "ContainerCannotRun", "DeadlineExceeded", + "Error", "Evicted", + "OOMKilled", "Other", } ) +var ( + nodeConditionStatuses = []string{ + "Ready", + "DiskPressure", + "MemoryPressure", + "NetworkUnavailable", + "PIDPressure", + } +) + func (ks *KubeState) collect() (map[string]int64, error) { if ks.discoverer == nil { return nil, errors.New("nil discoverer") @@ -56,6 +74,7 @@ func (ks *KubeState) collect() (map[string]int64, error) { ks.kubeClusterID = ks.getKubeClusterID() ks.kubeClusterName = ks.getKubeClusterName() + if chart := ks.Charts().Get(discoveryStatusChart.ID); chart != nil { chart.Labels = []module.Label{ {Key: labelKeyClusterID, Value: ks.kubeClusterID, Source: module.LabelSourceK8s}, @@ -92,7 +111,7 @@ func (ks *KubeState) collectKubeState(mx map[string]int64) { func (ks *KubeState) collectPodsState(mx map[string]int64) { now := time.Now() for _, ps := range ks.state.pods { - // Skip cronjobs (each of them is a unique container because name contains hash) + // Skip cronjobs (each of them is a unique container because the name contains hash) // to avoid overwhelming Netdata with high cardinality metrics. // Related issue https://github.com/netdata/netdata/issues/16412 if ps.controllerKind == "Job" { @@ -104,6 +123,7 @@ func (ks *KubeState) collectPodsState(mx map[string]int64) { ks.removePodCharts(ps) continue } + if ps.new { ps.new = false ks.addPodCharts(ps) @@ -130,12 +150,14 @@ func (ks *KubeState) collectPodsState(mx map[string]int64) { ns.stats.podsPhaseRunning += boolToInt(ps.phase == corev1.PodRunning) ns.stats.podsPhaseSucceeded += boolToInt(ps.phase == corev1.PodSucceeded) ns.stats.podsPhaseFailed += boolToInt(ps.phase == corev1.PodFailed) + for _, cs := range ps.initContainers { ns.stats.initContainers++ ns.stats.initContStateRunning += boolToInt(cs.stateRunning) ns.stats.initContStateWaiting += boolToInt(cs.stateWaiting) ns.stats.initContStateTerminated += boolToInt(cs.stateTerminated) } + for _, cs := range ps.containers { ns.stats.containers++ ns.stats.contStateRunning += boolToInt(cs.stateRunning) @@ -155,6 +177,17 @@ func (ks *KubeState) collectPodsState(mx map[string]int64) { mx[px+"phase_succeeded"] = boolToInt(ps.phase == corev1.PodSucceeded) mx[px+"phase_pending"] = boolToInt(ps.phase == corev1.PodPending) mx[px+"age"] = int64(now.Sub(ps.creationTime).Seconds()) + + for _, v := range podStatusReasons { + mx[px+"status_reason_"+v] = 0 + } + if v := ps.statusReason; v != "" { + if !slices.Contains(podStatusReasons, v) { + v = "Other" + } + mx[px+"status_reason_"+v] = 1 + } + mx[px+"cpu_requests_used"] = ps.reqCPU mx[px+"cpu_limits_used"] = ps.limitCPU mx[px+"mem_requests_used"] = ps.reqMem @@ -166,6 +199,7 @@ func (ks *KubeState) collectPodsState(mx map[string]int64) { mx[px+"init_containers_state_running"] = 0 mx[px+"init_containers_state_waiting"] = 0 mx[px+"init_containers_state_terminated"] = 0 + for _, cs := range ps.initContainers { mx[px+"init_containers_state_running"] += boolToInt(cs.stateRunning) mx[px+"init_containers_state_waiting"] += boolToInt(cs.stateWaiting) @@ -174,6 +208,7 @@ func (ks *KubeState) collectPodsState(mx map[string]int64) { mx[px+"containers_state_running"] = 0 mx[px+"containers_state_waiting"] = 0 mx[px+"containers_state_terminated"] = 0 + for _, cs := range ps.containers { if cs.new { cs.new = false @@ -194,7 +229,7 @@ func (ks *KubeState) collectPodsState(mx map[string]int64) { mx[ppx+"state_waiting_reason_"+v] = 0 } if v := cs.waitingReason; v != "" { - if !slices.Contains(containerWaitingStateReasons, cs.waitingReason) { + if !slices.Contains(containerWaitingStateReasons, v) { v = "Other" } mx[ppx+"state_waiting_reason_"+v] = 1 @@ -204,7 +239,7 @@ func (ks *KubeState) collectPodsState(mx map[string]int64) { mx[ppx+"state_terminated_reason_"+v] = 0 } if v := cs.terminatedReason; v != "" { - if !slices.Contains(containerTerminatedStateReasons, cs.terminatedReason) { + if !slices.Contains(containerTerminatedStateReasons, v) { v = "Other" } mx[ppx+"state_terminated_reason_"+v] = 1 @@ -228,12 +263,11 @@ func (ks *KubeState) collectNodesState(mx map[string]int64) { px := fmt.Sprintf("node_%s_", ns.id()) - for typ, cond := range ns.conditions { - if cond.new { - cond.new = false - ks.addNodeConditionToCharts(ns, typ) - } - mx[px+"cond_"+strings.ToLower(typ)] = condStatusToInt(cond.status) + for _, v := range nodeConditionStatuses { + mx[px+"cond_"+v] = 0 + } + for _, v := range ns.conditions { + mx[px+"cond_"+string(v.Type)] = condStatusToInt(v.Status) } mx[px+"age"] = int64(now.Sub(ns.creationTime).Seconds()) diff --git a/src/go/plugin/go.d/modules/k8s_state/kube_state_test.go b/src/go/plugin/go.d/modules/k8s_state/kube_state_test.go index c8e5599f0acdbc..be7e42723f1f1b 100644 --- a/src/go/plugin/go.d/modules/k8s_state/kube_state_test.go +++ b/src/go/plugin/go.d/modules/k8s_state/kube_state_test.go @@ -213,11 +213,11 @@ func TestKubeState_Collect(t *testing.T) { "node_node01_alloc_pods_allocated": 0, "node_node01_alloc_pods_available": 110, "node_node01_alloc_pods_util": 0, - "node_node01_cond_diskpressure": 0, - "node_node01_cond_memorypressure": 0, - "node_node01_cond_networkunavailable": 0, - "node_node01_cond_pidpressure": 0, - "node_node01_cond_ready": 1, + "node_node01_cond_DiskPressure": 0, + "node_node01_cond_MemoryPressure": 0, + "node_node01_cond_NetworkUnavailable": 0, + "node_node01_cond_PIDPressure": 0, + "node_node01_cond_Ready": 1, "node_node01_schedulability_schedulable": 1, "node_node01_schedulability_unschedulable": 0, "node_node01_containers": 0, @@ -240,6 +240,7 @@ func TestKubeState_Collect(t *testing.T) { "node_node01_pods_readiness_ready": 0, "node_node01_pods_readiness_unready": 0, } + copyAge(expected, mx) assert.Equal(t, expected, mx) @@ -331,6 +332,12 @@ func TestKubeState_Collect(t *testing.T) { "pod_default_pod01_phase_pending": 0, "pod_default_pod01_phase_running": 1, "pod_default_pod01_phase_succeeded": 0, + "pod_default_pod01_status_reason_Evicted": 0, + "pod_default_pod01_status_reason_NodeAffinity": 0, + "pod_default_pod01_status_reason_NodeLost": 0, + "pod_default_pod01_status_reason_Other": 0, + "pod_default_pod01_status_reason_Shutdown": 0, + "pod_default_pod01_status_reason_UnexpectedAdmissionError": 0, } copyAge(expected, mx) @@ -375,11 +382,11 @@ func TestKubeState_Collect(t *testing.T) { "node_node01_alloc_pods_allocated": 1, "node_node01_alloc_pods_available": 109, "node_node01_alloc_pods_util": 909, - "node_node01_cond_diskpressure": 0, - "node_node01_cond_memorypressure": 0, - "node_node01_cond_networkunavailable": 0, - "node_node01_cond_pidpressure": 0, - "node_node01_cond_ready": 1, + "node_node01_cond_DiskPressure": 0, + "node_node01_cond_MemoryPressure": 0, + "node_node01_cond_NetworkUnavailable": 0, + "node_node01_cond_PIDPressure": 0, + "node_node01_cond_Ready": 1, "node_node01_containers": 2, "node_node01_containers_state_running": 2, "node_node01_containers_state_terminated": 0, @@ -464,6 +471,12 @@ func TestKubeState_Collect(t *testing.T) { "pod_default_pod01_phase_pending": 0, "pod_default_pod01_phase_running": 1, "pod_default_pod01_phase_succeeded": 0, + "pod_default_pod01_status_reason_Evicted": 0, + "pod_default_pod01_status_reason_NodeAffinity": 0, + "pod_default_pod01_status_reason_NodeLost": 0, + "pod_default_pod01_status_reason_Other": 0, + "pod_default_pod01_status_reason_Shutdown": 0, + "pod_default_pod01_status_reason_UnexpectedAdmissionError": 0, } copyAge(expected, mx) @@ -513,11 +526,11 @@ func TestKubeState_Collect(t *testing.T) { "node_node01_alloc_pods_allocated": 0, "node_node01_alloc_pods_available": 110, "node_node01_alloc_pods_util": 0, - "node_node01_cond_diskpressure": 0, - "node_node01_cond_memorypressure": 0, - "node_node01_cond_networkunavailable": 0, - "node_node01_cond_pidpressure": 0, - "node_node01_cond_ready": 1, + "node_node01_cond_DiskPressure": 0, + "node_node01_cond_MemoryPressure": 0, + "node_node01_cond_NetworkUnavailable": 0, + "node_node01_cond_PIDPressure": 0, + "node_node01_cond_Ready": 1, "node_node01_schedulability_schedulable": 1, "node_node01_schedulability_unschedulable": 0, "node_node01_containers": 0, @@ -632,11 +645,11 @@ func TestKubeState_Collect(t *testing.T) { "node_node01_alloc_pods_allocated": 2, "node_node01_alloc_pods_available": 108, "node_node01_alloc_pods_util": 1818, - "node_node01_cond_diskpressure": 0, - "node_node01_cond_memorypressure": 0, - "node_node01_cond_networkunavailable": 0, - "node_node01_cond_pidpressure": 0, - "node_node01_cond_ready": 1, + "node_node01_cond_DiskPressure": 0, + "node_node01_cond_MemoryPressure": 0, + "node_node01_cond_NetworkUnavailable": 0, + "node_node01_cond_PIDPressure": 0, + "node_node01_cond_Ready": 1, "node_node01_containers": 4, "node_node01_containers_state_running": 4, "node_node01_containers_state_terminated": 0, @@ -721,6 +734,12 @@ func TestKubeState_Collect(t *testing.T) { "pod_default_pod01_phase_pending": 0, "pod_default_pod01_phase_running": 1, "pod_default_pod01_phase_succeeded": 0, + "pod_default_pod01_status_reason_Evicted": 0, + "pod_default_pod01_status_reason_NodeAffinity": 0, + "pod_default_pod01_status_reason_NodeLost": 0, + "pod_default_pod01_status_reason_Other": 0, + "pod_default_pod01_status_reason_Shutdown": 0, + "pod_default_pod01_status_reason_UnexpectedAdmissionError": 0, "pod_default_pod02_age": 4, "pod_default_pod02_cond_containersready": 1, "pod_default_pod02_cond_podinitialized": 1, @@ -784,7 +803,14 @@ func TestKubeState_Collect(t *testing.T) { "pod_default_pod02_phase_pending": 0, "pod_default_pod02_phase_running": 1, "pod_default_pod02_phase_succeeded": 0, + "pod_default_pod02_status_reason_Evicted": 0, + "pod_default_pod02_status_reason_NodeAffinity": 0, + "pod_default_pod02_status_reason_NodeLost": 0, + "pod_default_pod02_status_reason_Other": 0, + "pod_default_pod02_status_reason_Shutdown": 0, + "pod_default_pod02_status_reason_UnexpectedAdmissionError": 0, } + copyAge(expected, mx) assert.Equal(t, expected, mx) diff --git a/src/go/plugin/go.d/modules/k8s_state/metadata.yaml b/src/go/plugin/go.d/modules/k8s_state/metadata.yaml index 4b860eb57cc409..aa247a8f99313d 100644 --- a/src/go/plugin/go.d/modules/k8s_state/metadata.yaml +++ b/src/go/plugin/go.d/modules/k8s_state/metadata.yaml @@ -141,7 +141,11 @@ modules: unit: status chart_type: line dimensions: - - name: a dimension per condition + - name: Ready + - name: DiskPressure + - name: MemoryPressure + - name: NetworkUnavailable + - name: PIDPressure - name: k8s_state.node_schedulability description: Schedulability unit: state @@ -271,6 +275,17 @@ modules: - name: failed - name: succeeded - name: pending + - name: k8s_state.pod_status_reason + description: Status reason + unit: status + chart_type: line + dimensions: + - name: Evicted + - name: NodeAffinity + - name: NodeLost + - name: Shutdown + - name: UnexpectedAdmissionError + - name: Other - name: k8s_state.pod_age description: Age unit: seconds @@ -347,24 +362,24 @@ modules: unit: state chart_type: line dimensions: - - name: PodInitializing - name: ContainerCreating - name: CrashLoopBackOff - name: CreateContainerConfigError + - name: CreateContainerError - name: ErrImagePull - name: ImagePullBackOff - - name: CreateContainerError - name: InvalidImageName + - name: PodInitializing - name: Other - name: k8s_state.pod_container_terminated_state_reason description: Container terminated state reason unit: state chart_type: line dimensions: - - name: OOMKilled - name: Completed - - name: Error - name: ContainerCannotRun - name: DeadlineExceeded + - name: Error - name: Evicted + - name: OOMKilled - name: Other diff --git a/src/go/plugin/go.d/modules/k8s_state/state.go b/src/go/plugin/go.d/modules/k8s_state/state.go index 5ebc0950898e42..61566a5f1750b9 100644 --- a/src/go/plugin/go.d/modules/k8s_state/state.go +++ b/src/go/plugin/go.d/modules/k8s_state/state.go @@ -19,9 +19,8 @@ func newKubeState() *kubeState { func newNodeState() *nodeState { return &nodeState{ - new: true, - labels: make(map[string]string), - conditions: make(map[string]*nodeStateCondition), + new: true, + labels: make(map[string]string), } } @@ -58,16 +57,10 @@ type ( allocatableCPU int64 allocatableMem int64 allocatablePods int64 - conditions map[string]*nodeStateCondition + conditions []corev1.NodeCondition stats nodeStateStats } - nodeStateCondition struct { - new bool - // https://kubernetes.io/docs/concepts/architecture/nodes/#condition - //typ corev1.NodeConditionType - status corev1.ConditionStatus - } nodeStateStats struct { reqCPU int64 limitCPU int64 @@ -127,7 +120,8 @@ type ( condPodInitialized corev1.ConditionStatus condPodReady corev1.ConditionStatus // https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase - phase corev1.PodPhase + phase corev1.PodPhase + statusReason string initContainers map[string]*containerState containers map[string]*containerState diff --git a/src/go/plugin/go.d/modules/k8s_state/update_node_state.go b/src/go/plugin/go.d/modules/k8s_state/update_node_state.go index 80f5c26c841445..57a43ab666a7a3 100644 --- a/src/go/plugin/go.d/modules/k8s_state/update_node_state.go +++ b/src/go/plugin/go.d/modules/k8s_state/update_node_state.go @@ -36,12 +36,5 @@ func (ks *KubeState) updateNodeState(r resource) { } ns.unSchedulable = node.Spec.Unschedulable - - for _, c := range node.Status.Conditions { - if v, ok := ns.conditions[string(c.Type)]; !ok { - ns.conditions[string(c.Type)] = &nodeStateCondition{new: true, status: c.Status} - } else { - v.status = c.Status - } - } + ns.conditions = node.Status.Conditions } diff --git a/src/go/plugin/go.d/modules/k8s_state/update_pod_state.go b/src/go/plugin/go.d/modules/k8s_state/update_pod_state.go index 3977a4f792e030..eafe0aaecf46cd 100644 --- a/src/go/plugin/go.d/modules/k8s_state/update_pod_state.go +++ b/src/go/plugin/go.d/modules/k8s_state/update_pod_state.go @@ -78,6 +78,7 @@ func (ks *KubeState) updatePodState(r resource) { } ps.phase = pod.Status.Phase + ps.statusReason = pod.Status.Reason for _, cntr := range pod.Status.ContainerStatuses { cs, ok := ps.containers[cntr.Name]