Skip to content

Commit

Permalink
improvement(go.d/k8sstate): collect pod status reason (netdata#18887)
Browse files Browse the repository at this point in the history
  • Loading branch information
ilyam8 authored Oct 28, 2024
1 parent e465363 commit 6e17cb0
Show file tree
Hide file tree
Showing 7 changed files with 159 additions and 85 deletions.
65 changes: 38 additions & 27 deletions src/go/plugin/go.d/modules/k8s_state/charts.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ const (
prioPodMemLimitsUsed
prioPodCondition
prioPodPhase
prioPodStatusReason
prioPodAge
prioPodContainersCount
prioPodContainersState
Expand Down Expand Up @@ -106,6 +107,7 @@ var podChartsTmpl = module.Charts{
podMemLimitsUsedChartTmpl.Copy(),
podConditionChartTmpl.Copy(),
podPhaseChartTmpl.Copy(),
podStatusReasonChartTmpl.Copy(),
podAgeChartTmpl.Copy(),
podContainersCountChartTmpl.Copy(),
podContainersStateChartTmpl.Copy(),
Expand Down Expand Up @@ -247,15 +249,24 @@ var (
},
}
// condition
nodeConditionsChartTmpl = module.Chart{
IDSep: true,
ID: "node_%s.condition_status",
Title: "Condition status",
Units: "status",
Fam: "node condition",
Ctx: "k8s_state.node_condition",
Priority: prioNodeConditions,
}
nodeConditionsChartTmpl = func() module.Chart {
chart := module.Chart{
IDSep: true,
ID: "node_%s.condition_status",
Title: "Condition status",
Units: "status",
Fam: "node condition",
Ctx: "k8s_state.node_condition",
Priority: prioNodeConditions,
}
for _, v := range nodeConditionStatuses {
chart.Dims = append(chart.Dims, &module.Dim{
ID: "node_%s_cond_" + v,
Name: v,
})
}
return chart
}()
nodeSchedulabilityChartTmpl = module.Chart{
IDSep: true,
ID: "node_%s.schedulability",
Expand Down Expand Up @@ -426,24 +437,6 @@ func (ks *KubeState) removeNodeCharts(ns *nodeState) {
}
}

func (ks *KubeState) addNodeConditionToCharts(ns *nodeState, cond string) {
id := fmt.Sprintf(nodeConditionsChartTmpl.ID, replaceDots(ns.id()))
c := ks.Charts().Get(id)
if c == nil {
ks.Warningf("chart '%s' does not exist", id)
return
}
dim := &module.Dim{
ID: fmt.Sprintf("node_%s_cond_%s", ns.id(), strings.ToLower(cond)),
Name: cond,
}
if err := c.AddDim(dim); err != nil {
ks.Warning(err)
return
}
c.MarkNotCreated()
}

var (
podCPURequestsUsedChartTmpl = module.Chart{
IDSep: true,
Expand Down Expand Up @@ -523,6 +516,24 @@ var (
{ID: "pod_%s_phase_pending", Name: "pending"},
},
}
podStatusReasonChartTmpl = func() module.Chart {
chart := module.Chart{
IDSep: true,
ID: "pod_%s.status_reason",
Title: "Status reason",
Units: "status",
Fam: "pod status",
Ctx: "k8s_state.pod_status_reason",
Priority: prioPodStatusReason,
}
for _, v := range podStatusReasons {
chart.Dims = append(chart.Dims, &module.Dim{
ID: "pod_%s_status_reason_" + v,
Name: v,
})
}
return chart
}()
podAgeChartTmpl = module.Chart{
IDSep: true,
ID: "pod_%s.age",
Expand Down
62 changes: 48 additions & 14 deletions src/go/plugin/go.d/modules/k8s_state/collect.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import (
"errors"
"fmt"
"slices"
"strings"
"time"

"github.com/netdata/netdata/go/plugins/plugin/go.d/agent/module"
Expand All @@ -17,28 +16,47 @@ import (
const precision = 1000

var (
podStatusReasons = []string{
"Evicted",
"NodeAffinity",
"NodeLost",
"Shutdown",
"UnexpectedAdmissionError",
"Other",
}

containerWaitingStateReasons = []string{
"PodInitializing",
"ContainerCreating",
"CrashLoopBackOff",
"CreateContainerConfigError",
"CreateContainerError",
"ErrImagePull",
"ImagePullBackOff",
"CreateContainerError",
"InvalidImageName",
"PodInitializing",
"Other",
}
containerTerminatedStateReasons = []string{
"OOMKilled",
"Completed",
"Error",
"ContainerCannotRun",
"DeadlineExceeded",
"Error",
"Evicted",
"OOMKilled",
"Other",
}
)

var (
nodeConditionStatuses = []string{
"Ready",
"DiskPressure",
"MemoryPressure",
"NetworkUnavailable",
"PIDPressure",
}
)

func (ks *KubeState) collect() (map[string]int64, error) {
if ks.discoverer == nil {
return nil, errors.New("nil discoverer")
Expand All @@ -56,6 +74,7 @@ func (ks *KubeState) collect() (map[string]int64, error) {

ks.kubeClusterID = ks.getKubeClusterID()
ks.kubeClusterName = ks.getKubeClusterName()

if chart := ks.Charts().Get(discoveryStatusChart.ID); chart != nil {
chart.Labels = []module.Label{
{Key: labelKeyClusterID, Value: ks.kubeClusterID, Source: module.LabelSourceK8s},
Expand Down Expand Up @@ -92,7 +111,7 @@ func (ks *KubeState) collectKubeState(mx map[string]int64) {
func (ks *KubeState) collectPodsState(mx map[string]int64) {
now := time.Now()
for _, ps := range ks.state.pods {
// Skip cronjobs (each of them is a unique container because name contains hash)
// Skip cronjobs (each of them is a unique container because the name contains hash)
// to avoid overwhelming Netdata with high cardinality metrics.
// Related issue https://github.com/netdata/netdata/issues/16412
if ps.controllerKind == "Job" {
Expand All @@ -104,6 +123,7 @@ func (ks *KubeState) collectPodsState(mx map[string]int64) {
ks.removePodCharts(ps)
continue
}

if ps.new {
ps.new = false
ks.addPodCharts(ps)
Expand All @@ -130,12 +150,14 @@ func (ks *KubeState) collectPodsState(mx map[string]int64) {
ns.stats.podsPhaseRunning += boolToInt(ps.phase == corev1.PodRunning)
ns.stats.podsPhaseSucceeded += boolToInt(ps.phase == corev1.PodSucceeded)
ns.stats.podsPhaseFailed += boolToInt(ps.phase == corev1.PodFailed)

for _, cs := range ps.initContainers {
ns.stats.initContainers++
ns.stats.initContStateRunning += boolToInt(cs.stateRunning)
ns.stats.initContStateWaiting += boolToInt(cs.stateWaiting)
ns.stats.initContStateTerminated += boolToInt(cs.stateTerminated)
}

for _, cs := range ps.containers {
ns.stats.containers++
ns.stats.contStateRunning += boolToInt(cs.stateRunning)
Expand All @@ -155,6 +177,17 @@ func (ks *KubeState) collectPodsState(mx map[string]int64) {
mx[px+"phase_succeeded"] = boolToInt(ps.phase == corev1.PodSucceeded)
mx[px+"phase_pending"] = boolToInt(ps.phase == corev1.PodPending)
mx[px+"age"] = int64(now.Sub(ps.creationTime).Seconds())

for _, v := range podStatusReasons {
mx[px+"status_reason_"+v] = 0
}
if v := ps.statusReason; v != "" {
if !slices.Contains(podStatusReasons, v) {
v = "Other"
}
mx[px+"status_reason_"+v] = 1
}

mx[px+"cpu_requests_used"] = ps.reqCPU
mx[px+"cpu_limits_used"] = ps.limitCPU
mx[px+"mem_requests_used"] = ps.reqMem
Expand All @@ -166,6 +199,7 @@ func (ks *KubeState) collectPodsState(mx map[string]int64) {
mx[px+"init_containers_state_running"] = 0
mx[px+"init_containers_state_waiting"] = 0
mx[px+"init_containers_state_terminated"] = 0

for _, cs := range ps.initContainers {
mx[px+"init_containers_state_running"] += boolToInt(cs.stateRunning)
mx[px+"init_containers_state_waiting"] += boolToInt(cs.stateWaiting)
Expand All @@ -174,6 +208,7 @@ func (ks *KubeState) collectPodsState(mx map[string]int64) {
mx[px+"containers_state_running"] = 0
mx[px+"containers_state_waiting"] = 0
mx[px+"containers_state_terminated"] = 0

for _, cs := range ps.containers {
if cs.new {
cs.new = false
Expand All @@ -194,7 +229,7 @@ func (ks *KubeState) collectPodsState(mx map[string]int64) {
mx[ppx+"state_waiting_reason_"+v] = 0
}
if v := cs.waitingReason; v != "" {
if !slices.Contains(containerWaitingStateReasons, cs.waitingReason) {
if !slices.Contains(containerWaitingStateReasons, v) {
v = "Other"
}
mx[ppx+"state_waiting_reason_"+v] = 1
Expand All @@ -204,7 +239,7 @@ func (ks *KubeState) collectPodsState(mx map[string]int64) {
mx[ppx+"state_terminated_reason_"+v] = 0
}
if v := cs.terminatedReason; v != "" {
if !slices.Contains(containerTerminatedStateReasons, cs.terminatedReason) {
if !slices.Contains(containerTerminatedStateReasons, v) {
v = "Other"
}
mx[ppx+"state_terminated_reason_"+v] = 1
Expand All @@ -228,12 +263,11 @@ func (ks *KubeState) collectNodesState(mx map[string]int64) {

px := fmt.Sprintf("node_%s_", ns.id())

for typ, cond := range ns.conditions {
if cond.new {
cond.new = false
ks.addNodeConditionToCharts(ns, typ)
}
mx[px+"cond_"+strings.ToLower(typ)] = condStatusToInt(cond.status)
for _, v := range nodeConditionStatuses {
mx[px+"cond_"+v] = 0
}
for _, v := range ns.conditions {
mx[px+"cond_"+string(v.Type)] = condStatusToInt(v.Status)
}

mx[px+"age"] = int64(now.Sub(ns.creationTime).Seconds())
Expand Down
66 changes: 46 additions & 20 deletions src/go/plugin/go.d/modules/k8s_state/kube_state_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,11 +213,11 @@ func TestKubeState_Collect(t *testing.T) {
"node_node01_alloc_pods_allocated": 0,
"node_node01_alloc_pods_available": 110,
"node_node01_alloc_pods_util": 0,
"node_node01_cond_diskpressure": 0,
"node_node01_cond_memorypressure": 0,
"node_node01_cond_networkunavailable": 0,
"node_node01_cond_pidpressure": 0,
"node_node01_cond_ready": 1,
"node_node01_cond_DiskPressure": 0,
"node_node01_cond_MemoryPressure": 0,
"node_node01_cond_NetworkUnavailable": 0,
"node_node01_cond_PIDPressure": 0,
"node_node01_cond_Ready": 1,
"node_node01_schedulability_schedulable": 1,
"node_node01_schedulability_unschedulable": 0,
"node_node01_containers": 0,
Expand All @@ -240,6 +240,7 @@ func TestKubeState_Collect(t *testing.T) {
"node_node01_pods_readiness_ready": 0,
"node_node01_pods_readiness_unready": 0,
}

copyAge(expected, mx)

assert.Equal(t, expected, mx)
Expand Down Expand Up @@ -331,6 +332,12 @@ func TestKubeState_Collect(t *testing.T) {
"pod_default_pod01_phase_pending": 0,
"pod_default_pod01_phase_running": 1,
"pod_default_pod01_phase_succeeded": 0,
"pod_default_pod01_status_reason_Evicted": 0,
"pod_default_pod01_status_reason_NodeAffinity": 0,
"pod_default_pod01_status_reason_NodeLost": 0,
"pod_default_pod01_status_reason_Other": 0,
"pod_default_pod01_status_reason_Shutdown": 0,
"pod_default_pod01_status_reason_UnexpectedAdmissionError": 0,
}

copyAge(expected, mx)
Expand Down Expand Up @@ -375,11 +382,11 @@ func TestKubeState_Collect(t *testing.T) {
"node_node01_alloc_pods_allocated": 1,
"node_node01_alloc_pods_available": 109,
"node_node01_alloc_pods_util": 909,
"node_node01_cond_diskpressure": 0,
"node_node01_cond_memorypressure": 0,
"node_node01_cond_networkunavailable": 0,
"node_node01_cond_pidpressure": 0,
"node_node01_cond_ready": 1,
"node_node01_cond_DiskPressure": 0,
"node_node01_cond_MemoryPressure": 0,
"node_node01_cond_NetworkUnavailable": 0,
"node_node01_cond_PIDPressure": 0,
"node_node01_cond_Ready": 1,
"node_node01_containers": 2,
"node_node01_containers_state_running": 2,
"node_node01_containers_state_terminated": 0,
Expand Down Expand Up @@ -464,6 +471,12 @@ func TestKubeState_Collect(t *testing.T) {
"pod_default_pod01_phase_pending": 0,
"pod_default_pod01_phase_running": 1,
"pod_default_pod01_phase_succeeded": 0,
"pod_default_pod01_status_reason_Evicted": 0,
"pod_default_pod01_status_reason_NodeAffinity": 0,
"pod_default_pod01_status_reason_NodeLost": 0,
"pod_default_pod01_status_reason_Other": 0,
"pod_default_pod01_status_reason_Shutdown": 0,
"pod_default_pod01_status_reason_UnexpectedAdmissionError": 0,
}

copyAge(expected, mx)
Expand Down Expand Up @@ -513,11 +526,11 @@ func TestKubeState_Collect(t *testing.T) {
"node_node01_alloc_pods_allocated": 0,
"node_node01_alloc_pods_available": 110,
"node_node01_alloc_pods_util": 0,
"node_node01_cond_diskpressure": 0,
"node_node01_cond_memorypressure": 0,
"node_node01_cond_networkunavailable": 0,
"node_node01_cond_pidpressure": 0,
"node_node01_cond_ready": 1,
"node_node01_cond_DiskPressure": 0,
"node_node01_cond_MemoryPressure": 0,
"node_node01_cond_NetworkUnavailable": 0,
"node_node01_cond_PIDPressure": 0,
"node_node01_cond_Ready": 1,
"node_node01_schedulability_schedulable": 1,
"node_node01_schedulability_unschedulable": 0,
"node_node01_containers": 0,
Expand Down Expand Up @@ -632,11 +645,11 @@ func TestKubeState_Collect(t *testing.T) {
"node_node01_alloc_pods_allocated": 2,
"node_node01_alloc_pods_available": 108,
"node_node01_alloc_pods_util": 1818,
"node_node01_cond_diskpressure": 0,
"node_node01_cond_memorypressure": 0,
"node_node01_cond_networkunavailable": 0,
"node_node01_cond_pidpressure": 0,
"node_node01_cond_ready": 1,
"node_node01_cond_DiskPressure": 0,
"node_node01_cond_MemoryPressure": 0,
"node_node01_cond_NetworkUnavailable": 0,
"node_node01_cond_PIDPressure": 0,
"node_node01_cond_Ready": 1,
"node_node01_containers": 4,
"node_node01_containers_state_running": 4,
"node_node01_containers_state_terminated": 0,
Expand Down Expand Up @@ -721,6 +734,12 @@ func TestKubeState_Collect(t *testing.T) {
"pod_default_pod01_phase_pending": 0,
"pod_default_pod01_phase_running": 1,
"pod_default_pod01_phase_succeeded": 0,
"pod_default_pod01_status_reason_Evicted": 0,
"pod_default_pod01_status_reason_NodeAffinity": 0,
"pod_default_pod01_status_reason_NodeLost": 0,
"pod_default_pod01_status_reason_Other": 0,
"pod_default_pod01_status_reason_Shutdown": 0,
"pod_default_pod01_status_reason_UnexpectedAdmissionError": 0,
"pod_default_pod02_age": 4,
"pod_default_pod02_cond_containersready": 1,
"pod_default_pod02_cond_podinitialized": 1,
Expand Down Expand Up @@ -784,7 +803,14 @@ func TestKubeState_Collect(t *testing.T) {
"pod_default_pod02_phase_pending": 0,
"pod_default_pod02_phase_running": 1,
"pod_default_pod02_phase_succeeded": 0,
"pod_default_pod02_status_reason_Evicted": 0,
"pod_default_pod02_status_reason_NodeAffinity": 0,
"pod_default_pod02_status_reason_NodeLost": 0,
"pod_default_pod02_status_reason_Other": 0,
"pod_default_pod02_status_reason_Shutdown": 0,
"pod_default_pod02_status_reason_UnexpectedAdmissionError": 0,
}

copyAge(expected, mx)

assert.Equal(t, expected, mx)
Expand Down
Loading

0 comments on commit 6e17cb0

Please sign in to comment.