From a162b3ff785b369df7c7054e82fc27be282ec80e Mon Sep 17 00:00:00 2001 From: Panagiotis Atmatzidis Date: Thu, 12 Sep 2024 20:31:47 +0300 Subject: [PATCH] feat: add `exit_code` label to `execution_done_count` (#1576) The `exit_code` label adds useful information to the `execution_done_count` metric. The exit codes are usually `0`, `1` or `127`, this is a low cardinality metric. This label will allow us use promQL to perform standard operations like calculate the number of failed vs total jobs. --- builtin/bins/dkron-executor-shell/prometheus.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/builtin/bins/dkron-executor-shell/prometheus.go b/builtin/bins/dkron-executor-shell/prometheus.go index b3ebe3db4..14315cf44 100644 --- a/builtin/bins/dkron-executor-shell/prometheus.go +++ b/builtin/bins/dkron-executor-shell/prometheus.go @@ -2,6 +2,7 @@ package main import ( "log" + "strconv" "time" "github.com/prometheus/client_golang/prometheus" @@ -40,7 +41,7 @@ var ( Name: "execution_done_count", Help: "Job Execution Counter", }, - []string{"job_name"}) + []string{"job_name", "exit_code"}) jobExitCode = promauto.NewGaugeVec(prometheus.GaugeOpts{ Namespace: namespace, @@ -60,10 +61,11 @@ func CollectProcessMetrics(jobname string, pid int, quit chan int) { // log.Println("Exit code received and quit channel closed.") return } + exitCodeStr := strconv.Itoa(exitCode) cpuUsage.WithLabelValues(jobname).Set(0) memUsage.WithLabelValues(jobname).Set(0) jobExecutionTime.WithLabelValues(jobname).Set(0) - jobDoneCount.WithLabelValues(jobname).Inc() + jobDoneCount.WithLabelValues(jobname, exitCodeStr).Inc() jobExitCode.WithLabelValues(jobname).Set(float64(exitCode)) default: cpu, mem, err := GetTotalCPUMemUsage(pid)