Skip to content

Commit

Permalink
metrics: Add metrics label filter configuration
Browse files Browse the repository at this point in the history
Currently, metrics are all-or-nothing.
Certain labels may cause cardinality issues.

This patch introduces a new configuration option - MetricsLabelFilter.
It is an allow-list for configuring namespace, workload, pod, and binary.
Labels that utilize these fields will only add them if configured for it.

Fixes: #1037

Signed-off-by: Nick Peluso <[email protected]>
  • Loading branch information
nap32 authored and michi-covalent committed Sep 29, 2023
1 parent e718668 commit 913b64a
Show file tree
Hide file tree
Showing 13 changed files with 139 additions and 26 deletions.
17 changes: 10 additions & 7 deletions cmd/tetragon/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package main

import (
"github.com/cilium/tetragon/pkg/logger"
"github.com/cilium/tetragon/pkg/metrics/config"
"github.com/cilium/tetragon/pkg/option"

"github.com/spf13/viper"
Expand All @@ -31,13 +32,14 @@ const (
keyEnableCiliumAPI = "enable-cilium-api"
keyEnableProcessAncestors = "enable-process-ancestors"

keyMetricsServer = "metrics-server"
keyServerAddress = "server-address"
keyGopsAddr = "gops-address"
keyEnableProcessCred = "enable-process-cred"
keyEnableProcessNs = "enable-process-ns"
keyTracingPolicy = "tracing-policy"
keyTracingPolicyDir = "tracing-policy-dir"
keyMetricsServer = "metrics-server"
keyMetricsLabelFilter = "metrics-label-filter"
keyServerAddress = "server-address"
keyGopsAddr = "gops-address"
keyEnableProcessCred = "enable-process-cred"
keyEnableProcessNs = "enable-process-ns"
keyTracingPolicy = "tracing-policy"
keyTracingPolicyDir = "tracing-policy-dir"

keyCpuProfile = "cpuprofile"
keyMemProfile = "memprofile"
Expand Down Expand Up @@ -113,6 +115,7 @@ func readAndSetFlags() {
option.Config.DataCacheSize = viper.GetInt(keyDataCacheSize)

option.Config.MetricsServer = viper.GetString(keyMetricsServer)
option.Config.MetricsLabelFilter = config.ParseMetricsLabelFilter(viper.GetString(keyMetricsLabelFilter))
option.Config.ServerAddress = viper.GetString(keyServerAddress)

option.Config.ExportFilename = viper.GetString(keyExportFilename)
Expand Down
1 change: 1 addition & 0 deletions docs/content/en/docs/reference/helm-chart.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ To use [the values available](#values), with `helm install` or `helm upgrade`, u
| tetragon.processCacheSize | int | `65536` | |
| tetragon.prometheus.address | string | `""` | The address at which to expose metrics. Set it to "" to expose on all available interfaces. |
| tetragon.prometheus.enabled | bool | `true` | Whether to enable exposing Tetragon metrics. |
| tetragon.prometheus.metricsLabelFilter | string | `"namespace,workload,pod,binary"` | The labels to include with supporting metrics. The possible values are "namespace", "workload", "pod" and "binary". |
| tetragon.prometheus.port | int | `2112` | The port at which to expose metrics. |
| tetragon.prometheus.serviceMonitor.enabled | bool | `false` | Whether to create a 'ServiceMonitor' resource targeting the 'tetragon' pods. |
| tetragon.prometheus.serviceMonitor.labelsOverride | object | `{}` | The set of labels to place on the 'ServiceMonitor' resource. |
Expand Down
1 change: 1 addition & 0 deletions install/kubernetes/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ Helm chart for Tetragon
| tetragon.processCacheSize | int | `65536` | |
| tetragon.prometheus.address | string | `""` | The address at which to expose metrics. Set it to "" to expose on all available interfaces. |
| tetragon.prometheus.enabled | bool | `true` | Whether to enable exposing Tetragon metrics. |
| tetragon.prometheus.metricsLabelFilter | string | `"namespace,workload,pod,binary"` | The labels to include with supporting metrics. The possible values are "namespace", "workload", "pod" and "binary". |
| tetragon.prometheus.port | int | `2112` | The port at which to expose metrics. |
| tetragon.prometheus.serviceMonitor.enabled | bool | `false` | Whether to create a 'ServiceMonitor' resource targeting the 'tetragon' pods. |
| tetragon.prometheus.serviceMonitor.labelsOverride | object | `{}` | The set of labels to place on the 'ServiceMonitor' resource. |
Expand Down
3 changes: 3 additions & 0 deletions install/kubernetes/templates/tetragon_configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ data:
{{- else }}
metrics-server: ""
{{- end }}
{{- if .Values.tetragon.prometheus.enabled }}
metrics-label-filter: {{ .Values.tetragon.prometheus.metricsLabelFilter }}
{{- end }}
{{- if .Values.tetragon.grpc.enabled }}
server-address: {{ .Values.tetragon.grpc.address }}
{{- else }}
Expand Down
3 changes: 3 additions & 0 deletions install/kubernetes/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,9 @@ tetragon:
address: ""
# -- The port at which to expose metrics.
port: 2112
# -- The labels to include with supporting metrics.
# The possible values are "namespace", "workload", "pod" and "binary".
metricsLabelFilter: "namespace,workload,pod,binary"
serviceMonitor:
# -- Whether to create a 'ServiceMonitor' resource targeting the 'tetragon' pods.
enabled: false
Expand Down
10 changes: 10 additions & 0 deletions pkg/metrics/config/initmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ import (
grpcmetrics "github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/collectors"

"strings"
)

func InitAllMetrics(registry *prometheus.Registry) {
Expand All @@ -47,3 +49,11 @@ func InitAllMetrics(registry *prometheus.Registry) {
registry.MustRegister(grpcmetrics.NewServerMetrics())
version.InitMetrics(registry)
}

func ParseMetricsLabelFilter(labels string) map[string]interface{} {
result := make(map[string]interface{})
for _, label := range strings.Split(labels, ",") {
result[label] = nil
}
return result
}
1 change: 1 addition & 0 deletions pkg/metrics/consts/consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
package consts

var MetricsNamespace = "tetragon"
var KnownMetricLabelFilters = []string{"namespace", "workload", "pod", "binary"}
18 changes: 9 additions & 9 deletions pkg/metrics/eventmetrics/eventmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ import (
)

var (
EventsProcessed = metrics.NewCounterVecWithPod(prometheus.CounterOpts{
EventsProcessed = metrics.MustNewGranularCounter(prometheus.CounterOpts{
Namespace: consts.MetricsNamespace,
Name: "events_total",
Help: "The total number of Tetragon events",
ConstLabels: nil,
}, []string{"type", "namespace", "workload", "pod", "binary"})
}, []string{"type"})
FlagCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: consts.MetricsNamespace,
Name: "flags_total",
Expand All @@ -39,19 +39,19 @@ var (
ConstLabels: nil,
})

policyStats = metrics.NewCounterVecWithPod(prometheus.CounterOpts{
policyStats = metrics.MustNewGranularCounter(prometheus.CounterOpts{
Namespace: consts.MetricsNamespace,
Name: "policy_events_total",
Help: "Policy events calls observed.",
ConstLabels: nil,
}, []string{"policy", "hook", "namespace", "workload", "pod", "binary"})
}, []string{"policy", "hook"})
)

func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(EventsProcessed)
registry.MustRegister(EventsProcessed.ToProm())
registry.MustRegister(FlagCount)
registry.MustRegister(NotifyOverflowedEvents)
registry.MustRegister(policyStats)
registry.MustRegister(policyStats.ToProm())
}

func GetProcessInfo(process *tetragon.Process) (binary, pod, workload, namespace string) {
Expand Down Expand Up @@ -93,10 +93,10 @@ func handleProcessedEvent(pInfo *tracingpolicy.PolicyInfo, processedEvent interf
default:
eventType = "unknown"
}
EventsProcessed.WithLabelValues(eventType, namespace, workload, pod, binary).Inc()
EventsProcessed.ToProm().WithLabelValues(metrics.FilterMetricLabels(eventType, namespace, workload, pod, binary)...).Inc()
if pInfo != nil && pInfo.Name != "" {
policyStats.
WithLabelValues(pInfo.Name, pInfo.Hook, namespace, workload, pod, binary).
policyStats.ToProm().
WithLabelValues(metrics.FilterMetricLabels(pInfo.Name, pInfo.Hook, namespace, workload, pod, binary)...).
Inc()
}
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/metrics/eventmetrics/eventmetrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import (
)

func TestHandleProcessedEvent(t *testing.T) {
assert.NoError(t, testutil.CollectAndCompare(EventsProcessed, strings.NewReader("")))
assert.NoError(t, testutil.CollectAndCompare(EventsProcessed.ToProm(), strings.NewReader("")))
handleProcessedEvent(nil, nil)
// empty process
handleProcessedEvent(nil, &tetragon.GetEventsResponse{Event: &tetragon.GetEventsResponse_ProcessKprobe{ProcessKprobe: &tetragon.ProcessKprobe{}}})
Expand Down Expand Up @@ -79,7 +79,7 @@ tetragon_events_total{binary="binary_c",namespace="namespace_c",pod="pod_c",type
tetragon_events_total{binary="binary_e",namespace="",pod="",type="PROCESS_EXIT",workload=""} 1
tetragon_events_total{binary="binary_e",namespace="namespace_e",pod="pod_e",type="PROCESS_EXIT",workload="workload_e"} 1
`)
assert.NoError(t, testutil.CollectAndCompare(EventsProcessed, expected))
assert.NoError(t, testutil.CollectAndCompare(EventsProcessed.ToProm(), expected))
}

func TestHandleOriginalEvent(t *testing.T) {
Expand Down
49 changes: 49 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,16 @@
package metrics

import (
"fmt"
"net/http"
"sync"
"time"

"golang.org/x/exp/slices"

"github.com/cilium/tetragon/pkg/logger"
"github.com/cilium/tetragon/pkg/metrics/consts"
"github.com/cilium/tetragon/pkg/option"
"github.com/cilium/tetragon/pkg/podhooks"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
Expand All @@ -27,6 +32,33 @@ var (
deleteDelay = 1 * time.Minute
)

type GranularCounter struct {
counter *prometheus.CounterVec
CounterOpts prometheus.CounterOpts
labels []string
register sync.Once
}

func MustNewGranularCounter(opts prometheus.CounterOpts, labels []string) *GranularCounter {
for _, label := range labels {
if slices.Contains(consts.KnownMetricLabelFilters, label) {
panic(fmt.Sprintf("labels passed to GranularCounter can't contain any of the following: %v. These labels are added by Tetragon.", consts.KnownMetricLabelFilters))
}
}
return &GranularCounter{
CounterOpts: opts,
labels: append(labels, consts.KnownMetricLabelFilters...),
}
}

func (m *GranularCounter) ToProm() *prometheus.CounterVec {
m.register.Do(func() {
m.labels = FilterMetricLabels(m.labels...)
m.counter = NewCounterVecWithPod(m.CounterOpts, m.labels)
})
return m.counter
}

// NewCounterVecWithPod is a wrapper around prometheus.NewCounterVec that also registers the metric
// to be cleaned up when a pod is deleted. It should be used only to register metrics that have
// "pod" and "namespace" labels.
Expand Down Expand Up @@ -142,3 +174,20 @@ func EnableMetrics(address string) {
http.Handle("/metrics", promhttp.HandlerFor(reg, promhttp.HandlerOpts{Registry: reg}))
http.ListenAndServe(address, nil)
}

// The FilterMetricLabels func takes in string arguments and returns a slice of those strings omitting the labels it is not configured for.
// IMPORTANT! The filtered metric labels must be passed last and in the exact order of consts.KnownMetricLabelFilters.
func FilterMetricLabels(labels ...string) []string {
offset := len(labels) - len(consts.KnownMetricLabelFilters)
if offset < 0 {
logger.GetLogger().WithField("labels", labels).Debug("Not enough labels provided to metrics.FilterMetricLabels.")
return labels
}
result := labels[:offset]
for i, label := range consts.KnownMetricLabelFilters {
if _, ok := option.Config.MetricsLabelFilter[label]; ok {
result = append(result, labels[offset+i])
}
}
return result
}
29 changes: 29 additions & 0 deletions pkg/metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,41 @@ import (
"github.com/cilium/tetragon/pkg/metrics"
"github.com/cilium/tetragon/pkg/metrics/config"
"github.com/cilium/tetragon/pkg/metrics/eventmetrics"
"github.com/cilium/tetragon/pkg/option"
)

var sampleMsgGenericTracepointUnix = tracing.MsgGenericTracepointUnix{
PolicyName: "fake-policy",
}

func TestFilterMetricLabels(t *testing.T) {
option.Config.MetricsLabelFilter = map[string]interface{}{
"namespace": nil,
"workload": nil,
"pod": nil,
"binary": nil,
}
assert.Equal(t, []string{"type", "namespace", "workspace", "pod", "binary"}, metrics.FilterMetricLabels("type", "namespace", "workspace", "pod", "binary"))
assert.Equal(t, []string{"syscall", "namespace", "workspace", "pod", "binary"}, metrics.FilterMetricLabels("syscall", "namespace", "workspace", "pod", "binary"))
assert.Equal(t, []string{"namespace", "workspace", "pod", "binary"}, metrics.FilterMetricLabels("namespace", "workspace", "pod", "binary"))

option.Config.MetricsLabelFilter = map[string]interface{}{
"namespace": nil,
"workload": nil,
}
assert.Equal(t, []string{"type", "namespace", "workspace"}, metrics.FilterMetricLabels("type", "namespace", "workspace", "pod", "binary"))
assert.Equal(t, []string{"syscall", "namespace", "workspace"}, metrics.FilterMetricLabels("syscall", "namespace", "workspace", "pod", "binary"))
assert.Equal(t, []string{"namespace", "workspace"}, metrics.FilterMetricLabels("namespace", "workspace", "pod", "binary"))

option.Config.MetricsLabelFilter = map[string]interface{}{
"namespace": nil,
"workload": nil,
"pod": nil,
"binary": nil,
}
assert.Equal(t, []string{"type", "syscall"}, metrics.FilterMetricLabels("type", "syscall"))
}

func TestPodDelete(t *testing.T) {
reg := metrics.GetRegistry()
config.InitAllMetrics(reg)
Expand Down
10 changes: 6 additions & 4 deletions pkg/metrics/syscallmetrics/syscallmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,16 @@ import (
)

var (
syscallStats = metrics.NewCounterVecWithPod(prometheus.CounterOpts{
syscallStats = metrics.MustNewGranularCounter(prometheus.CounterOpts{
Namespace: consts.MetricsNamespace,
Name: "syscalls_total",
Help: "System calls observed.",
ConstLabels: nil,
}, []string{"syscall", "namespace", "workload", "pod", "binary"})
}, []string{"syscall"})
)

func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(syscallStats)
registry.MustRegister(syscallStats.ToProm())
}

func Handle(event interface{}) {
Expand All @@ -46,7 +46,9 @@ func Handle(event interface{}) {
}

if syscall != "" {
syscallStats.WithLabelValues(syscall, namespace, workload, pod, binary).Inc()
syscallStats.ToProm().
WithLabelValues(metrics.FilterMetricLabels(syscall, namespace, workload, pod, binary)...).
Inc()
}
}

Expand Down
19 changes: 15 additions & 4 deletions pkg/option/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"time"

"github.com/cilium/tetragon/pkg/logger"
"github.com/cilium/tetragon/pkg/metrics/consts"
"github.com/spf13/viper"
)

Expand Down Expand Up @@ -46,10 +47,11 @@ type config struct {
ProcessCacheSize int
DataCacheSize int

MetricsServer string
ServerAddress string
TracingPolicy string
TracingPolicyDir string
MetricsServer string
MetricsLabelFilter map[string]interface{}
ServerAddress string
TracingPolicy string
TracingPolicyDir string

ExportFilename string
ExportFileMaxSizeMB int
Expand Down Expand Up @@ -93,6 +95,15 @@ var (

// LogOpts contains logger parameters
LogOpts: make(map[string]string),

// Default to logging metrics with the greatest granularity.
MetricsLabelFilter: func() map[string]interface{} {
result := make(map[string]interface{})
for _, label := range consts.KnownMetricLabelFilters {
result[label] = nil
}
return result
}(),
}
)

Expand Down

0 comments on commit 913b64a

Please sign in to comment.