diff --git a/otel-integration/CHANGELOG.md b/otel-integration/CHANGELOG.md index af028f2a..68c88287 100644 --- a/otel-integration/CHANGELOG.md +++ b/otel-integration/CHANGELOG.md @@ -2,6 +2,11 @@ ## OpenTelemtry-Integration +### v0.0.72 / 2024-05-16 +- [FEAT] Bump Collector to 0.100.0 +- [FEAT] Add container CPU throttling metrics +- [FEAT] Add k8s_container_status_last_terminated_reason metric to track OOMKilled events. + ### v0.0.71 / 2024-05-06 - [Fix] reduceResourceAttributes preset will now work when metadata preset is manually set in processors. diff --git a/otel-integration/k8s-helm/Chart.yaml b/otel-integration/k8s-helm/Chart.yaml index 70e641c2..2e70cb3c 100644 --- a/otel-integration/k8s-helm/Chart.yaml +++ b/otel-integration/k8s-helm/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 name: otel-integration description: OpenTelemetry Integration -version: 0.0.71 +version: 0.0.72 keywords: - OpenTelemetry Collector - OpenTelemetry Agent @@ -11,22 +11,22 @@ keywords: dependencies: - name: opentelemetry-collector alias: opentelemetry-agent - version: "0.83.1" + version: "0.84.0" repository: https://cgx.jfrog.io/artifactory/coralogix-charts-virtual condition: opentelemetry-agent.enabled - name: opentelemetry-collector alias: opentelemetry-agent-windows - version: "0.83.1" + version: "0.84.0" repository: https://cgx.jfrog.io/artifactory/coralogix-charts-virtual condition: opentelemetry-agent-windows.enabled - name: opentelemetry-collector alias: opentelemetry-cluster-collector - version: "0.83.1" + version: "0.84.0" repository: https://cgx.jfrog.io/artifactory/coralogix-charts-virtual condition: opentelemetry-cluster-collector.enabled - name: opentelemetry-collector alias: opentelemetry-gateway - version: "0.83.1" + version: "0.84.0" repository: https://cgx.jfrog.io/artifactory/coralogix-charts-virtual condition: opentelemetry-gateway.enabled sources: diff --git a/otel-integration/k8s-helm/README.md b/otel-integration/k8s-helm/README.md index 6cca05b2..bb496363 100644 --- a/otel-integration/k8s-helm/README.md +++ b/otel-integration/k8s-helm/README.md @@ -54,6 +54,77 @@ This chart will also collect, out of the box, all the metrics necessary for [Cor If you do not require to collect these metrics, you can disable them by setting `global.extensions.kubernetesDashboard.enabled` to `false` in the `values.yaml` file. +## Metrics + +OpenTelemetry integration collects metrics from various sources. You can see the list of metrics and their labels in OpenTelemetry Collector contrib receiver documentation: + +- Kubernetes Cluster Receiver - https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/k8sclusterreceiver/documentation.md +- Kubelet Stats Receiver - https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/kubeletstatsreceiver/metadata.yaml +- Host Metrics Receiver - https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/hostmetricsreceiver + +Additionally, we use [k8sattributes processor](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/k8sattributesprocessor) and [resource detection processor](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/resourcedetectionprocessor) to add more metadata labels. + +For Kubernetes Dashboard we also use [Prometheus receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/prometheusreceiver/README.md) to scrape Kubernetes API Server and [Kubelet cAdvisor](https://kubernetes.io/docs/concepts/cluster-administration/system-metrics/) endpoints. + +Note: OpenTelemetry metrics are converted to Prometheus format following [OpenTelemetry specification](https://opentelemetry.io/docs/specs/otel/compatibility/prometheus_and_openmetrics/#otlp-metric-points-to-prometheus) + +## Custom Metrics + +OpenTelemetry Integration additionally adds these custom metrics: + +### kube_pod_status_qos_class + +Provides information about Pod QOS class. + +| Metric Type | Value | Labels | +|-------------|-------|--------| +| Gauge | 1 | reason | + +### kube_pod_status_reason + +Provides information about Kubernetes Pod Status. + +| Metric Type | Value | Labels | +|-------------|-------|--------| +| Gauge | 1 | reason | + +Example reason label keys: Evicted, NodeAffinity, NodeLost, Shutdown, UnexpectedAdmissionError + +### kube_node_info + +Provides information about Kubernetes Node. + +| Metric Type | Value | Labels | +|-------------|-------|---------------------| +| Gauge | 1 | k8s.kubelet.version | + +### k8s.container.status.last_terminated_reason + +Provides information about Pod's last termination. + +| Metric Type | Value | Labels | +|-------------|-------|--------| +| Gauge | 1 | reason | + +Example reason label keys: OOMKilled + +### kubernetes_build_info + +Provides information about Kubernetes version. + +### Container Filesystem usage metrics + +- container_fs_writes_total +- container_fs_reads_total +- container_fs_writes_bytes_total +- container_fs_reads_bytes_total +- container_fs_usage_bytes + +### CPU throttling metrics + +- container_cpu_cfs_periods_total +- container_cpu_cfs_throttled_periods_total + # Prerequisites Make sure you have at least these version of the following installed: diff --git a/otel-integration/k8s-helm/values.yaml b/otel-integration/k8s-helm/values.yaml index fecb826d..804cf5ce 100644 --- a/otel-integration/k8s-helm/values.yaml +++ b/otel-integration/k8s-helm/values.yaml @@ -5,7 +5,7 @@ global: defaultSubsystemName: "integration" logLevel: "warn" collectionInterval: "30s" - version: "0.0.71" + version: "0.0.72" extensions: kubernetesDashboard: @@ -401,6 +401,8 @@ opentelemetry-cluster-collector: enabled: true k8s.pod.qos_class: enabled: true + k8s.container.status.last_terminated_reason: + enabled: true metrics: k8s.pod.status_reason: enabled: true @@ -459,6 +461,11 @@ opentelemetry-cluster-collector: match_type: strict action: insert new_name: kube_node_info + - include: k8s.container.ready + match_type: strict + action: insert + new_name: k8s.container.status.last_terminated_reason + transform/k8s-dashboard: error_mode: ignore metric_statements: @@ -468,6 +475,7 @@ opentelemetry-cluster-collector: # K8s Dashboard uses k8s_pod_phase_1 in their queries. - set(unit, "1") where name == "k8s.pod.phase" - set(unit, "") where name == "kube_node_info" + - set(unit, "") where name == "k8s.container.status.last_terminated_reason" - context: datapoint statements: # Transforming k8s.pod.phase to kube_pod_status_qos_class format. @@ -485,8 +493,13 @@ opentelemetry-cluster-collector: # Transforming k8s.node.status_reason to kube-state-metrics format - set(value_int, 1) where metric.name == "kube_node_info" - set(attributes["kubelet_version"], resource.attributes["k8s.kubelet.version"]) where metric.name == "kube_node_info" + # Transform k8s.container.status.last_terminated_reason from resource attribute to metric + - set(value_int, 1) where metric.name == "k8s.container.status.last_terminated_reason" + - set(attributes["reason"], "") where metric.name == "k8s.container.status.last_terminated_reason" + - set(attributes["reason"], resource.attributes["k8s.container.status.last_terminated_reason"]) where metric.name == "k8s.container.status.last_terminated_reason" - context: resource statements: + - delete_key(attributes, "k8s.container.status.last_terminated_reason") - delete_key(attributes, "k8s.pod.qos_class") - delete_key(attributes, "k8s.kubelet.version")