diff --git a/k8s/amour/kube_state_metrics/BUILD.bazel b/k8s/amour/kube_state_metrics/BUILD.bazel index 85565e912..76ac40600 100644 --- a/k8s/amour/kube_state_metrics/BUILD.bazel +++ b/k8s/amour/kube_state_metrics/BUILD.bazel @@ -12,11 +12,13 @@ cue_library( "service_account_list.cue", "service_list.cue", "stateful_set_list.cue", + "vm_rule_list.cue", "vm_service_scrape_list.cue", ], importpath = "github.com/uhthomas/automata/k8s/amour/kube_state_metrics", visibility = ["//visibility:public"], deps = [ + "//cue.mod/gen/github.com/VictoriaMetrics/operator/api/victoriametrics/v1beta1:cue_v1beta1_library", "//cue.mod/gen/k8s.io/api/admissionregistration/v1:cue_v1_library", "//cue.mod/gen/k8s.io/api/apps/v1:cue_v1_library", "//cue.mod/gen/k8s.io/api/authentication/v1:cue_v1_library", diff --git a/k8s/amour/kube_state_metrics/list.cue b/k8s/amour/kube_state_metrics/list.cue index afec68a99..e15531c47 100644 --- a/k8s/amour/kube_state_metrics/list.cue +++ b/k8s/amour/kube_state_metrics/list.cue @@ -39,5 +39,6 @@ _items: [ #ServiceAccountList.items, #ServiceList.items, #StatefulSetList.items, + #VMRuleList.items, #VMServiceScrapeList.items, ] diff --git a/k8s/amour/kube_state_metrics/vm_rule_list.cue b/k8s/amour/kube_state_metrics/vm_rule_list.cue new file mode 100644 index 000000000..3e75e7038 --- /dev/null +++ b/k8s/amour/kube_state_metrics/vm_rule_list.cue @@ -0,0 +1,78 @@ +package kube_state_metrics + +import victoriametricsv1beta1 "github.com/VictoriaMetrics/operator/api/victoriametrics/v1beta1" + +#VMRuleList: victoriametricsv1beta1.#VMRuleList & { + apiVersion: "operator.victoriametrics.com/v1beta1" + kind: "VMRuleList" + items: [...{ + apiVersion: "operator.victoriametrics.com/v1beta1" + kind: "VMRule" + }] +} + +#VMRuleList: items: [{ + metadata: name: "kube-state-metrics" + spec: groups: [{ + name: "kube-state-metrics" + rules: [{ + alert: "KubeStateMetricsListErrors" + annotations: { + description: "kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all." + runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors" + summary: "kube-state-metrics is experiencing errors in list operations." + } + expr: """ + (sum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\",result=\"error\"}[5m])) + / + sum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\"}[5m]))) + > 0.01 + """ + + for: "15m" + labels: severity: "critical" + }, { + alert: "KubeStateMetricsWatchErrors" + annotations: { + description: "kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all." + runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors" + summary: "kube-state-metrics is experiencing errors in watch operations." + } + expr: """ + (sum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\",result=\"error\"}[5m])) + / + sum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\"}[5m]))) + > 0.01 + """ + + for: "15m" + labels: severity: "critical" + }, { + alert: "KubeStateMetricsShardingMismatch" + annotations: { + description: "kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all." + runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch" + summary: "kube-state-metrics sharding is misconfigured." + } + expr: "stdvar (kube_state_metrics_total_shards{job=\"kube-state-metrics\"}) != 0" + for: "15m" + labels: severity: "critical" + }, { + alert: "KubeStateMetricsShardsMissing" + annotations: { + description: "kube-state-metrics shards are missing, some Kubernetes objects are not being exposed." + runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing" + summary: "kube-state-metrics shards are missing." + } + expr: """ + 2^max(kube_state_metrics_total_shards{job=\"kube-state-metrics\"}) - 1 + - + sum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job=\"kube-state-metrics\"}) ) + != 0 + """ + + for: "15m" + labels: severity: "critical" + }] + }] +}] diff --git a/k8s/amour/node_exporter/BUILD.bazel b/k8s/amour/node_exporter/BUILD.bazel index ef9c6303e..a9acce7e3 100644 --- a/k8s/amour/node_exporter/BUILD.bazel +++ b/k8s/amour/node_exporter/BUILD.bazel @@ -8,11 +8,13 @@ cue_library( "namespace_list.cue", "service_account_list.cue", "service_list.cue", + "vm_rule_list.cue", "vm_service_scrape_list.cue", ], importpath = "github.com/uhthomas/automata/k8s/amour/node_exporter", visibility = ["//visibility:public"], deps = [ + "//cue.mod/gen/github.com/VictoriaMetrics/operator/api/victoriametrics/v1beta1:cue_v1beta1_library", "//cue.mod/gen/k8s.io/api/apps/v1:cue_v1_library", "//cue.mod/gen/k8s.io/api/core/v1:cue_v1_library", ], diff --git a/k8s/amour/node_exporter/list.cue b/k8s/amour/node_exporter/list.cue index f74a4628c..22c9558f9 100644 --- a/k8s/amour/node_exporter/list.cue +++ b/k8s/amour/node_exporter/list.cue @@ -33,5 +33,6 @@ _items: [ #NamespaceList.items, #ServiceAccountList.items, #ServiceList.items, + #VMRuleList.items, #VMServiceScrapeList.items, ] diff --git a/k8s/amour/node_exporter/vm_rule_list.cue b/k8s/amour/node_exporter/vm_rule_list.cue new file mode 100644 index 000000000..c9a77b494 --- /dev/null +++ b/k8s/amour/node_exporter/vm_rule_list.cue @@ -0,0 +1,384 @@ +package node_exporter + +import victoriametricsv1beta1 "github.com/VictoriaMetrics/operator/api/victoriametrics/v1beta1" + +#VMRuleList: victoriametricsv1beta1.#VMRuleList & { + apiVersion: "operator.victoriametrics.com/v1beta1" + kind: "VMRuleList" + items: [...{ + apiVersion: "operator.victoriametrics.com/v1beta1" + kind: "VMRule" + }] +} + +#VMRuleList: items: [{ + metadata: name: "node-exporter.rules" + spec: groups: [{ + name: "node-exporter.rules" + rules: [{ + expr: """ + count without (cpu, mode) ( + node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"} + ) + """ + + record: "instance:node_num_cpu:sum" + }, { + expr: """ + 1 - avg without (cpu) ( + sum without (mode) (rate(node_cpu_seconds_total{job=\"node-exporter\", mode=~\"idle|iowait|steal\"}[5m])) + ) + """ + + record: "instance:node_cpu_utilisation:rate5m" + }, { + expr: """ + ( + node_load1{job=\"node-exporter\"} + / + instance:node_num_cpu:sum{job=\"node-exporter\"} + ) + """ + + record: "instance:node_load1_per_cpu:ratio" + }, { + expr: """ + 1 - ( + ( + node_memory_MemAvailable_bytes{job=\"node-exporter\"} + or + ( + node_memory_Buffers_bytes{job=\"node-exporter\"} + + + node_memory_Cached_bytes{job=\"node-exporter\"} + + + node_memory_MemFree_bytes{job=\"node-exporter\"} + + + node_memory_Slab_bytes{job=\"node-exporter\"} + ) + ) + / + node_memory_MemTotal_bytes{job=\"node-exporter\"} + ) + """ + + record: "instance:node_memory_utilisation:ratio" + }, { + expr: "rate(node_vmstat_pgmajfault{job=\"node-exporter\"}[5m])" + record: "instance:node_vmstat_pgmajfault:rate5m" + }, { + expr: "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"}[5m])" + record: "instance_device:node_disk_io_time_seconds:rate5m" + }, { + expr: "rate(node_disk_io_time_weighted_seconds_total{job=\"node-exporter\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"}[5m])" + record: "instance_device:node_disk_io_time_weighted_seconds:rate5m" + }, { + expr: """ + sum without (device) ( + rate(node_network_receive_bytes_total{job=\"node-exporter\", device!=\"lo\"}[5m]) + ) + """ + + record: "instance:node_network_receive_bytes_excluding_lo:rate5m" + }, { + expr: """ + sum without (device) ( + rate(node_network_transmit_bytes_total{job=\"node-exporter\", device!=\"lo\"}[5m]) + ) + """ + + record: "instance:node_network_transmit_bytes_excluding_lo:rate5m" + }, { + expr: """ + sum without (device) ( + rate(node_network_receive_drop_total{job=\"node-exporter\", device!=\"lo\"}[5m]) + ) + """ + + record: "instance:node_network_receive_drop_excluding_lo:rate5m" + }, { + expr: """ + sum without (device) ( + rate(node_network_transmit_drop_total{job=\"node-exporter\", device!=\"lo\"}[5m]) + ) + """ + + record: "instance:node_network_transmit_drop_excluding_lo:rate5m" + }] + }] +}, { + metadata: name: "node-exporter" + spec: groups: [{ + name: "node-exporter" + rules: [{ + alert: "NodeFilesystemSpaceFillingUp" + annotations: { + description: "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left and is filling up." + runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup" + summary: "Filesystem is predicted to run out of space within the next 24 hours." + } + expr: """ + ( + node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} * 100 < 15 + and + predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} == 0 + ) + """ + + for: "1h" + labels: severity: "warning" + }, { + alert: "NodeFilesystemSpaceFillingUp" + annotations: { + description: "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left and is filling up fast." + runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup" + summary: "Filesystem is predicted to run out of space within the next 4 hours." + } + expr: """ + ( + node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} * 100 < 10 + and + predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} == 0 + ) + """ + + for: "1h" + labels: severity: "critical" + }, { + alert: "NodeFilesystemAlmostOutOfSpace" + annotations: { + description: "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left." + runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace" + summary: "Filesystem has less than 5% space left." + } + expr: """ + ( + node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} * 100 < 5 + and + node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} == 0 + ) + """ + + for: "30m" + labels: severity: "warning" + }, { + alert: "NodeFilesystemAlmostOutOfSpace" + annotations: { + description: "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left." + runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace" + summary: "Filesystem has less than 3% space left." + } + expr: """ + ( + node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} * 100 < 3 + and + node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} == 0 + ) + """ + + for: "30m" + labels: severity: "critical" + }, { + alert: "NodeFilesystemFilesFillingUp" + annotations: { + description: "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left and is filling up." + runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup" + summary: "Filesystem is predicted to run out of inodes within the next 24 hours." + } + expr: """ + ( + node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} * 100 < 40 + and + predict_linear(node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} == 0 + ) + """ + + for: "1h" + labels: severity: "warning" + }, { + alert: "NodeFilesystemFilesFillingUp" + annotations: { + description: "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left and is filling up fast." + runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup" + summary: "Filesystem is predicted to run out of inodes within the next 4 hours." + } + expr: """ + ( + node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} * 100 < 20 + and + predict_linear(node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} == 0 + ) + """ + + for: "1h" + labels: severity: "critical" + }, { + alert: "NodeFilesystemAlmostOutOfFiles" + annotations: { + description: "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left." + runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles" + summary: "Filesystem has less than 5% inodes left." + } + expr: """ + ( + node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} * 100 < 5 + and + node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} == 0 + ) + """ + + for: "1h" + labels: severity: "warning" + }, { + alert: "NodeFilesystemAlmostOutOfFiles" + annotations: { + description: "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left." + runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles" + summary: "Filesystem has less than 3% inodes left." + } + expr: """ + ( + node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} * 100 < 3 + and + node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} == 0 + ) + """ + + for: "1h" + labels: severity: "critical" + }, { + alert: "NodeNetworkReceiveErrs" + annotations: { + description: "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes." + runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs" + summary: "Network interface is reporting many receive errors." + } + expr: "rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01" + for: "1h" + labels: severity: "warning" + }, { + alert: "NodeNetworkTransmitErrs" + annotations: { + description: "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes." + runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs" + summary: "Network interface is reporting many transmit errors." + } + expr: "rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01" + for: "1h" + labels: severity: "warning" + }, { + alert: "NodeHighNumberConntrackEntriesUsed" + annotations: { + description: "{{ $value | humanizePercentage }} of conntrack entries are used." + runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused" + summary: "Number of conntrack are getting close to the limit." + } + expr: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75" + labels: severity: "warning" + }, { + alert: "NodeTextFileCollectorScrapeError" + annotations: { + description: "Node Exporter text file collector failed to scrape." + runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror" + summary: "Node Exporter text file collector failed to scrape." + } + expr: "node_textfile_scrape_error{job=\"node-exporter\"} == 1" + labels: severity: "warning" + }, { + alert: "NodeClockSkewDetected" + annotations: { + description: "Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host." + runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected" + summary: "Clock skew detected." + } + expr: """ + ( + node_timex_offset_seconds{job=\"node-exporter\"} > 0.05 + and + deriv(node_timex_offset_seconds{job=\"node-exporter\"}[5m]) >= 0 + ) + or + ( + node_timex_offset_seconds{job=\"node-exporter\"} < -0.05 + and + deriv(node_timex_offset_seconds{job=\"node-exporter\"}[5m]) <= 0 + ) + """ + + for: "10m" + labels: severity: "warning" + }, { + alert: "NodeClockNotSynchronising" + annotations: { + description: "Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host." + runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising" + summary: "Clock not synchronising." + } + expr: """ + min_over_time(node_timex_sync_status{job=\"node-exporter\"}[5m]) == 0 + and + node_timex_maxerror_seconds{job=\"node-exporter\"} >= 16 + """ + + for: "10m" + labels: severity: "warning" + }, { + alert: "NodeRAIDDegraded" + annotations: { + description: "RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically." + runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded" + summary: "RAID Array is degraded" + } + expr: "node_md_disks_required{job=\"node-exporter\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"} - ignoring (state) (node_md_disks{state=\"active\",job=\"node-exporter\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"}) > 0" + for: "15m" + labels: severity: "critical" + }, { + alert: "NodeRAIDDiskFailure" + annotations: { + description: "At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap." + runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure" + summary: "Failed device in RAID array" + } + expr: "node_md_disks{state=\"failed\",job=\"node-exporter\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"} > 0" + labels: severity: "warning" + }, { + alert: "NodeFileDescriptorLimit" + annotations: { + description: "File descriptors limit at {{ $labels.instance }} is currently at {{ printf \"%.2f\" $value }}%." + runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit" + summary: "Kernel is predicted to exhaust file descriptors limit soon." + } + expr: """ + ( + node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"node-exporter\"} > 70 + ) + """ + + for: "15m" + labels: severity: "warning" + }, { + alert: "NodeFileDescriptorLimit" + annotations: { + description: "File descriptors limit at {{ $labels.instance }} is currently at {{ printf \"%.2f\" $value }}%." + runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit" + summary: "Kernel is predicted to exhaust file descriptors limit soon." + } + expr: """ + ( + node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"node-exporter\"} > 90 + ) + """ + + for: "15m" + labels: severity: "critical" + }] + }] +}] diff --git a/k8s/amour/vm/vm_rule_list.cue b/k8s/amour/vm/vm_rule_list.cue index 3475855a2..b255b937d 100644 --- a/k8s/amour/vm/vm_rule_list.cue +++ b/k8s/amour/vm/vm_rule_list.cue @@ -1182,70 +1182,6 @@ import victoriametricsv1beta1 "github.com/VictoriaMetrics/operator/api/victoriam record: "cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile" }] }] -}, { - metadata: name: "kube-state-metrics" - spec: groups: [{ - name: "kube-state-metrics" - rules: [{ - alert: "KubeStateMetricsListErrors" - annotations: { - description: "kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all." - runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors" - summary: "kube-state-metrics is experiencing errors in list operations." - } - expr: """ - (sum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\",result=\"error\"}[5m])) - / - sum(rate(kube_state_metrics_list_total{job=\"kube-state-metrics\"}[5m]))) - > 0.01 - """ - - for: "15m" - labels: severity: "critical" - }, { - alert: "KubeStateMetricsWatchErrors" - annotations: { - description: "kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all." - runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors" - summary: "kube-state-metrics is experiencing errors in watch operations." - } - expr: """ - (sum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\",result=\"error\"}[5m])) - / - sum(rate(kube_state_metrics_watch_total{job=\"kube-state-metrics\"}[5m]))) - > 0.01 - """ - - for: "15m" - labels: severity: "critical" - }, { - alert: "KubeStateMetricsShardingMismatch" - annotations: { - description: "kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all." - runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch" - summary: "kube-state-metrics sharding is misconfigured." - } - expr: "stdvar (kube_state_metrics_total_shards{job=\"kube-state-metrics\"}) != 0" - for: "15m" - labels: severity: "critical" - }, { - alert: "KubeStateMetricsShardsMissing" - annotations: { - description: "kube-state-metrics shards are missing, some Kubernetes objects are not being exposed." - runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing" - summary: "kube-state-metrics shards are missing." - } - expr: """ - 2^max(kube_state_metrics_total_shards{job=\"kube-state-metrics\"}) - 1 - - - sum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job=\"kube-state-metrics\"}) ) - != 0 - """ - - for: "15m" - labels: severity: "critical" - }] - }] }, { metadata: name: "kubelet.rules" spec: groups: [{ @@ -2048,376 +1984,6 @@ import victoriametricsv1beta1 "github.com/VictoriaMetrics/operator/api/victoriam labels: severity: "warning" }] }] -}, { - metadata: name: "node-exporter.rules" - spec: groups: [{ - name: "node-exporter.rules" - rules: [{ - expr: """ - count without (cpu, mode) ( - node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"} - ) - """ - - record: "instance:node_num_cpu:sum" - }, { - expr: """ - 1 - avg without (cpu) ( - sum without (mode) (rate(node_cpu_seconds_total{job=\"node-exporter\", mode=~\"idle|iowait|steal\"}[5m])) - ) - """ - - record: "instance:node_cpu_utilisation:rate5m" - }, { - expr: """ - ( - node_load1{job=\"node-exporter\"} - / - instance:node_num_cpu:sum{job=\"node-exporter\"} - ) - """ - - record: "instance:node_load1_per_cpu:ratio" - }, { - expr: """ - 1 - ( - ( - node_memory_MemAvailable_bytes{job=\"node-exporter\"} - or - ( - node_memory_Buffers_bytes{job=\"node-exporter\"} - + - node_memory_Cached_bytes{job=\"node-exporter\"} - + - node_memory_MemFree_bytes{job=\"node-exporter\"} - + - node_memory_Slab_bytes{job=\"node-exporter\"} - ) - ) - / - node_memory_MemTotal_bytes{job=\"node-exporter\"} - ) - """ - - record: "instance:node_memory_utilisation:ratio" - }, { - expr: "rate(node_vmstat_pgmajfault{job=\"node-exporter\"}[5m])" - record: "instance:node_vmstat_pgmajfault:rate5m" - }, { - expr: "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"}[5m])" - record: "instance_device:node_disk_io_time_seconds:rate5m" - }, { - expr: "rate(node_disk_io_time_weighted_seconds_total{job=\"node-exporter\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"}[5m])" - record: "instance_device:node_disk_io_time_weighted_seconds:rate5m" - }, { - expr: """ - sum without (device) ( - rate(node_network_receive_bytes_total{job=\"node-exporter\", device!=\"lo\"}[5m]) - ) - """ - - record: "instance:node_network_receive_bytes_excluding_lo:rate5m" - }, { - expr: """ - sum without (device) ( - rate(node_network_transmit_bytes_total{job=\"node-exporter\", device!=\"lo\"}[5m]) - ) - """ - - record: "instance:node_network_transmit_bytes_excluding_lo:rate5m" - }, { - expr: """ - sum without (device) ( - rate(node_network_receive_drop_total{job=\"node-exporter\", device!=\"lo\"}[5m]) - ) - """ - - record: "instance:node_network_receive_drop_excluding_lo:rate5m" - }, { - expr: """ - sum without (device) ( - rate(node_network_transmit_drop_total{job=\"node-exporter\", device!=\"lo\"}[5m]) - ) - """ - - record: "instance:node_network_transmit_drop_excluding_lo:rate5m" - }] - }] -}, { - metadata: name: "node-exporter" - spec: groups: [{ - name: "node-exporter" - rules: [{ - alert: "NodeFilesystemSpaceFillingUp" - annotations: { - description: "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left and is filling up." - runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup" - summary: "Filesystem is predicted to run out of space within the next 24 hours." - } - expr: """ - ( - node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} * 100 < 15 - and - predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"}[6h], 24*60*60) < 0 - and - node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} == 0 - ) - """ - - for: "1h" - labels: severity: "warning" - }, { - alert: "NodeFilesystemSpaceFillingUp" - annotations: { - description: "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left and is filling up fast." - runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup" - summary: "Filesystem is predicted to run out of space within the next 4 hours." - } - expr: """ - ( - node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} * 100 < 10 - and - predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} == 0 - ) - """ - - for: "1h" - labels: severity: "critical" - }, { - alert: "NodeFilesystemAlmostOutOfSpace" - annotations: { - description: "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left." - runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace" - summary: "Filesystem has less than 5% space left." - } - expr: """ - ( - node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} * 100 < 5 - and - node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} == 0 - ) - """ - - for: "30m" - labels: severity: "warning" - }, { - alert: "NodeFilesystemAlmostOutOfSpace" - annotations: { - description: "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left." - runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace" - summary: "Filesystem has less than 3% space left." - } - expr: """ - ( - node_filesystem_avail_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} * 100 < 3 - and - node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} == 0 - ) - """ - - for: "30m" - labels: severity: "critical" - }, { - alert: "NodeFilesystemFilesFillingUp" - annotations: { - description: "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left and is filling up." - runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup" - summary: "Filesystem is predicted to run out of inodes within the next 24 hours." - } - expr: """ - ( - node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} * 100 < 40 - and - predict_linear(node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"}[6h], 24*60*60) < 0 - and - node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} == 0 - ) - """ - - for: "1h" - labels: severity: "warning" - }, { - alert: "NodeFilesystemFilesFillingUp" - annotations: { - description: "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left and is filling up fast." - runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup" - summary: "Filesystem is predicted to run out of inodes within the next 4 hours." - } - expr: """ - ( - node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} * 100 < 20 - and - predict_linear(node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} == 0 - ) - """ - - for: "1h" - labels: severity: "critical" - }, { - alert: "NodeFilesystemAlmostOutOfFiles" - annotations: { - description: "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left." - runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles" - summary: "Filesystem has less than 5% inodes left." - } - expr: """ - ( - node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} * 100 < 5 - and - node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} == 0 - ) - """ - - for: "1h" - labels: severity: "warning" - }, { - alert: "NodeFilesystemAlmostOutOfFiles" - annotations: { - description: "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left." - runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles" - summary: "Filesystem has less than 3% inodes left." - } - expr: """ - ( - node_filesystem_files_free{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} / node_filesystem_files{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} * 100 < 3 - and - node_filesystem_readonly{job=\"node-exporter\",fstype!=\"\",mountpoint!=\"\"} == 0 - ) - """ - - for: "1h" - labels: severity: "critical" - }, { - alert: "NodeNetworkReceiveErrs" - annotations: { - description: "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes." - runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs" - summary: "Network interface is reporting many receive errors." - } - expr: "rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01" - for: "1h" - labels: severity: "warning" - }, { - alert: "NodeNetworkTransmitErrs" - annotations: { - description: "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes." - runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs" - summary: "Network interface is reporting many transmit errors." - } - expr: "rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01" - for: "1h" - labels: severity: "warning" - }, { - alert: "NodeHighNumberConntrackEntriesUsed" - annotations: { - description: "{{ $value | humanizePercentage }} of conntrack entries are used." - runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused" - summary: "Number of conntrack are getting close to the limit." - } - expr: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75" - labels: severity: "warning" - }, { - alert: "NodeTextFileCollectorScrapeError" - annotations: { - description: "Node Exporter text file collector failed to scrape." - runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror" - summary: "Node Exporter text file collector failed to scrape." - } - expr: "node_textfile_scrape_error{job=\"node-exporter\"} == 1" - labels: severity: "warning" - }, { - alert: "NodeClockSkewDetected" - annotations: { - description: "Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host." - runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected" - summary: "Clock skew detected." - } - expr: """ - ( - node_timex_offset_seconds{job=\"node-exporter\"} > 0.05 - and - deriv(node_timex_offset_seconds{job=\"node-exporter\"}[5m]) >= 0 - ) - or - ( - node_timex_offset_seconds{job=\"node-exporter\"} < -0.05 - and - deriv(node_timex_offset_seconds{job=\"node-exporter\"}[5m]) <= 0 - ) - """ - - for: "10m" - labels: severity: "warning" - }, { - alert: "NodeClockNotSynchronising" - annotations: { - description: "Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host." - runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising" - summary: "Clock not synchronising." - } - expr: """ - min_over_time(node_timex_sync_status{job=\"node-exporter\"}[5m]) == 0 - and - node_timex_maxerror_seconds{job=\"node-exporter\"} >= 16 - """ - - for: "10m" - labels: severity: "warning" - }, { - alert: "NodeRAIDDegraded" - annotations: { - description: "RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically." - runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded" - summary: "RAID Array is degraded" - } - expr: "node_md_disks_required{job=\"node-exporter\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"} - ignoring (state) (node_md_disks{state=\"active\",job=\"node-exporter\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"}) > 0" - for: "15m" - labels: severity: "critical" - }, { - alert: "NodeRAIDDiskFailure" - annotations: { - description: "At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap." - runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure" - summary: "Failed device in RAID array" - } - expr: "node_md_disks{state=\"failed\",job=\"node-exporter\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)\"} > 0" - labels: severity: "warning" - }, { - alert: "NodeFileDescriptorLimit" - annotations: { - description: "File descriptors limit at {{ $labels.instance }} is currently at {{ printf \"%.2f\" $value }}%." - runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit" - summary: "Kernel is predicted to exhaust file descriptors limit soon." - } - expr: """ - ( - node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"node-exporter\"} > 70 - ) - """ - - for: "15m" - labels: severity: "warning" - }, { - alert: "NodeFileDescriptorLimit" - annotations: { - description: "File descriptors limit at {{ $labels.instance }} is currently at {{ printf \"%.2f\" $value }}%." - runbook_url: "https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit" - summary: "Kernel is predicted to exhaust file descriptors limit soon." - } - expr: """ - ( - node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"node-exporter\"} > 90 - ) - """ - - for: "15m" - labels: severity: "critical" - }] - }] }, { metadata: name: "node-network" spec: groups: [{