diff --git a/charts/platform-reports/dashboards/cluster/credits.json b/charts/platform-reports/files/grafana-dashboards/cluster/credits.json similarity index 100% rename from charts/platform-reports/dashboards/cluster/credits.json rename to charts/platform-reports/files/grafana-dashboards/cluster/credits.json diff --git a/charts/platform-reports/dashboards/cluster/jobs.json b/charts/platform-reports/files/grafana-dashboards/cluster/jobs.json similarity index 100% rename from charts/platform-reports/dashboards/cluster/jobs.json rename to charts/platform-reports/files/grafana-dashboards/cluster/jobs.json diff --git a/charts/platform-reports/dashboards/cluster/nodes.json b/charts/platform-reports/files/grafana-dashboards/cluster/nodes.json similarity index 100% rename from charts/platform-reports/dashboards/cluster/nodes.json rename to charts/platform-reports/files/grafana-dashboards/cluster/nodes.json diff --git a/charts/platform-reports/dashboards/cluster/overview.json b/charts/platform-reports/files/grafana-dashboards/cluster/overview.json similarity index 100% rename from charts/platform-reports/dashboards/cluster/overview.json rename to charts/platform-reports/files/grafana-dashboards/cluster/overview.json diff --git a/charts/platform-reports/dashboards/cluster/prices.json b/charts/platform-reports/files/grafana-dashboards/cluster/prices.json similarity index 100% rename from charts/platform-reports/dashboards/cluster/prices.json rename to charts/platform-reports/files/grafana-dashboards/cluster/prices.json diff --git a/charts/platform-reports/dashboards/cluster/services.json b/charts/platform-reports/files/grafana-dashboards/cluster/services.json similarity index 100% rename from charts/platform-reports/dashboards/cluster/services.json rename to charts/platform-reports/files/grafana-dashboards/cluster/services.json diff --git a/charts/platform-reports/dashboards/home.json b/charts/platform-reports/files/grafana-dashboards/home.json similarity index 97% rename from charts/platform-reports/dashboards/home.json rename to charts/platform-reports/files/grafana-dashboards/home.json index a98ad3a1..b950c3fe 100644 --- a/charts/platform-reports/dashboards/home.json +++ b/charts/platform-reports/files/grafana-dashboards/home.json @@ -27,8 +27,8 @@ "overrides": [] }, "gridPos": { - "h": 18, - "w": 25, + "h": 19, + "w": 24, "x": 0, "y": 0 }, diff --git a/charts/platform-reports/dashboards/org/org_credits.json b/charts/platform-reports/files/grafana-dashboards/org/org_credits.json similarity index 100% rename from charts/platform-reports/dashboards/org/org_credits.json rename to charts/platform-reports/files/grafana-dashboards/org/org_credits.json diff --git a/charts/platform-reports/dashboards/org/org_jobs.json b/charts/platform-reports/files/grafana-dashboards/org/org_jobs.json similarity index 100% rename from charts/platform-reports/dashboards/org/org_jobs.json rename to charts/platform-reports/files/grafana-dashboards/org/org_jobs.json diff --git a/charts/platform-reports/dashboards/project/job.json b/charts/platform-reports/files/grafana-dashboards/project/job.json similarity index 100% rename from charts/platform-reports/dashboards/project/job.json rename to charts/platform-reports/files/grafana-dashboards/project/job.json diff --git a/charts/platform-reports/dashboards/project/project_credits.json b/charts/platform-reports/files/grafana-dashboards/project/project_credits.json similarity index 100% rename from charts/platform-reports/dashboards/project/project_credits.json rename to charts/platform-reports/files/grafana-dashboards/project/project_credits.json diff --git a/charts/platform-reports/dashboards/project/project_jobs.json b/charts/platform-reports/files/grafana-dashboards/project/project_jobs.json similarity index 100% rename from charts/platform-reports/dashboards/project/project_jobs.json rename to charts/platform-reports/files/grafana-dashboards/project/project_jobs.json diff --git a/charts/platform-reports/files/prometheus-rules/alertmanager.yaml b/charts/platform-reports/files/prometheus-rules/alertmanager.yaml new file mode 100644 index 00000000..b9537d41 --- /dev/null +++ b/charts/platform-reports/files/prometheus-rules/alertmanager.yaml @@ -0,0 +1,125 @@ +groups: +- name: alertmanager.rules + rules: + - alert: AlertmanagerFailedReload + annotations: + description: Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload + summary: Reloading an Alertmanager configuration has failed. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_config_last_reload_successful{job="prometheus-alertmanager"}[5m]) == 0 + for: 10m + labels: + severity: critical + - alert: AlertmanagerMembersInconsistent + annotations: + description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent + summary: A member of an Alertmanager cluster has not found all other cluster members. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_cluster_members{job="prometheus-alertmanager"}[5m]) + < on (namespace,service) group_left + count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="prometheus-alertmanager"}[5m])) + for: 15m + labels: + severity: critical + - alert: AlertmanagerFailedToSendAlerts + annotations: + description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts + summary: An Alertmanager instance failed to send notifications. + expr: | + ( + rate(alertmanager_notifications_failed_total{job="prometheus-alertmanager"}[5m]) + / + rate(alertmanager_notifications_total{job="prometheus-alertmanager"}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts + summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration. + expr: | + min by (namespace,service, integration) ( + rate(alertmanager_notifications_failed_total{job="prometheus-alertmanager", integration=~`.*`}[5m]) + / + rate(alertmanager_notifications_total{job="prometheus-alertmanager", integration=~`.*`}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: critical + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts + summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration. + expr: | + min by (namespace,service, integration) ( + rate(alertmanager_notifications_failed_total{job="prometheus-alertmanager", integration!~`.*`}[5m]) + / + rate(alertmanager_notifications_total{job="prometheus-alertmanager", integration!~`.*`}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning + - alert: AlertmanagerConfigInconsistent + annotations: + description: Alertmanager instances within the {{$labels.job}} cluster have different configurations. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent + summary: Alertmanager instances within the same cluster have different configurations. + expr: | + count by (namespace,service) ( + count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="prometheus-alertmanager"}) + ) + != 1 + for: 20m + labels: + severity: critical + - alert: AlertmanagerClusterDown + annotations: + description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown + summary: Half or more of the Alertmanager instances within the same cluster are down. + expr: | + ( + count by (namespace,service) ( + avg_over_time(up{job="prometheus-alertmanager"}[5m]) < 0.5 + ) + / + count by (namespace,service) ( + up{job="prometheus-alertmanager"} + ) + ) + >= 0.5 + for: 5m + labels: + severity: critical + - alert: AlertmanagerClusterCrashlooping + annotations: + description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping + summary: Half or more of the Alertmanager instances within the same cluster are crashlooping. + expr: | + ( + count by (namespace,service) ( + changes(process_start_time_seconds{job="prometheus-alertmanager"}[10m]) > 4 + ) + / + count by (namespace,service) ( + up{job="prometheus-alertmanager"} + ) + ) + >= 0.5 + for: 5m + labels: + severity: critical diff --git a/charts/platform-reports/files/prometheus-rules/kube-control-plane.yaml b/charts/platform-reports/files/prometheus-rules/kube-control-plane.yaml new file mode 100644 index 00000000..92726fd3 --- /dev/null +++ b/charts/platform-reports/files/prometheus-rules/kube-control-plane.yaml @@ -0,0 +1,494 @@ +groups: +- name: kubernetes-apps + rules: + - alert: KubePodCrashLooping + annotations: + description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is in waiting state (reason: "CrashLoopBackOff").' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping + summary: Pod is crash looping. + expr: | + max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics"}[5m]) >= 1 + for: 15m + labels: + severity: warning + - alert: KubePodNotReady + annotations: + description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready + summary: Pod has been in a non-ready state for more than 15 minutes. + expr: | + sum by (namespace, pod, cluster) ( + max by(namespace, pod, cluster) ( + kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown|Failed"} + ) * on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) ( + 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"}) + ) + ) > 0 + for: 15m + labels: + severity: warning + - alert: KubeDeploymentGenerationMismatch + annotations: + description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch + summary: Deployment generation mismatch due to possible roll-back + expr: | + kube_deployment_status_observed_generation{job="kube-state-metrics"} + != + kube_deployment_metadata_generation{job="kube-state-metrics"} + for: 15m + labels: + severity: warning + - alert: KubeDeploymentReplicasMismatch + annotations: + description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch + summary: Deployment has not matched the expected number of replicas. + expr: | + ( + kube_deployment_spec_replicas{job="kube-state-metrics"} + > + kube_deployment_status_replicas_available{job="kube-state-metrics"} + ) and ( + changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[10m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeDeploymentRolloutStuck + annotations: + description: Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck + summary: Deployment rollout is not progressing. + expr: | + kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics"} + != 0 + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetReplicasMismatch + annotations: + description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch + summary: Deployment has not matched the expected number of replicas. + expr: | + ( + kube_statefulset_status_replicas_ready{job="kube-state-metrics"} + != + kube_statefulset_status_replicas{job="kube-state-metrics"} + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[10m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetGenerationMismatch + annotations: + description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch + summary: StatefulSet generation mismatch due to possible roll-back + expr: | + kube_statefulset_status_observed_generation{job="kube-state-metrics"} + != + kube_statefulset_metadata_generation{job="kube-state-metrics"} + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetUpdateNotRolledOut + annotations: + description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout + summary: StatefulSet update has not been rolled out. + expr: | + ( + max without (revision) ( + kube_statefulset_status_current_revision{job="kube-state-metrics"} + unless + kube_statefulset_status_update_revision{job="kube-state-metrics"} + ) + * + ( + kube_statefulset_replicas{job="kube-state-metrics"} + != + kube_statefulset_status_replicas_updated{job="kube-state-metrics"} + ) + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeDaemonSetRolloutStuck + annotations: + description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck + summary: DaemonSet rollout is stuck. + expr: | + ( + ( + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + ) or ( + kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} + != + 0 + ) or ( + kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + ) or ( + kube_daemonset_status_number_available{job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + ) + ) and ( + changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeContainerWaiting + annotations: + description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting + summary: Pod container waiting longer than 1 hour + expr: | + sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0 + for: 1h + labels: + severity: warning + - alert: KubeDaemonSetNotScheduled + annotations: + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled + summary: DaemonSet pods are not scheduled. + expr: | + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + - + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0 + for: 10m + labels: + severity: warning + - alert: KubeDaemonSetMisScheduled + annotations: + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled + summary: DaemonSet pods are misscheduled. + expr: | + kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 + for: 15m + labels: + severity: warning + - alert: KubeJobNotCompleted + annotations: + description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "43200" | humanizeDuration }} to complete. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted + summary: Job did not complete in time + expr: | + time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics"} + and + kube_job_status_active{job="kube-state-metrics"} > 0) > 43200 + labels: + severity: warning + - alert: KubeJobFailed + annotations: + description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed + summary: Job failed to complete. + expr: | + kube_job_failed{job="kube-state-metrics"} > 0 + for: 15m + labels: + severity: warning + - alert: KubeHpaReplicasMismatch + annotations: + description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch + summary: HPA has not matched desired number of replicas. + expr: | + (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics"} + != + kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}) + and + (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} + > + kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics"}) + and + (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} + < + kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics"}) + and + changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}[15m]) == 0 + for: 15m + labels: + severity: warning + - alert: KubeHpaMaxedOut + annotations: + description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout + summary: HPA is running at max replicas + expr: | + kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} + == + kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics"} + for: 15m + labels: + severity: warning +- name: kubernetes-storage + rules: + - alert: KubePersistentVolumeErrors + annotations: + description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors + summary: PersistentVolume is having issues with provisioning. + expr: | + kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 + for: 5m + labels: + severity: critical +- name: kubernetes-system + rules: + - alert: KubeVersionMismatch + annotations: + description: There are {{ $value }} different semantic versions of Kubernetes components running. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch + summary: Different semantic versions of Kubernetes components running. + expr: | + count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1 + for: 15m + labels: + severity: warning + - alert: KubeClientErrors + annotations: + description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclienterrors + summary: Kubernetes API server client is experiencing errors. + expr: | + (sum(rate(rest_client_requests_total{job="apiserver",code=~"5.."}[5m])) by (cluster, instance, job, namespace) + / + sum(rate(rest_client_requests_total{job="apiserver"}[5m])) by (cluster, instance, job, namespace)) + > 0.01 + for: 15m + labels: + severity: warning +- name: kubernetes-system-apiserver + rules: + - alert: KubeClientCertificateExpiration + annotations: + description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration + summary: Client certificate is about to expire. + expr: | + apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 + for: 5m + labels: + severity: warning + - alert: KubeClientCertificateExpiration + annotations: + description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration + summary: Client certificate is about to expire. + expr: | + apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 + for: 5m + labels: + severity: critical + - alert: KubeAggregatedAPIErrors + annotations: + description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors + summary: Kubernetes aggregated API has reported errors. + expr: | + sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[10m])) > 4 + labels: + severity: warning + - alert: KubeAggregatedAPIDown + annotations: + description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown + summary: Kubernetes aggregated API is down. + expr: | + (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice{job="apiserver"}[10m]))) * 100 < 85 + for: 5m + labels: + severity: warning + - alert: KubeAPIDown + annotations: + description: KubeAPI has disappeared from Prometheus target discovery. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapidown + summary: Target disappeared from Prometheus target discovery. + expr: | + absent(up{job="apiserver"} == 1) + for: 15m + labels: + severity: critical + - alert: KubeAPITerminatedRequests + annotations: + description: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests + summary: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests. + expr: | + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 + for: 5m + labels: + severity: warning +- name: kubernetes-system-kubelet + rules: + - alert: KubeNodeNotReady + annotations: + description: '{{ $labels.node }} has been unready for more than 15 minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodenotready + summary: Node is not ready. + expr: | + kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 + for: 15m + labels: + severity: warning + - alert: KubeNodeUnreachable + annotations: + description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodeunreachable + summary: Node is unreachable. + expr: | + (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 + for: 15m + labels: + severity: warning + - alert: KubeletTooManyPods + annotations: + description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubelettoomanypods + summary: Kubelet is running at capacity. + expr: | + count by(cluster, node) ( + (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"}) + ) + / + max by(cluster, node) ( + kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1 + ) > 0.95 + for: 15m + labels: + severity: info + - alert: KubeNodeReadinessFlapping + annotations: + description: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping + summary: Node readiness status is flapping. + expr: | + sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node) > 2 + for: 15m + labels: + severity: warning + - alert: KubeletPlegDurationHigh + annotations: + description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletplegdurationhigh + summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. + expr: | + node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 + for: 5m + labels: + severity: warning + - alert: KubeletPodStartUpLatencyHigh + annotations: + description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletpodstartuplatencyhigh + summary: Kubelet Pod startup latency is too high. + expr: | + histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 + for: 15m + labels: + severity: warning + - alert: KubeletClientCertificateExpiration + annotations: + description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration + summary: Kubelet client certificate is about to expire. + expr: | + kubelet_certificate_manager_client_ttl_seconds < 604800 + labels: + severity: warning + - alert: KubeletClientCertificateExpiration + annotations: + description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration + summary: Kubelet client certificate is about to expire. + expr: | + kubelet_certificate_manager_client_ttl_seconds < 86400 + labels: + severity: critical + - alert: KubeletServerCertificateExpiration + annotations: + description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration + summary: Kubelet server certificate is about to expire. + expr: | + kubelet_certificate_manager_server_ttl_seconds < 604800 + labels: + severity: warning + - alert: KubeletServerCertificateExpiration + annotations: + description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration + summary: Kubelet server certificate is about to expire. + expr: | + kubelet_certificate_manager_server_ttl_seconds < 86400 + labels: + severity: critical + - alert: KubeletClientCertificateRenewalErrors + annotations: + description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes). + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificaterenewalerrors + summary: Kubelet has failed to renew its client certificate. + expr: | + increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: KubeletServerCertificateRenewalErrors + annotations: + description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes). + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificaterenewalerrors + summary: Kubelet has failed to renew its server certificate. + expr: | + increase(kubelet_server_expiration_renew_errors[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: KubeletDown + annotations: + description: Kubelet has disappeared from Prometheus target discovery. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletdown + summary: Target disappeared from Prometheus target discovery. + expr: | + absent(up{job="kubelet", metrics_path="/metrics"} == 1) + for: 15m + labels: + severity: critical +- name: kubelet.rules + rules: + - expr: | + histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: "0.99" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: "0.9" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: "0.5" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile diff --git a/charts/platform-reports/files/prometheus-rules/node-exporter.yaml b/charts/platform-reports/files/prometheus-rules/node-exporter.yaml new file mode 100644 index 00000000..dfb6cf33 --- /dev/null +++ b/charts/platform-reports/files/prometheus-rules/node-exporter.yaml @@ -0,0 +1,303 @@ +groups: +- name: node-exporter + rules: + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup + summary: Filesystem is predicted to run out of space within the next 24 hours. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup + summary: Filesystem is predicted to run out of space within the next 4 hours. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace + summary: Filesystem has less than 5% space left. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 30m + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace + summary: Filesystem has less than 3% space left. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 30m + labels: + severity: critical + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup + summary: Filesystem is predicted to run out of inodes within the next 24 hours. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup + summary: Filesystem is predicted to run out of inodes within the next 4 hours. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles + summary: Filesystem has less than 5% inodes left. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles + summary: Filesystem has less than 3% inodes left. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeNetworkReceiveErrs + annotations: + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs + summary: Network interface is reporting many receive errors. + expr: | + rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 + for: 1h + labels: + severity: warning + - alert: NodeNetworkTransmitErrs + annotations: + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs + summary: Network interface is reporting many transmit errors. + expr: | + rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 + for: 1h + labels: + severity: warning + - alert: NodeHighNumberConntrackEntriesUsed + annotations: + description: '{{ $value | humanizePercentage }} of conntrack entries are used.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused + summary: Number of conntrack are getting close to the limit. + expr: | + (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 + labels: + severity: warning + - alert: NodeTextFileCollectorScrapeError + annotations: + description: Node Exporter text file collector failed to scrape. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror + summary: Node Exporter text file collector failed to scrape. + expr: | + node_textfile_scrape_error{job="node-exporter"} == 1 + labels: + severity: warning + - alert: NodeClockSkewDetected + annotations: + description: Clock on {{ $labels.instance }} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected + summary: Clock skew detected. + expr: | + ( + node_timex_offset_seconds{job="node-exporter"} > 0.05 + and + deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0 + ) + or + ( + node_timex_offset_seconds{job="node-exporter"} < -0.05 + and + deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 + ) + for: 10m + labels: + severity: warning + - alert: NodeClockNotSynchronising + annotations: + description: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising + summary: Clock not synchronising. + expr: | + min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0 + and + node_timex_maxerror_seconds{job="node-exporter"} >= 16 + for: 10m + labels: + severity: warning + - alert: NodeRAIDDegraded + annotations: + description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded + summary: RAID Array is degraded + expr: | + node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0 + for: 15m + labels: + severity: critical + - alert: NodeRAIDDiskFailure + annotations: + description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure + summary: Failed device in RAID array + expr: | + node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} > 0 + labels: + severity: warning + - alert: NodeFileDescriptorLimit + annotations: + description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit + summary: Kernel is predicted to exhaust file descriptors limit soon. + expr: | + ( + node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70 + ) + for: 15m + labels: + severity: warning + - alert: NodeFileDescriptorLimit + annotations: + description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit + summary: Kernel is predicted to exhaust file descriptors limit soon. + expr: | + ( + node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90 + ) + for: 15m + labels: + severity: critical +- name: node-exporter.rules + rules: + - expr: | + count without (cpu, mode) ( + node_cpu_seconds_total{job="node-exporter",mode="idle"} + ) + record: instance:node_num_cpu:sum + - expr: | + 1 - avg without (cpu) ( + sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m])) + ) + record: instance:node_cpu_utilisation:rate5m + - expr: | + ( + node_load1{job="node-exporter"} + / + instance:node_num_cpu:sum{job="node-exporter"} + ) + record: instance:node_load1_per_cpu:ratio + - expr: | + 1 - ( + ( + node_memory_MemAvailable_bytes{job="node-exporter"} + or + ( + node_memory_Buffers_bytes{job="node-exporter"} + + + node_memory_Cached_bytes{job="node-exporter"} + + + node_memory_MemFree_bytes{job="node-exporter"} + + + node_memory_Slab_bytes{job="node-exporter"} + ) + ) + / + node_memory_MemTotal_bytes{job="node-exporter"} + ) + record: instance:node_memory_utilisation:ratio + - expr: | + rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) + record: instance:node_vmstat_pgmajfault:rate5m + - expr: | + rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) + record: instance_device:node_disk_io_time_seconds:rate5m + - expr: | + rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) + record: instance_device:node_disk_io_time_weighted_seconds:rate5m + - expr: | + sum without (device) ( + rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_receive_bytes_excluding_lo:rate5m + - expr: | + sum without (device) ( + rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_transmit_bytes_excluding_lo:rate5m + - expr: | + sum without (device) ( + rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_receive_drop_excluding_lo:rate5m + - expr: | + sum without (device) ( + rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_transmit_drop_excluding_lo:rate5m diff --git a/charts/platform-reports/files/prometheus-rules/prometheus-operator.yaml b/charts/platform-reports/files/prometheus-rules/prometheus-operator.yaml new file mode 100644 index 00000000..a1ef1a2c --- /dev/null +++ b/charts/platform-reports/files/prometheus-rules/prometheus-operator.yaml @@ -0,0 +1,87 @@ +groups: +- name: prometheus-operator + rules: + - alert: PrometheusOperatorListErrors + annotations: + description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorlisterrors + summary: Errors while performing list operations in controller. + expr: | + (sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator"}[10m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator"}[10m]))) > 0.4 + for: 15m + labels: + severity: warning + - alert: PrometheusOperatorWatchErrors + annotations: + description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorwatcherrors + summary: Errors while performing watch operations in controller. + expr: | + (sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator"}[5m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator"}[5m]))) > 0.4 + for: 15m + labels: + severity: warning + - alert: PrometheusOperatorSyncFailed + annotations: + description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorsyncfailed + summary: Last controller reconciliation failed + expr: | + min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator"}[5m]) > 0 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorReconcileErrors + annotations: + description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorreconcileerrors + summary: Errors while reconciling controller. + expr: | + (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator"}[5m]))) > 0.1 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorNodeLookupErrors + annotations: + description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornodelookuperrors + summary: Errors while reconciling Prometheus. + expr: | + rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorNotReady + annotations: + description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready + summary: Prometheus operator not ready + expr: | + min by (cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator"}[5m]) == 0) + for: 5m + labels: + severity: warning + - alert: PrometheusOperatorRejectedResources + annotations: + description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorrejectedresources + summary: Resources rejected by Prometheus operator + expr: | + min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator"}[5m]) > 0 + for: 5m + labels: + severity: warning +- name: config-reloaders + rules: + - alert: ConfigReloaderSidecarErrors + annotations: + description: |- + Errors encountered while the {{$labels.pod}} config-reloader sidecar attempts to sync config in {{$labels.namespace}} namespace. + As a result, configuration for service running in {{$labels.pod}} may be stale and cannot be updated anymore. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/configreloadersidecarerrors + summary: config-reloader sidecar has not had a successful reload for 10m + expr: | + max_over_time(reloader_last_reload_successful{namespace=~".+"}[5m]) == 0 + for: 10m + labels: + severity: warning diff --git a/charts/platform-reports/files/prometheus-rules/prometheus.yaml b/charts/platform-reports/files/prometheus-rules/prometheus.yaml new file mode 100644 index 00000000..1169f58c --- /dev/null +++ b/charts/platform-reports/files/prometheus-rules/prometheus.yaml @@ -0,0 +1,276 @@ +groups: +- name: prometheus + rules: + - alert: PrometheusBadConfig + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusbadconfig + summary: Failed Prometheus configuration reload. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(prometheus_config_last_reload_successful{job="prometheus-prometheus"}[5m]) == 0 + for: 10m + labels: + severity: critical + - alert: PrometheusSDRefreshFailure + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to refresh SD with mechanism {{$labels.mechanism}}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheussdrefreshfailure + summary: Failed Prometheus SD refresh. + expr: | + increase(prometheus_sd_refresh_failures_total{job="prometheus-prometheus"}[10m]) > 0 + for: 20m + labels: + severity: warning + - alert: PrometheusNotificationQueueRunningFull + annotations: + description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotificationqueuerunningfull + summary: Prometheus alert notification queue predicted to run full in less than 30m. + expr: | + # Without min_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + predict_linear(prometheus_notifications_queue_length{job="prometheus-prometheus"}[5m], 60 * 30) + > + min_over_time(prometheus_notifications_queue_capacity{job="prometheus-prometheus"}[5m]) + ) + for: 15m + labels: + severity: warning + - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers + annotations: + description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers + summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. + expr: | + ( + rate(prometheus_notifications_errors_total{job="prometheus-prometheus"}[5m]) + / + rate(prometheus_notifications_sent_total{job="prometheus-prometheus"}[5m]) + ) + * 100 + > 1 + for: 15m + labels: + severity: warning + - alert: PrometheusNotConnectedToAlertmanagers + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotconnectedtoalertmanagers + summary: Prometheus is not connected to any Alertmanagers. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-prometheus"}[5m]) < 1 + for: 10m + labels: + severity: warning + - alert: PrometheusTSDBReloadsFailing + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbreloadsfailing + summary: Prometheus has issues reloading blocks from disk. + expr: | + increase(prometheus_tsdb_reloads_failures_total{job="prometheus-prometheus"}[3h]) > 0 + for: 4h + labels: + severity: warning + - alert: PrometheusTSDBCompactionsFailing + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbcompactionsfailing + summary: Prometheus has issues compacting blocks. + expr: | + increase(prometheus_tsdb_compactions_failed_total{job="prometheus-prometheus"}[3h]) > 0 + for: 4h + labels: + severity: warning + - alert: PrometheusNotIngestingSamples + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotingestingsamples + summary: Prometheus is not ingesting samples. + expr: | + ( + rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-prometheus",type="float"}[5m]) <= 0 + and + ( + sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus-prometheus"}) > 0 + or + sum without(rule_group) (prometheus_rule_group_rules{job="prometheus-prometheus"}) > 0 + ) + ) + for: 10m + labels: + severity: warning + - alert: PrometheusDuplicateTimestamps + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusduplicatetimestamps + summary: Prometheus is dropping samples with duplicate timestamps. + expr: | + rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-prometheus"}[5m]) > 0 + for: 10m + labels: + severity: warning + - alert: PrometheusOutOfOrderTimestamps + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusoutofordertimestamps + summary: Prometheus drops samples with out-of-order timestamps. + expr: | + rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-prometheus"}[5m]) > 0 + for: 10m + labels: + severity: warning + - alert: PrometheusRemoteStorageFailures + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }} + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotestoragefailures + summary: Prometheus fails to send samples to remote storage. + expr: | + ( + (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-prometheus"}[5m])) + / + ( + (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-prometheus"}[5m])) + + + (rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-prometheus"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus-prometheus"}[5m])) + ) + ) + * 100 + > 1 + for: 15m + labels: + severity: critical + - alert: PrometheusRemoteWriteBehind + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritebehind + summary: Prometheus remote write is behind. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-prometheus"}[5m]) + - ignoring(remote_name, url) group_right + max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-prometheus"}[5m]) + ) + > 120 + for: 15m + labels: + severity: critical + - alert: PrometheusRemoteWriteDesiredShards + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-prometheus"}` $labels.instance | query | first | value }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritedesiredshards + summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-prometheus"}[5m]) + > + max_over_time(prometheus_remote_storage_shards_max{job="prometheus-prometheus"}[5m]) + ) + for: 15m + labels: + severity: warning + - alert: PrometheusRuleFailures + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusrulefailures + summary: Prometheus is failing rule evaluations. + expr: | + increase(prometheus_rule_evaluation_failures_total{job="prometheus-prometheus"}[5m]) > 0 + for: 15m + labels: + severity: critical + - alert: PrometheusMissingRuleEvaluations + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusmissingruleevaluations + summary: Prometheus is missing rule evaluations due to slow rule group evaluation. + expr: | + increase(prometheus_rule_group_iterations_missed_total{job="prometheus-prometheus"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusTargetLimitHit + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetlimithit + summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. + expr: | + increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-prometheus"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusLabelLimitHit + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuslabellimithit + summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit. + expr: | + increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus-prometheus"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusScrapeBodySizeLimitHit + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed {{ printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured body_size_limit. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapebodysizelimithit + summary: Prometheus has dropped some targets that exceeded body size limit. + expr: | + increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="prometheus-prometheus"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusScrapeSampleLimitHit + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed {{ printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured sample_limit. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapesamplelimithit + summary: Prometheus has failed scrapes that have exceeded the configured sample limit. + expr: | + increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="prometheus-prometheus"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusTargetSyncFailure + annotations: + description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.namespace}}/{{$labels.pod}} have failed to sync because invalid configuration was supplied.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetsyncfailure + summary: Prometheus has failed to sync targets. + expr: | + increase(prometheus_target_sync_failed_total{job="prometheus-prometheus"}[30m]) > 0 + for: 5m + labels: + severity: critical + - alert: PrometheusHighQueryLoad + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload + summary: Prometheus is reaching its maximum capacity serving concurrent requests. + expr: | + avg_over_time(prometheus_engine_queries{job="prometheus-prometheus"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus"}[5m]) > 0.8 + for: 15m + labels: + severity: warning + - alert: PrometheusErrorSendingAlertsToAnyAlertmanager + annotations: + description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstoanyalertmanager + summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. + expr: | + min without (alertmanager) ( + rate(prometheus_notifications_errors_total{job="prometheus-prometheus",alertmanager!~``}[5m]) + / + rate(prometheus_notifications_sent_total{job="prometheus-prometheus",alertmanager!~``}[5m]) + ) + * 100 + > 3 + for: 15m + labels: + severity: critical diff --git a/charts/platform-reports/files/prometheus-rules/traefik.yaml b/charts/platform-reports/files/prometheus-rules/traefik.yaml new file mode 100644 index 00000000..6cb9e75e --- /dev/null +++ b/charts/platform-reports/files/prometheus-rules/traefik.yaml @@ -0,0 +1,54 @@ +groups: +- name: traefik + rules: + - alert: TraefikDown + annotations: + summary: Traefik Down + description: '{{$labels.namespace}}/{{$labels.pod}} on {{$labels.nodename}} is down' + expr: up{job="traefik"} == 0 + for: 5m + labels: + severity: critical + - alert: TraefikConfigReloadError + annotations: + description: Traefik {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration. + summary: Failed Traefik configuration reload. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(traefik_config_last_reload_success{job="traefik"}[5m]) == 0 + for: 10m + labels: + severity: critical + - alert: TraefikTlsCertExpiration + annotations: + description: TLS Certificate will expire within the next 30 Days + summary: TLS Certificate will expire soon + expr: (traefik_tls_certs_not_after - time()) < 2592000 + for: 1h + labels: + severity: critical + - alert: TraefikTooMuchErrors5XX + annotations: + description: '{{$value | humanizePercentage}} of responses from service {{$labels.exported_service}} have 5XX status code during last 5 minutes.' + summary: Too much 5XX responses from service {{$labels.exported_service}} + expr: | + 100 * + sum by(exported_service) (rate(traefik_service_requests_total{job="traefik", code=~"5.+"}[5m])) + / sum by(exported_service) (rate(traefik_service_requests_total{job="traefik"}[5m])) + > 5 + for: 5m + labels: + severity: critical + - alert: TraefikTooMuchErrors4XX + annotations: + description: '{{$value | humanizePercentage}} of responses from service {{$labels.exported_service}} have 4XX status code during last 5 minutes.' + summary: Too much 4XX responses from service {{$labels.exported_service}} + expr: | + 100 * + sum by(exported_service) (rate(traefik_service_requests_total{job="traefik", code=~"5.+"}[5m])) + / sum by(exported_service) (rate(traefik_service_requests_total{job="traefik"}[5m])) + > 5 + for: 5m + labels: + severity: warning diff --git a/charts/platform-reports/templates/grafana-dashboards.yaml b/charts/platform-reports/templates/grafana-dashboards.yaml index 8861fc3a..5a06823c 100644 --- a/charts/platform-reports/templates/grafana-dashboards.yaml +++ b/charts/platform-reports/templates/grafana-dashboards.yaml @@ -6,7 +6,7 @@ metadata: grafana_dashboard: "true" service: grafana data: -{{ (.Files.Glob "dashboards/home.json").AsConfig | indent 2 }} +{{ (.Files.Glob "files/grafana-dashboards/home.json").AsConfig | indent 2 }} --- apiVersion: v1 kind: ConfigMap @@ -18,7 +18,7 @@ metadata: annotations: grafana_folder: {{ .Values.grafana.sidecar.dashboards.folder }}/platform/Cluster data: -{{ (.Files.Glob "dashboards/cluster/*.json").AsConfig | indent 2 }} +{{ (.Files.Glob "files/grafana-dashboards/cluster/*.json").AsConfig | indent 2 }} --- apiVersion: v1 kind: ConfigMap @@ -30,7 +30,7 @@ metadata: annotations: grafana_folder: {{ .Values.grafana.sidecar.dashboards.folder }}/platform/Project data: -{{ (.Files.Glob "dashboards/project/*.json").AsConfig | indent 2 }} +{{ (.Files.Glob "files/grafana-dashboards/project/*.json").AsConfig | indent 2 }} --- apiVersion: v1 kind: ConfigMap @@ -42,4 +42,4 @@ metadata: annotations: grafana_folder: {{ .Values.grafana.sidecar.dashboards.folder }}/platform/Organization data: -{{ (.Files.Glob "dashboards/org/*.json").AsConfig | indent 2 }} +{{ (.Files.Glob "files/grafana-dashboards/org/*.json").AsConfig | indent 2 }} diff --git a/charts/platform-reports/templates/prometheus-rules.yaml b/charts/platform-reports/templates/prometheus-rules.yaml new file mode 100644 index 00000000..274b3dc8 --- /dev/null +++ b/charts/platform-reports/templates/prometheus-rules.yaml @@ -0,0 +1,53 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ include "platformReports.fullname" . }}-alertmanager + labels: + {{- include "platformReports.labels.standard" . | nindent 4 }} +spec: +{{ .Files.Get "files/prometheus-rules/alertmanager.yaml" | indent 2 }} +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ include "platformReports.fullname" . }}-kube-control-plane + labels: + {{- include "platformReports.labels.standard" . | nindent 4 }} +spec: +{{ .Files.Get "files/prometheus-rules/kube-control-plane.yaml" | indent 2 }} +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ include "platformReports.fullname" . }}-node-exporter + labels: + {{- include "platformReports.labels.standard" . | nindent 4 }} +spec: +{{ .Files.Get "files/prometheus-rules/node-exporter.yaml" | indent 2 }} +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ include "platformReports.fullname" . }}-prometheus-operator + labels: + {{- include "platformReports.labels.standard" . | nindent 4 }} +spec: +{{ .Files.Get "files/prometheus-rules/prometheus-operator.yaml" | indent 2 }} +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ include "platformReports.fullname" . }}-prometheus + labels: + {{- include "platformReports.labels.standard" . | nindent 4 }} +spec: +{{ .Files.Get "files/prometheus-rules/prometheus.yaml" | indent 2 }} +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ include "platformReports.fullname" . }}-traefik + labels: + {{- include "platformReports.labels.standard" . | nindent 4 }} +spec: +{{ .Files.Get "files/prometheus-rules/traefik.yaml" | indent 2 }} diff --git a/charts/platform-reports/values-dev.yaml b/charts/platform-reports/values-dev.yaml index e45213d9..f07f0b6a 100644 --- a/charts/platform-reports/values-dev.yaml +++ b/charts/platform-reports/values-dev.yaml @@ -41,6 +41,26 @@ grafanaProxy: traefik.ingress.kubernetes.io/router.middlewares: platform-platform-ingress-auth@kubernetescrd external-dns.platform.neuromation.io: "true" +secrets: + - name: alertmanager-config + data: + alertmanager.yaml: | + route: + receiver: platform-notifications + group_wait: 30s + group_interval: 5m + repeat_interval: 4h + group_by: [alertname] + + receivers: + - name: platform-notifications + webhook_configs: + - url: http://platform-notifications:8080/api/v1/notifications/alert-manager-notification + http_config: + authorization: + type: Bearer + credentials_file: /etc/alertmanager/secrets/platformservices-secret/compute_token + kube-prometheus-stack: prometheus: prometheusSpec: @@ -56,10 +76,17 @@ kube-prometheus-stack: storageClassName: standard-rwo externalLabels: - cluster: "" + cluster: default priorityClassName: "" + alertmanager: + alertmanagerSpec: + configSecret: alertmanager-config + + secrets: + - platformservices-secret + prometheusOperator: kubeletService: namespace: platform diff --git a/charts/platform-reports/values.yaml b/charts/platform-reports/values.yaml index aa0c167c..8c6208ff 100644 --- a/charts/platform-reports/values.yaml +++ b/charts/platform-reports/values.yaml @@ -63,7 +63,7 @@ kube-prometheus-stack: create: false kubeApiServer: - enabled: false + enabled: true kubelet: enabled: true @@ -119,6 +119,10 @@ kube-prometheus-stack: collectors: - nodes - pods + - deployments + - statefulsets + - daemonsets + - horizontalpodautoscalers metricLabelsAllowlist: - nodes=[platform.neuromation.io/nodepool,platform.neuromation.io/job] @@ -179,15 +183,9 @@ kube-prometheus-stack: tag: v20221220-controller-v1.5.1-58-g787ea74b6 priorityClassName: "" - serviceMonitor: - selfMonitor: false - priorityClassName: "" prometheus: - serviceMonitor: - selfMonitor: false - prometheusSpec: image: registry: quay.io @@ -226,9 +224,6 @@ kube-prometheus-stack: priorityClassName: "" alertmanager: - serviceMonitor: - selfMonitor: false - alertmanagerSpec: image: registry: quay.io diff --git a/tests/conftest.py b/tests/conftest.py index 1baeebb3..f5df7202 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -20,7 +20,7 @@ @pytest.fixture(scope="session") def dashboards_expressions() -> dict[str, Sequence[str]]: result: dict[str, Sequence[str]] = {} - dashboards_path = Path("charts/platform-reports/dashboards") + dashboards_path = Path("charts/platform-reports/files/grafana-dashboards") expr_regex = re.compile(r'"expr": "((?:[^"\\]|\\.)+)"') var_regex = re.compile(r'"query": "label_values\(((?:[^"\\]|\\.)+),[a-z_]+\)"') for path in dashboards_path.glob("**/*.json"):