From 221aa51e0aa01deb1307fc92768012bfc6278663 Mon Sep 17 00:00:00 2001 From: Ivan Zubenko Date: Thu, 7 Dec 2023 10:38:18 +0200 Subject: [PATCH] upgrade kube-prometheus-stack to 54.2.2 and grafana to 7.0.11 --- charts/platform-reports/Chart.lock | 8 +- charts/platform-reports/Chart.yaml | 5 +- charts/platform-reports/README.md | 21 ++ .../dashboards/cluster/jobs.json | 12 +- .../dashboards/cluster/nodes.json | 270 ------------------ .../dashboards/cluster/services.json | 4 +- .../dashboards/org/org_jobs.json | 12 +- .../dashboards/project/job.json | 8 +- .../dashboards/project/project_jobs.json | 12 +- .../templates/metrics-exporter.yaml | 4 +- charts/platform-reports/values-dev.yaml | 9 +- charts/platform-reports/values.yaml | 223 ++++++--------- platform_reports/prometheus_grammars.py | 3 +- setup.cfg | 2 +- tests/unit/test_prometheus.py | 2 +- 15 files changed, 147 insertions(+), 448 deletions(-) create mode 100644 charts/platform-reports/README.md diff --git a/charts/platform-reports/Chart.lock b/charts/platform-reports/Chart.lock index 37183a16..dd73cb6a 100644 --- a/charts/platform-reports/Chart.lock +++ b/charts/platform-reports/Chart.lock @@ -1,15 +1,15 @@ dependencies: - name: kube-prometheus-stack repository: https://prometheus-community.github.io/helm-charts - version: 10.3.5 + version: 54.2.2 - name: thanos repository: https://kubernetes-charts.banzaicloud.com version: 0.3.23 - name: grafana repository: https://grafana.github.io/helm-charts - version: 6.2.2 + version: 7.0.11 - name: k8s-resources repository: https://neuro-inc.github.io/helm-charts version: 1.0.1 -digest: sha256:b25d48d44fdfb6eceda35ffba5f7bd26606ff00e07204cfa694f478dede4171a -generated: "2023-06-29T18:20:05.811893947+03:00" +digest: sha256:2b600ad528ed1ed4c5fb06ed20ac718faf9d3213a7ff706b932c5e3a1bfd8314 +generated: "2023-11-30T11:51:27.16429011+02:00" diff --git a/charts/platform-reports/Chart.yaml b/charts/platform-reports/Chart.yaml index bedb60bf..da94796c 100644 --- a/charts/platform-reports/Chart.yaml +++ b/charts/platform-reports/Chart.yaml @@ -3,16 +3,17 @@ name: platform-reports description: A Helm chart for platform-reports service version: 1.0.0 appVersion: 1.0.0 +kubeVersion: ">=1.21.0-0" dependencies: - name: kube-prometheus-stack - version: "10.3.5" + version: "54.2.2" repository: "https://prometheus-community.github.io/helm-charts" - name: thanos version: "0.3.23" repository: "https://kubernetes-charts.banzaicloud.com" condition: objectStore.supported - name: grafana - version: "6.2.2" + version: "7.0.11" repository: "https://grafana.github.io/helm-charts" - name: k8s-resources version: "1.0.1" diff --git a/charts/platform-reports/README.md b/charts/platform-reports/README.md new file mode 100644 index 00000000..998b2b64 --- /dev/null +++ b/charts/platform-reports/README.md @@ -0,0 +1,21 @@ +# platform-reports + +## Upgrading chart + +### To 23.12+ + +1. Remove `prometheus-node-exporter` DaemonSet before the upgrade since selector labels were changed in child helm chart. + +1. Run these commands to update the CRDs before applying the upgrade: + ```shell + kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.69.1/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagerconfigs.yaml + kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.69.1/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagers.yaml + kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.69.1/example/prometheus-operator-crd/monitoring.coreos.com_podmonitors.yaml + kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.69.1/example/prometheus-operator-crd/monitoring.coreos.com_probes.yaml + kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.69.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheusagents.yaml + kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.69.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheuses.yaml + kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.69.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml + kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.69.1/example/prometheus-operator-crd/monitoring.coreos.com_scrapeconfigs.yaml + kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.69.1/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml + kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.69.1/example/prometheus-operator-crd/monitoring.coreos.com_thanosrulers.yam + ``` diff --git a/charts/platform-reports/dashboards/cluster/jobs.json b/charts/platform-reports/dashboards/cluster/jobs.json index 69ee52f7..aa4421ac 100644 --- a/charts/platform-reports/dashboards/cluster/jobs.json +++ b/charts/platform-reports/dashboards/cluster/jobs.json @@ -239,7 +239,7 @@ "pluginVersion": "7.3.5", "targets": [ { - "expr": "sum by (label_platform_neuromation_io_org,label_platform_neuromation_io_project,label_platform_neuromation_io_user,label_platform_neuromation_io_preset,pod) (irate(container_cpu_usage_seconds_total{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}[$__rate_interval])) / sum by (pod) (container_spec_cpu_quota{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"} / container_spec_cpu_period{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) * 100 * on(pod) group_left(label_platform_neuromation_io_org,label_platform_neuromation_io_project,label_platform_neuromation_io_user,label_platform_neuromation_io_preset) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=~\"^$org_name$\",label_platform_neuromation_io_project=~\"^$project_name$\",label_platform_neuromation_io_user=~\"^$user_name$\"}", + "expr": "sum by (label_platform_neuromation_io_org,label_platform_neuromation_io_project,label_platform_neuromation_io_user,label_platform_neuromation_io_preset,pod) (irate(container_cpu_usage_seconds_total{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}[$__rate_interval])) / sum by (pod) (kube_pod_container_resource_requests{job=\"kube-state-metrics\",resource=\"cpu\",pod=~\"^job-.+\",container!=\"POD\"}) * 100 * on(pod) group_left(label_platform_neuromation_io_org,label_platform_neuromation_io_project,label_platform_neuromation_io_user,label_platform_neuromation_io_preset) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=~\"^$org_name$\",label_platform_neuromation_io_project=~\"^$project_name$\",label_platform_neuromation_io_user=~\"^$user_name$\"}", "format": "table", "hide": false, "instant": true, @@ -248,7 +248,7 @@ "refId": "A" }, { - "expr": "sum by (pod) (container_memory_working_set_bytes{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) / sum by (pod) (container_spec_memory_limit_bytes{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) * 100 * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=~\"^$org_name$\",label_platform_neuromation_io_project=~\"^$project_name$\",label_platform_neuromation_io_user=~\"^$user_name$\"}", + "expr": "sum by (pod) (container_memory_working_set_bytes{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) / sum by (pod) (kube_pod_container_resource_limits{job=\"kube-state-metrics\",resource=\"memory\",pod=~\"^job-.+\",container!=\"POD\"}) * 100 * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=~\"^$org_name$\",label_platform_neuromation_io_project=~\"^$project_name$\",label_platform_neuromation_io_user=~\"^$user_name$\"}", "format": "table", "hide": false, "instant": true, @@ -389,7 +389,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum (container_spec_cpu_quota{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"} / container_spec_cpu_period{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"} * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=~\"^$org_name$\",label_platform_neuromation_io_project=~\"^$project_name$\",label_platform_neuromation_io_user=~\"^$user_name$\"})", + "expr": "sum (kube_pod_container_resource_requests{job=\"kube-state-metrics\",resource=\"cpu\",pod=~\"^job-.+\",container!=\"POD\"} * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=~\"^$org_name$\",label_platform_neuromation_io_project=~\"^$project_name$\",label_platform_neuromation_io_user=~\"^$user_name$\"})", "format": "time_series", "hide": false, "interval": "", @@ -495,7 +495,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum (sum by(pod) (container_spec_memory_limit_bytes{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=~\"^$org_name$\",label_platform_neuromation_io_project=~\"^$project_name$\",label_platform_neuromation_io_user=~\"^$user_name$\"})", + "expr": "sum (sum by(pod) (kube_pod_container_resource_limits{job=\"kube-state-metrics\",resource=\"memory\",pod=~\"^job-.+\",container!=\"POD\"}) * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=~\"^$org_name$\",label_platform_neuromation_io_project=~\"^$project_name$\",label_platform_neuromation_io_user=~\"^$user_name$\"})", "hide": false, "interval": "", "intervalFactor": 1, @@ -723,7 +723,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (pod) (irate(container_cpu_usage_seconds_total{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}[$__rate_interval])) / sum by (pod) (container_spec_cpu_quota{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"} / container_spec_cpu_period{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) * 100 * on(pod) group_left(label_platform_neuromation_io_project) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=~\"^$org_name$\",label_platform_neuromation_io_project=~\"^$project_name$\",label_platform_neuromation_io_user=~\"^$user_name$\"}", + "expr": "sum by (pod) (irate(container_cpu_usage_seconds_total{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}[$__rate_interval])) / sum by (pod) (kube_pod_container_resource_requests{job=\"kube-state-metrics\",resource=\"cpu\",pod=~\"^job-.+\",container!=\"POD\"}) * 100 * on(pod) group_left(label_platform_neuromation_io_project) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=~\"^$org_name$\",label_platform_neuromation_io_project=~\"^$project_name$\",label_platform_neuromation_io_user=~\"^$user_name$\"}", "format": "time_series", "hide": false, "interval": "", @@ -829,7 +829,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (pod) (container_memory_working_set_bytes{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) / sum by (pod) (container_spec_memory_limit_bytes{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) * 100 * on(pod) group_left(label_platform_neuromation_io_project) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=~\"^$org_name$\",label_platform_neuromation_io_project=~\"^$project_name$\",label_platform_neuromation_io_user=~\"^$user_name$\"}", + "expr": "sum by (pod) (container_memory_working_set_bytes{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) / sum by (pod) (kube_pod_container_resource_limits{job=\"kube-state-metrics\",resource=\"memory\",pod=~\"^job-.+\",container!=\"POD\"}) * 100 * on(pod) group_left(label_platform_neuromation_io_project) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=~\"^$org_name$\",label_platform_neuromation_io_project=~\"^$project_name$\",label_platform_neuromation_io_user=~\"^$user_name$\"}", "hide": false, "interval": "", "intervalFactor": 1, diff --git a/charts/platform-reports/dashboards/cluster/nodes.json b/charts/platform-reports/dashboards/cluster/nodes.json index c07fce3f..95be0db2 100644 --- a/charts/platform-reports/dashboards/cluster/nodes.json +++ b/charts/platform-reports/dashboards/cluster/nodes.json @@ -8452,276 +8452,6 @@ "title": "Hardware Misc", "type": "row" }, - { - "collapsed": true, - "datasource": null, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 25 - }, - "id": 296, - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": null, - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, - "fill": 2, - "fillGradient": 0, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 18 - }, - "hiddenSeries": false, - "id": 297, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum by(name) (irate(node_systemd_socket_accepted_connections_total{instance=\"$instance\",job=\"node-exporter\"}[$__rate_interval]))", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{ name }} Connections", - "refId": "A", - "step": 240 - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Systemd Sockets", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "counter", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": null, - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, - "fill": 2, - "fillGradient": 0, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 18 - }, - "hiddenSeries": false, - "id": 298, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "Failed", - "color": "#F2495C" - }, - { - "alias": "Inactive", - "color": "#FF9830" - }, - { - "alias": "Active", - "color": "#73BF69" - }, - { - "alias": "Deactivating", - "color": "#FFCB7D" - }, - { - "alias": "Activating", - "color": "#C8F2C2" - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(node_systemd_units{instance=\"$instance\",job=\"node-exporter\",state=\"activating\"})", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "Activating", - "refId": "A", - "step": 240 - }, - { - "expr": "sum(node_systemd_units{instance=\"$instance\",job=\"node-exporter\",state=\"active\"})", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "Active", - "refId": "B", - "step": 240 - }, - { - "expr": "sum(node_systemd_units{instance=\"$instance\",job=\"node-exporter\",state=\"deactivating\"})", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "Deactivating", - "refId": "C", - "step": 240 - }, - { - "expr": "sum(node_systemd_units{instance=\"$instance\",job=\"node-exporter\",state=\"failed\"})", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "Failed", - "refId": "D", - "step": 240 - }, - { - "expr": "sum(node_systemd_units{instance=\"$instance\",job=\"node-exporter\",state=\"inactive\"})", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "Inactive", - "refId": "E", - "step": 240 - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Systemd Units State", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "counter", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - } - ], - "title": "Systemd", - "type": "row" - }, { "collapsed": true, "datasource": null, diff --git a/charts/platform-reports/dashboards/cluster/services.json b/charts/platform-reports/dashboards/cluster/services.json index 6253a80e..b932ebb6 100644 --- a/charts/platform-reports/dashboards/cluster/services.json +++ b/charts/platform-reports/dashboards/cluster/services.json @@ -97,7 +97,7 @@ "refId": "A" }, { - "expr": "max(sum by(pod) (container_spec_cpu_quota{job=\"kubelet\",container!=\"POD\",image!=\"\"} / container_spec_cpu_period{job=\"kubelet\",container!=\"POD\",image!=\"\"}) * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_service=\"$service\"})", + "expr": "max(sum by(pod) (kube_pod_container_resource_requests{job=\"kube-state-metrics\",resource=\"cpu\",container!=\"POD\"}) * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_service=\"$service\"})", "hide": false, "interval": "", "legendFormat": "Limit", @@ -212,7 +212,7 @@ "refId": "A" }, { - "expr": "max(sum by(pod) (container_spec_memory_limit_bytes{job=\"kubelet\",container!=\"POD\",image!=\"\"}) * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_service=\"$service\"})", + "expr": "max(sum by(pod) (kube_pod_container_resource_limits{job=\"kube-state-metrics\",resource=\"memory\",container!=\"POD\"}) * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_service=\"$service\"})", "hide": false, "interval": "", "legendFormat": "Limit", diff --git a/charts/platform-reports/dashboards/org/org_jobs.json b/charts/platform-reports/dashboards/org/org_jobs.json index 0350b8f5..ddeb0390 100644 --- a/charts/platform-reports/dashboards/org/org_jobs.json +++ b/charts/platform-reports/dashboards/org/org_jobs.json @@ -214,7 +214,7 @@ "pluginVersion": "7.3.5", "targets": [ { - "expr": "sum by (label_platform_neuromation_io_project,label_platform_neuromation_io_user,label_platform_neuromation_io_preset,pod) (irate(container_cpu_usage_seconds_total{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}[$__rate_interval])) / sum by (pod) (container_spec_cpu_quota{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"} / container_spec_cpu_period{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) * 100 * on(pod) group_left(label_platform_neuromation_io_project,label_platform_neuromation_io_user,label_platform_neuromation_io_preset) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=\"$org_name\",label_platform_neuromation_io_project=~\"^$project_name$\",label_platform_neuromation_io_user=~\"^$user_name$\"}", + "expr": "sum by (label_platform_neuromation_io_project,label_platform_neuromation_io_user,label_platform_neuromation_io_preset,pod) (irate(container_cpu_usage_seconds_total{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}[$__rate_interval])) / sum by (pod) (kube_pod_container_resource_requests{job=\"kube-state-metrics\",resource=\"cpu\",pod=~\"^job-.+\",container!=\"POD\"}) * 100 * on(pod) group_left(label_platform_neuromation_io_project,label_platform_neuromation_io_user,label_platform_neuromation_io_preset) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=\"$org_name\",label_platform_neuromation_io_project=~\"^$project_name$\",label_platform_neuromation_io_user=~\"^$user_name$\"}", "format": "table", "hide": false, "instant": true, @@ -223,7 +223,7 @@ "refId": "A" }, { - "expr": "sum by (pod) (container_memory_working_set_bytes{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) / sum by (pod) (container_spec_memory_limit_bytes{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) * 100 * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=\"$org_name\",label_platform_neuromation_io_project=~\"^$project_name$\",label_platform_neuromation_io_user=~\"^$user_name$\"}", + "expr": "sum by (pod) (container_memory_working_set_bytes{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) / sum by (pod) (kube_pod_container_resource_limits{job=\"kube-state-metrics\",resource=\"memory\",pod=~\"^job-.+\",container!=\"POD\"}) * 100 * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=\"$org_name\",label_platform_neuromation_io_project=~\"^$project_name$\",label_platform_neuromation_io_user=~\"^$user_name$\"}", "format": "table", "hide": false, "instant": true, @@ -363,7 +363,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (label_platform_neuromation_io_user) (container_spec_cpu_quota{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"} / container_spec_cpu_period{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"} * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=\"$org_name\",label_platform_neuromation_io_project=~\"^$project_name$\",label_platform_neuromation_io_user=~\"^$user_name$\"})", + "expr": "sum by (label_platform_neuromation_io_user) (kube_pod_container_resource_requests{job=\"kube-state-metrics\",resource=\"cpu\",pod=~\"^job-.+\",container!=\"POD\"} * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=\"$org_name\",label_platform_neuromation_io_project=~\"^$project_name$\",label_platform_neuromation_io_user=~\"^$user_name$\"})", "format": "time_series", "hide": false, "interval": "", @@ -469,7 +469,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(sum by (pod) (container_spec_memory_limit_bytes{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=\"$org_name\",label_platform_neuromation_io_project=~\"^$project_name$\",label_platform_neuromation_io_user=~\"^$user_name$\"})", + "expr": "sum(sum by (pod) (kube_pod_container_resource_limits{job=\"kube-state-metrics\",resource=\"memory\",pod=~\"^job-.+\",container!=\"POD\"}) * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=\"$org_name\",label_platform_neuromation_io_project=~\"^$project_name$\",label_platform_neuromation_io_user=~\"^$user_name$\"})", "hide": false, "interval": "", "intervalFactor": 1, @@ -697,7 +697,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (pod) (irate(container_cpu_usage_seconds_total{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}[$__rate_interval])) / sum by (pod) (container_spec_cpu_quota{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"} / container_spec_cpu_period{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) * 100 * on(pod) group_left(label_platform_neuromation_io_project) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=\"$org_name\",label_platform_neuromation_io_project=~\"^$project_name$\",label_platform_neuromation_io_user=~\"^$user_name$\"}", + "expr": "sum by (pod) (irate(container_cpu_usage_seconds_total{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}[$__rate_interval])) / sum by (pod) (kube_pod_container_resource_requests{job=\"kube-state-metrics\",resource=\"cpu\",pod=~\"^job-.+\",container!=\"POD\"}) * 100 * on(pod) group_left(label_platform_neuromation_io_project) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=\"$org_name\",label_platform_neuromation_io_project=~\"^$project_name$\",label_platform_neuromation_io_user=~\"^$user_name$\"}", "format": "time_series", "hide": false, "interval": "", @@ -803,7 +803,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (pod) (container_memory_working_set_bytes{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) / sum by (pod) (container_spec_memory_limit_bytes{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) * 100 * on(pod) group_left(label_platform_neuromation_io_user, label_platform_neuromation_io_project) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=\"$org_name\",label_platform_neuromation_io_project=~\"^$project_name$\",label_platform_neuromation_io_user=~\"^$user_name$\"}", + "expr": "sum by (pod) (container_memory_working_set_bytes{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) / sum by (pod) (kube_pod_container_resource_limits{job=\"kube-state-metrics\",resource=\"memory\",pod=~\"^job-.+\",container!=\"POD\"}) * 100 * on(pod) group_left(label_platform_neuromation_io_user, label_platform_neuromation_io_project) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=\"$org_name\",label_platform_neuromation_io_project=~\"^$project_name$\",label_platform_neuromation_io_user=~\"^$user_name$\"}", "hide": false, "interval": "", "intervalFactor": 1, diff --git a/charts/platform-reports/dashboards/project/job.json b/charts/platform-reports/dashboards/project/job.json index fddb33ec..5faf9c57 100644 --- a/charts/platform-reports/dashboards/project/job.json +++ b/charts/platform-reports/dashboards/project/job.json @@ -161,7 +161,7 @@ "pluginVersion": "7.3.5", "targets": [ { - "expr": "sum(irate(container_cpu_usage_seconds_total{job=\"kubelet\",pod=\"$job_id\",container!=\"POD\",image!=\"\"}[$__rate_interval])) / sum(container_spec_cpu_quota{job=\"kubelet\",pod=\"$job_id\",container!=\"POD\",image!=\"\"} / container_spec_cpu_period{job=\"kubelet\",pod=\"$job_id\",container!=\"POD\",image!=\"\"}) * 100", + "expr": "sum(irate(container_cpu_usage_seconds_total{job=\"kubelet\",pod=\"$job_id\",container!=\"POD\",image!=\"\"}[$__rate_interval])) / sum(kube_pod_container_resource_requests{job=\"kube-state-metrics\",resource=\"cpu\",pod=\"$job_id\",container!=\"POD\"}) * 100", "instant": true, "interval": "", "legendFormat": "", @@ -233,7 +233,7 @@ "pluginVersion": "7.3.5", "targets": [ { - "expr": "sum(container_memory_working_set_bytes{job=\"kubelet\",pod=\"$job_id\",container!=\"POD\",image!=\"\"}) / sum(container_spec_memory_limit_bytes{job=\"kubelet\",pod=\"$job_id\",container!=\"POD\",image!=\"\"}) * 100", + "expr": "sum(container_memory_working_set_bytes{job=\"kubelet\",pod=\"$job_id\",container!=\"POD\",image!=\"\"}) / sum(kube_pod_container_resource_limits{job=\"kube-state-metrics\",resource=\"memory\",pod=\"$job_id\",container!=\"POD\"}) * 100", "instant": true, "interval": "", "legendFormat": "", @@ -579,7 +579,7 @@ "refId": "A" }, { - "expr": "sum(container_spec_cpu_quota{job=\"kubelet\",pod=\"$job_id\",container!=\"POD\",image!=\"\"} / container_spec_cpu_period{job=\"kubelet\",pod=\"$job_id\",container!=\"POD\",image!=\"\"}) - sum(irate(container_cpu_usage_seconds_total{job=\"kubelet\",pod=\"$job_id\",container!=\"POD\",image!=\"\"}[$__rate_interval]))", + "expr": "sum(kube_pod_container_resource_requests{job=\"kube-state-metrics\",resource=\"cpu\",pod=\"$job_id\",container!=\"POD\"}) - sum(irate(container_cpu_usage_seconds_total{job=\"kubelet\",pod=\"$job_id\",container!=\"POD\",image!=\"\"}[$__rate_interval]))", "interval": "", "legendFormat": "Free", "refId": "B" @@ -689,7 +689,7 @@ "refId": "A" }, { - "expr": "sum(container_spec_memory_limit_bytes{job=\"kubelet\",pod=\"$job_id\",container!=\"POD\",image!=\"\"}) - sum(container_memory_working_set_bytes{job=\"kubelet\",pod=\"$job_id\",container!=\"POD\",image!=\"\"})", + "expr": "sum(kube_pod_container_resource_limits{job=\"kube-state-metrics\",resource=\"memory\",pod=\"$job_id\",container!=\"POD\"}) - sum(container_memory_working_set_bytes{job=\"kubelet\",pod=\"$job_id\",container!=\"POD\",image!=\"\"})", "interval": "", "legendFormat": "Free", "refId": "B" diff --git a/charts/platform-reports/dashboards/project/project_jobs.json b/charts/platform-reports/dashboards/project/project_jobs.json index 69bdacc6..23afe623 100644 --- a/charts/platform-reports/dashboards/project/project_jobs.json +++ b/charts/platform-reports/dashboards/project/project_jobs.json @@ -210,7 +210,7 @@ "pluginVersion": "7.3.5", "targets": [ { - "expr": "sum by (label_platform_neuromation_io_user,label_platform_neuromation_io_preset,pod) (irate(container_cpu_usage_seconds_total{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}[$__rate_interval])) / sum by (pod) (container_spec_cpu_quota{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"} / container_spec_cpu_period{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) * 100 * on(pod) group_left(label_platform_neuromation_io_user,label_platform_neuromation_io_preset) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=\"$org_name\",label_platform_neuromation_io_project=\"$project_name\",label_platform_neuromation_io_user=~\"^$user_name$\"}", + "expr": "sum by (label_platform_neuromation_io_user,label_platform_neuromation_io_preset,pod) (irate(container_cpu_usage_seconds_total{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}[$__rate_interval])) / sum by (pod) (kube_pod_container_resource_requests{job=\"kube-state-metrics\",resource=\"cpu\",pod=~\"^job-.+\",container!=\"POD\"}) * 100 * on(pod) group_left(label_platform_neuromation_io_user,label_platform_neuromation_io_preset) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=\"$org_name\",label_platform_neuromation_io_project=\"$project_name\",label_platform_neuromation_io_user=~\"^$user_name$\"}", "format": "table", "hide": false, "instant": true, @@ -219,7 +219,7 @@ "refId": "A" }, { - "expr": "sum by (pod) (container_memory_working_set_bytes{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) / sum by (pod) (container_spec_memory_limit_bytes{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) * 100 * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=\"$org_name\",label_platform_neuromation_io_project=\"$project_name\",label_platform_neuromation_io_user=~\"^$user_name$\"}", + "expr": "sum by (pod) (container_memory_working_set_bytes{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) / sum by (pod) (kube_pod_container_resource_limits{job=\"kube-state-metrics\",resource=\"memory\",pod=~\"^job-.+\",container!=\"POD\"}) * 100 * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=\"$org_name\",label_platform_neuromation_io_project=\"$project_name\",label_platform_neuromation_io_user=~\"^$user_name$\"}", "format": "table", "hide": false, "instant": true, @@ -358,7 +358,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum (container_spec_cpu_quota{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"} / container_spec_cpu_period{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"} * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=\"$org_name\",label_platform_neuromation_io_project=\"$project_name\",label_platform_neuromation_io_user=~\"^$user_name$\"})", + "expr": "sum (kube_pod_container_resource_requests{job=\"kube-state-metrics\",resource=\"cpu\",pod=~\"^job-.+\",container!=\"POD\"} * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=\"$org_name\",label_platform_neuromation_io_project=\"$project_name\",label_platform_neuromation_io_user=~\"^$user_name$\"})", "format": "time_series", "hide": false, "interval": "", @@ -464,7 +464,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum (sum by(pod) (container_spec_memory_limit_bytes{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=\"$org_name\",label_platform_neuromation_io_project=\"$project_name\",label_platform_neuromation_io_user=~\"^$user_name$\"})", + "expr": "sum (sum by(pod) (kube_pod_container_resource_limits{job=\"kube-state-metrics\",resource=\"memory\",pod=~\"^job-.+\",container!=\"POD\"}) * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=\"$org_name\",label_platform_neuromation_io_project=\"$project_name\",label_platform_neuromation_io_user=~\"^$user_name$\"})", "hide": false, "interval": "", "intervalFactor": 1, @@ -692,7 +692,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (pod) (irate(container_cpu_usage_seconds_total{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}[$__rate_interval])) / sum by (pod) (container_spec_cpu_quota{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"} / container_spec_cpu_period{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) * 100 * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=\"$org_name\",label_platform_neuromation_io_project=\"$project_name\",label_platform_neuromation_io_user=~\"^$user_name$\"}", + "expr": "sum by (pod) (irate(container_cpu_usage_seconds_total{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}[$__rate_interval])) / sum by (pod) (kube_pod_container_resource_requests{job=\"kube-state-metrics\",resource=\"cpu\",pod=~\"^job-.+\",container!=\"POD\"}) * 100 * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=\"$org_name\",label_platform_neuromation_io_project=\"$project_name\",label_platform_neuromation_io_user=~\"^$user_name$\"}", "format": "time_series", "hide": false, "interval": "", @@ -798,7 +798,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (pod) (container_memory_working_set_bytes{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) / sum by (pod) (container_spec_memory_limit_bytes{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) * 100 * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=\"$org_name\",label_platform_neuromation_io_project=\"$project_name\",label_platform_neuromation_io_user=~\"^$user_name$\"}", + "expr": "sum by (pod) (container_memory_working_set_bytes{job=\"kubelet\",pod=~\"^job-.+\",container!=\"POD\",image!=\"\"}) / sum by (pod) (kube_pod_container_resource_limits{job=\"kube-state-metrics\",resource=\"memory\",pod=~\"^job-.+\",container!=\"POD\"}) * 100 * on(pod) kube_pod_labels{job=\"kube-state-metrics\",label_platform_neuromation_io_org=\"$org_name\",label_platform_neuromation_io_project=\"$project_name\",label_platform_neuromation_io_user=~\"^$user_name$\"}", "hide": false, "interval": "", "intervalFactor": 1, diff --git a/charts/platform-reports/templates/metrics-exporter.yaml b/charts/platform-reports/templates/metrics-exporter.yaml index c1a4fc16..d44f3f96 100644 --- a/charts/platform-reports/templates/metrics-exporter.yaml +++ b/charts/platform-reports/templates/metrics-exporter.yaml @@ -175,8 +175,9 @@ spec: kind: Service apiVersion: v1 metadata: - name: neuro-metrics-exporter # name is fixed as it is used grafana dashboards + name: {{ include "platformReports.metricsExporter.fullname" . }} labels: {{ include "platformReports.labels.standard" . | nindent 4 }} + job: neuro-metrics-exporter service: platform-metrics-exporter spec: selector: @@ -200,6 +201,7 @@ spec: app: {{ include "platformReports.name" . }} release: {{ .Release.Name | quote }} service: platform-metrics-exporter + jobLabel: job endpoints: - port: metrics path: /metrics diff --git a/charts/platform-reports/values-dev.yaml b/charts/platform-reports/values-dev.yaml index 0b8dd757..e45213d9 100644 --- a/charts/platform-reports/values-dev.yaml +++ b/charts/platform-reports/values-dev.yaml @@ -46,19 +46,24 @@ kube-prometheus-stack: prometheusSpec: thanos: objectStorageConfig: - name: thanos - key: object-store.yaml + existingSecret: + name: thanos + key: object-store.yaml + storageSpec: volumeClaimTemplate: spec: storageClassName: standard-rwo + externalLabels: cluster: "" + priorityClassName: "" prometheusOperator: kubeletService: namespace: platform + priorityClassName: "" kubelet: diff --git a/charts/platform-reports/values.yaml b/charts/platform-reports/values.yaml index cb1cc4a8..aa0c167c 100644 --- a/charts/platform-reports/values.yaml +++ b/charts/platform-reports/values.yaml @@ -1,7 +1,3 @@ -global: - rbac: - pspEnabled: false - nodePoolLabels: job: platform.neuromation.io/job gpu: platform.neuromation.io/accelerator @@ -66,12 +62,12 @@ kube-prometheus-stack: defaultRules: create: false - alertmanager: - enabled: false - kubeApiServer: enabled: false + kubelet: + enabled: true + kubeControllerManager: enabled: false @@ -93,122 +89,62 @@ kube-prometheus-stack: kubeStateMetrics: enabled: true - serviceMonitor: - interval: 15s - - metricRelabelings: - - sourceLabels: - - label_beta_kubernetes_io_instance_type - targetLabel: label_node_kubernetes_io_instance_type - kube-state-metrics: nameOverride: kube-state-metrics fullnameOverride: kube-state-metrics image: - repository: quay.io/coreos/kube-state-metrics - tag: v1.9.7 + registry: registry.k8s.io + repository: kube-state-metrics/kube-state-metrics + tag: v2.10.1 customLabels: service: kube-state-metrics + selectorOverride: + app.kubernetes.io/name: kube-state-metrics + resources: requests: cpu: 100m memory: 200Mi - collectors: - # enabled - nodes: true - pods: true - - # disabled - certificatesigningrequests: false - configmaps: false - cronjobs: false - daemonsets: false - deployments: false - endpoints: false - horizontalpodautoscalers: false - ingresses: false - jobs: false - limitranges: false - mutatingwebhookconfigurations: false - namespaces: false - networkpolicies: false - persistentvolumeclaims: false - persistentvolumes: false - poddisruptionbudgets: false - replicasets: false - replicationcontrollers: false - resourcequotas: false - secrets: false - services: false - statefulsets: false - storageclasses: false - validatingwebhookconfigurations: false - verticalpodautoscalers: false - volumeattachments: false - - podSecurityPolicy: - enabled: false + prometheus: + monitor: + metricRelabelings: + - sourceLabels: + - label_beta_kubernetes_io_instance_type + targetLabel: label_node_kubernetes_io_instance_type - priorityClassName: "" + collectors: + - nodes + - pods - kubelet: - enabled: true + metricLabelsAllowlist: + - nodes=[platform.neuromation.io/nodepool,platform.neuromation.io/job] + - pods=[service,platform.neuromation.io/org,platform.neuromation.io/project,platform.neuromation.io/user,platform.neuromation.io/preset,platform.neuromation.io/gpu-model] - serviceMonitor: - interval: 15s - https: true - probes: false - resource: false - cAdvisor: true + priorityClassName: "" nodeExporter: enabled: true - resources: - requests: - cpu: 15m - memory: 50Mi - - serviceMonitor: - interval: 15s - prometheus-node-exporter: nameOverride: prometheus-node-exporter fullnameOverride: prometheus-node-exporter image: - repository: quay.io/prometheus/node-exporter - tag: v1.0.1 + registry: quay.io + repository: prometheus/node-exporter + tag: v1.7.0 podLabels: service: prometheus-node-exporter - securityContext: - runAsUser: 0 - runAsNonRoot: false - - rbac: - pspEnabled: false - - extraHostVolumeMounts: - - name: system-dbus-socket - hostPath: /var/run/dbus/system_bus_socket - mountPath: /var/run/dbus/system_bus_socket - readOnly: true - - name: systemd-private - hostPath: /run/systemd/private - mountPath: /run/systemd/private - readOnly: true - - extraArgs: - - --collector.systemd - - --collector.processes - - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/) - - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$ + resources: + requests: + cpu: 15m + memory: 50Mi updateStrategy: type: RollingUpdate @@ -217,37 +153,31 @@ kube-prometheus-stack: prometheusOperator: image: - repository: quay.io/coreos/prometheus-operator + registry: quay.io + repository: prometheus-operator/prometheus-operator + tag: v0.69.1 podLabels: service: prometheus-operator - prometheusConfigReloaderImage: - repository: quay.io/coreos/prometheus-config-reloader - - configmapReloadImage: - repository: docker.io/jimmidyson/configmap-reload - - kubectlImage: - repository: docker.io/bitnami/kubectl - - tlsProxy: + prometheusConfigReloader: image: - repository: squareup/ghostunnel + registry: quay.io + repository: prometheus-operator/prometheus-config-reloader + tag: v0.69.1 + + thanosImage: + registry: quay.io + repository: thanos/thanos + tag: v0.24.0 admissionWebhooks: patch: image: - repository: k8s.gcr.io/ingress-nginx/kube-webhook-certgen - tag: v1.1.1 + registry: registry.k8s.io + repository: ingress-nginx/kube-webhook-certgen + tag: v20221220-controller-v1.5.1-58-g787ea74b6 priorityClassName: "" - logLevel: error - - createCustomResource: false - cleanupCustomResource: false - - kubeletService: - enabled: true serviceMonitor: selfMonitor: false @@ -255,26 +185,31 @@ kube-prometheus-stack: priorityClassName: "" prometheus: - podSecurityPolicy: - enabled: false - serviceMonitor: selfMonitor: false prometheusSpec: image: - repository: quay.io/prometheus/prometheus + registry: quay.io + repository: prometheus/prometheus + tag: v2.48.0 + podMetadata: labels: service: prometheus + replicas: 1 + retention: 12h + scrapeInterval: 15s evaluationInterval: 15s + resources: requests: cpu: "500m" memory: "1Gi" + storageSpec: volumeClaimTemplate: metadata: @@ -284,14 +219,26 @@ kube-prometheus-stack: resources: requests: storage: 10Gi - logLevel: error - # thanos sidecar - thanos: - image: quay.io/thanos/thanos:v0.24.0 - version: v0.24.0 - logLevel: error + externalLabels: cluster: "" + + priorityClassName: "" + + alertmanager: + serviceMonitor: + selfMonitor: false + + alertmanagerSpec: + image: + registry: quay.io + repository: prometheus/alertmanager + tag: v0.26.0 + + podMetadata: + labels: + service: alert-manager + priorityClassName: "" grafana: @@ -324,7 +271,6 @@ thanos: storage: 10Gi labels: service: thanos-store - logLevel: error # Compact service works with data in object storage compact: @@ -351,17 +297,14 @@ thanos: storage: 10Gi labels: service: thanos-compact - logLevel: error query: labels: service: thanos-query - logLevel: error bucket: labels: service: thanos-bucket - logLevel: error rule: enabled: false @@ -370,7 +313,8 @@ thanos: # NOTE: This is only the service references for the sidecar enabled: true selector: - app: prometheus + app: null + app.kubernetes.io/name: prometheus priorityClassName: "" @@ -379,7 +323,9 @@ grafana: fullnameOverride: grafana image: + registry: docker.io repository: grafana/grafana + tag: 10.2.2 podLabels: service: grafana @@ -389,9 +335,6 @@ grafana: ingress: enabled: false - rbac: - pspEnabled: false - initChownData: image: repository: busybox @@ -399,22 +342,15 @@ grafana: sidecar: image: + registry: quay.io repository: kiwigrid/k8s-sidecar - tag: 1.22.0 - - logLevel: WARN - - datasources: - enabled: false - - notifiers: - enabled: false + tag: 1.25.2 dashboards: enabled: true SCProvider: false - watchMethod: SLEEP + watchMethod: WATCH label: grafana_dashboard folderAnnotation: grafana_folder folder: /etc/grafana/dashboards @@ -443,9 +379,12 @@ grafana: access: proxy isDefault: true jsonData: + httpMethod: GET + manageAlerts: false keepCookies: - dat - sat + prometheusType: Thanos persistence: enabled: false diff --git a/platform_reports/prometheus_grammars.py b/platform_reports/prometheus_grammars.py index caa1b669..023d7872 100644 --- a/platform_reports/prometheus_grammars.py +++ b/platform_reports/prometheus_grammars.py @@ -258,7 +258,8 @@ : "'" /([^'\\\\]|\\\\.)*/ "'" | "\\"" /([^\\"\\\\]|\\\\.)*/ "\\"" -DURATION: DIGIT+ ("s" | "m" | "h" | "d" | "w" | "y") +DURATION: DIGIT+ ("s" | "m" | "h" | "d" | "w" | "y") \ + (DIGIT+ ("s" | "m" | "h" | "d" | "w" | "y"))* METRIC_NAME: (LETTER | "_" | ":") (DIGIT | LETTER | "_" | ":")* diff --git a/setup.cfg b/setup.cfg index 7c28d29d..7bc9f84d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,7 +17,7 @@ platforms = any install_requires = neuro-auth-client==22.6.1 neuro-config-client==23.3.0 - neuro-sdk==23.7.0 + neuro-sdk==23.2.0 neuro-logging==21.12.2 aiohttp==3.8.4 python-jose==3.3.0 diff --git a/tests/unit/test_prometheus.py b/tests/unit/test_prometheus.py index e25e8356..1495fa62 100644 --- a/tests/unit/test_prometheus.py +++ b/tests/unit/test_prometheus.py @@ -96,7 +96,7 @@ def test_instant_vector_with_empty_label_matcher_values(self) -> None: ) def test_range_vector_with_interval(self) -> None: - result = parse_query("irate(container_cpu_usage_seconds_total[5m])") + result = parse_query("irate(container_cpu_usage_seconds_total[5m0s])") assert result == InstantVector(name="container_cpu_usage_seconds_total") def test_instant_vector_with_subquery(self) -> None: