From 3c921ef6df649bf27db4b2a5763371f51b3258fb Mon Sep 17 00:00:00 2001 From: Stephen Lang Date: Fri, 19 Apr 2024 14:56:53 +0100 Subject: [PATCH] fix(dashboards): Port networking/cluster dashboard to new grafonnet library --- config.libsonnet | 28 +- .../network-usage/cluster-total.libsonnet | 757 +++++++----------- dashboards/resources/multi-cluster.libsonnet | 2 + 3 files changed, 290 insertions(+), 497 deletions(-) diff --git a/config.libsonnet b/config.libsonnet index 43c4a9822..b6af96e1f 100644 --- a/config.libsonnet +++ b/config.libsonnet @@ -34,29 +34,29 @@ // Grafana dashboard IDs are necessary for stable links for dashboards grafanaDashboardIDs: { - 'k8s-resources-multicluster.json': std.md5('k8s-resources-multicluster.json'), + 'apiserver.json': std.md5('apiserver.json'), + 'cluster-total.json': std.md5('cluster-total.json'), + 'controller-manager.json': std.md5('controller-manager.json'), 'k8s-resources-cluster.json': std.md5('k8s-resources-cluster.json'), + 'k8s-resources-multicluster.json': std.md5('k8s-resources-multicluster.json'), 'k8s-resources-namespace.json': std.md5('k8s-resources-namespace.json'), + 'k8s-resources-node.json': std.md5('k8s-resources-node.json'), 'k8s-resources-pod.json': std.md5('k8s-resources-pod.json'), - 'k8s-multicluster-rsrc-use.json': std.md5('k8s-multicluster-rsrc-use.json'), - 'k8s-cluster-rsrc-use.json': std.md5('k8s-cluster-rsrc-use.json'), - 'k8s-node-rsrc-use.json': std.md5('k8s-node-rsrc-use.json'), - 'nodes.json': std.md5('nodes.json'), - 'persistentvolumesusage.json': std.md5('persistentvolumesusage.json'), - 'pods.json': std.md5('pods.json'), - 'statefulset.json': std.md5('statefulset.json'), 'k8s-resources-windows-cluster.json': std.md5('k8s-resources-windows-cluster.json'), 'k8s-resources-windows-namespace.json': std.md5('k8s-resources-windows-namespace.json'), 'k8s-resources-windows-pod.json': std.md5('k8s-resources-windows-pod.json'), + 'k8s-resources-workload.json': std.md5('k8s-resources-workload.json'), + 'k8s-resources-workloads-namespace.json': std.md5('k8s-resources-workloads-namespace.json'), 'k8s-windows-cluster-rsrc-use.json': std.md5('k8s-windows-cluster-rsrc-use.json'), 'k8s-windows-node-rsrc-use.json': std.md5('k8s-windows-node-rsrc-use.json'), - 'k8s-resources-workloads-namespace.json': std.md5('k8s-resources-workloads-namespace.json'), - 'k8s-resources-workload.json': std.md5('k8s-resources-workload.json'), - 'apiserver.json': std.md5('apiserver.json'), - 'controller-manager.json': std.md5('controller-manager.json'), - 'scheduler.json': std.md5('scheduler.json'), - 'proxy.json': std.md5('proxy.json'), 'kubelet.json': std.md5('kubelet.json'), + 'namespace-by-pod.json': std.md5('namespace-by-pod.json'), + 'namespace-by-workload.json': std.md5('namespace-by-workload.json'), + 'persistentvolumesusage.json': std.md5('persistentvolumesusage.json'), + 'pod-total.json': std.md5('pod-total.json'), + 'proxy.json': std.md5('proxy.json'), + 'scheduler.json': std.md5('scheduler.json'), + 'workload-total.json': std.md5('workload-total.json'), }, // Support for Grafana 7.2+ `$__rate_interval` instead of `$__interval` diff --git a/dashboards/network-usage/cluster-total.libsonnet b/dashboards/network-usage/cluster-total.libsonnet index c0de0943b..ef95bd93d 100644 --- a/dashboards/network-usage/cluster-total.libsonnet +++ b/dashboards/network-usage/cluster-total.libsonnet @@ -1,513 +1,304 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local dashboard = grafana.dashboard; -local row = grafana.row; -local prometheus = grafana.prometheus; -local template = grafana.template; -local graphPanel = grafana.graphPanel; -local tablePanel = grafana.tablePanel; -local annotation = grafana.annotation; +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; +local prometheus = g.query.prometheus; +local table = g.panel.table; +local timeSeries = g.panel.timeSeries; +local var = g.dashboard.variable; { - grafanaDashboards+:: { + local tsPanel = + timeSeries { + new(title): + timeSeries.new(title) + + timeSeries.options.legend.withShowLegend() + + timeSeries.options.legend.withAsTable() + + timeSeries.options.legend.withDisplayMode('table') + + timeSeries.options.legend.withPlacement('right') + + timeSeries.options.tooltip.withMode('single') + + timeSeries.fieldConfig.defaults.custom.withShowPoints('never') + + timeSeries.queryOptions.withInterval($._config.grafanaK8s.minimumTimeInterval), + }, + grafanaDashboards+:: { 'cluster-total.json': + local variables = { + datasource: + var.datasource.new('datasource', 'prometheus') + + var.datasource.withRegex($._config.datasourceFilterRegex) + + var.datasource.generalOptions.showOnDashboard.withLabelAndValue() + + var.datasource.generalOptions.withLabel('Data source') + + { + current: { + selected: true, + text: $._config.datasourceName, + value: $._config.datasourceName, + }, + }, - local newStyle( - alias, - colorMode=null, - colors=[], - dateFormat='YYYY-MM-DD HH:mm:ss', - decimals=2, - link=false, - linkTooltip='Drill down', - linkUrl='', - thresholds=[], - type='number', - unit='short' - ) = { - alias: alias, - colorMode: colorMode, - colors: colors, - dateFormat: dateFormat, - decimals: decimals, - link: link, - linkTooltip: linkTooltip, - linkUrl: linkUrl, - thresholds: thresholds, - type: type, - unit: unit, + cluster: + var.query.new('cluster') + + var.query.withDatasourceFromVariable(self.datasource) + + var.query.queryTypes.withLabelValues( + $._config.clusterLabel, + 'up{%(cadvisorSelector)s}' % $._config, + ) + + var.query.generalOptions.withLabel('cluster') + + var.query.refresh.onTime() + + ( + if $._config.showMultiCluster + then var.query.generalOptions.showOnDashboard.withLabelAndValue() + else var.query.generalOptions.showOnDashboard.withNothing() + ) + + var.query.withSort(type='alphabetical'), }; - local newBarplotPanel(graphTitle, graphQuery, graphFormat='Bps', legendFormat='{{namespace}}') = - local target = - prometheus.target( - graphQuery - ) + { - intervalFactor: 1, - legendFormat: legendFormat, - step: 10, - }; - - graphPanel.new( - title=graphTitle, - span=24, - datasource='$datasource', - fill=2, - min_span=24, - format=graphFormat, - min=0, - max=null, - show_xaxis=false, - x_axis_mode='series', - x_axis_values='current', - lines=false, - bars=true, - stack=false, - legend_show=true, - legend_values=true, - legend_min=false, - legend_max=false, - legend_current=true, - legend_avg=false, - legend_alignAsTable=true, - legend_rightSide=true, - legend_sort='current', - legend_sortDesc=true, - nullPointMode='null' - ).addTarget(target) + { - legend+: { - hideEmpty: true, - hideZero: true, + local links = { + namespace: { + title: 'Drill down', + url: '%(prefix)s/d/%(uid)s/kubernetes-networking-namespace-pods?${datasource:queryparam}&var-cluster=${cluster}&var-namespace=${__data.fields.Namespace}' % { + uid: $._config.grafanaDashboardIDs['namespace-by-pod.json'], + prefix: $._config.grafanaK8s.linkPrefix, }, - paceLength: 10, - tooltip+: { - sort: 2, - }, - }; + }, + }; - local newGraphPanel(graphTitle, graphQuery, graphFormat='Bps', legendFormat='{{namespace}}') = - local target = - prometheus.target( - graphQuery - ) + { - intervalFactor: 1, - legendFormat: legendFormat, - step: 10, - }; + local panels = [ + tsPanel.new('Current Rate of Bytes Received') + + tsPanel.standardOptions.withUnit('binBps') + + tsPanel.queryOptions.withTargets([ + prometheus.new( + '${datasource}', + 'sum by (namespace) (rate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster",namespace!=""}[%(grafanaIntervalVar)s]))' % $._config + ) + + prometheus.withLegendFormat('__auto'), + ]), - graphPanel.new( - title=graphTitle, - span=24, - datasource='$datasource', - fill=2, - linewidth=2, - min_span=24, - format=graphFormat, - min=0, - max=null, - x_axis_mode='time', - x_axis_values='total', - lines=true, - stack=true, - legend_show=true, - legend_values=true, - legend_min=true, - legend_max=true, - legend_current=true, - legend_avg=true, - legend_alignAsTable=true, - legend_rightSide=true, - nullPointMode='connected' - ).addTarget(target) + { - legend+: { - hideEmpty: true, - hideZero: true, - }, - paceLength: 10, - tooltip+: { - sort: 2, - }, - }; + tsPanel.new('Current Rate of Bytes Transmitted') + + tsPanel.standardOptions.withUnit('binBps') + + tsPanel.queryOptions.withTargets([ + prometheus.new( + '${datasource}', + 'sum by (namespace) (rate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster",namespace!=""}[%(grafanaIntervalVar)s]))' % $._config + ) + + prometheus.withLegendFormat('__auto'), + ]), - local newTablePanel(tableTitle, colQueries) = - local buildTarget(index, colQuery) = - prometheus.target( - colQuery, - format='table', - instant=true, - ) + { - legendFormat: '', - step: 10, - refId: std.char(65 + index), - }; + table.new('Current Status') + + table.gridPos.withW(24) + + table.queryOptions.withTargets([ + prometheus.new('${datasource}', 'sum by (namespace) (rate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster",namespace!=""}[%(grafanaIntervalVar)s]))' % $._config) + + prometheus.withInstant(true) + + prometheus.withFormat('table'), - local targets = std.mapWithIndex(buildTarget, colQueries); + prometheus.new('${datasource}', 'sum by (namespace) (rate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster",namespace!=""}[%(grafanaIntervalVar)s]))' % $._config) + + prometheus.withInstant(true) + + prometheus.withFormat('table'), - tablePanel.new( - title=tableTitle, - span=24, - min_span=24, - datasource='$datasource', - ) - .addColumn( - field='Time', - style=newStyle( - alias='Time', - type='hidden' - ) - ) - .addColumn( - field='Value #A', - style=newStyle( - alias='Current Bandwidth Received', - unit='Bps', - ), - ) - .addColumn( - field='Value #B', - style=newStyle( - alias='Current Bandwidth Transmitted', - unit='Bps', - ), - ) - .addColumn( - field='Value #C', - style=newStyle( - alias='Average Bandwidth Received', - unit='Bps', - ), - ) - .addColumn( - field='Value #D', - style=newStyle( - alias='Average Bandwidth Transmitted', - unit='Bps', - ), - ) - .addColumn( - field='Value #E', - style=newStyle( - alias='Rate of Received Packets', - unit='pps', - ), - ) - .addColumn( - field='Value #F', - style=newStyle( - alias='Rate of Transmitted Packets', - unit='pps', - ), - ) - .addColumn( - field='Value #G', - style=newStyle( - alias='Rate of Received Packets Dropped', - unit='pps', - ), - ) - .addColumn( - field='Value #H', - style=newStyle( - alias='Rate of Transmitted Packets Dropped', - unit='pps', - ), - ) - .addColumn( - field='namespace', - style=newStyle( - alias='Namespace', - link=true, - linkUrl='d/8b7a8b326d7a6f1f04244066368c67af/kubernetes-networking-namespace-pods?orgId=1&refresh=30s&var-namespace=$__cell', - ), - ) + { + prometheus.new('${datasource}', 'avg by (namespace) (rate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster",namespace!=""}[%(grafanaIntervalVar)s]))' % $._config) + + prometheus.withInstant(true) + + prometheus.withFormat('table'), - fill: 1, - fontSize: '90%', - lines: true, - linewidth: 1, - nullPointMode: 'null as zero', - renderer: 'flot', - scroll: true, - showHeader: true, - spaceLength: 10, - sort: { - col: 0, - desc: false, - }, - targets: targets, - }; + prometheus.new('${datasource}', 'avg by (namespace) (rate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster",namespace!=""}[%(grafanaIntervalVar)s]))' % $._config) + + prometheus.withInstant(true) + + prometheus.withFormat('table'), + + prometheus.new('${datasource}', 'sum by (namespace) (rate(container_network_receive_packets_total{%(clusterLabel)s="$cluster",namespace!=""}[%(grafanaIntervalVar)s]))' % $._config) + + prometheus.withInstant(true) + + prometheus.withFormat('table'), + + prometheus.new('${datasource}', 'sum by (namespace) (rate(container_network_transmit_packets_total{%(clusterLabel)s="$cluster",namespace!=""}[%(grafanaIntervalVar)s]))' % $._config) + + prometheus.withInstant(true) + + prometheus.withFormat('table'), + + prometheus.new('${datasource}', 'sum by (namespace) (rate(container_network_receive_packets_dropped_total{%(clusterLabel)s="$cluster",namespace!=""}[%(grafanaIntervalVar)s]))' % $._config) + + prometheus.withInstant(true) + + prometheus.withFormat('table'), - local resolutionTemplate = - template.new( - name='resolution', - datasource='$datasource', - query='30s,5m,1h', - current='5m', - hide='', - refresh=2, - includeAll=false, - sort=1 - ) + { - auto: false, - auto_count: 30, - auto_min: '10s', - skipUrlSync: false, - type: 'interval', - options: [ - { - selected: false, - text: '30s', - value: '30s', + prometheus.new('${datasource}', 'sum by (namespace) (rate(container_network_transmit_packets_dropped_total{%(clusterLabel)s="$cluster",namespace!=""}[%(grafanaIntervalVar)s]))' % $._config) + + prometheus.withInstant(true) + + prometheus.withFormat('table'), + ]) + + table.queryOptions.withTransformations([ + table.queryOptions.transformation.withId('joinByField') + + table.queryOptions.transformation.withOptions({ + byField: 'namespace', + mode: 'outer', + }), + + g.panel.table.queryOptions.transformation.withId('organize') + + g.panel.table.queryOptions.transformation.withOptions({ + excludeByName: { + 'Time 1': true, + 'Time 2': true, + 'Time 3': true, + 'Time 4': true, + 'Time 5': true, + 'Time 6': true, + 'Time 7': true, + 'Time 8': true, }, - { - selected: true, - text: '5m', - value: '5m', + indexByName: { + 'Time 1': 0, + 'Time 2': 1, + 'Time 3': 2, + 'Time 4': 3, + 'Time 5': 4, + 'Time 6': 5, + 'Time 7': 6, + 'Time 8': 7, + namespace: 8, + 'Value #A': 9, + 'Value #B': 10, + 'Value #C': 11, + 'Value #D': 12, + 'Value #E': 13, + 'Value #F': 14, + 'Value #G': 15, + 'Value #H': 16, }, - { - selected: false, - text: '1h', - value: '1h', + renameByName: { + namespace: 'Namespace', + 'Value #A': 'Rx Bytes', + 'Value #B': 'Tx Bytes', + 'Value #C': 'Rx Bytes (Avg)', + 'Value #D': 'Tx Bytes (Avg)', + 'Value #E': 'Rx Packets', + 'Value #F': 'Tx Packets', + 'Value #G': 'Rx Packets Dropped', + 'Value #H': 'Tx Packets Dropped', }, - ], - }; + }), + ]) - local intervalTemplate = - template.new( - name='interval', - datasource='$datasource', - query='4h', - current='5m', - hide=2, - refresh=2, - includeAll=false, - sort=1 - ) + { - auto: false, - auto_count: 30, - auto_min: '10s', - skipUrlSync: false, - type: 'interval', - options: [ - { - selected: true, - text: '4h', - value: '4h', + + table.standardOptions.withOverrides([ + { + matcher: { + id: 'byRegexp', + options: '/Bytes/', + }, + properties: [ + { + id: 'unit', + value: 'binBps', + }, + ], + }, + { + matcher: { + id: 'byRegexp', + options: '/Packets/', }, - ], - }; + properties: [ + { + id: 'unit', + value: 'pps', + }, + ], + }, + { + matcher: { + id: 'byName', + options: 'Namespace', + }, + properties: [ + { + id: 'links', + value: [links.namespace], + }, + ], + }, + ]), - //##### Current Bandwidth Row ###### + tsPanel.new('Average Rate of Bytes Received') + + tsPanel.standardOptions.withUnit('binBps') + + tsPanel.queryOptions.withTargets([ + prometheus.new( + '${datasource}', + 'avg by (namespace) (rate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster",namespace!=""}[%(grafanaIntervalVar)s]))' % $._config + ) + + prometheus.withLegendFormat('__auto'), + ]), - local currentBandwidthRow = - row.new( - title='Current Bandwidth' - ); + tsPanel.new('Average Rate of Bytes Transmitted') + + tsPanel.standardOptions.withUnit('binBps') + + tsPanel.queryOptions.withTargets([ + prometheus.new( + '${datasource}', + 'avg by (namespace) (rate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster",namespace!=""}[%(grafanaIntervalVar)s]))' % $._config + ) + + prometheus.withLegendFormat('__auto'), + ]), - //##### Average Bandwidth Row ###### + tsPanel.new('Receive Bandwidth') + + tsPanel.standardOptions.withUnit('binBps') + + tsPanel.queryOptions.withTargets([ + prometheus.new( + '${datasource}', + 'sum by (namespace) (rate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster",namespace!=""}[%(grafanaIntervalVar)s]))' % $._config + ) + + prometheus.withLegendFormat('__auto'), + ]), - local averageBandwidthRow = - row.new( - title='Average Bandwidth', - collapse=true, - ); + tsPanel.new('Transmit Bandwidth') + + tsPanel.standardOptions.withUnit('binBps') + + tsPanel.queryOptions.withTargets([ + prometheus.new( + '${datasource}', + 'sum by (namespace) (rate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster",namespace!=""}[%(grafanaIntervalVar)s]))' % $._config + ) + + prometheus.withLegendFormat('__auto'), + ]), - //##### Bandwidth History Row ###### - local bandwidthHistoryRow = - row.new( - title='Bandwidth History' - ); + tsPanel.new('Rate of Received Packets') + + tsPanel.standardOptions.withUnit('pps') + + tsPanel.queryOptions.withTargets([ + prometheus.new('${datasource}', 'sum by (namespace) (rate(container_network_receive_packets_total{%(clusterLabel)s="$cluster",namespace!=""}[%(grafanaIntervalVar)s]))' % $._config) + + prometheus.withLegendFormat('__auto'), + ]), - //##### Packet Row ###### - // collapsed, so row must include panels - local packetRow = - row.new( - title='Packets', - collapse=true, - ); + tsPanel.new('Rate of Transmitted Packets') + + tsPanel.standardOptions.withUnit('pps') + + tsPanel.queryOptions.withTargets([ + prometheus.new('${datasource}', 'sum by (namespace) (rate(container_network_transmit_packets_total{%(clusterLabel)s="$cluster",namespace!=""}[%(grafanaIntervalVar)s]))' % $._config) + + prometheus.withLegendFormat('__auto'), + ]), - //##### Error Row ###### - // collapsed, so row must include panels - local errorRow = - row.new( - title='Errors', - collapse=true, - ); - local clusterTemplate = - template.new( - name='cluster', - datasource='$datasource', - query='label_values(up{%(cadvisorSelector)s}, %(clusterLabel)s)' % $._config, - hide=if $._config.showMultiCluster then '' else '2', - refresh=2 - ); + tsPanel.new('Rate of Received Packets Dropped') + + tsPanel.standardOptions.withUnit('pps') + + tsPanel.queryOptions.withTargets([ + prometheus.new('${datasource}', 'sum by (namespace) (rate(container_network_receive_packets_dropped_total{%(clusterLabel)s="$cluster",namespace!=""}[%(grafanaIntervalVar)s]))' % $._config) + + prometheus.withLegendFormat('__auto'), + ]), - dashboard.new( - title='%(dashboardNamePrefix)sNetworking / Cluster' % $._config.grafanaK8s, - tags=($._config.grafanaK8s.dashboardTags), - editable=true, - schemaVersion=18, - refresh=($._config.grafanaK8s.refresh), - time_from='now-1h', - time_to='now', - ) - .addTemplate(resolutionTemplate) - .addTemplate(intervalTemplate) - .addAnnotation(annotation.default) - .addPanel( - currentBandwidthRow, gridPos={ h: 1, w: 24, x: 0, y: 0 } - ) - .addTemplate( - { - current: { - selected: true, - text: $._config.datasourceName, - value: $._config.datasourceName, - }, - hide: 0, - label: 'Data source', - name: 'datasource', - options: [], - query: 'prometheus', - refresh: 1, - regex: $._config.datasourceFilterRegex, - type: 'datasource', - }, - ) - .addTemplate(clusterTemplate) - .addPanel( - newBarplotPanel( - graphTitle='Current Rate of Bytes Received', - graphQuery='sort_desc(sum(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster",namespace=~".+"}[$interval:$resolution])) by (namespace))' % $._config, - ), - gridPos={ h: 9, w: 12, x: 0, y: 1 } - ) - .addPanel( - newBarplotPanel( - graphTitle='Current Rate of Bytes Transmitted', - graphQuery='sort_desc(sum(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster",namespace=~".+"}[$interval:$resolution])) by (namespace))' % $._config, - ), - gridPos={ h: 9, w: 12, x: 12, y: 1 } - ) - .addPanel( - newTablePanel( - tableTitle='Current Status', - colQueries=[ - 'sort_desc(sum(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster",namespace=~".+"}[$interval:$resolution])) by (namespace))' % $._config, - 'sort_desc(sum(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster",namespace=~".+"}[$interval:$resolution])) by (namespace))' % $._config, - 'sort_desc(avg(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster",namespace=~".+"}[$interval:$resolution])) by (namespace))' % $._config, - 'sort_desc(avg(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster",namespace=~".+"}[$interval:$resolution])) by (namespace))' % $._config, - 'sort_desc(sum(irate(container_network_receive_packets_total{%(clusterLabel)s="$cluster",namespace=~".+"}[$interval:$resolution])) by (namespace))' % $._config, - 'sort_desc(sum(irate(container_network_transmit_packets_total{%(clusterLabel)s="$cluster",namespace=~".+"}[$interval:$resolution])) by (namespace))' % $._config, - 'sort_desc(sum(irate(container_network_receive_packets_dropped_total{%(clusterLabel)s="$cluster",namespace=~".+"}[$interval:$resolution])) by (namespace))' % $._config, - 'sort_desc(sum(irate(container_network_transmit_packets_dropped_total{%(clusterLabel)s="$cluster",namespace=~".+"}[$interval:$resolution])) by (namespace))' % $._config, - ] - ), - gridPos={ h: 9, w: 24, x: 0, y: 10 } - ) - .addPanel( - averageBandwidthRow - .addPanel( - newBarplotPanel( - graphTitle='Average Rate of Bytes Received', - graphQuery='sort_desc(avg(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster",namespace=~".+"}[$interval:$resolution])) by (namespace))' % $._config, - ), - gridPos={ h: 9, w: 12, x: 0, y: 11 } - ) - .addPanel( - newBarplotPanel( - graphTitle='Average Rate of Bytes Transmitted', - graphQuery='sort_desc(avg(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster",namespace=~".+"}[$interval:$resolution])) by (namespace))' % $._config, - ), - gridPos={ h: 9, w: 12, x: 12, y: 11 } - ), - gridPos={ h: 1, w: 24, x: 0, y: 10 }, - ) - .addPanel( - bandwidthHistoryRow, gridPos={ h: 1, w: 24, x: 0, y: 11 } - ) - .addPanel( - newGraphPanel( - graphTitle='Receive Bandwidth', - graphQuery='sort_desc(sum(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster",namespace=~".+"}[$interval:$resolution])) by (namespace))' % $._config, - ), - gridPos={ h: 9, w: 24, x: 0, y: 12 } - ) - .addPanel( - newGraphPanel( - graphTitle='Transmit Bandwidth', - graphQuery='sort_desc(sum(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster",namespace=~".+"}[$interval:$resolution])) by (namespace))' % $._config, - ), - gridPos={ h: 9, w: 24, x: 0, y: 21 } - ) - .addPanel( - packetRow - .addPanel( - newGraphPanel( - graphTitle='Rate of Received Packets', - graphQuery='sort_desc(sum(irate(container_network_receive_packets_total{%(clusterLabel)s="$cluster",namespace=~".+"}[$interval:$resolution])) by (namespace))' % $._config, - graphFormat='pps' - ), - gridPos={ h: 9, w: 24, x: 0, y: 31 } - ) - .addPanel( - newGraphPanel( - graphTitle='Rate of Transmitted Packets', - graphQuery='sort_desc(sum(irate(container_network_transmit_packets_total{%(clusterLabel)s="$cluster",namespace=~".+"}[$interval:$resolution])) by (namespace))' % $._config, - graphFormat='pps' - ), - gridPos={ h: 9, w: 24, x: 0, y: 40 } - ), - gridPos={ h: 1, w: 24, x: 0, y: 30 } - ) - .addPanel( - errorRow - .addPanel( - newGraphPanel( - graphTitle='Rate of Received Packets Dropped', - graphQuery='sort_desc(sum(irate(container_network_receive_packets_dropped_total{%(clusterLabel)s="$cluster",namespace=~".+"}[$interval:$resolution])) by (namespace))' % $._config, - graphFormat='pps' - ), - gridPos={ h: 9, w: 24, x: 0, y: 50 } - ) - .addPanel( - newGraphPanel( - graphTitle='Rate of Transmitted Packets Dropped', - graphQuery='sort_desc(sum(irate(container_network_transmit_packets_dropped_total{%(clusterLabel)s="$cluster",namespace=~".+"}[$interval:$resolution])) by (namespace))' % $._config, - graphFormat='pps' - ), - gridPos={ h: 9, w: 24, x: 0, y: 59 } - ) - .addPanel( - newGraphPanel( - graphTitle='Rate of TCP Retransmits out of all sent segments', - graphQuery='sort_desc(sum(rate(node_netstat_Tcp_RetransSegs{%(clusterLabel)s="$cluster"}[$interval:$resolution]) / rate(node_netstat_Tcp_OutSegs{%(clusterLabel)s="$cluster"}[$interval:$resolution])) by (instance))' % $._config, - graphFormat='percentunit', - legendFormat='{{instance}}' - ) + { links: [ - { - url: 'https://accedian.com/enterprises/blog/network-packet-loss-retransmissions-and-duplicate-acknowledgements/', - title: 'What is TCP Retransmit?', - targetBlank: true, - }, - ] }, - gridPos={ h: 9, w: 24, x: 0, y: 59 } - ).addPanel( - newGraphPanel( - graphTitle='Rate of TCP SYN Retransmits out of all retransmits', - graphQuery='sort_desc(sum(rate(node_netstat_TcpExt_TCPSynRetrans{%(clusterLabel)s="$cluster"}[$interval:$resolution]) / rate(node_netstat_Tcp_RetransSegs{%(clusterLabel)s="$cluster"}[$interval:$resolution])) by (instance))' % $._config, - graphFormat='percentunit', - legendFormat='{{instance}}' - ) + { links: [ - { - url: 'https://github.com/prometheus/node_exporter/issues/1023#issuecomment-408128365', - title: 'Why monitor SYN retransmits?', - targetBlank: true, - }, - ] }, - gridPos={ h: 9, w: 24, x: 0, y: 59 } - ), - gridPos={ h: 1, w: 24, x: 0, y: 31 } - ), + tsPanel.new('Rate of Transmitted Packets Dropped') + + tsPanel.standardOptions.withUnit('pps') + + tsPanel.queryOptions.withTargets([ + prometheus.new('${datasource}', 'sum by (namespace) (rate(container_network_transmit_packets_dropped_total{%(clusterLabel)s="$cluster",namespace!=""}[%(grafanaIntervalVar)s]))' % $._config) + + prometheus.withLegendFormat('__auto'), + ]), + + tsPanel.new('Rate of TCP Retransmits out of all sent segments') + + tsPanel.standardOptions.withUnit('percentunit') + + tsPanel.queryOptions.withTargets([ + prometheus.new('${datasource}', 'sum by (instance) (rate(node_netstat_Tcp_RetransSegs{%(clusterLabel)s="$cluster"}[%(grafanaIntervalVar)s]) / rate(node_netstat_Tcp_OutSegs{%(clusterLabel)s="$cluster"}[%(grafanaIntervalVar)s]))' % $._config) + + prometheus.withLegendFormat('__auto'), + ]), + + tsPanel.new('Rate of TCP SYN Retransmits out of all retransmits') + + tsPanel.standardOptions.withUnit('percentunit') + + tsPanel.queryOptions.withTargets([ + prometheus.new('${datasource}', 'sum by (instance) (rate(node_netstat_TcpExt_TCPSynRetrans{%(clusterLabel)s="$cluster"}[%(grafanaIntervalVar)s]) / rate(node_netstat_Tcp_RetransSegs{%(clusterLabel)s="$cluster"}[%(grafanaIntervalVar)s]))' % $._config) + + prometheus.withLegendFormat('__auto'), + ]), + ]; + + g.dashboard.new('%(dashboardNamePrefix)sNetworking / Cluster' % $._config.grafanaK8s) + + g.dashboard.withUid($._config.grafanaDashboardIDs['cluster-total.json']) + + g.dashboard.withTags($._config.grafanaK8s.dashboardTags) + + g.dashboard.withEditable(false) + + g.dashboard.time.withFrom('now-1h') + + g.dashboard.time.withTo('now') + + g.dashboard.withRefresh($._config.grafanaK8s.refresh) + + g.dashboard.withVariables([variables.datasource, variables.cluster]) + + g.dashboard.withPanels(g.util.grid.wrapPanels(panels, panelWidth=12, panelHeight=9)), }, } diff --git a/dashboards/resources/multi-cluster.libsonnet b/dashboards/resources/multi-cluster.libsonnet index 4ff4c5afe..7137a3181 100644 --- a/dashboards/resources/multi-cluster.libsonnet +++ b/dashboards/resources/multi-cluster.libsonnet @@ -293,9 +293,11 @@ local var = g.dashboard.variable; g.dashboard.new('%(dashboardNamePrefix)sCompute Resources / Multi-Cluster' % $._config.grafanaK8s) + g.dashboard.withUid($._config.grafanaDashboardIDs['k8s-resources-multicluster.json']) + + g.dashboard.withTags($._config.grafanaK8s.dashboardTags) + g.dashboard.withEditable(false) + g.dashboard.time.withFrom('now-1h') + g.dashboard.time.withTo('now') + + g.dashboard.withRefresh($._config.grafanaK8s.refresh) + g.dashboard.withVariables([variables.datasource]) + g.dashboard.withPanels( g.util.grid.wrapPanels(panels.highlights, panelWidth=4, panelHeight=3, startY=0)