diff --git a/charts/spiderpool/config b/charts/spiderpool/config index e23314188..29b872da4 100644 --- a/charts/spiderpool/config +++ b/charts/spiderpool/config @@ -4,7 +4,7 @@ export USE_OPENSOURCE_CHART=false export REPO_URL=https://spidernet-io.github.io/spiderpool export REPO_NAME=spiderpool export CHART_NAME=spiderpool -export VERSION=1.0.0-rc2 +export VERSION=1.0.0-rc3 # pr, issue, none export UPGRADE_METHOD=pr diff --git a/charts/spiderpool/custom.sh b/charts/spiderpool/custom.sh index f065505e8..c263ecdde 100755 --- a/charts/spiderpool/custom.sh +++ b/charts/spiderpool/custom.sh @@ -58,6 +58,7 @@ yq -i ' .spiderpool.clusterDefaultPool.ipv6Gateway="fd00::1" | .spiderpool.clusterDefaultPool.ipv4IPRanges = ["192.168.0.10-192.168.0.100"] + .spiderpool.clusterDefaultPool.ipv4IPRanges | .spiderpool.clusterDefaultPool.ipv6IPRanges = ["fd00::10-fd00::100"] + .spiderpool.clusterDefaultPool.ipv6IPRanges | + .spiderpool.spiderpoolAgent.prometheus.serviceMonitor.labels."operator.insight.io/managed-by"="insight" | .spiderpool.spiderpoolAgent.image.registry="ghcr.m.daocloud.io" | .spiderpool.spiderpoolAgent.resources.requests.cpu=strenv(CUSTOM_SPIDERPOOL_AGENT_CPU) | .spiderpool.spiderpoolAgent.resources.requests.memory=strenv(CUSTOM_SPIDERPOOL_AGENT_MEMORY) | @@ -66,13 +67,16 @@ yq -i ' .spiderpool.spiderpoolController.resources.requests.memory=strenv(CUSTOM_SPIDERPOOL_CONTROLLER_MEMORY) | .spiderpool.spiderpoolController.tolerations[0].effect = "NoSchedule" | .spiderpool.spiderpoolController.podResourceInject.enabled=true | - .spiderpool.spiderpoolController.podResourceInject.namespacesExclude= ["insight-system","mcamel-system","amamba-system","argocd","baize-system","ghippo-system","gpu-operator","dowl-system","hwameistor","insight-system","kairship-system","kangaroo-system","kant-system","kcollie-system","kcoral-system","kolm-system","kpanda-system","kubean-system","local-path-storage","metax","mspider-system","nvidia-gpu-operator","skoala-system","spidernet-system","virtnest-system","ipavo-system"] + .spiderpool.spiderpoolController.podResourceInject.namespacesExclude | + .spiderpool.spiderpoolController.podResourceInject.namespacesExclude= ["insight-system","mcamel-system","amamba-system","argocd","baize-system","ghippo-system","gpu-operator","dowl-system","hwameistor","insight-system","kairship-system","kangaroo-system","kant-system","kcollie-system","kcoral-system","kolm-system","kpanda-system","kubean-system","local-path-storage","mspider-system","nvidia-gpu-operator","skoala-system","spidernet-system","virtnest-system","ipavo-system"] + .spiderpool.spiderpoolController.podResourceInject.namespacesExclude | .spiderpool.spiderpoolInit.image.registry="ghcr.m.daocloud.io" | .spiderpool.plugins.image.registry="ghcr.m.daocloud.io" | .spiderpool.rdma.rdmaSharedDevicePlugin.image.registry="ghcr.m.daocloud.io" | - .spiderpool.sriov.image.resourcesInjector.tag="v1.5" + .spiderpool.sriov.image.resourcesInjector.tag="v1.5" | + .spiderpool.grafanaDashboard.labels."operator.insight.io/managed-by"="insight" ' ${CHART_DIRECTORY}/values.yaml +# `.spiderpool.sriov.image.resourcesInjector.tag="v1.5"` is used as a fallback because resourcesInjector v1.6.0 does not include an ARM64 image. + if ! grep "keywords:" ${CHART_DIRECTORY}/Chart.yaml &>/dev/null ; then echo "keywords:" >> ${CHART_DIRECTORY}/Chart.yaml echo " - networking" >> ${CHART_DIRECTORY}/Chart.yaml diff --git a/charts/spiderpool/spiderpool/Chart.yaml b/charts/spiderpool/spiderpool/Chart.yaml index 1a70fa8c3..76cd95ec6 100644 --- a/charts/spiderpool/spiderpool/Chart.yaml +++ b/charts/spiderpool/spiderpool/Chart.yaml @@ -1,5 +1,5 @@ apiVersion: v2 -appVersion: 1.0.0-rc2 +appVersion: 1.0.0-rc3 description: underlay CNI solution for kubernetes home: https://spidernet-io.github.io/spiderpool icon: https://raw.githubusercontent.com/spidernet-io/spiderpool/main/docs/images/spider.svg @@ -16,8 +16,8 @@ name: spiderpool sources: - https://github.com/spidernet-io/spiderpool type: application -version: 1.0.0-rc2 +version: 1.0.0-rc3 dependencies: - name: spiderpool - version: "1.0.0-rc2" + version: "1.0.0-rc3" repository: "https://spidernet-io.github.io/spiderpool" diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/Chart.yaml b/charts/spiderpool/spiderpool/charts/spiderpool/Chart.yaml index 977a81038..d8943598d 100644 --- a/charts/spiderpool/spiderpool/charts/spiderpool/Chart.yaml +++ b/charts/spiderpool/spiderpool/charts/spiderpool/Chart.yaml @@ -1,5 +1,5 @@ apiVersion: v2 -appVersion: 1.0.0-rc2 +appVersion: 1.0.0-rc3 description: underlay CNI solution for kubernetes home: https://spidernet-io.github.io/spiderpool icon: https://raw.githubusercontent.com/spidernet-io/spiderpool/main/docs/images/spider.svg @@ -16,4 +16,4 @@ name: spiderpool sources: - https://github.com/spidernet-io/spiderpool type: application -version: 1.0.0-rc2 +version: 1.0.0-rc3 diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-cluster.json b/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-cluster.json index f786ea619..3f47706e7 100644 --- a/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-cluster.json +++ b/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-cluster.json @@ -24,14 +24,27 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 4, + "id": 10, "links": [], "liveNow": false, "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 16, + "panels": [], + "title": "Summary", + "type": "row" + }, { "datasource": { "type": "prometheus", - "uid": "husBV6iNz" + "uid": "${datasource}" }, "fieldConfig": { "defaults": { @@ -56,10 +69,75 @@ "overrides": [] }, "gridPos": { - "h": 3, - "w": 24, + "h": 4, + "w": 12, "x": 0, - "y": 0 + "y": 1 + }, + "id": 9, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.14", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "count(count(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\"}) by (node_name))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Number of RDMA-Capable Nodes", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 12, + "y": 1 }, "id": 2, "options": { @@ -81,7 +159,7 @@ { "datasource": { "type": "prometheus", - "uid": "husBV6iNz" + "uid": "${datasource}" }, "editorMode": "code", "expr": "count(count(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\"}) by (pod_name))", @@ -110,7 +188,7 @@ "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 20, "gradientMode": "none", "hideFrom": { "legend": false, @@ -119,7 +197,7 @@ }, "lineInterpolation": "linear", "lineWidth": 1, - "pointSize": 5, + "pointSize": 3, "scaleDistribution": { "type": "linear" }, @@ -140,22 +218,49 @@ { "color": "green", "value": null - }, - { - "color": "red", - "value": 80 } ] }, "unit": "bytes" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "10-20-1-60" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "10-20-1-50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + } + ] }, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 3 + "y": 5 }, "id": 7, "options": { @@ -177,13 +282,13 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum by (node_name) (rate(rdma_rx_vport_rdma_unicast_bytes_total{}[5m]))", + "expr": "sum by (node_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total[5m]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total[5m])\n)", "legendFormat": "__auto", "range": true, "refId": "A" } ], - "title": "Bandwidth | Read", + "title": "Node Bandwidth | Read", "type": "timeseries" }, { @@ -203,7 +308,7 @@ "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -212,7 +317,7 @@ }, "lineInterpolation": "linear", "lineWidth": 1, - "pointSize": 5, + "pointSize": 3, "scaleDistribution": { "type": "linear" }, @@ -242,13 +347,44 @@ }, "unit": "bytes" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "10-20-1-50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "10-20-1-60" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + } + ] }, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 3 + "y": 5 }, "id": 8, "options": { @@ -270,13 +406,13 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum by (node_name) (rate(rdma_tx_vport_rdma_unicast_bytes_total{}[5m]))", + "expr": "sum by (node_name) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total[5m]) +\n rate(rdma_tx_vport_rdma_multicast_bytes_total[5m])\n)", "legendFormat": "__auto", "range": true, "refId": "A" } ], - "title": "Bandwidth | Write", + "title": "Node Bandwidth | Write", "type": "timeseries" }, { @@ -296,7 +432,7 @@ "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -331,13 +467,44 @@ }, "unit": "percent" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "10-20-1-50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "10-20-1-60" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + } + ] }, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 11 + "y": 13 }, "id": 4, "options": { @@ -359,13 +526,13 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum by (node_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{}[3m])\n)\n/ sum by (node_name) (rdma_vport_speed_mbps_total{is_root=\"true\"} * 1000000 / 8) * 100", + "expr": "sum by (node_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{}[3m]) + rate(rdma_rx_vport_rdma_multicast_bytes_total[3m])\n)\n/ sum by (node_name) (rdma_vport_speed_mbps_total{is_root=\"true\"} * 1000000 / 8) * 100", "legendFormat": "__auto", "range": true, "refId": "A" } ], - "title": "Bandwidth rate | Write", + "title": "Rate of Bandwidth | Node | Read", "type": "timeseries" }, { @@ -385,7 +552,7 @@ "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -420,13 +587,44 @@ }, "unit": "percent" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "10-20-1-50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "10-20-1-60" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + } + ] }, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 11 + "y": 13 }, "id": 5, "options": { @@ -448,16 +646,751 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum by (node_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{}[3m])\n)\n/ sum by (node_name) (rdma_vport_speed_mbps_total{is_root=\"true\"} * 1000000 / 8) * 100", + "expr": "sum by (node_name) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{}[3m]) + rate(rdma_tx_vport_rdma_multicast_bytes_total[3m])\n)\n/ sum by (node_name) (rdma_vport_speed_mbps_total{is_root=\"true\"} * 1000000 / 8) * 100", "legendFormat": "__auto", "range": true, "refId": "A" } ], - "title": "Bandwidth rate | Write", + "title": "Rate of Bandwidth | Node | Write", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 21 + }, + "id": 18, + "panels": [], + "title": "Top traffic", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 3, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "10-20-1-50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "10-20-1-60" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 22 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "topk(\n 10,\n sum by (node_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total[5m]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total[5m])\n )\n)", + "legendFormat": "{{node_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Node Top 10 | Bandwidth | Read", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 3, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "10-20-1-50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "10-20-1-60" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 22 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "topk(\n 10,\n sum by (node_name) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total[5m]) +\n rate(rdma_tx_vport_rdma_multicast_bytes_total[5m])\n )\n)", + "legendFormat": "{{node_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Node Top 10 | Bandwidth | Write", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percent" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "10-20-1-50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "10-20-1-60" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 30 + }, + "id": 19, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "topk(\n 10,\nsum by (node_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{}[3m]) + rate(rdma_rx_vport_rdma_multicast_bytes_total[3m])\n)\n/ sum by (node_name) (rdma_vport_speed_mbps_total{is_root=\"true\"} * 1000000 / 8) * 100\n)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Node Top 10 | Rate of Bandwidth | Read", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percent" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "10-20-1-50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "10-20-1-60" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 30 + }, + "id": 20, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "topk(\n 10,\nsum by (node_name) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{}[3m]) + rate(rdma_tx_vport_rdma_multicast_bytes_total[3m])\n)\n/ sum by (node_name) (rdma_vport_speed_mbps_total{is_root=\"true\"} * 1000000 / 8) * 100\n)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Node Top 10 | Rate of Bandwidth | Write", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 3, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "rdma-test-gpu-tool-jgt9t" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#73BF69", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "rdma-test-gpu-tool-n792j" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#ffc0cb", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 38 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "topk(\n 10,\n sum by (pod_name) (\n (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{pod_name!=\"\"}[5m]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name!=\"\"}[5m])\n )\n )\n)", + "legendFormat": "{{node_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Pod Top 10 | Bandwidth | Read", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 3, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "rdma-test-gpu-tool-n792j" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#ffc0cb", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 38 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "topk(\n 10,\n sum by (pod_name) (\n (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\"}[5m]) +\n rate(rdma_tx_vport_rdma_multicast_bytes_total{pod_name!=\"\"}[5m])\n )\n )\n)", + "legendFormat": "{{node_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Pod Top 10 | Bandwidth | Write", "type": "timeseries" } ], + "refresh": false, "schemaVersion": 37, "style": "dark", "tags": [], @@ -489,8 +1422,8 @@ }, "timepicker": {}, "timezone": "", - "title": "Spiderpool RDMA Dashboard | Cluster", + "title": "Spiderpool RDMA | Cluster", "uid": "2ZOHs2ZHk", - "version": 16, + "version": 24, "weekStart": "" } \ No newline at end of file diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-node.json b/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-node.json index aeda6315a..b75fa5b60 100644 --- a/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-node.json +++ b/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-node.json @@ -24,7 +24,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 5, + "id": 7, "links": [], "liveNow": false, "panels": [ @@ -38,7 +38,7 @@ }, "id": 11, "panels": [], - "title": "Summary", + "title": "Physical Network Card", "type": "row" }, { @@ -58,24 +58,27 @@ "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 17, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "lineInterpolation": "linear", - "lineWidth": 1, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" }, "thresholdsStyle": { "mode": "off" @@ -93,7 +96,23 @@ }, "unit": "bytes" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "ibp8s0" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] }, "gridPos": { "h": 8, @@ -121,13 +140,13 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum by (rdma_parent_name) (rate(rdma_rx_vport_rdma_unicast_bytes_total{node_name=~\"$node\"}[1m]))", + "expr": "sum by (rdma_parent_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{node_name=~\"$node\"}[1m]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total{node_name=~\"$node\"}[1m])\n)", "legendFormat": "{{net_dev_name}}", "range": true, "refId": "A" } ], - "title": "Node bandwidth | Read", + "title": "Bandwidth | Read", "type": "timeseries" }, { @@ -147,24 +166,27 @@ "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 17, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "lineInterpolation": "linear", - "lineWidth": 1, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" }, "thresholdsStyle": { "mode": "off" @@ -182,7 +204,23 @@ }, "unit": "bytes" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "ibp8s0" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] }, "gridPos": { "h": 8, @@ -190,7 +228,7 @@ "x": 12, "y": 1 }, - "id": 21, + "id": 30, "options": { "legend": { "calcs": [], @@ -210,13 +248,13 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum by (rdma_parent_name) (rate(rdma_tx_vport_rdma_unicast_bytes_total{node_name=~\"$node\"}[1m]))", + "expr": "sum by (rdma_parent_name) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{node_name=~\"$node\"}[1m]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total{node_name=~\"$node\"}[1m])\n)", "legendFormat": "{{net_dev_name}}", "range": true, "refId": "A" } ], - "title": "Node bandwidth | Write", + "title": "Bandwidth | Write", "type": "timeseries" }, { @@ -244,7 +282,7 @@ "viz": false }, "lineInterpolation": "linear", - "lineWidth": 1, + "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" @@ -260,6 +298,8 @@ } }, "mappings": [], + "max": 100, + "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -305,7 +345,7 @@ "refId": "A" } ], - "title": "Bandwidth rate | Read", + "title": "Rate of Bandwidth | Read", "type": "timeseries" }, { @@ -313,6 +353,7 @@ "type": "prometheus", "uid": "${datasource}" }, + "description": "", "fieldConfig": { "defaults": { "color": { @@ -333,7 +374,7 @@ "viz": false }, "lineInterpolation": "linear", - "lineWidth": 1, + "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" @@ -349,6 +390,8 @@ } }, "mappings": [], + "max": 100, + "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -394,7 +437,7 @@ "refId": "A" } ], - "title": "Bandwidth rate | Write", + "title": "Rate of Bandwidth | Write", "type": "timeseries" }, { @@ -435,7 +478,7 @@ "viz": false }, "lineInterpolation": "linear", - "lineWidth": 1, + "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" @@ -464,25 +507,31 @@ }, "overrides": [ { - "__systemRef": "hideSeriesFrom", "matcher": { - "id": "byNames", - "options": { - "mode": "exclude", - "names": [ - "ibp13s0" - ], - "prefix": "All except:", - "readOnly": true + "id": "byName", + "options": "ibp8s0v0" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } } + ] + }, + { + "matcher": { + "id": "byName", + "options": "ibp8s0" }, "properties": [ { - "id": "custom.hideFrom", + "id": "color", "value": { - "legend": false, - "tooltip": false, - "viz": true + "fixedColor": "dark-blue", + "mode": "fixed" } } ] @@ -515,13 +564,13 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name=\"\", node_name=~\"$node\"}[$__rate_interval])", + "expr": "rate(rdma_rx_vport_rdma_unicast_bytes_total{pod_name=\"\", node_name=~\"$node\"}[$__rate_interval]) +\nrate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name=\"\", node_name=~\"$node\"}[$__rate_interval])", "legendFormat": "{{net_dev_name}}", "range": true, "refId": "A" } ], - "title": "Host RDMA Devices | Write", + "title": "Host Bandwidth | Read", "type": "timeseries" }, { @@ -549,7 +598,7 @@ "viz": false }, "lineInterpolation": "linear", - "lineWidth": 1, + "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" @@ -576,32 +625,7 @@ }, "unit": "binBps" }, - "overrides": [ - { - "__systemRef": "hideSeriesFrom", - "matcher": { - "id": "byNames", - "options": { - "mode": "exclude", - "names": [ - "ibp13s0" - ], - "prefix": "All except:", - "readOnly": true - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": false, - "tooltip": false, - "viz": true - } - } - ] - } - ] + "overrides": [] }, "gridPos": { "h": 8, @@ -629,13 +653,13 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name=\"\", node_name=~\"$node\"}[$__rate_interval])", + "expr": "rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name=\"\", node_name=~\"$node\"}[$__rate_interval]) +\nrate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name=\"\", node_name=~\"$node\"}[$__rate_interval])", "legendFormat": "{{net_dev_name}}", "range": true, "refId": "A" } ], - "title": "Host RDMA Devices | Write", + "title": "Host Bandwidth | Write", "type": "timeseries" }, { @@ -676,7 +700,7 @@ "viz": false }, "lineInterpolation": "linear", - "lineWidth": 1, + "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" @@ -696,39 +720,13 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] }, "unit": "binBps" }, - "overrides": [ - { - "__systemRef": "hideSeriesFrom", - "matcher": { - "id": "byNames", - "options": { - "mode": "exclude", - "names": [ - "ibp13s0" - ], - "prefix": "All except:", - "readOnly": true - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": false, - "tooltip": false, - "viz": true - } - } - ] - } - ] + "overrides": [] }, "gridPos": { "h": 8, @@ -762,7 +760,7 @@ "refId": "A" } ], - "title": "Pod RDMA Devices | Read", + "title": "Pod Bandwidth | Read", "type": "timeseries" }, { @@ -790,7 +788,7 @@ "viz": false }, "lineInterpolation": "linear", - "lineWidth": 1, + "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" @@ -810,52 +808,210 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] }, "unit": "binBps" }, - "overrides": [ - { - "__systemRef": "hideSeriesFrom", - "matcher": { - "id": "byNames", - "options": { - "mode": "exclude", - "names": [ - "ibp13s0" - ], - "prefix": "All except:", - "readOnly": true + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 27 + }, + "id": 29, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\", node_name=~\"$node\"}[$__rate_interval]) +\nrate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name!=\"\", node_name=~\"$node\"}[$__rate_interval])", + "legendFormat": "{{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "Pod Bandwidth Write", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 35 + }, + "id": 32, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "topk(\n 10,\n sum by (pod_name, pod_namespace) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{pod_name!=\"\", node_name=~\"$node\"}[$__rate_interval]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name!=\"\", node_name=~\"$node\"}[$__rate_interval])\n )\n)", + "legendFormat": "{{pod_namespace}}/{{pod_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Pod Top 10 | Bandwidth | Read", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - "properties": [ + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ { - "id": "custom.hideFrom", - "value": { - "legend": false, - "tooltip": false, - "viz": true - } + "color": "green" + }, + { + "color": "red", + "value": 80 } ] - } - ] + }, + "unit": "bytes" + }, + "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 27 + "y": 35 }, - "id": 29, + "id": 33, "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom", + "placement": "right", "showLegend": true }, "tooltip": { @@ -870,17 +1026,17 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\", node_name=~\"$node\"}[$__rate_interval])", - "legendFormat": "{{ifname}}", + "expr": "topk(\n 10, \n sum by (pod_name, pod_namespace) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\", node_name=~\"$node\"}[$__rate_interval]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name!=\"\", node_name=~\"$node\"}[$__rate_interval])\n )\n)", + "legendFormat": "{{pod_namespace}}/{{pod_name}}", "range": true, "refId": "A" } ], - "title": "Host RDMA Devices | Write", + "title": "Pod Top 10 | Bandwidth | Write", "type": "timeseries" } ], - "refresh": "5s", + "refresh": false, "schemaVersion": 37, "style": "dark", "tags": [], @@ -938,8 +1094,8 @@ }, "timepicker": {}, "timezone": "", - "title": "Spiderpool RDMA Dashboard | Node", + "title": "Spiderpool RDMA | Node", "uid": "A0T4f2ZNz", - "version": 2, + "version": 22, "weekStart": "" } \ No newline at end of file diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-pod.json b/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-pod.json index d7ba00e22..faef3fde4 100644 --- a/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-pod.json +++ b/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-pod.json @@ -24,7 +24,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 1, + "id": 6, "links": [], "liveNow": false, "panels": [ @@ -60,7 +60,7 @@ "axisSoftMin": 0, "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -95,7 +95,38 @@ }, "unit": "binBps" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "net3 - mlx5_19" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "net2 - mlx5_16" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] }, "gridPos": { "h": 8, @@ -125,7 +156,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_rx_vport_rdma_unicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_rx_vport_rdma_unicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval]) + rate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval]) ", "format": "time_series", "instant": false, "interval": "", @@ -134,7 +165,7 @@ "refId": "A" } ], - "title": "Throughput | Read", + "title": "Bandwidth | Read", "type": "timeseries" }, { @@ -156,7 +187,7 @@ "axisSoftMin": 0, "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -191,7 +222,38 @@ }, "unit": "binBps" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "net3 - mlx5_19" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "net2 - mlx5_16" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] }, "gridPos": { "h": 8, @@ -221,7 +283,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval]) + rate(rdma_tx_vport_rdma_multicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval]) ", "format": "time_series", "instant": false, "interval": "", @@ -230,7 +292,7 @@ "refId": "A" } ], - "title": "Throughput | Write", + "title": "Bandwidth | Write", "type": "timeseries" }, { @@ -252,7 +314,7 @@ "axisSoftMin": 0, "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -287,7 +349,38 @@ }, "unit": "percent" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "net3 - mlx5_19" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "net2 - mlx5_16" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] }, "gridPos": { "h": 8, @@ -317,7 +410,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_rx_vport_rdma_unicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])/(rdma_vport_speed_mbps_total{pod_name!=\"\"}*1000000/8)", + "expr": "(rate(rdma_rx_vport_rdma_unicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])\n)\n/\n(rdma_vport_speed_mbps_total{pod_name!=\"\"} * 1000000 / 8) * 100", "format": "time_series", "instant": false, "interval": "", @@ -326,7 +419,7 @@ "refId": "A" } ], - "title": "Bandwidth rate | Read", + "title": "Rate of Bandwidth | Read", "type": "timeseries" }, { @@ -348,7 +441,7 @@ "axisSoftMin": 0, "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 0, + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -383,7 +476,38 @@ }, "unit": "percent" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "net2 - mlx5_16" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "net3 - mlx5_19" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] }, "gridPos": { "h": 8, @@ -413,7 +537,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])/(rdma_vport_speed_mbps_total{pod_name!=\"\"}*1000000/8)", + "expr": "(rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval]) +\n rate(rdma_tx_vport_rdma_multicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])\n)\n/\n(rdma_vport_speed_mbps_total{pod_name!=\"\"} * 1000000 / 8) * 100", "format": "time_series", "instant": false, "interval": "", @@ -422,7 +546,7 @@ "refId": "A" } ], - "title": "Bandwidth rate | Write", + "title": "Rate of Bandwidth | Write", "type": "timeseries" }, { @@ -985,7 +1109,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "irate(rdma_tx_vport_rdma_multicast_packets_total{pod_namespace!=\"\"}[1m])", + "expr": "irate(rdma_tx_vport_rdma_multicast_packets_total{pod_namespace!=\"\", pod_name=~\"$pod\"}[1m])", "format": "time_series", "instant": false, "interval": "", @@ -1081,7 +1205,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_tx_vport_rdma_multicast_bytes_total{pod_name!=\"\"}[1m]) * 8 / 1000000", + "expr": "rate(rdma_tx_vport_rdma_multicast_bytes_total{pod_name!=\"\", pod_name=~\"$pod\"}[1m]) * 8 / 1000000", "format": "time_series", "instant": false, "interval": "", @@ -1191,7 +1315,7 @@ { "datasource": { "type": "prometheus", - "uid": "husBV6iNz" + "uid": "${datasource}" }, "description": "", "fieldConfig": { @@ -1266,7 +1390,7 @@ { "datasource": { "type": "prometheus", - "uid": "husBV6iNz" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, @@ -1285,7 +1409,7 @@ { "datasource": { "type": "prometheus", - "uid": "husBV6iNz" + "uid": "${datasource}" }, "description": "", "fieldConfig": { @@ -1360,11 +1484,11 @@ { "datasource": { "type": "prometheus", - "uid": "husBV6iNz" + "uid": "${datasource}" }, "editorMode": "code", "exemplar": false, - "expr": "irate(rdma_duplicate_request_total{pod_namespace!=\"\"}[1m])", + "expr": "irate(rdma_duplicate_request_total{pod_namespace!=\"\", pod_name=~\"$pod\"}[1m])", "format": "time_series", "instant": false, "interval": "", @@ -1611,7 +1735,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -1704,7 +1829,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -1797,7 +1923,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -1890,7 +2017,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -1924,7 +2052,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rdma_duplicate_request_total{pod_namespace!=\"\"}", + "expr": "rate(rdma_duplicate_request_total{pod_namespace!=\"\",pod_name=~\"$pod\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -1983,7 +2111,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -2076,7 +2205,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -2169,7 +2299,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -2262,7 +2393,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -2355,7 +2487,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -2448,7 +2581,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -2541,7 +2675,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -2634,7 +2769,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -2727,7 +2863,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -2820,7 +2957,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -2913,7 +3051,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -3006,7 +3145,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -3099,7 +3239,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -3192,7 +3333,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -3285,7 +3427,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -3378,7 +3521,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -3471,7 +3615,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -3564,7 +3709,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -3657,7 +3803,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -3750,7 +3897,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] } @@ -3795,99 +3943,6 @@ ], "title": "rdma_roce_adp_retrans", "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "axisSoftMin": 0, - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 0, - "y": 89 - }, - "id": 15, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "editorMode": "code", - "exemplar": false, - "expr": "rate(rdma_rp_cnp_handled_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", - "format": "time_series", - "instant": false, - "interval": "", - "legendFormat": "{{ifname}} - {{pod_namespace}}/{{pod_name}}", - "range": true, - "refId": "A" - } - ], - "title": "rdma_rp_cnp_handled", - "type": "timeseries" } ], "refresh": "5s", @@ -3944,25 +3999,24 @@ { "current": { "selected": false, - "text": "rdma-test-gpu-tool-8zrh6", - "value": "rdma-test-gpu-tool-8zrh6" + "text": "10-20-1-50", + "value": "10-20-1-50" }, "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "definition": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{pod_namespace=~\"$namespace\"}, pod_name)", + "definition": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{}, node_name)", "hide": 0, - "includeAll": false, - "label": "pod", + "includeAll": true, "multi": false, - "name": "pod", + "name": "node", "options": [], "query": { - "query": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{pod_namespace=~\"$namespace\"}, pod_name)", + "query": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{}, node_name)", "refId": "StandardVariableQuery" }, - "refresh": 2, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, @@ -3971,24 +4025,25 @@ { "current": { "selected": false, - "text": "10-20-1-50", - "value": "10-20-1-50" + "text": "pytorch-sample-master-0", + "value": "pytorch-sample-master-0" }, "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "definition": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{}, node_name)", + "definition": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{pod_namespace=~\"$namespace\"}, pod_name)", "hide": 0, "includeAll": false, + "label": "pod", "multi": false, - "name": "node", + "name": "pod", "options": [], "query": { - "query": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{}, node_name)", + "query": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{pod_namespace=~\"$namespace\"}, pod_name)", "refId": "StandardVariableQuery" }, - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 1, @@ -3997,13 +4052,13 @@ ] }, "time": { - "from": "now-6h", + "from": "now-3h", "to": "now" }, "timepicker": {}, "timezone": "", - "title": "Spiderpool RDMA Dashboard | Pod", + "title": "Spiderpool RDMA | Pod", "uid": "DenUibiNk", - "version": 158, + "version": 17, "weekStart": "" } \ No newline at end of file diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-workload.json b/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-workload.json new file mode 100644 index 000000000..0d6b78cdc --- /dev/null +++ b/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-workload.json @@ -0,0 +1,1012 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 12, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 11, + "panels": [], + "title": "Summary", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 24, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum by (owner_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\"}[1m])\n)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Bandwidth per $kind | Read ", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 28, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum by (owner_name) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\"}[1m])\n)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Bandwidth per $kind | Write", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 30, + "panels": [], + "title": "Pods", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "rdma-test-gpu-tool-n792j" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 10 + }, + "id": 26, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum by (pod_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\"}[1m])\n)", + "interval": "", + "legendFormat": "{{pod_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Bandwidth per Pod | Read", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "rdma-test-gpu-tool-n792j" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 10 + }, + "id": 27, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum by (pod_name) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\"}[1m])\n)", + "interval": "", + "legendFormat": "{{pod_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Bandwidth per Pod | Write", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "rdma-test-gpu-tool-n792j" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 18 + }, + "id": 32, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.3.14", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum by (pod_name) (\n rate(rdma_rx_vport_rdma_unicast_packets_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\"}[1m])\n)", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{pod_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Packets per Pod | unicast | Read", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "rdma-test-gpu-tool-n792j" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 18 + }, + "id": 33, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.3.14", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum by (pod_name) (\n rate(rdma_tx_vport_rdma_unicast_packets_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\"}[1m])\n)", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{pod_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Packets per Pod | unicast | Write", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "rdma-test-gpu-tool-n792j" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 18 + }, + "id": 34, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.3.14", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum by (pod_name) (\n rate(rdma_tx_vport_rdma_multicast_packets_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\"}[1m])\n)", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{pod_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Packets per Pod | multicast | Read", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "rdma-test-gpu-tool-n792j" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 18 + }, + "id": 35, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.3.14", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum by (pod_name) (\n rate(rdma_tx_vport_rdma_multicast_packets_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\"}[1m])\n)", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{pod_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Packets per Pod | multicast | Write", + "type": "timeseries" + } + ], + "refresh": false, + "schemaVersion": 37, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 2, + "includeAll": false, + "label": "Data source", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { + "selected": false, + "text": "DaemonSet", + "value": "DaemonSet" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(rdma_vport_speed_mbps_total{}, owner_kind)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "kind", + "options": [], + "query": { + "query": "label_values(rdma_vport_speed_mbps_total{}, owner_kind)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "huailou", + "value": "huailou" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(rdma_vport_speed_mbps_total{owner_kind=~\"$kind\"}, owner_namespace)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "namespace", + "options": [], + "query": { + "query": "label_values(rdma_vport_speed_mbps_total{owner_kind=~\"$kind\"}, owner_namespace)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "rdma-test-gpu-tool", + "value": "rdma-test-gpu-tool" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(rdma_vport_speed_mbps_total{owner_kind=~\"$kind\", owner_namespace=~\"$namespace\"}, owner_name)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "name", + "options": [], + "query": { + "query": "label_values(rdma_vport_speed_mbps_total{owner_kind=~\"$kind\", owner_namespace=~\"$namespace\"}, owner_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Spiderpool RDMA | AI Workload", + "uid": "AAT6f2ZNz", + "version": 40, + "weekStart": "" +} \ No newline at end of file diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/templates/daemonset.yaml b/charts/spiderpool/spiderpool/charts/spiderpool/templates/daemonset.yaml index 3b2469f65..af672a82b 100644 --- a/charts/spiderpool/spiderpool/charts/spiderpool/templates/daemonset.yaml +++ b/charts/spiderpool/spiderpool/charts/spiderpool/templates/daemonset.yaml @@ -201,11 +201,11 @@ spec: {{- if .Values.multus.multusCNI.defaultCniCRName }} - name: MULTUS_CLUSTER_NETWORK value: {{ .Release.Namespace }}/{{ .Values.multus.multusCNI.defaultCniCRName }} - {{ end }} + {{- end }} {{- with .Values.spiderpoolAgent.extraEnv }} - {{- toYaml . | nindent 8 }} + {{- toYaml . | nindent 12 }} {{- end }} - {{- if or .Values.spiderpoolAgent.tuneSysctlConfig .Values.spiderpoolAgent.securityContext }} + {{- if or .Values.spiderpoolAgent.tuneSysctlConfig .Values.spiderpoolAgent.securityContext .Values.spiderpoolAgent.prometheus.enabledRdmaMetric }} securityContext: privileged: true {{- with .Values.spiderpoolAgent.securityContext }} @@ -216,6 +216,7 @@ spec: {{- if .Values.spiderpoolAgent.prometheus.enabledRdmaMetric }} - name: host-ns mountPath: /var/run/netns + mountPropagation: Bidirectional {{- end }} - name: config-path mountPath: /tmp/spiderpool/config-map diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/templates/grafanaDashboardRdmaCluster.yaml b/charts/spiderpool/spiderpool/charts/spiderpool/templates/grafanaDashboardRdmaCluster.yaml index f07c839bd..f01125de0 100644 --- a/charts/spiderpool/spiderpool/charts/spiderpool/templates/grafanaDashboardRdmaCluster.yaml +++ b/charts/spiderpool/spiderpool/charts/spiderpool/templates/grafanaDashboardRdmaCluster.yaml @@ -21,6 +21,5 @@ metadata: {{- end }} {{- end }} spec: - json: |- - {{ .Files.Get "files/grafana-rdma-cluster.json" | toJson | indent 4 }} + json: {{ .Files.Get "files/grafana-rdma-cluster.json" | toJson }} {{- end }} \ No newline at end of file diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/templates/grafanaDashboardRdmaNode.yaml b/charts/spiderpool/spiderpool/charts/spiderpool/templates/grafanaDashboardRdmaNode.yaml index 85ae18ec3..5870b9738 100644 --- a/charts/spiderpool/spiderpool/charts/spiderpool/templates/grafanaDashboardRdmaNode.yaml +++ b/charts/spiderpool/spiderpool/charts/spiderpool/templates/grafanaDashboardRdmaNode.yaml @@ -21,6 +21,5 @@ metadata: {{- end }} {{- end }} spec: - json: |- - {{ .Files.Get "files/grafana-rdma-node.json" | toJson | indent 4 }} + json: {{ .Files.Get "files/grafana-rdma-node.json" | toJson }} {{- end }} diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/templates/grafanaDashboardRdmaPod.yaml b/charts/spiderpool/spiderpool/charts/spiderpool/templates/grafanaDashboardRdmaPod.yaml index 5efceaa69..7ef3afccb 100644 --- a/charts/spiderpool/spiderpool/charts/spiderpool/templates/grafanaDashboardRdmaPod.yaml +++ b/charts/spiderpool/spiderpool/charts/spiderpool/templates/grafanaDashboardRdmaPod.yaml @@ -21,6 +21,5 @@ metadata: {{- end }} {{- end }} spec: - json: |- - {{ .Files.Get "files/grafana-rdma-pod.json" | toJson | indent 4 }} + json: {{ .Files.Get "files/grafana-rdma-pod.json" | toJson }} {{- end }} diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/templates/grafanaDashboardRdmaWorkload.yaml b/charts/spiderpool/spiderpool/charts/spiderpool/templates/grafanaDashboardRdmaWorkload.yaml new file mode 100644 index 000000000..ca226760a --- /dev/null +++ b/charts/spiderpool/spiderpool/charts/spiderpool/templates/grafanaDashboardRdmaWorkload.yaml @@ -0,0 +1,25 @@ +{{- if .Values.grafanaDashboard.install -}} +apiVersion: integreatly.org/v1alpha1 +kind: GrafanaDashboard +metadata: + name: {{ default "spiderpool" .Values.global.nameOverride }}-rdma-workload + namespace: {{ default .Release.Namespace .Values.grafanaDashboard.namespace }} + labels: + {{- if .Values.global.commonLabels }} + {{- include "tplvalues.render" ( dict "value" .Values.global.commonLabels "context" $ ) | nindent 4 }} + {{- end }} + {{- if .Values.grafanaDashboard.labels }} + {{- include "tplvalues.render" ( dict "value" .Values.grafanaDashboard.labels "context" $ ) | nindent 4 }} + {{- end }} + {{- if or .Values.global.commonAnnotations .Values.grafanaDashboard.annotations }} + annotations: + {{- if .Values.global.commonAnnotations }} + {{- include "tplvalues.render" ( dict "value" .Values.global.commonAnnotations "context" $ ) | nindent 4 }} + {{- end }} + {{- if .Values.grafanaDashboard.annotations }} + {{- include "tplvalues.render" ( dict "value" .Values.grafanaDashboard.annotations "context" $ ) | nindent 4 }} + {{- end }} + {{- end }} +spec: + json: {{ .Files.Get "files/grafana-rdma-workload.json" | toJson }} +{{- end }} diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/templates/role_binding.yaml b/charts/spiderpool/spiderpool/charts/spiderpool/templates/role_binding.yaml index f18317670..7ca687f78 100644 --- a/charts/spiderpool/spiderpool/charts/spiderpool/templates/role_binding.yaml +++ b/charts/spiderpool/spiderpool/charts/spiderpool/templates/role_binding.yaml @@ -19,4 +19,4 @@ subjects: name: {{ .Values.spiderpoolInit.name | trunc 63 | trimSuffix "-" }} namespace: {{ .Release.Namespace }} {{- end }} -{{- end }} \ No newline at end of file +{{- end }} diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/templates/role_rdma-exporter.yaml b/charts/spiderpool/spiderpool/charts/spiderpool/templates/role_rdma-exporter.yaml new file mode 100644 index 000000000..7fc4a90cc --- /dev/null +++ b/charts/spiderpool/spiderpool/charts/spiderpool/templates/role_rdma-exporter.yaml @@ -0,0 +1,37 @@ +# for rdma metrics exporter, read rdma pod owner's info +# for example, the rdma pod owner is a job, the job's owner is a cronjob +{{- if .Values.spiderpoolAgent.prometheus.enabledRdmaMetric }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: spiderpool-rdma-metrics-exporter +rules: + - apiGroups: ["kubeflow.org"] + resources: ["mpijobs", "mxjobs", "notebooks", "paddlejobs", "pytorchjobs", "tfjobs", "xgboostjobs"] + verbs: ["get"] + - apiGroups: ["serving.baize.io"] + resources: ["inferences"] + verbs: ["get"] + - apiGroups: ["apps"] + resources: ["daemonsets", "deployments", "replicasets", "statefulsets"] + verbs: ["get"] + - apiGroups: ["batch"] + resources: ["cronjobs", "jobs"] + verbs: ["get"] + - apiGroups: [""] + resources: ["pods", "pods/status"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: spiderpool-rdma-metrics-exporter +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: spiderpool-rdma-metrics-exporter +subjects: + - kind: ServiceAccount + name: {{ .Values.spiderpoolAgent.name | trunc 63 | trimSuffix "-" }} + namespace: {{ .Release.Namespace }} +{{- end }} diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/templates/servicemonitor.yaml b/charts/spiderpool/spiderpool/charts/spiderpool/templates/servicemonitor.yaml index b56ce2ec3..1efa34623 100644 --- a/charts/spiderpool/spiderpool/charts/spiderpool/templates/servicemonitor.yaml +++ b/charts/spiderpool/spiderpool/charts/spiderpool/templates/servicemonitor.yaml @@ -15,9 +15,6 @@ metadata: {{- end }} {{- end }} spec: - namespaceSelector: - matchNames: - - {{ .Release.Namespace }} selector: matchLabels: {{- if .Values.global.commonLabels }} diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/values.yaml b/charts/spiderpool/spiderpool/charts/spiderpool/values.yaml index 65160f989..ab33cc0f8 100644 --- a/charts/spiderpool/spiderpool/charts/spiderpool/values.yaml +++ b/charts/spiderpool/spiderpool/charts/spiderpool/values.yaml @@ -369,7 +369,7 @@ spiderpoolAgent: digest: "" ## @param spiderpoolAgent.image.tag the image tag of spiderpoolAgent, overrides the image tag whose default is the chart appVersion. - tag: v1.0.0-rc2 + tag: v1.0.0-rc3 ## @param spiderpoolAgent.image.imagePullSecrets the image imagePullSecrets of spiderpoolAgent imagePullSecrets: [] @@ -559,7 +559,7 @@ spiderpoolController: digest: "" ## @param spiderpoolController.image.tag the image tag of spiderpoolController, overrides the image tag whose default is the chart appVersion. - tag: v1.0.0-rc2 + tag: v1.0.0-rc3 ## @param spiderpoolController.image.imagePullSecrets the image imagePullSecrets of spiderpoolController imagePullSecrets: [] @@ -805,7 +805,7 @@ spiderpoolInit: digest: "" ## @param spiderpoolInit.image.tag the image tag of spiderpoolInit, overrides the image tag whose default is the chart appVersion. - tag: v1.0.0-rc2 + tag: v1.0.0-rc3 ## @param spiderpoolInit.image.imagePullSecrets the image imagePullSecrets of spiderpoolInit imagePullSecrets: [] diff --git a/charts/spiderpool/spiderpool/values.yaml b/charts/spiderpool/spiderpool/values.yaml index b57394f56..330011bb3 100644 --- a/charts/spiderpool/spiderpool/values.yaml +++ b/charts/spiderpool/spiderpool/values.yaml @@ -73,7 +73,8 @@ spiderpool: ## @param grafanaDashboard.annotations the additional annotations of spiderpool grafanaDashboard annotations: {} ## @param grafanaDashboard.labels the additional label of spiderpool grafanaDashboard - labels: {} + labels: + operator.insight.io/managed-by: insight ## @section coordinator parameters ## coordinator: @@ -280,7 +281,7 @@ spiderpool: ## @param spiderpoolAgent.image.digest the image digest of spiderpoolAgent, which takes preference over tag digest: "" ## @param spiderpoolAgent.image.tag the image tag of spiderpoolAgent, overrides the image tag whose default is the chart appVersion. - tag: v1.0.0-rc2 + tag: v1.0.0-rc3 ## @param spiderpoolAgent.image.imagePullSecrets the image imagePullSecrets of spiderpoolAgent imagePullSecrets: [] # - name: "image-pull-secret" @@ -374,7 +375,8 @@ spiderpool: ## @param spiderpoolAgent.prometheus.serviceMonitor.annotations the additional annotations of spiderpoolAgent serviceMonitor annotations: {} ## @param spiderpoolAgent.prometheus.serviceMonitor.labels the additional label of spiderpoolAgent serviceMonitor - labels: {} + labels: + operator.insight.io/managed-by: insight ## @param spiderpoolAgent.prometheus.serviceMonitor.interval represents the interval of spiderpoolAgent serviceMonitor's scraping action interval: "10s" prometheusRule: @@ -422,7 +424,7 @@ spiderpool: ## @param spiderpoolController.image.digest the image digest of spiderpoolController, which takes preference over tag digest: "" ## @param spiderpoolController.image.tag the image tag of spiderpoolController, overrides the image tag whose default is the chart appVersion. - tag: v1.0.0-rc2 + tag: v1.0.0-rc3 ## @param spiderpoolController.image.imagePullSecrets the image imagePullSecrets of spiderpoolController imagePullSecrets: [] # - name: "image-pull-secret" @@ -531,7 +533,6 @@ spiderpool: - kpanda-system - kubean-system - local-path-storage - - metax - mspider-system - nvidia-gpu-operator - skoala-system @@ -637,7 +638,7 @@ spiderpool: ## @param spiderpoolInit.image.digest the image digest of spiderpoolInit, which takes preference over tag digest: "" ## @param spiderpoolInit.image.tag the image tag of spiderpoolInit, overrides the image tag whose default is the chart appVersion. - tag: v1.0.0-rc2 + tag: v1.0.0-rc3 ## @param spiderpoolInit.image.imagePullSecrets the image imagePullSecrets of spiderpoolInit imagePullSecrets: [] # - name: "image-pull-secret"