diff --git a/infrastructure/kube/keep-prd/monitoring/README.adoc b/infrastructure/kube/keep-prd/monitoring/README.adoc new file mode 100644 index 0000000000..bc9f79b764 --- /dev/null +++ b/infrastructure/kube/keep-prd/monitoring/README.adoc @@ -0,0 +1,37 @@ +:icons: font + +ifdef::env-github[] +:tip-caption: :bulb: +:note-caption: :information_source: +:important-caption: :heavy_exclamation_mark: +:caution-caption: :fire: +:warning-caption: :warning: +endif::[] + +# Monitoring + +The monitoring stack has the following components: + +1. Prometheus +2. Trickster +3. Grafana + +The production monitoring is based on the configuration described in the link:../../keep-test/monitoring/README.adoc[keep-test monitoring documentation]. + +Resources are exposed publicly under the following URLs: + +[cols="^1s,2m"] +|=== +^h|Service +^h|Address + +|Public Dashboard +|link:https://public.monitoring.threshold.network[] + +|Grafana +|link:https://monitoring.threshold.network/grafana[] + +|Prometheus +|link:https://monitoring.threshold.network/prometheus[] + +|=== diff --git a/infrastructure/kube/keep-prd/monitoring/grafana/config/dashboards.yaml b/infrastructure/kube/keep-prd/monitoring/grafana/config/dashboards.yaml new file mode 100644 index 0000000000..54bf65f56f --- /dev/null +++ b/infrastructure/kube/keep-prd/monitoring/grafana/config/dashboards.yaml @@ -0,0 +1,10 @@ +apiVersion: 1 +providers: + - name: dashboards-provider + type: file + disableDeletion: true + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: "/var/lib/grafana/dashboards" + foldersFromFilesStructure: true diff --git a/infrastructure/kube/keep-prd/monitoring/grafana/config/datasources.yaml b/infrastructure/kube/keep-prd/monitoring/grafana/config/datasources.yaml new file mode 100644 index 0000000000..ef00731e62 --- /dev/null +++ b/infrastructure/kube/keep-prd/monitoring/grafana/config/datasources.yaml @@ -0,0 +1,10 @@ +apiVersion: 1 +datasources: + - name: Trickster + type: prometheus + access: proxy + editable: true + orgId: 1 + url: http://trickster:8480/prometheus + version: 1 + isDefault: true diff --git a/infrastructure/kube/keep-prd/monitoring/grafana/config/grafana.ini b/infrastructure/kube/keep-prd/monitoring/grafana/config/grafana.ini new file mode 100644 index 0000000000..66e6511968 --- /dev/null +++ b/infrastructure/kube/keep-prd/monitoring/grafana/config/grafana.ini @@ -0,0 +1,9 @@ +[auth.google] +enabled = true +scopes = https://www.googleapis.com/auth/userinfo.profile https://www.googleapis.com/auth/userinfo.email +auth_url = https://accounts.google.com/o/oauth2/auth +token_url = https://accounts.google.com/o/oauth2/token +allow_sign_up = true + +[feature_toggles] +publicDashboards = true diff --git a/infrastructure/kube/keep-prd/monitoring/grafana/dashboards/keep/keep-nodes-public.json b/infrastructure/kube/keep-prd/monitoring/grafana/dashboards/keep/keep-nodes-public.json new file mode 100644 index 0000000000..9f7958c7a6 --- /dev/null +++ b/infrastructure/kube/keep-prd/monitoring/grafana/dashboards/keep/keep-nodes-public.json @@ -0,0 +1,695 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 2, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 30 + }, + { + "color": "green", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 11, + "x": 0, + "y": 0 + }, + "id": 8, + "interval": "1m", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.8", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "editorMode": "builder", + "expr": "sum by(job) (sum by(chain_address) (up{job=\"keep-discovered-nodes\"}))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Nodes Up", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "super-light-red", + "value": null + }, + { + "color": "super-light-yellow", + "value": 30 + }, + { + "color": "super-light-green", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 16, + "w": 13, + "x": 11, + "y": 0 + }, + "id": 3, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "editorMode": "builder", + "exemplar": false, + "expr": "min by(chain_address) (connected_bootstrap_count{job=\"keep-discovered-nodes\"})", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "{{chain_address}}", + "range": true, + "refId": "C" + } + ], + "title": "Connected Bootstraps", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 100 + }, + { + "color": "light-yellow", + "value": 300 + }, + { + "color": "green", + "value": 900 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 28, + "w": 11, + "x": 0, + "y": 8 + }, + "id": 4, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "editorMode": "builder", + "exemplar": false, + "expr": "min by(chain_address) (tbtc_pre_params_count{job=\"keep-discovered-nodes\"})", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "{{chain_address}}", + "range": true, + "refId": "C" + } + ], + "title": "TBTC PreParams Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "super-light-yellow", + "value": 50 + }, + { + "color": "super-light-green", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 20, + "w": 13, + "x": 11, + "y": 16 + }, + "id": 2, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "editorMode": "builder", + "expr": "min by(chain_address) (connected_peers_count{job=\"keep-discovered-nodes\"})", + "hide": false, + "interval": "", + "legendFormat": "{{chain_address}}", + "range": true, + "refId": "Discovered Keep Nodes" + } + ], + "title": "Connected Peers", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "displayMode": "auto", + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 25, + "w": 11, + "x": 0, + "y": 36 + }, + "id": 10, + "options": { + "footer": { + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "chain_address" + } + ] + }, + "pluginVersion": "9.1.8", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "editorMode": "builder", + "exemplar": false, + "expr": "up{job=\"keep-discovered-nodes\"}", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "__auto", + "range": false, + "refId": "Nodes" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "editorMode": "builder", + "exemplar": false, + "expr": "client_info{job=\"keep-discovered-nodes\"}", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "", + "range": false, + "refId": "Client Info" + } + ], + "title": "Client Versions", + "transformations": [ + { + "id": "seriesToColumns", + "options": { + "byField": "chain_address" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Time 1": false, + "Time 2": true, + "Value": true, + "Value #A": true, + "Value #B": true, + "Value #Client Info": true, + "Value #Nodes": true, + "__name__": true, + "__name__ 1": true, + "__name__ 2": true, + "app": true, + "controller_revision_hash": true, + "id": true, + "instance": false, + "instance 1": false, + "instance 2": true, + "job": true, + "job 1": true, + "job 2": true, + "kubernetes_namespace": true, + "kubernetes_pod_name": true, + "kubernetes_pod_name_monitoring": true, + "network": true, + "network_id": true, + "network_id 1": true, + "network_id 2": true, + "statefulset_kubernetes_io_pod_name": true, + "type": true + }, + "indexByName": { + "Time 1": 3, + "Time 2": 8, + "Value #Client Info": 13, + "Value #Nodes": 7, + "__name__ 1": 4, + "__name__ 2": 9, + "chain_address": 0, + "instance 1": 1, + "instance 2": 10, + "job 1": 5, + "job 2": 11, + "network_id 1": 6, + "network_id 2": 12, + "version": 2 + }, + "renameByName": { + "chain_address": "Chain Address", + "instance 1": "Instance", + "version": "Client Version" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 25, + "w": 13, + "x": 11, + "y": 36 + }, + "id": 6, + "options": { + "alignValue": "left", + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "mergeValues": true, + "rowHeight": 0.9, + "showValue": "auto", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "editorMode": "builder", + "expr": "min by(chain_address) (up{job=\"keep-discovered-nodes\"})", + "interval": "", + "legendFormat": "{{chain_address}}", + "range": true, + "refId": "A" + } + ], + "title": "Uptime (experimental)", + "type": "state-timeline" + } + ], + "refresh": false, + "schemaVersion": 37, + "style": "dark", + "tags": [ + "tbtc", + "keep", + "public" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-7d", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Keep Nodes (Public)", + "uid": "hhDyYDI4z", + "version": 3, + "weekStart": "" +} diff --git a/infrastructure/kube/keep-prd/monitoring/grafana/dashboards/keep/keep-nodes.json b/infrastructure/kube/keep-prd/monitoring/grafana/dashboards/keep/keep-nodes.json new file mode 100644 index 0000000000..a2003adc91 --- /dev/null +++ b/infrastructure/kube/keep-prd/monitoring/grafana/dashboards/keep/keep-nodes.json @@ -0,0 +1,774 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 3, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 30 + }, + { + "color": "green", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 11, + "x": 0, + "y": 0 + }, + "id": 8, + "interval": "1m", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.8", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "editorMode": "builder", + "exemplar": false, + "expr": "sum by(job) (sum by(chain_address) (up{job=\"keep-discovered-nodes\"}))", + "format": "time_series", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Nodes Up", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "super-light-red", + "value": null + }, + { + "color": "super-light-yellow", + "value": 30 + }, + { + "color": "super-light-green", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 16, + "w": 13, + "x": 11, + "y": 0 + }, + "id": 3, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "editorMode": "builder", + "exemplar": false, + "expr": "min by(chain_address) (connected_bootstrap_count{job=\"keep-discovered-nodes\"})", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "{{chain_address}}", + "range": true, + "refId": "C" + } + ], + "title": "Connected Bootstraps", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 100 + }, + { + "color": "light-yellow", + "value": 300 + }, + { + "color": "green", + "value": 900 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 28, + "w": 11, + "x": 0, + "y": 8 + }, + "id": 4, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "editorMode": "builder", + "exemplar": false, + "expr": "min by(chain_address) (tbtc_pre_params_count{job=\"keep-discovered-nodes\"})", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "{{chain_address}}", + "range": true, + "refId": "C" + } + ], + "title": "TBTC PreParams Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "super-light-yellow", + "value": 50 + }, + { + "color": "super-light-green", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 20, + "w": 13, + "x": 11, + "y": 16 + }, + "id": 2, + "interval": "1m", + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "editorMode": "builder", + "expr": "min by(chain_address) (connected_peers_count{job=\"keep-discovered-nodes\"})", + "hide": false, + "interval": "", + "legendFormat": "{{chain_address}}", + "range": true, + "refId": "Discovered Keep Nodes" + } + ], + "title": "Connected Peers", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "displayMode": "auto", + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 25, + "w": 11, + "x": 0, + "y": 36 + }, + "id": 10, + "options": { + "footer": { + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "chain_address" + } + ] + }, + "pluginVersion": "9.1.8", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "editorMode": "builder", + "exemplar": false, + "expr": "up{job=\"keep-discovered-nodes\"}", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "__auto", + "range": false, + "refId": "Nodes" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "editorMode": "builder", + "exemplar": false, + "expr": "client_info{job=\"keep-discovered-nodes\"}", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "", + "range": false, + "refId": "Client Info" + } + ], + "title": "Client Versions", + "transformations": [ + { + "id": "seriesToColumns", + "options": { + "byField": "chain_address" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Time 1": false, + "Time 2": true, + "Value": true, + "Value #A": true, + "Value #B": true, + "Value #Client Info": true, + "Value #Nodes": true, + "__name__": true, + "__name__ 1": true, + "__name__ 2": true, + "app": true, + "controller_revision_hash": true, + "id": true, + "instance": false, + "instance 1": false, + "instance 2": true, + "job": true, + "job 1": true, + "job 2": true, + "kubernetes_namespace": true, + "kubernetes_pod_name": true, + "kubernetes_pod_name_monitoring": true, + "network": true, + "network_id": true, + "network_id 1": true, + "network_id 2": true, + "statefulset_kubernetes_io_pod_name": true, + "type": true + }, + "indexByName": { + "Time 1": 3, + "Time 2": 8, + "Value #Client Info": 13, + "Value #Nodes": 7, + "__name__ 1": 4, + "__name__ 2": 9, + "chain_address": 0, + "instance 1": 1, + "instance 2": 10, + "job 1": 5, + "job 2": 11, + "network_id 1": 6, + "network_id 2": 12, + "version": 2 + }, + "renameByName": { + "chain_address": "Chain Address", + "instance 1": "Instance", + "version": "Client Version" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 25, + "w": 13, + "x": 11, + "y": 36 + }, + "id": 6, + "options": { + "alignValue": "left", + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "mergeValues": true, + "rowHeight": 0.9, + "showValue": "auto", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "editorMode": "builder", + "expr": "min by(chain_address) (up{job=\"keep-discovered-nodes\"})", + "interval": "", + "legendFormat": "{{chain_address}}", + "range": true, + "refId": "A" + } + ], + "title": "Uptime (experimental)", + "type": "state-timeline" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "description": "A number of running instances for each operator address.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "fillOpacity": 60, + "lineWidth": 0, + "spanNulls": false + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "super-light-orange", + "value": null + }, + { + "color": "super-light-green", + "value": 1 + }, + { + "color": "super-light-red", + "value": 2 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 20, + "w": 11, + "x": 0, + "y": 61 + }, + "id": 12, + "options": { + "alignValue": "center", + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "mergeValues": true, + "rowHeight": 0.9, + "showValue": "auto", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P09205B1DD12FB1C6" + }, + "editorMode": "builder", + "expr": "count by(chain_address) (up{job=\"keep-discovered-nodes\"})", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Node Instances", + "type": "state-timeline" + } + ], + "refresh": false, + "schemaVersion": 37, + "style": "dark", + "tags": [ + "tbtc", + "keep" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-7d", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Keep Nodes", + "uid": "tMgEvbnVk", + "version": 4, + "weekStart": "" +} diff --git a/infrastructure/kube/keep-prd/monitoring/grafana/grafana-deployment.yaml b/infrastructure/kube/keep-prd/monitoring/grafana/grafana-deployment.yaml new file mode 100644 index 0000000000..a9b9af9dd1 --- /dev/null +++ b/infrastructure/kube/keep-prd/monitoring/grafana/grafana-deployment.yaml @@ -0,0 +1,99 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: grafana +spec: + replicas: 1 + selector: + matchLabels: + app: grafana + template: + spec: + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + runAsNonRoot: true + containers: + - name: grafana + image: grafana/grafana:9.1.8 + env: + - name: GF_SERVER_DOMAIN + value: monitoring.threshold.network + - name: GF_SERVER_ROOT_URL + value: "https://%(domain)s/grafana/" + - name: GF_SERVER_SERVE_FROM_SUB_PATH + value: "true" + - name: GF_FEATURE_TOGGLES_PUBLICDASHBOARDS + value: "true" + - name: GF_AUTH_GOOGLE_CLIENT_ID + valueFrom: + secretKeyRef: + name: grafana-auth-google + key: client_id + - name: GF_AUTH_GOOGLE_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: grafana-auth-google + key: client_secret + ports: + - name: grafana + containerPort: 3000 + readinessProbe: + httpGet: + path: /api/health + port: grafana + initialDelaySeconds: 10 + periodSeconds: 30 + timeoutSeconds: 2 + livenessProbe: + initialDelaySeconds: 30 + tcpSocket: + port: grafana + resources: + limits: + cpu: 1000m + memory: 1Gi + requests: + cpu: 250m + memory: 512Mi + volumeMounts: + - name: grafana-grafana-ini + mountPath: /etc/grafana/grafana.ini + subPath: grafana.ini + - name: grafana-config-datasources + mountPath: /etc/grafana/provisioning/datasources + - name: grafana-config-dashboards + mountPath: /etc/grafana/provisioning/dashboards + - name: grafana-storage + mountPath: /var/lib/grafana + - name: grafana-dashboards-keep + mountPath: /var/lib/grafana/dashboards/keep + securityContext: + readOnlyRootFilesystem: true + volumes: + - name: grafana-storage + persistentVolumeClaim: + claimName: grafana-pvc + - name: grafana-dashboards-keep + configMap: + name: grafana-dashboards-keep + - name: grafana-config-datasources + configMap: + name: grafana-config + items: + - key: datasources.yaml + path: datasources.yaml + - name: grafana-config-dashboards + configMap: + name: grafana-config + items: + - key: dashboards.yaml + path: dashboards.yaml + - name: grafana-grafana-ini + configMap: + name: grafana-config + items: + - key: grafana.ini + path: grafana.ini diff --git a/infrastructure/kube/keep-prd/monitoring/grafana/grafana-pvc.yaml b/infrastructure/kube/keep-prd/monitoring/grafana/grafana-pvc.yaml new file mode 100644 index 0000000000..46b9de4205 --- /dev/null +++ b/infrastructure/kube/keep-prd/monitoring/grafana/grafana-pvc.yaml @@ -0,0 +1,15 @@ +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: grafana-pvc + namespace: monitoring + labels: + app: grafana +spec: + storageClassName: monitoring-storage + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 5Gi diff --git a/infrastructure/kube/keep-prd/monitoring/grafana/grafana-service.yaml b/infrastructure/kube/keep-prd/monitoring/grafana/grafana-service.yaml new file mode 100644 index 0000000000..2db62dbeda --- /dev/null +++ b/infrastructure/kube/keep-prd/monitoring/grafana/grafana-service.yaml @@ -0,0 +1,12 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: grafana +spec: + selector: + app: grafana + type: NodePort + ports: + - port: 3000 + targetPort: grafana diff --git a/infrastructure/kube/keep-prd/monitoring/grafana/kustomization.yaml b/infrastructure/kube/keep-prd/monitoring/grafana/kustomization.yaml new file mode 100644 index 0000000000..e1ca15444f --- /dev/null +++ b/infrastructure/kube/keep-prd/monitoring/grafana/kustomization.yaml @@ -0,0 +1,26 @@ +resources: + - grafana-deployment.yaml + - grafana-pvc.yaml + - grafana-service.yaml + +namespace: monitoring + +commonLabels: + app: grafana + type: monitoring + +configMapGenerator: + - name: grafana-config + files: + - config/grafana.ini + - config/dashboards.yaml + - config/datasources.yaml + - name: grafana-dashboards-keep + files: + - dashboards/keep/keep-nodes-public.json + - dashboards/keep/keep-nodes.json + +generatorOptions: + disableNameSuffixHash: true + annotations: + note: generated diff --git a/infrastructure/kube/keep-prd/monitoring/monitoring-ingress.yaml b/infrastructure/kube/keep-prd/monitoring/monitoring-ingress.yaml new file mode 100644 index 0000000000..bfa25808cb --- /dev/null +++ b/infrastructure/kube/keep-prd/monitoring/monitoring-ingress.yaml @@ -0,0 +1,50 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: monitoring + namespace: monitoring + annotations: + kubernetes.io/ingress.class: "gce" + # The static IP has to be created with the following command: + # `gcloud compute addresses create keep-monitoring-ingress --global` + kubernetes.io/ingress.global-static-ip-name: "keep-monitoring-ingress" + networking.gke.io/managed-certificates: monitoring-cert +spec: + defaultBackend: + service: + name: grafana + port: + number: 3000 + rules: + - http: + paths: + - path: "/grafana" + pathType: Prefix + backend: + service: + name: grafana + port: + number: 3000 + - path: "/prometheus" + pathType: Prefix + backend: + service: + name: trickster + port: + number: 8480 + - path: "/trickster" + pathType: Prefix + backend: + service: + name: trickster + port: + number: 8480 +--- +apiVersion: networking.gke.io/v1 +kind: ManagedCertificate +metadata: + name: monitoring-cert + namespace: monitoring +spec: + domains: + - monitoring.threshold.network diff --git a/infrastructure/kube/keep-prd/monitoring/prometheus/config/config.yaml b/infrastructure/kube/keep-prd/monitoring/prometheus/config/config.yaml new file mode 100644 index 0000000000..9d6a7fdc2a --- /dev/null +++ b/infrastructure/kube/keep-prd/monitoring/prometheus/config/config.yaml @@ -0,0 +1,28 @@ +global: + scrape_interval: 1m + scrape_timeout: 10s + evaluation_interval: 1m +scrape_configs: + - job_name: keep-discovered-nodes + honor_timestamps: true + metrics_path: /metrics + scheme: http + follow_redirects: true + enable_http2: true + relabel_configs: + - source_labels: [__meta_chain_address] + separator: ; + regex: (.*) + target_label: chain_address + replacement: $1 + action: replace + - source_labels: [__meta_network_id] + separator: ; + regex: (.*) + target_label: network_id + replacement: $1 + action: replace + file_sd_configs: + - files: + - /etc/prometheus/sd/keep-sd.json + refresh_interval: 5m diff --git a/infrastructure/kube/keep-prd/monitoring/prometheus/kustomization.yaml b/infrastructure/kube/keep-prd/monitoring/prometheus/kustomization.yaml new file mode 100644 index 0000000000..4cd5954af5 --- /dev/null +++ b/infrastructure/kube/keep-prd/monitoring/prometheus/kustomization.yaml @@ -0,0 +1,20 @@ +resources: + - prometheus-deployment.yaml + - prometheus-pvc.yaml + - prometheus-service.yaml + +namespace: monitoring + +commonLabels: + app: prometheus + type: monitoring + +configMapGenerator: + - name: prometheus-config + files: + - config/config.yaml + +generatorOptions: + disableNameSuffixHash: true + annotations: + note: generated diff --git a/infrastructure/kube/keep-prd/monitoring/prometheus/prometheus-deployment.yaml b/infrastructure/kube/keep-prd/monitoring/prometheus/prometheus-deployment.yaml new file mode 100644 index 0000000000..0c96ff11ef --- /dev/null +++ b/infrastructure/kube/keep-prd/monitoring/prometheus/prometheus-deployment.yaml @@ -0,0 +1,90 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: prometheus + type: monitoring + template: + spec: + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + runAsNonRoot: true + containers: + - name: prometheus + image: prom/prometheus:v2.39.1 + args: + - --config.file=/etc/prometheus/config.yaml + - --storage.tsdb.path=/etc/prometheus/data + - --storage.tsdb.retention.time=1y + - --web.external-url=/prometheus/ + ports: + - name: prometheus + containerPort: 9090 + readinessProbe: + httpGet: + path: "/prometheus/-/ready" + port: prometheus + initialDelaySeconds: 10 + periodSeconds: 30 + timeoutSeconds: 2 + livenessProbe: + httpGet: + path: "/prometheus/-/healthy" + port: prometheus + initialDelaySeconds: 10 + periodSeconds: 30 + timeoutSeconds: 2 + resources: + limits: + cpu: 1000m + memory: 1Gi + requests: + cpu: 500m + memory: 512Mi + volumeMounts: + - name: prometheus-config-volume + mountPath: /etc/prometheus/ + - name: prometheus-storage-volume + mountPath: /etc/prometheus/data/ + - name: prometheus-sd-volume + mountPath: /etc/prometheus/sd/ + securityContext: + readOnlyRootFilesystem: true + - name: keep-sd + image: keepnetwork/keep-prometheus-sd + args: + - --output.file=/etc/prometheus/sd/keep-sd.json + - --source.address=bst-a01.tbtc.boar.network:9601 + - --source.address=bst-b01.tbtc.boar.network:9601 + - --refresh.interval=5m + - --log.json + resources: + limits: + cpu: 1000m + memory: 1Gi + requests: + cpu: 250m + memory: 256Mi + volumeMounts: + - name: prometheus-sd-volume + mountPath: /etc/prometheus/sd/ + securityContext: + readOnlyRootFilesystem: true + volumes: + - name: prometheus-config-volume + configMap: + name: prometheus-config + - name: prometheus-storage-volume + persistentVolumeClaim: + claimName: prometheus-pvc + - name: prometheus-sd-volume + emptyDir: {} diff --git a/infrastructure/kube/keep-prd/monitoring/prometheus/prometheus-pvc.yaml b/infrastructure/kube/keep-prd/monitoring/prometheus/prometheus-pvc.yaml new file mode 100644 index 0000000000..6ca54ca443 --- /dev/null +++ b/infrastructure/kube/keep-prd/monitoring/prometheus/prometheus-pvc.yaml @@ -0,0 +1,12 @@ +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: prometheus-pvc +spec: + storageClassName: monitoring-storage + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 100Gi diff --git a/infrastructure/kube/keep-prd/monitoring/prometheus/prometheus-service.yaml b/infrastructure/kube/keep-prd/monitoring/prometheus/prometheus-service.yaml new file mode 100644 index 0000000000..ef83e37517 --- /dev/null +++ b/infrastructure/kube/keep-prd/monitoring/prometheus/prometheus-service.yaml @@ -0,0 +1,11 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus +spec: + selector: + app: prometheus + ports: + - port: 9090 + targetPort: prometheus diff --git a/infrastructure/kube/keep-prd/monitoring/storage-class.yaml b/infrastructure/kube/keep-prd/monitoring/storage-class.yaml new file mode 100644 index 0000000000..bf375bd8c0 --- /dev/null +++ b/infrastructure/kube/keep-prd/monitoring/storage-class.yaml @@ -0,0 +1,13 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: monitoring-storage +provisioner: kubernetes.io/gce-pd +parameters: + type: pd-ssd + replication-type: none +reclaimPolicy: Retain +allowVolumeExpansion: true +mountOptions: + - debug +volumeBindingMode: Immediate diff --git a/infrastructure/kube/keep-prd/monitoring/trickster/config/trickster.yaml b/infrastructure/kube/keep-prd/monitoring/trickster/config/trickster.yaml new file mode 100644 index 0000000000..0c4b5797c5 --- /dev/null +++ b/infrastructure/kube/keep-prd/monitoring/trickster/config/trickster.yaml @@ -0,0 +1,25 @@ +# Trickster Configuration File. +# +# A full configuration file example can be found here: +# https://github.com/trickstercache/trickster/blob/main/examples/conf/example.full.yaml + +frontend: + listen_port: 8480 + +backends: + default: + provider: prometheus + origin_url: http://prometheus:9090 + is_default: true + healthcheck: + path: /prometheus/-/ready + upstream_path: /prometheus/-/ready + interval_ms: 5000 + expected_body: "Prometheus Server is Ready.\n" + +metrics: + listen_port: 8481 + listen_address: "" + +logging: + log_level: info diff --git a/infrastructure/kube/keep-prd/monitoring/trickster/kustomization.yaml b/infrastructure/kube/keep-prd/monitoring/trickster/kustomization.yaml new file mode 100644 index 0000000000..0ca82fb0a6 --- /dev/null +++ b/infrastructure/kube/keep-prd/monitoring/trickster/kustomization.yaml @@ -0,0 +1,19 @@ +resources: + - trickster-deployment.yaml + - trickster-service.yaml + +namespace: monitoring + +commonLabels: + app: trickster + type: monitoring + +configMapGenerator: + - name: trickster-config + files: + - config/trickster.yaml + +generatorOptions: + disableNameSuffixHash: true + annotations: + note: generated diff --git a/infrastructure/kube/keep-prd/monitoring/trickster/trickster-deployment.yaml b/infrastructure/kube/keep-prd/monitoring/trickster/trickster-deployment.yaml new file mode 100644 index 0000000000..f63c615dad --- /dev/null +++ b/infrastructure/kube/keep-prd/monitoring/trickster/trickster-deployment.yaml @@ -0,0 +1,58 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: trickster +spec: + replicas: 1 + selector: + matchLabels: + app: trickster + type: monitoring + template: + spec: + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + runAsNonRoot: true + containers: + - name: trickster + image: trickstercache/trickster:2 + ports: + - name: trickster + containerPort: 8480 + - name: metrics + containerPort: 8481 + readinessProbe: + httpGet: + path: "/trickster/health/default" + port: metrics + livenessProbe: + httpGet: + path: "/trickster/ping" + port: trickster + resources: + limits: + cpu: 1000m + memory: 1Gi + requests: + cpu: 500m + memory: 512Mi + volumeMounts: + - name: trickster-config + mountPath: /etc/trickster + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + securityContext: + readOnlyRootFilesystem: true + volumes: + - name: trickster-config + configMap: + name: trickster-config + items: + - key: trickster.yaml + path: trickster.yaml diff --git a/infrastructure/kube/keep-prd/monitoring/trickster/trickster-service.yaml b/infrastructure/kube/keep-prd/monitoring/trickster/trickster-service.yaml new file mode 100644 index 0000000000..cdcb0f9030 --- /dev/null +++ b/infrastructure/kube/keep-prd/monitoring/trickster/trickster-service.yaml @@ -0,0 +1,15 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: trickster +spec: + selector: + app: trickster + ports: + - name: trickster + port: 8480 + targetPort: trickster + - name: metrics + port: 8481 + targetPort: metrics