diff --git a/charts/chainlink-cluster/README.md b/charts/chainlink-cluster/README.md index f7d4c45fa5f..46c337dc724 100644 --- a/charts/chainlink-cluster/README.md +++ b/charts/chainlink-cluster/README.md @@ -34,7 +34,7 @@ If you don't need a build use devspace deploy --skip-build ``` -Connect to your environment +Connect to your environment, by replacing container with label `node-1` with your local repository files ``` devspace dev -p node make chainlink @@ -117,4 +117,23 @@ helm test cl-cluster ## Uninstall ``` helm uninstall cl-cluster +``` + +# Grafana dashboard +We are using [Grabana]() lib to create dashboards programmatically +``` +export GRAFANA_URL=... +export GRAFANA_TOKEN=... +export LOKI_DATA_SOURCE_NAME=Loki +export PROMETHEUS_DATA_SOURCE_NAME=Thanos +export DASHBOARD_FOLDER=CLClusterEphemeralDevspace +export DASHBOARD_NAME=ChainlinkCluster + +cd dashboard/cmd && go run dashboard_deploy.go +``` +Open Grafana folder `CLClusterEphemeralDevspace` and find dashboard `ChainlinkCluster` + +If you'd like to add more metrics or verify that all of them are added you can have the full list using IDE search or `ripgrep`: +``` +rg -U ".*promauto.*\n.*Name: \"(.*)\"" ../.. > metrics.txt ``` \ No newline at end of file diff --git a/charts/chainlink-cluster/dashboard/cmd/dashboard_deploy.go b/charts/chainlink-cluster/dashboard/cmd/dashboard_deploy.go new file mode 100644 index 00000000000..c752794f53f --- /dev/null +++ b/charts/chainlink-cluster/dashboard/cmd/dashboard_deploy.go @@ -0,0 +1,42 @@ +package main + +import ( + "os" + + "github.com/smartcontractkit/chainlink/v2/dashboard/dashboard" +) + +func main() { + name := os.Getenv("DASHBOARD_NAME") + if name == "" { + panic("DASHBOARD_NAME must be provided") + } + ldsn := os.Getenv("LOKI_DATA_SOURCE_NAME") + if ldsn == "" { + panic("DATA_SOURCE_NAME must be provided") + } + pdsn := os.Getenv("PROMETHEUS_DATA_SOURCE_NAME") + if ldsn == "" { + panic("DATA_SOURCE_NAME must be provided") + } + dbf := os.Getenv("DASHBOARD_FOLDER") + if dbf == "" { + panic("DASHBOARD_FOLDER must be provided") + } + grafanaURL := os.Getenv("GRAFANA_URL") + if grafanaURL == "" { + panic("GRAFANA_URL must be provided") + } + grafanaToken := os.Getenv("GRAFANA_TOKEN") + if grafanaToken == "" { + panic("GRAFANA_TOKEN must be provided") + } + // if you'll use this dashboard base in other projects, you can add your own opts here to extend it + db, err := dashboard.NewCLClusterDashboard(name, ldsn, pdsn, dbf, grafanaURL, grafanaToken, nil) + if err != nil { + panic(err) + } + if err := db.Deploy(); err != nil { + panic(err) + } +} diff --git a/charts/chainlink-cluster/dashboard/dashboard.go b/charts/chainlink-cluster/dashboard/dashboard.go new file mode 100644 index 00000000000..7918b996dd0 --- /dev/null +++ b/charts/chainlink-cluster/dashboard/dashboard.go @@ -0,0 +1,392 @@ +package dashboard + +import ( + "context" + "fmt" + "net/http" + + "github.com/K-Phoen/grabana" + "github.com/K-Phoen/grabana/dashboard" + "github.com/K-Phoen/grabana/logs" + "github.com/K-Phoen/grabana/row" + "github.com/K-Phoen/grabana/stat" + "github.com/K-Phoen/grabana/target/prometheus" + "github.com/K-Phoen/grabana/timeseries" + "github.com/K-Phoen/grabana/timeseries/axis" + "github.com/K-Phoen/grabana/variable/interval" + "github.com/K-Phoen/grabana/variable/query" + "github.com/pkg/errors" +) + +const ( + ErrFailedToCreateDashboard = "failed to create dashboard" + ErrFailedToCreateFolder = "failed to create folder" +) + +// CLClusterDashboard is a dashboard for a Chainlink cluster +type CLClusterDashboard struct { + Name string + LokiDataSourceName string + PrometheusDataSourceName string + Folder string + GrafanaURL string + GrafanaToken string + opts []dashboard.Option + extendedOpts []dashboard.Option + builder dashboard.Builder +} + +// NewCLClusterDashboard returns a new dashboard for a Chainlink cluster, can be used as a base for more complex plugin based dashboards +func NewCLClusterDashboard(name, ldsn, pdsn, dbf, grafanaURL, grafanaToken string, opts []dashboard.Option) (*CLClusterDashboard, error) { + db := &CLClusterDashboard{ + Name: name, + Folder: dbf, + LokiDataSourceName: ldsn, + PrometheusDataSourceName: pdsn, + GrafanaURL: grafanaURL, + GrafanaToken: grafanaToken, + extendedOpts: opts, + } + if err := db.generate(); err != nil { + return db, err + } + return db, nil +} + +func (m *CLClusterDashboard) Opts() []dashboard.Option { + return m.opts +} + +// logsRowOption returns a row option for a node's logs with name and instance selector +func (m *CLClusterDashboard) logsRowOption(name, instanceSelector string) row.Option { + return row.WithLogs( + name, + logs.DataSource(m.LokiDataSourceName), + logs.Span(12), + logs.Height("300px"), + logs.Transparent(), + logs.WithLokiTarget(fmt.Sprintf(` + {namespace="${namespace}", app="app", instance="%s", container="node"} + `, instanceSelector)), + ) +} + +// timeseriesRowOption returns a row option for a timeseries with name, axis unit, query and legend template +func (m *CLClusterDashboard) timeseriesRowOption(name, axisUnit, query, legendTemplate string) row.Option { + var tsq timeseries.Option + if legendTemplate != "" { + tsq = timeseries.WithPrometheusTarget( + query, + prometheus.Legend(legendTemplate), + ) + } else { + tsq = timeseries.WithPrometheusTarget(query) + } + var au timeseries.Option + if axisUnit != "" { + au = timeseries.Axis( + axis.Unit(axisUnit), + ) + } else { + au = timeseries.Axis() + } + return row.WithTimeSeries( + name, + timeseries.Span(6), + timeseries.Height("300px"), + timeseries.DataSource(m.PrometheusDataSourceName), + au, + tsq, + ) +} + +// statRowOption returns a row option for a stat with name, prometheus target and legend template +func (m *CLClusterDashboard) statRowOption(name, target, legend string) row.Option { + return row.WithStat( + name, + stat.Transparent(), + stat.DataSource(m.PrometheusDataSourceName), + stat.Text(stat.TextValueAndName), + stat.Orientation(stat.OrientationVertical), + stat.TitleFontSize(12), + stat.ValueFontSize(20), + stat.Span(12), + stat.Height("100px"), + stat.WithPrometheusTarget(target, prometheus.Legend(legend)), + ) +} + +// generate generates the dashboard, adding extendedOpts to the default options +func (m *CLClusterDashboard) generate() error { + opts := []dashboard.Option{ + dashboard.AutoRefresh("10s"), + dashboard.Tags([]string{"generated"}), + dashboard.VariableAsQuery( + "namespace", + query.DataSource(m.LokiDataSourceName), + query.Multiple(), + query.IncludeAll(), + query.Request(fmt.Sprintf("label_values(%s)", "namespace")), + query.Sort(query.NumericalAsc), + ), + dashboard.VariableAsInterval( + "interval", + interval.Values([]string{"30s", "1m", "5m", "10m", "30m", "1h", "6h", "12h"}), + ), + dashboard.Row( + "Cluster health", + m.statRowOption( + "App Version", + `version{namespace="${namespace}"}`, + "{{pod}} - {{version}}", + ), + row.WithTimeSeries( + "Restarts", + timeseries.Span(12), + timeseries.Height("200px"), + timeseries.DataSource(m.PrometheusDataSourceName), + timeseries.WithPrometheusTarget( + `sum(increase(kube_pod_container_status_restarts_total{namespace=~"${namespace}"}[5m])) by (pod)`, + prometheus.Legend("{{pod}}"), + ), + ), + row.WithTimeSeries( + "Service Components Health", + timeseries.Span(12), + timeseries.Height("200px"), + timeseries.DataSource(m.PrometheusDataSourceName), + timeseries.WithPrometheusTarget( + `health{namespace="${namespace}"}`, + prometheus.Legend("{{pod}} - {{service_id}}"), + ), + ), + row.WithTimeSeries( + "Log Counters", + timeseries.Span(12), + timeseries.Height("200px"), + timeseries.DataSource(m.PrometheusDataSourceName), + timeseries.WithPrometheusTarget( + `log_panic_count{namespace="${namespace}"}`, + prometheus.Legend("{{pod}} - panic"), + ), + timeseries.WithPrometheusTarget( + `log_fatal_count{namespace="${namespace}"}`, + prometheus.Legend("{{pod}} - fatal"), + ), + timeseries.WithPrometheusTarget( + `log_critical_count{namespace="${namespace}"}`, + prometheus.Legend("{{pod}} - critical"), + ), + timeseries.WithPrometheusTarget( + `log_error_count{namespace="${namespace}"}`, + prometheus.Legend("{{pod}} - error"), + ), + ), + row.WithTimeSeries( + "ETH Balance", + timeseries.Span(12), + timeseries.Height("200px"), + timeseries.DataSource(m.PrometheusDataSourceName), + timeseries.WithPrometheusTarget( + `eth_balance{namespace="${namespace}"}`, + prometheus.Legend("{{pod}} - {{account}}"), + ), + ), + ), + // logs + dashboard.Row( + "Logs", + row.Collapse(), + m.logsRowOption("Node 1", "node-1"), + m.logsRowOption("Node 2", "node-2"), + m.logsRowOption("Node 3", "node-3"), + m.logsRowOption("Node 4", "node-4"), + ), + // DON report metrics + dashboard.Row("DON Report metrics", + row.Collapse(), + m.timeseriesRowOption( + "Plugin Query() count", + "Count", + `sum(rate(ocr2_reporting_plugin_query_count{namespace="${namespace}", app="app"}[$__rate_interval])) by (service)`, + "", + ), + m.timeseriesRowOption( + "Plugin Observation() time (95th)", + "Sec", + `histogram_quantile(0.95, sum(rate(ocr2_reporting_plugin_observation_time_bucket{namespace="${namespace}", app="app"}[$__rate_interval])) by (le, service)) / 1e9`, + "", + ), + m.timeseriesRowOption( + "Plugin ShouldAcceptReport() time (95th)", + "Sec", + `histogram_quantile(0.95, sum(rate(ocr2_reporting_plugin_should_accept_report_time_bucket{namespace="${namespace}", app="app"}[$__rate_interval])) by (le, service)) / 1e9`, + "", + ), + m.timeseriesRowOption( + "Plugin Report() time (95th)", + "Sec", + `histogram_quantile(0.95, sum(rate(ocr2_reporting_plugin_report_time_bucket{namespace="${namespace}", app="app"}[$__rate_interval])) by (le, service)) / 1e9`, + "", + ), + m.timeseriesRowOption( + "Plugin ShouldTransmitReport() time (95th)", + "Sec", + `histogram_quantile(0.95, sum(rate(ocr2_reporting_plugin_should_transmit_report_time_bucket{namespace="${namespace}", app="app"}[$__rate_interval])) by (le, service)) / 1e9`, + "", + ), + ), + dashboard.Row( + "DB Connection Metrics (App)", + row.Collapse(), + m.timeseriesRowOption( + "DB Connections MAX", + "Conn", + `db_conns_max{namespace="${namespace}"}`, + "{{pod}}", + ), + m.timeseriesRowOption( + "DB Connections Open", + "Conn", + `db_conns_open{namespace="${namespace}"}`, + "{{pod}}", + ), + m.timeseriesRowOption( + "DB Connections Used", + "Conn", + `db_conns_used{namespace="${namespace}"}`, + "{{pod}}", + ), + m.timeseriesRowOption( + "DB Connections Wait", + "Conn", + `db_conns_wait{namespace="${namespace}"}`, + "{{pod}}", + ), + m.timeseriesRowOption( + "DB Wait time", + "Sec", + `db_wait_time_seconds{namespace="${namespace}"}`, + "{{pod}}", + ), + ), + dashboard.Row( + "EVM Pool RPC Node Metrics (App)", + row.Collapse(), + m.timeseriesRowOption( + "EVM Pool RPC Node Calls Success", + "", + `evm_pool_rpc_node_calls_success{namespace="${namespace}"}`, + "{{pod}}", + ), + m.timeseriesRowOption( + "EVM Pool RPC Node Calls Total", + "", + `evm_pool_rpc_node_calls_total{namespace="${namespace}"}`, + "{{pod}}", + ), + m.timeseriesRowOption( + "EVM Pool RPC Node Dials Success", + "", + `evm_pool_rpc_node_dials_success{namespace="${namespace}"}`, + "{{pod}}", + ), + m.timeseriesRowOption( + "EVM Pool RPC Node Dials Total", + "", + `evm_pool_rpc_node_dials_total{namespace="${namespace}"}`, + "{{pod}}", + ), + m.timeseriesRowOption( + "EVM Pool RPC Highest Seen Block", + "", + `evm_pool_rpc_node_highest_seen_block{namespace="${namespace}"}`, + "{{pod}}", + ), + m.timeseriesRowOption( + "EVM Pool RPC Total Transitions to Alive", + "", + `evm_pool_rpc_node_num_transitions_to_alive{namespace="${namespace}"}`, + "{{pod}}", + ), + m.timeseriesRowOption( + "EVM Pool RPC Node Polls Success", + "", + `evm_pool_rpc_node_polls_success{namespace="${namespace}"}`, + "{{pod}}", + ), + m.timeseriesRowOption( + "EVM Pool RPC Node Polls Total", + "", + `evm_pool_rpc_node_polls_total{namespace="${namespace}"}`, + "{{pod}}", + ), + m.timeseriesRowOption( + "EVM Pool RPC Node States", + "", + `evm_pool_rpc_node_states{namespace="${namespace}"}`, + "{{pod}} - {{evmChainID}} - {{state}}", + ), + m.timeseriesRowOption( + "EVM Pool RPC Node Verifies Total", + "", + `evm_pool_rpc_node_verifies{namespace="${namespace}"}`, + "{{pod}} - {{evmChainID}}", + ), + m.timeseriesRowOption( + "EVM Pool RPC Node Verifies Success", + "", + `evm_pool_rpc_node_verifies_success{namespace="${namespace}"}`, + "{{pod}} - {{evmChainID}}", + ), + ), + dashboard.Row( + "EVM Pool RPC Node Latencies (App)", + row.Collapse(), + m.timeseriesRowOption( + "EVM Pool RPC Node Calls Latency 0.95 quantile", + "ms", + `histogram_quantile(0.95, sum(rate(evm_pool_rpc_node_rpc_call_time_bucket{namespace="${namespace}"}[$__rate_interval])) by (le, rpcCallName)) / 1e6`, + "{{pod}}", + ), + ), + dashboard.Row( + "Pipeline Tasks Metrics (App)", + row.Collapse(), + m.timeseriesRowOption( + "Pipeline Runs Queued", + "", + `pipeline_runs_queued{namespace="${namespace}"}`, + "{{pod}}", + ), + m.timeseriesRowOption( + "Pipeline Runs Tasks Queued", + "", + `pipeline_task_runs_queued{namespace="${namespace}"}`, + "{{pod}}", + ), + ), + } + opts = append(opts, m.extendedOpts...) + builder, err := dashboard.New( + "Chainlink Cluster Dashboard", + opts..., + ) + m.opts = opts + m.builder = builder + return err +} + +// Deploy deploys the dashboard to Grafana +func (m *CLClusterDashboard) Deploy() error { + ctx := context.Background() + client := grabana.NewClient(&http.Client{}, m.GrafanaURL, grabana.WithAPIToken(m.GrafanaToken)) + folder, err := client.FindOrCreateFolder(ctx, m.Folder) + if err != nil { + return errors.Wrap(err, ErrFailedToCreateFolder) + } + if _, err := client.UpsertDashboard(ctx, folder, m.builder); err != nil { + return errors.Wrap(err, ErrFailedToCreateDashboard) + } + return nil +} diff --git a/charts/chainlink-cluster/devspace.yaml b/charts/chainlink-cluster/devspace.yaml index 54b5f9f01e9..688660d918e 100644 --- a/charts/chainlink-cluster/devspace.yaml +++ b/charts/chainlink-cluster/devspace.yaml @@ -32,6 +32,7 @@ images: deployments: app: helm: + releaseName: "app" chart: name: cl-cluster path: . @@ -68,6 +69,7 @@ deployments: - name: node-4 image: ${DEVSPACE_IMAGE} version: latest + prometheusMonitor: "true" podAnnotations: { } nodeSelector: { } tolerations: [ ] diff --git a/charts/chainlink-cluster/go.mod b/charts/chainlink-cluster/go.mod new file mode 100644 index 00000000000..990ebbf713e --- /dev/null +++ b/charts/chainlink-cluster/go.mod @@ -0,0 +1,15 @@ +module github.com/smartcontractkit/chainlink/v2/dashboard + +go 1.21 + +require ( + github.com/K-Phoen/grabana v0.21.19 + github.com/pkg/errors v0.9.1 +) + +require ( + github.com/K-Phoen/sdk v0.12.3 // indirect + github.com/gosimple/slug v1.13.1 // indirect + github.com/gosimple/unidecode v1.0.1 // indirect + github.com/prometheus/common v0.39.0 // indirect +) diff --git a/charts/chainlink-cluster/go.sum b/charts/chainlink-cluster/go.sum new file mode 100644 index 00000000000..093a42d6081 --- /dev/null +++ b/charts/chainlink-cluster/go.sum @@ -0,0 +1,20 @@ +github.com/K-Phoen/grabana v0.21.19 h1:tJjRO8nN9JrFjLoQGtOB9P5ILoqENZZGAtt3nK+Ry2Y= +github.com/K-Phoen/grabana v0.21.19/go.mod h1:B7gxVxacQUgHWmgqduf4WPZoKYHO1mvZnRVCoyQiwdw= +github.com/K-Phoen/sdk v0.12.3 h1:ScutEQASc9VEKJCm3OjIMD82BIS9B2XtNg3gEf6Gs+M= +github.com/K-Phoen/sdk v0.12.3/go.mod h1:qmM0wO23CtoDux528MXPpYvS4XkRWkWX6rvX9Za8EVU= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/gosimple/slug v1.13.1 h1:bQ+kpX9Qa6tHRaK+fZR0A0M2Kd7Pa5eHPPsb1JpHD+Q= +github.com/gosimple/slug v1.13.1/go.mod h1:UiRaFH+GEilHstLUmcBgWcI42viBN7mAb818JrYOeFQ= +github.com/gosimple/unidecode v1.0.1 h1:hZzFTMMqSswvf0LBJZCZgThIZrpDHFXux9KeGmn6T/o= +github.com/gosimple/unidecode v1.0.1/go.mod h1:CP0Cr1Y1kogOtx0bJblKzsVWrqYaqfNOnHzpgWw4Awc= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/common v0.39.0 h1:oOyhkDq05hPZKItWVBkJ6g6AtGxi+fy7F4JvUV8uhsI= +github.com/prometheus/common v0.39.0/go.mod h1:6XBZ7lYdLCbkAVhwRsWTZn+IN5AB9F/NXd5w0BbEX0Y= +github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8= +github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/charts/chainlink-cluster/templates/chainlink-cm.yaml b/charts/chainlink-cluster/templates/chainlink-cm.yaml index fa9c2c2657d..736a3322048 100644 --- a/charts/chainlink-cluster/templates/chainlink-cm.yaml +++ b/charts/chainlink-cluster/templates/chainlink-cm.yaml @@ -29,10 +29,9 @@ data: [OCR] Enabled = true [P2P] - [P2P.V1] + [P2P.V2] Enabled = true - ListenIP = '0.0.0.0' - ListenPort = 6690 + ListenAddresses = ["0.0.0.0:6690"] [[EVM]] ChainID = '1337' MinContractPayment = '0' diff --git a/charts/chainlink-cluster/templates/chainlink-deployment.yaml b/charts/chainlink-cluster/templates/chainlink-deployment.yaml index 16665916f59..b434c9894b0 100644 --- a/charts/chainlink-cluster/templates/chainlink-deployment.yaml +++ b/charts/chainlink-cluster/templates/chainlink-deployment.yaml @@ -37,6 +37,8 @@ spec: {{- end }} annotations: prometheus.io/scrape: 'true' + app.kubernetes.io/managed-by: "Helm" + meta.helm.sh/release-namespace: "{{ $.Release.Namespace }}" {{- range $key, $value := $.Values.podAnnotations }} {{ $key }}: {{ $value | quote }} {{- end }} diff --git a/charts/chainlink-cluster/templates/chainlink-pod-monitor.yaml b/charts/chainlink-cluster/templates/chainlink-pod-monitor.yaml index 4a74ff6c454..2cd9c3df2b6 100644 --- a/charts/chainlink-cluster/templates/chainlink-pod-monitor.yaml +++ b/charts/chainlink-cluster/templates/chainlink-pod-monitor.yaml @@ -1,16 +1,18 @@ -{{- range $cfg := .Values.chainlink.nodes }} {{- if $.Values.prometheusMonitor }} apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: + name: {{ $.Release.Name }}-pod-monitor labels: release: grafana-agent spec: - selector: - matchLabels: - release: {{ $.Release.Name }}-{{ $cfg.name }} + namespaceSelector: + matchNames: + - "cl-cluster" podMetricsEndpoints: - port: access - {{- end }} ---- -{{- end }} \ No newline at end of file + selector: + matchLabels: + app: {{ $.Release.Name }} +{{- end }} +--- \ No newline at end of file diff --git a/sonar-project.properties b/sonar-project.properties index ea142ce17fe..c40b5f361e1 100644 --- a/sonar-project.properties +++ b/sonar-project.properties @@ -12,4 +12,4 @@ sonar.cpd.exclusions=**/contracts/**/*.sol, **/config.go, /core/services/ocr2/pl # Tests' root folder, inclusions (tests to check and count) and exclusions sonar.tests=. sonar.test.inclusions=**/*_test.go, **/*.test.ts -sonar.test.exclusions=**/integration-tests/**/* +sonar.test.exclusions=**/integration-tests/**/*, **/charts/chainlink-cluster/dashboard/cmd/*