diff --git a/CHANGELOG.md b/CHANGELOG.md index ac53c46a81..533ddfd269 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,10 @@ Main (unreleased) - Update Public preview `remotecfg` argument from `metadata` to `attributes`. (@erikbaranowski) +### Enhancements + +- Added a success rate panel on the Prometheus Components dashboard. (@thampiotr) + v1.2.0-rc.0 ----------------- diff --git a/example/images/grizzly/Dockerfile b/example/images/grizzly/Dockerfile index b98841fb67..25be31db0b 100644 --- a/example/images/grizzly/Dockerfile +++ b/example/images/grizzly/Dockerfile @@ -1,6 +1,3 @@ FROM golang:1.22-alpine -# NOTE(rfratto): Versions of grr newer than v0.2.1 don't work with our Grizzly -# jsonnet files, reporting that the the Alloy dashboard folder "cannot be -# found" after creating it. -RUN go install github.com/grafana/grizzly/cmd/grr@v0.2.1 +RUN go install github.com/grafana/grizzly/cmd/grr@v0.4.3 diff --git a/example/images/jb/Dockerfile b/example/images/jb/Dockerfile index 82543f33d8..786324a42e 100644 --- a/example/images/jb/Dockerfile +++ b/example/images/jb/Dockerfile @@ -1,3 +1,3 @@ FROM golang:1.22-alpine -RUN go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@c862f0670eb199b5024e31ff024f39b74d3b803a +RUN go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@v0.5.1 diff --git a/operations/alloy-mixin/dashboards/alloy-logs.libsonnet b/operations/alloy-mixin/dashboards/alloy-logs.libsonnet index 9ce96c8b1e..b821bc914d 100644 --- a/operations/alloy-mixin/dashboards/alloy-logs.libsonnet +++ b/operations/alloy-mixin/dashboards/alloy-logs.libsonnet @@ -4,6 +4,7 @@ local logsDashboard = import 'github.com/grafana/jsonnet-libs/logs-lib/logs/main { local labels = if $._config.enableK8sCluster then ['cluster', 'namespace', 'job', 'instance', 'level'] else ['job', 'instance', 'level'], + local dashboardName = 'alloy-logs.json', grafanaDashboards+: if $._config.enableLokiLogs then { @@ -28,9 +29,10 @@ local logsDashboard = import 'github.com/grafana/jsonnet-libs/logs-lib/logs/main { logs+: g.dashboard.withLinksMixin($.grafanaDashboards['alloy-resources.json'].links) + g.dashboard.withRefresh('10s') - + g.dashboard.withTagsMixin($._config.dashboardTag), + + g.dashboard.withTagsMixin($._config.dashboardTag) + + g.dashboard.withUid(std.md5(dashboardName)), }, }, - 'alloy-logs.json': alloyLogs.dashboards.logs, + [dashboardName]: alloyLogs.dashboards.logs, } else {}, } diff --git a/operations/alloy-mixin/dashboards/cluster-node.libsonnet b/operations/alloy-mixin/dashboards/cluster-node.libsonnet index 9abac50560..cc5046132d 100644 --- a/operations/alloy-mixin/dashboards/cluster-node.libsonnet +++ b/operations/alloy-mixin/dashboards/cluster-node.libsonnet @@ -14,7 +14,7 @@ local filename = 'alloy-cluster-node.json'; [filename]: dashboard.new(name='Alloy / Cluster Node', tag=$._config.dashboardTag) + dashboard.withDocsLink( - url='https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode', + url='https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering', desc='Clustering documentation', ) + dashboard.withDashboardsLink(tag=$._config.dashboardTag) + diff --git a/operations/alloy-mixin/dashboards/cluster-overview.libsonnet b/operations/alloy-mixin/dashboards/cluster-overview.libsonnet index 9b07ec8d4f..06ad02b552 100644 --- a/operations/alloy-mixin/dashboards/cluster-overview.libsonnet +++ b/operations/alloy-mixin/dashboards/cluster-overview.libsonnet @@ -15,7 +15,7 @@ local cluster_node_filename = 'alloy-cluster-node.json'; [filename]: dashboard.new(name='Alloy / Cluster Overview', tag=$._config.dashboardTag) + dashboard.withDocsLink( - url='https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode', + url='https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering', desc='Clustering documentation', ) + dashboard.withDashboardsLink(tag=$._config.dashboardTag) + diff --git a/operations/alloy-mixin/dashboards/prometheus.libsonnet b/operations/alloy-mixin/dashboards/prometheus.libsonnet index 79e26ee3d0..879e6afb9b 100644 --- a/operations/alloy-mixin/dashboards/prometheus.libsonnet +++ b/operations/alloy-mixin/dashboards/prometheus.libsonnet @@ -87,6 +87,72 @@ local filename = 'alloy-prometheus-remote-write.json'; local remoteWritePanels(y_offset) = [ panel.newRow(title='prometheus.remote_write', y=y_offset), + // Remote write success rate + ( + panel.new(title='Remote write success rate in $cluster', type='timeseries') + + panel.withUnit('percentunit') + + panel.withDescription(||| + Percentage of samples sent by prometheus.remote_write that succeeded. + + Low success rates can indicate a problem with Alloy or the remote storage. + |||) + + panel.withPosition({ x: 0, y: 1 + y_offset, w: 12, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr=||| + ( + 1 - + ( + sum(rate(prometheus_remote_storage_samples_failed_total{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval])) + ) + / + ( + sum(rate(prometheus_remote_storage_samples_total{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval])) + ) + ) + ||| % $._config, + legendFormat='% of samples successfully sent', + ), + ]) + ), + + // Write latency + ( + panel.new(title='Write latency in $cluster', type='timeseries') + + panel.withUnit('s') + + panel.withDescription(||| + Latency of writes to the remote system made by + prometheus.remote_write. + |||) + + panel.withPosition({ x: 12, y: 1 + y_offset, w: 12, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr= ||| + histogram_quantile(0.99, sum by (le) ( + rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + )) + ||| % $._config, + legendFormat='99th percentile', + ), + panel.newQuery( + expr= ||| + histogram_quantile(0.50, sum by (le) ( + rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + )) + ||| % $._config, + legendFormat='50th percentile', + ), + panel.newQuery( + expr= ||| + sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component"}[$__rate_interval])) / + sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component"}[$__rate_interval])) + ||| % $._config, + legendFormat='Average', + ), + ]) + ), + + // WAL delay ( panel.new(title='WAL delay', type='timeseries') + @@ -103,7 +169,7 @@ local filename = 'alloy-prometheus-remote-write.json'; the WAL delay continues to increase beyond that amount, try increasing the number of maximum shards. |||) + - panel.withPosition({ x: 0, y: 1 + y_offset, w: 6, h: 10 }) + + panel.withPosition({ x: 0, y: 11 + y_offset, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( expr= ||| @@ -127,7 +193,7 @@ local filename = 'alloy-prometheus-remote-write.json'; Rate of data containing samples and metadata sent by prometheus.remote_write. |||) + - panel.withPosition({ x: 6, y: 1 + y_offset, w: 6, h: 10 }) + + panel.withPosition({ x: 8, y: 11 + y_offset, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( expr= ||| @@ -141,42 +207,6 @@ local filename = 'alloy-prometheus-remote-write.json'; ]) ), - // Write latency - ( - panel.new(title='Write latency', type='timeseries') + - panel.withUnit('s') + - panel.withDescription(||| - Latency of writes to the remote system made by - prometheus.remote_write. - |||) + - panel.withPosition({ x: 12, y: 1 + y_offset, w: 6, h: 10 }) + - panel.withQueries([ - panel.newQuery( - expr= ||| - histogram_quantile(0.99, sum by (le) ( - rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) - )) - ||| % $._config, - legendFormat='99th percentile', - ), - panel.newQuery( - expr= ||| - histogram_quantile(0.50, sum by (le) ( - rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) - )) - ||| % $._config, - legendFormat='50th percentile', - ), - panel.newQuery( - expr= ||| - sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component"}[$__rate_interval])) / - sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component"}[$__rate_interval])) - ||| % $._config, - legendFormat='Average', - ), - ]) - ), - // Shards ( local minMaxOverride = { @@ -220,7 +250,7 @@ local filename = 'alloy-prometheus-remote-write.json'; shards; filter to a specific URL to display more granular information. |||) + - panel.withPosition({ x: 18, y: 1 + y_offset, w: 6, h: 10 }) + + panel.withPosition({ x: 16, y: 11 + y_offset, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( expr= ||| @@ -257,7 +287,7 @@ local filename = 'alloy-prometheus-remote-write.json'; panel.withDescription(||| Total outgoing samples sent by prometheus.remote_write. |||) + - panel.withPosition({ x: 0, y: 11 + y_offset, w: 8, h: 10 }) + + panel.withPosition({ x: 0, y: 21 + y_offset, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( expr= ||| @@ -279,7 +309,7 @@ local filename = 'alloy-prometheus-remote-write.json'; Rate of samples which prometheus.remote_write could not send due to non-recoverable errors. |||) + - panel.withPosition({ x: 8, y: 11 + y_offset, w: 8, h: 10 }) + + panel.withPosition({ x: 8, y: 21 + y_offset, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( expr= ||| @@ -301,7 +331,7 @@ local filename = 'alloy-prometheus-remote-write.json'; Rate of samples which prometheus.remote_write attempted to resend after receiving a recoverable error. |||) + - panel.withPosition({ x: 16, y: 11 + y_offset, w: 8, h: 10 }) + + panel.withPosition({ x: 16, y: 21 + y_offset, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( expr= ||| @@ -331,7 +361,7 @@ local filename = 'alloy-prometheus-remote-write.json'; received a sample for. Active series are garbage collected whenever a truncation of the WAL occurs. |||) + - panel.withPosition({ x: 0, y: 21 + y_offset, w: 8, h: 10 }) + + panel.withPosition({ x: 0, y: 31 + y_offset, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( expr= ||| @@ -354,7 +384,7 @@ local filename = 'alloy-prometheus-remote-write.json'; received a sample for. Active series are garbage collected whenever a truncation of the WAL occurs. |||) + - panel.withPosition({ x: 8, y: 21 + y_offset, w: 8, h: 10 }) + + panel.withPosition({ x: 8, y: 31 + y_offset, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( expr= ||| @@ -377,7 +407,7 @@ local filename = 'alloy-prometheus-remote-write.json'; received a sample for. Active series are garbage collected whenever a truncation of the WAL occurs. |||) + - panel.withPosition({ x: 16, y: 21 + y_offset, w: 8, h: 10 }) + + panel.withPosition({ x: 16, y: 31 + y_offset, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( expr= ||| diff --git a/operations/alloy-mixin/grizzly.jsonnet b/operations/alloy-mixin/grizzly.jsonnet index 5a6200260f..d71bedfd58 100644 --- a/operations/alloy-mixin/grizzly.jsonnet +++ b/operations/alloy-mixin/grizzly.jsonnet @@ -6,9 +6,12 @@ // Then, run `grr watch . ./grizzly.jsonnet` from this directory to watch the // mixin and continually deploy all dashboards. // + +(import './grizzly/dashboards.jsonnet') + // By default, only dashboards get deployed; not alerts or recording rules. // To deploy alerts and recording rules, set up the environment variables used -// by cortextool to authenticate with a Prometheus or Alertmanager intance. +// by cortextool to authenticate with a Prometheus or Alertmanager intance and +// uncomment the line below. -(import './grizzly/dashboards.jsonnet') + -(import './grizzly/alerts.jsonnet') +//+ (import './grizzly/alerts.jsonnet')