Skip to content

Commit

Permalink
mixin: Add SR to remote_write dashboard (#1100)
Browse files Browse the repository at this point in the history
  • Loading branch information
thampiotr authored Jun 21, 2024
1 parent d616a70 commit 78b1581
Show file tree
Hide file tree
Showing 8 changed files with 93 additions and 57 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ Main (unreleased)

- Update Public preview `remotecfg` argument from `metadata` to `attributes`. (@erikbaranowski)

### Enhancements

- Added a success rate panel on the Prometheus Components dashboard. (@thampiotr)

v1.2.0-rc.0
-----------------

Expand Down
5 changes: 1 addition & 4 deletions example/images/grizzly/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
FROM golang:1.22-alpine

# NOTE(rfratto): Versions of grr newer than v0.2.1 don't work with our Grizzly
# jsonnet files, reporting that the the Alloy dashboard folder "cannot be
# found" after creating it.
RUN go install github.com/grafana/grizzly/cmd/[email protected]
RUN go install github.com/grafana/grizzly/cmd/[email protected]
2 changes: 1 addition & 1 deletion example/images/jb/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
FROM golang:1.22-alpine

RUN go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@c862f0670eb199b5024e31ff024f39b74d3b803a
RUN go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@v0.5.1
6 changes: 4 additions & 2 deletions operations/alloy-mixin/dashboards/alloy-logs.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ local logsDashboard = import 'github.com/grafana/jsonnet-libs/logs-lib/logs/main
{

local labels = if $._config.enableK8sCluster then ['cluster', 'namespace', 'job', 'instance', 'level'] else ['job', 'instance', 'level'],
local dashboardName = 'alloy-logs.json',

grafanaDashboards+:
if $._config.enableLokiLogs then {
Expand All @@ -28,9 +29,10 @@ local logsDashboard = import 'github.com/grafana/jsonnet-libs/logs-lib/logs/main
{
logs+: g.dashboard.withLinksMixin($.grafanaDashboards['alloy-resources.json'].links)
+ g.dashboard.withRefresh('10s')
+ g.dashboard.withTagsMixin($._config.dashboardTag),
+ g.dashboard.withTagsMixin($._config.dashboardTag)
+ g.dashboard.withUid(std.md5(dashboardName)),
},
},
'alloy-logs.json': alloyLogs.dashboards.logs,
[dashboardName]: alloyLogs.dashboards.logs,
} else {},
}
2 changes: 1 addition & 1 deletion operations/alloy-mixin/dashboards/cluster-node.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ local filename = 'alloy-cluster-node.json';
[filename]:
dashboard.new(name='Alloy / Cluster Node', tag=$._config.dashboardTag) +
dashboard.withDocsLink(
url='https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode',
url='https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering',
desc='Clustering documentation',
) +
dashboard.withDashboardsLink(tag=$._config.dashboardTag) +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ local cluster_node_filename = 'alloy-cluster-node.json';
[filename]:
dashboard.new(name='Alloy / Cluster Overview', tag=$._config.dashboardTag) +
dashboard.withDocsLink(
url='https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode',
url='https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering',
desc='Clustering documentation',
) +
dashboard.withDashboardsLink(tag=$._config.dashboardTag) +
Expand Down
120 changes: 75 additions & 45 deletions operations/alloy-mixin/dashboards/prometheus.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,72 @@ local filename = 'alloy-prometheus-remote-write.json';
local remoteWritePanels(y_offset) = [
panel.newRow(title='prometheus.remote_write', y=y_offset),

// Remote write success rate
(
panel.new(title='Remote write success rate in $cluster', type='timeseries') +
panel.withUnit('percentunit') +
panel.withDescription(|||
Percentage of samples sent by prometheus.remote_write that succeeded.
Low success rates can indicate a problem with Alloy or the remote storage.
|||) +
panel.withPosition({ x: 0, y: 1 + y_offset, w: 12, h: 10 }) +
panel.withQueries([
panel.newQuery(
expr=|||
(
1 -
(
sum(rate(prometheus_remote_storage_samples_failed_total{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]))
)
/
(
sum(rate(prometheus_remote_storage_samples_total{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]))
)
)
||| % $._config,
legendFormat='% of samples successfully sent',
),
])
),

// Write latency
(
panel.new(title='Write latency in $cluster', type='timeseries') +
panel.withUnit('s') +
panel.withDescription(|||
Latency of writes to the remote system made by
prometheus.remote_write.
|||) +
panel.withPosition({ x: 12, y: 1 + y_offset, w: 12, h: 10 }) +
panel.withQueries([
panel.newQuery(
expr= |||
histogram_quantile(0.99, sum by (le) (
rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval])
))
||| % $._config,
legendFormat='99th percentile',
),
panel.newQuery(
expr= |||
histogram_quantile(0.50, sum by (le) (
rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval])
))
||| % $._config,
legendFormat='50th percentile',
),
panel.newQuery(
expr= |||
sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component"}[$__rate_interval])) /
sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component"}[$__rate_interval]))
||| % $._config,
legendFormat='Average',
),
])
),


// WAL delay
(
panel.new(title='WAL delay', type='timeseries') +
Expand All @@ -103,7 +169,7 @@ local filename = 'alloy-prometheus-remote-write.json';
the WAL delay continues to increase beyond that amount, try
increasing the number of maximum shards.
|||) +
panel.withPosition({ x: 0, y: 1 + y_offset, w: 6, h: 10 }) +
panel.withPosition({ x: 0, y: 11 + y_offset, w: 8, h: 10 }) +
panel.withQueries([
panel.newQuery(
expr= |||
Expand All @@ -127,7 +193,7 @@ local filename = 'alloy-prometheus-remote-write.json';
Rate of data containing samples and metadata sent by
prometheus.remote_write.
|||) +
panel.withPosition({ x: 6, y: 1 + y_offset, w: 6, h: 10 }) +
panel.withPosition({ x: 8, y: 11 + y_offset, w: 8, h: 10 }) +
panel.withQueries([
panel.newQuery(
expr= |||
Expand All @@ -141,42 +207,6 @@ local filename = 'alloy-prometheus-remote-write.json';
])
),

// Write latency
(
panel.new(title='Write latency', type='timeseries') +
panel.withUnit('s') +
panel.withDescription(|||
Latency of writes to the remote system made by
prometheus.remote_write.
|||) +
panel.withPosition({ x: 12, y: 1 + y_offset, w: 6, h: 10 }) +
panel.withQueries([
panel.newQuery(
expr= |||
histogram_quantile(0.99, sum by (le) (
rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval])
))
||| % $._config,
legendFormat='99th percentile',
),
panel.newQuery(
expr= |||
histogram_quantile(0.50, sum by (le) (
rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval])
))
||| % $._config,
legendFormat='50th percentile',
),
panel.newQuery(
expr= |||
sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component"}[$__rate_interval])) /
sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component"}[$__rate_interval]))
||| % $._config,
legendFormat='Average',
),
])
),

// Shards
(
local minMaxOverride = {
Expand Down Expand Up @@ -220,7 +250,7 @@ local filename = 'alloy-prometheus-remote-write.json';
shards; filter to a specific URL to display more granular
information.
|||) +
panel.withPosition({ x: 18, y: 1 + y_offset, w: 6, h: 10 }) +
panel.withPosition({ x: 16, y: 11 + y_offset, w: 8, h: 10 }) +
panel.withQueries([
panel.newQuery(
expr= |||
Expand Down Expand Up @@ -257,7 +287,7 @@ local filename = 'alloy-prometheus-remote-write.json';
panel.withDescription(|||
Total outgoing samples sent by prometheus.remote_write.
|||) +
panel.withPosition({ x: 0, y: 11 + y_offset, w: 8, h: 10 }) +
panel.withPosition({ x: 0, y: 21 + y_offset, w: 8, h: 10 }) +
panel.withQueries([
panel.newQuery(
expr= |||
Expand All @@ -279,7 +309,7 @@ local filename = 'alloy-prometheus-remote-write.json';
Rate of samples which prometheus.remote_write could not send due to
non-recoverable errors.
|||) +
panel.withPosition({ x: 8, y: 11 + y_offset, w: 8, h: 10 }) +
panel.withPosition({ x: 8, y: 21 + y_offset, w: 8, h: 10 }) +
panel.withQueries([
panel.newQuery(
expr= |||
Expand All @@ -301,7 +331,7 @@ local filename = 'alloy-prometheus-remote-write.json';
Rate of samples which prometheus.remote_write attempted to resend
after receiving a recoverable error.
|||) +
panel.withPosition({ x: 16, y: 11 + y_offset, w: 8, h: 10 }) +
panel.withPosition({ x: 16, y: 21 + y_offset, w: 8, h: 10 }) +
panel.withQueries([
panel.newQuery(
expr= |||
Expand Down Expand Up @@ -331,7 +361,7 @@ local filename = 'alloy-prometheus-remote-write.json';
received a sample for. Active series are garbage collected whenever a
truncation of the WAL occurs.
|||) +
panel.withPosition({ x: 0, y: 21 + y_offset, w: 8, h: 10 }) +
panel.withPosition({ x: 0, y: 31 + y_offset, w: 8, h: 10 }) +
panel.withQueries([
panel.newQuery(
expr= |||
Expand All @@ -354,7 +384,7 @@ local filename = 'alloy-prometheus-remote-write.json';
received a sample for. Active series are garbage collected whenever a
truncation of the WAL occurs.
|||) +
panel.withPosition({ x: 8, y: 21 + y_offset, w: 8, h: 10 }) +
panel.withPosition({ x: 8, y: 31 + y_offset, w: 8, h: 10 }) +
panel.withQueries([
panel.newQuery(
expr= |||
Expand All @@ -377,7 +407,7 @@ local filename = 'alloy-prometheus-remote-write.json';
received a sample for. Active series are garbage collected whenever a
truncation of the WAL occurs.
|||) +
panel.withPosition({ x: 16, y: 21 + y_offset, w: 8, h: 10 }) +
panel.withPosition({ x: 16, y: 31 + y_offset, w: 8, h: 10 }) +
panel.withQueries([
panel.newQuery(
expr= |||
Expand Down
9 changes: 6 additions & 3 deletions operations/alloy-mixin/grizzly.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,12 @@
// Then, run `grr watch . ./grizzly.jsonnet` from this directory to watch the
// mixin and continually deploy all dashboards.
//

(import './grizzly/dashboards.jsonnet')

// By default, only dashboards get deployed; not alerts or recording rules.
// To deploy alerts and recording rules, set up the environment variables used
// by cortextool to authenticate with a Prometheus or Alertmanager intance.
// by cortextool to authenticate with a Prometheus or Alertmanager intance and
// uncomment the line below.

(import './grizzly/dashboards.jsonnet') +
(import './grizzly/alerts.jsonnet')
//+ (import './grizzly/alerts.jsonnet')

0 comments on commit 78b1581

Please sign in to comment.