Skip to content

Commit

Permalink
systemd.go: Added watchdog metrics
Browse files Browse the repository at this point in the history
Signed-off-by: Jonathan Davies <[email protected]>
  • Loading branch information
jpds committed Jan 11, 2024
1 parent f925935 commit 027a5a0
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 18 deletions.
38 changes: 21 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,23 +83,27 @@ PromQL grouping queries (e.g. `count(systemd_unit_state) by (type)`)

Note that a number of unit types are filtered by default

| Metric name | Metric type | Status | Cardinality |
| ----------------------------------------- | ----------- | -------- | ------------------------------------------------------------------ |
| systemd_exporter_build_info | Gauge | UNSTABLE | 1 per systemd-exporter |
| systemd_unit_info | Gauge | UNSTABLE | 1 per service + 1 per mount |
| systemd_unit_state | Gauge | UNSTABLE | 5 per unit {state="activating/active/deactivating/failed/inactive} |
| systemd_unit_tasks_current | Gauge | UNSTABLE | 1 per service |
| systemd_unit_tasks_max | Gauge | UNSTABLE | 1 per service |
| systemd_unit_start_time_seconds | Gauge | UNSTABLE | 1 per service |
| systemd_service_restart_total | Gauge | UNSTABLE | 1 per service |
| systemd_service_ip_ingress_bytes | Counter | UNSTABLE | 1 per service |
| systemd_service_ip_egress_bytes | Counter | UNSTABLE | 1 per service |
| systemd_service_ip_ingress_packets_total | Counter | UNSTABLE | 1 per service |
| systemd_service_ip_egress_packets_total | Counter | UNSTABLE | 1 per service |
| systemd_socket_accepted_connections_total | Counter | UNSTABLE | 1 per socket |
| systemd_socket_current_connections | Gauge | UNSTABLE | 1 per socket |
| systemd_socket_refused_connections_total | Gauge | UNSTABLE | 1 per socket |
| systemd_timer_last_trigger_seconds | Gauge | UNSTABLE | 1 per timer |
| Metric name | Metric type | Status | Cardinality |
| -------------------------------------------- | ----------- | -------- | ------------------------------------------------------------------ |
| systemd_exporter_build_info | Gauge | UNSTABLE | 1 per systemd-exporter |
| systemd_unit_info | Gauge | UNSTABLE | 1 per service + 1 per mount |
| systemd_unit_state | Gauge | UNSTABLE | 5 per unit {state="activating/active/deactivating/failed/inactive} |
| systemd_unit_tasks_current | Gauge | UNSTABLE | 1 per service |
| systemd_unit_tasks_max | Gauge | UNSTABLE | 1 per service |
| systemd_unit_start_time_seconds | Gauge | UNSTABLE | 1 per service |
| systemd_service_restart_total | Gauge | UNSTABLE | 1 per service |
| systemd_service_ip_ingress_bytes | Counter | UNSTABLE | 1 per service |
| systemd_service_ip_egress_bytes | Counter | UNSTABLE | 1 per service |
| systemd_service_ip_ingress_packets_total | Counter | UNSTABLE | 1 per service |
| systemd_service_ip_egress_packets_total | Counter | UNSTABLE | 1 per service |
| systemd_socket_accepted_connections_total | Counter | UNSTABLE | 1 per socket |
| systemd_socket_current_connections | Gauge | UNSTABLE | 1 per socket |
| systemd_socket_refused_connections_total | Gauge | UNSTABLE | 1 per socket |
| systemd_timer_last_trigger_seconds | Gauge | UNSTABLE | 1 per timer |
| systemd_watchdog_enabled | Gauge | UNSTABLE | 1 (only 1 watchdog configurable) |
| systemd_watchdog_last_ping_monotonic_seconds | Gauge | UNSTABLE | 1 |
| systemd_watchdog_last_ping_time_seconds | Gauge | UNSTABLE | 1 |
| systemd_watchdog_runtime_seconds | Gauge | UNSTABLE | 1 |

## Configuration

Expand Down
105 changes: 104 additions & 1 deletion systemd/systemd.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import (
)

const namespace = "systemd"
const watchdogSubsystem = "watchdog"

var (
unitInclude = kingpin.Flag("systemd.collector.unit-include", "Regexp of systemd units to include. Units must both match include and not match exclude to be included.").Default(".+").String()
Expand Down Expand Up @@ -81,6 +82,10 @@ type Collector struct {
ipEgressBytes *prometheus.Desc
ipIngressPackets *prometheus.Desc
ipEgressPackets *prometheus.Desc
watchdogEnabled *prometheus.Desc
watchdogLastPingMonotonic *prometheus.Desc
watchdogLastPingTimestamp *prometheus.Desc
watchdogRuntimeSeconds *prometheus.Desc

unitIncludePattern *regexp.Regexp
unitExcludePattern *regexp.Regexp
Expand Down Expand Up @@ -198,6 +203,26 @@ func NewCollector(logger log.Logger) (*Collector, error) {
"Service unit egress IP accounting in packets.",
[]string{"name"}, nil,
)
watchdogEnabled := prometheus.NewDesc(
prometheus.BuildFQName(namespace, watchdogSubsystem, "enabled"),
"systemd watchdog enabled",
nil, nil,
)
watchdogLastPingMonotonic := prometheus.NewDesc(
prometheus.BuildFQName(namespace, watchdogSubsystem, "last_ping_monotonic_seconds"),
"systemd watchdog last ping monotonic seconds",
[]string{"device"}, nil,
)
watchdogLastPingTimestamp := prometheus.NewDesc(
prometheus.BuildFQName(namespace, watchdogSubsystem, "last_ping_time_seconds"),
"systemd watchdog last ping time seconds",
[]string{"device"}, nil,
)
watchdogRuntimeSeconds := prometheus.NewDesc(
prometheus.BuildFQName(namespace, watchdogSubsystem, "runtime_seconds"),
"systemd watchdog runtime seconds",
[]string{"device"}, nil,
)
unitIncludePattern := regexp.MustCompile(fmt.Sprintf("^(?:%s)$", *unitInclude))
unitExcludePattern := regexp.MustCompile(fmt.Sprintf("^(?:%s)$", *unitExclude))

Expand Down Expand Up @@ -229,6 +254,10 @@ func NewCollector(logger log.Logger) (*Collector, error) {
ipEgressPackets: ipEgressPackets,
unitIncludePattern: unitIncludePattern,
unitExcludePattern: unitExcludePattern,
watchdogEnabled: watchdogEnabled,
watchdogLastPingMonotonic: watchdogLastPingMonotonic,
watchdogLastPingTimestamp: watchdogLastPingTimestamp,
watchdogRuntimeSeconds: watchdogRuntimeSeconds,
}, nil
}

Expand Down Expand Up @@ -259,7 +288,10 @@ func (c *Collector) Describe(desc chan<- *prometheus.Desc) {
desc <- c.ipEgressBytes
desc <- c.ipIngressPackets
desc <- c.ipEgressPackets

desc <- c.watchdogEnabled
desc <- c.watchdogLastPingMonotonic
desc <- c.watchdogLastPingTimestamp
desc <- c.watchdogRuntimeSeconds
}

func parseUnitType(unit dbus.UnitStatus) string {
Expand All @@ -280,6 +312,11 @@ func (c *Collector) collect(ch chan<- prometheus.Metric) error {
level.Debug(c.logger).Log("msg", "Failed to collect boot stage timestamps", "err", err)
}

err = c.collectWatchdogMetrics(conn, ch)
if err != nil {
level.Debug(c.logger).Log("msg", "Failed to collect watchdog metrics", "err", err)
}

allUnits, err := conn.ListUnitsContext(c.ctx)
if err != nil {
return errors.Wrap(err, "could not get list of systemd units from dbus")
Expand Down Expand Up @@ -688,3 +725,69 @@ func (c *Collector) filterUnits(units []dbus.UnitStatus, includePattern, exclude

return filtered
}

func (c *Collector) collectWatchdogMetrics(conn *dbus.Conn, ch chan<- prometheus.Metric) error {
watchdogDevice, err := conn.GetManagerProperty("WatchdogDevice")
if err != nil {
return err
}

watchdogDeviceString := strings.TrimPrefix(strings.TrimSuffix(watchdogDevice, `"`), `"`)

if len(watchdogDeviceString) == 0 {
ch <- prometheus.MustNewConstMetric(
c.watchdogEnabled, prometheus.GaugeValue,
0)

level.Debug(c.logger).Log("msg", "No watchdog configured, ignoring metrics")
return nil
}

watchdogLastPingMonotonicProperty, err := conn.GetManagerProperty("WatchdogLastPingTimestampMonotonic")
if err != nil {
return err
}

watchdogLastPingTimeProperty, err := conn.GetManagerProperty("WatchdogLastPingTimestamp")
if err != nil {
return err
}

runtimeWatchdogUSecProperty, err := conn.GetManagerProperty("RuntimeWatchdogUSec")
if err != nil {
return err
}

watchdogLastPingMonotonic, err := strconv.ParseFloat(watchdogLastPingMonotonicProperty, 64)
if err != nil {
return err
}

watchdogLastPingTimestamp, err := strconv.ParseFloat(watchdogLastPingTimeProperty, 64)
if err != nil {
return err
}

runtimeWatchdogUSec, err := strconv.ParseFloat(runtimeWatchdogUSecProperty, 64)
if err != nil {
return err
}

ch <- prometheus.MustNewConstMetric(
c.watchdogEnabled, prometheus.GaugeValue,
1)

ch <- prometheus.MustNewConstMetric(
c.watchdogLastPingMonotonic, prometheus.GaugeValue,
float64(watchdogLastPingMonotonic)/1e6, watchdogDeviceString)

ch <- prometheus.MustNewConstMetric(
c.watchdogLastPingTimestamp, prometheus.GaugeValue,
float64(watchdogLastPingTimestamp)/1e6, watchdogDeviceString)

ch <- prometheus.MustNewConstMetric(
c.watchdogRuntimeSeconds, prometheus.GaugeValue,
float64(runtimeWatchdogUSec)/1e6, watchdogDeviceString)

return nil
}

0 comments on commit 027a5a0

Please sign in to comment.