Skip to content

Commit

Permalink
Merge pull request mrlhansen#86 from mrlhansen/eventlog
Browse files Browse the repository at this point in the history
breaking changes: rewrote event log metrics
  • Loading branch information
mrlhansen authored Jul 19, 2024
2 parents e374d51 + e656d1c commit afbcd89
Show file tree
Hide file tree
Showing 15 changed files with 137 additions and 97 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,15 +62,15 @@ metrics:
system: true
sensors: true
power: true
sel: false
events: false
storage: false
memory: false
network: false
```
As shown in the above example, under `hosts` you can specify login information for individual hosts via their IP address, otherwise the exporter will attempt to use the login information under `default`. The login user only needs read-only permissions. Under `metrics` you can select what kind of metrics that should be returned, as described in more detail below.

For a detailed description of the configuration, please see the [sample-config.yml](sample-config.yml) file.
**For a detailed description of the configuration, please see the [sample-config.yml](sample-config.yml) file.**

Because the metrics are collected on-demand it can take several minutes to scrape the metrics endpoint, depending on how many metrics groups are selected in the configuration file. For this reason you should carefully select the metrics of interest and make sure Prometheus is configured with a sufficiently high scrape timeout value.

Expand Down Expand Up @@ -126,10 +126,10 @@ idrac_power_control_interval_in_minutes{id="0",name="System Power Control"} 1
```

### System Event Log
On iDRAC only, the system event log can also be exported. This is not exactly an ordinary metric, but it is often convenient to be informed about new entries in the event log. The value of this metric is the unix timestamp for when the entry was created (as reported by iDRAC).
This is not exactly an ordinary metric, but it is often convenient to be informed about new entries in the event log. The value of this metric is the unix timestamp for when the entry was created.

```text
idrac_sel_entry{id="1",message="The process of installing an operating system or hypervisor is successfully completed",component="BaseOSBoot/InstallationStatus",severity="OK"} 1631175352
idrac_events_log_entry{id="1",message="The process of installing an operating system or hypervisor is successfully completed",severity="OK"} 1631175352
```

### Storage
Expand Down
3 changes: 1 addition & 2 deletions charts/idrac-exporter/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ idracConfig: |
address: 0.0.0.0 # Running in a container, this makes sense
port: 9348 # Listen port
timeout: 60 # HTTP timeout (in seconds) for Redfish API calls
retries: 10 # Number of retries before a target is marked as unreachable
hosts:
default:
username: IDRAC_USERNAME
Expand All @@ -95,7 +94,7 @@ idracConfig: |
system: true
sensors: true
power: true
sel: true # iDRAC only
events: true
storage: true
memory: true
network: true
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ go 1.20
require (
github.com/prometheus/client_golang v1.19.0
github.com/prometheus/common v0.48.0
github.com/xhit/go-str2duration/v2 v2.1.0
gopkg.in/yaml.v2 v2.4.0
)

Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ github.com/prometheus/common v0.48.0/go.mod h1:0/KsvlIEfPQCQ5I2iNSAWKPZziNCvRs5E
github.com/prometheus/procfs v0.12.0 h1:jluTpSng7V9hY0O2R9DzzJHYb2xULk9VTR1V1R/k6Bo=
github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3cnaOZAZEfOo=
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
github.com/xhit/go-str2duration/v2 v2.1.0 h1:lxklc02Drh6ynqX+DdPyp5pCKLUQpRT8bp8Ydu2Bstc=
github.com/xhit/go-str2duration/v2 v2.1.0/go.mod h1:ohY8p+0f07DiV6Em5LKB0s2YpLtXVyJfNt1+BlmyAsU=
golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4=
golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI=
Expand Down
27 changes: 7 additions & 20 deletions grafana/idrac.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@
"uid": "${datasource}"
},
"enable": true,
"expr": "idrac_sel_entry{job=~\"$job\", instance=~\"$instance\"} * 1000 < ${__to} and \nidrac_sel_entry{job=~\"$job\", instance=~\"$instance\"} * 1000 > ${__from}",
"expr": "idrac_events_log_entry{job=~\"$job\", instance=~\"$instance\"} * 1000 < ${__to} and \nidrac_events_log_entry{job=~\"$job\", instance=~\"$instance\"} * 1000 > ${__from}",
"iconColor": "purple",
"name": "SEL",
"tagKeys": "instance,component,severity",
"tagKeys": "instance,severity",
"textFormat": "{{ message }}",
"titleFormat": "{{id}}:{{ component }}",
"titleFormat": "{{ id }}",
"useValueForTime": "on"
}
]
Expand Down Expand Up @@ -2644,18 +2644,6 @@
}
]
},
{
"matcher": {
"id": "byName",
"options": "component"
},
"properties": [
{
"id": "custom.width",
"value": 217
}
]
},
{
"matcher": {
"id": "byName",
Expand Down Expand Up @@ -2699,10 +2687,10 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "idrac_sel_entry{job=~\"$job\", instance=~\"$instance\"} * 1000 < ${__to} and \nidrac_sel_entry{job=~\"$job\", instance=~\"$instance\"} * 1000 > ${__from}",
"expr": "idrac_events_log_entry{job=~\"$job\", instance=~\"$instance\"} * 1000 < ${__to} and \nidrac_events_log_entry{job=~\"$job\", instance=~\"$instance\"} * 1000 > ${__from}",
"format": "table",
"instant": true,
"legendFormat": "{{id}} {{component}}: {{message}}",
"legendFormat": "{{ id }}: {{ message }}",
"range": false,
"refId": "A"
}
Expand Down Expand Up @@ -2737,7 +2725,7 @@
"id": "filterFieldsByName",
"options": {
"include": {
"pattern": "message|id|component|severity|Value"
"pattern": "message|id|severity|Value"
}
}
},
Expand All @@ -2747,10 +2735,9 @@
"excludeByName": {},
"indexByName": {
"Value": 0,
"component": 4,
"id": 2,
"instance": 1,
"message": 5,
"message": 4,
"severity": 3
},
"renameByName": {
Expand Down
27 changes: 7 additions & 20 deletions grafana/idrac_overview.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@
"uid": "${datasource}"
},
"enable": true,
"expr": "idrac_sel_entry{job=~\"$job\", instance=~\"$instance\"} * 1000 < ${__to} and \nidrac_sel_entry{job=~\"$job\", instance=~\"$instance\"} * 1000 > ${__from}",
"expr": "idrac_events_log_entry{job=~\"$job\", instance=~\"$instance\"} * 1000 < ${__to} and \nidrac_events_log_entry{job=~\"$job\", instance=~\"$instance\"} * 1000 > ${__from}",
"iconColor": "purple",
"name": "SEL",
"tagKeys": "instance,component,severity",
"tagKeys": "instance,severity",
"textFormat": "{{ message }}",
"titleFormat": "{{id}}:{{ component }}",
"titleFormat": "{{ id }}",
"useValueForTime": "on"
}
]
Expand Down Expand Up @@ -1828,18 +1828,6 @@
}
]
},
{
"matcher": {
"id": "byName",
"options": "component"
},
"properties": [
{
"id": "custom.width",
"value": 217
}
]
},
{
"matcher": {
"id": "byName",
Expand Down Expand Up @@ -1881,10 +1869,10 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "idrac_sel_entry{job=~\"$job\", instance=~\"$instance\"} * 1000 < ${__to} and \nidrac_sel_entry{job=~\"$job\", instance=~\"$instance\"} * 1000 > ${__from}",
"expr": "idrac_events_log_entry{job=~\"$job\", instance=~\"$instance\"} * 1000 < ${__to} and \nidrac_events_log_entry{job=~\"$job\", instance=~\"$instance\"} * 1000 > ${__from}",
"format": "table",
"instant": true,
"legendFormat": "{{id}} {{component}}: {{message}}",
"legendFormat": "{{ id }}: {{ message }}",
"range": false,
"refId": "A"
}
Expand Down Expand Up @@ -1919,7 +1907,7 @@
"id": "filterFieldsByName",
"options": {
"include": {
"pattern": "instance|message|id|component|severity|Value"
"pattern": "instance|message|id|severity|Value"
}
}
},
Expand All @@ -1929,10 +1917,9 @@
"excludeByName": {},
"indexByName": {
"Value": 0,
"component": 4,
"id": 2,
"instance": 1,
"message": 5,
"message": 4,
"severity": 3
},
"renameByName": {
Expand Down
3 changes: 1 addition & 2 deletions idrac.yml.template
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
address: 0.0.0.0 # Running in a container, this makes sense
port: 9348 # Listen port
timeout: 60 # HTTP timeout (in seconds) for Redfish API calls
retries: 10 # Number of retries before a target is marked as unreachable
hosts:
default:
username: "$IDRAC_USERNAME"
Expand All @@ -10,7 +9,7 @@ metrics:
system: true
sensors: true
power: true
sel: true
events: true
storage: true
memory: true
network: true
42 changes: 34 additions & 8 deletions internal/collector/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ type Client struct {
storagePath string
memoryPath string
networkPath string
eventPath string
}

func newHttpClient() *http.Client {
Expand Down Expand Up @@ -123,6 +124,16 @@ func (client *Client) findAllEndpoints() error {
client.vendor = H3C
}

// Path for event log
switch client.vendor {
case DELL:
client.eventPath = "/redfish/v1/Managers/iDRAC.Embedded.1/LogServices/Sel/Entries"
case LENOVO:
client.eventPath = "/redfish/v1/Systems/1/LogServices/PlatformLog/Entries"
case HPE:
client.eventPath = "/redfish/v1/Systems/1/LogServices/IML/Entries"
}

// Fix for Inspur bug
if client.vendor == INSPUR {
client.storagePath = strings.ReplaceAll(client.storagePath, "Storages", "Storage")
Expand All @@ -133,6 +144,7 @@ func (client *Client) findAllEndpoints() error {
if strings.Contains(root.Name, "HP RESTful") {
client.memoryPath = "/redfish/v1/Systems/1/Memory/"
client.storagePath = "/redfish/v1/Systems/1/SmartStorage/ArrayControllers/"
client.eventPath = ""
client.version = 4
}
}
Expand Down Expand Up @@ -287,23 +299,37 @@ func (client *Client) RefreshPower(mc *Collector, ch chan<- prometheus.Metric) e
return nil
}

func (client *Client) RefreshIdracSel(mc *Collector, ch chan<- prometheus.Metric) error {
if client.vendor != DELL {
func (client *Client) RefreshEventLog(mc *Collector, ch chan<- prometheus.Metric) error {
if client.eventPath == "" {
return nil
}

resp := IdracSelResponse{}
err := client.redfishGet(redfishRootPath+"/Managers/iDRAC.Embedded.1/Logs/Sel", &resp)
resp := EventLogResponse{}
err := client.redfishGet(client.eventPath, &resp)
if err != nil {
return err
}

level := config.Config.Event.SeverityLevel
maxage := config.Config.Event.MaxAgeSeconds

for _, e := range resp.Members {
st := string(e.SensorType)
if st == "" {
st = "Unknown"
t, err := time.Parse(time.RFC3339, e.Created)
if err != nil {
continue
}

d := time.Since(t)
if d.Seconds() > maxage {
continue
}

severity := health2value(e.Severity)
if severity < level {
continue
}
ch <- mc.NewSelEntry(e.Id, e.Message, st, e.Severity, e.Created)

ch <- mc.NewEventLogEntry(e.Id, e.Message, e.Severity, t)
}

return nil
Expand Down
14 changes: 7 additions & 7 deletions internal/collector/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ type Collector struct {
PowerControlInterval *prometheus.Desc

// System event log
SelEntry *prometheus.Desc
EventLogEntry *prometheus.Desc

// Disk drives
DriveInfo *prometheus.Desc
Expand Down Expand Up @@ -210,10 +210,10 @@ func NewCollector() *Collector {
"Interval for measurements of power control system",
[]string{"id", "name"}, nil,
),
SelEntry: prometheus.NewDesc(
prometheus.BuildFQName(prefix, "sel", "entry"),
EventLogEntry: prometheus.NewDesc(
prometheus.BuildFQName(prefix, "events", "log_entry"),
"Entry from the system event log",
[]string{"id", "message", "component", "severity"}, nil,
[]string{"id", "message", "severity"}, nil,
),
DriveInfo: prometheus.NewDesc(
prometheus.BuildFQName(prefix, "drive", "info"),
Expand Down Expand Up @@ -310,7 +310,7 @@ func (collector *Collector) Describe(ch chan<- *prometheus.Desc) {
ch <- collector.PowerControlMaxConsumedWatts
ch <- collector.PowerControlAvgConsumedWatts
ch <- collector.PowerControlInterval
ch <- collector.SelEntry
ch <- collector.EventLogEntry
ch <- collector.DriveInfo
ch <- collector.DriveHealth
ch <- collector.DriveCapacity
Expand Down Expand Up @@ -372,10 +372,10 @@ func (collector *Collector) Collect(ch chan<- prometheus.Metric) {
}()
}

if config.Config.Collect.SEL {
if config.Config.Collect.Events {
wg.Add(1)
go func() {
err := collector.client.RefreshIdracSel(collector, ch)
err := collector.client.RefreshEventLog(collector, ch)
if err != nil {
collector.errors.Add(1)
}
Expand Down
Loading

0 comments on commit afbcd89

Please sign in to comment.