Skip to content

Commit

Permalink
batchseal: Report nvme metrics to prometheus
Browse files Browse the repository at this point in the history
  • Loading branch information
magik6k committed Nov 11, 2024
1 parent 996974e commit 4c54d39
Show file tree
Hide file tree
Showing 3 changed files with 178 additions and 9 deletions.
2 changes: 1 addition & 1 deletion lib/supraffi/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ type HealthInfo struct {
// Critical warning flags
CriticalWarning byte

// Temperature information in celsius
// Temperature information in Celsius
Temperature float64
TemperatureSensors []float64
WarningTempTime time.Duration
Expand Down
117 changes: 115 additions & 2 deletions tasks/sealsupra/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,56 @@ import (
)

var (
phaseKey, _ = tag.NewKey("phase")
pre = "sealsupra_"
phaseKey, _ = tag.NewKey("phase")
nvmeDeviceKey, _ = tag.NewKey("nvme_device")
pre = "sealsupra_"
)

// SupraSealMeasures groups all SupraSeal metrics.
var SupraSealMeasures = struct {
PhaseLockCount *stats.Int64Measure
PhaseWaitingCount *stats.Int64Measure
PhaseAvgDuration *stats.Float64Measure

// NVMe Health measures
NVMeTemperature *stats.Float64Measure
NVMeAvailableSpare *stats.Int64Measure
NVMePercentageUsed *stats.Int64Measure
NVMePowerCycles *stats.Int64Measure
NVMePowerOnHours *stats.Float64Measure
NVMeUnsafeShutdowns *stats.Int64Measure
NVMeMediaErrors *stats.Int64Measure
NVMeErrorLogEntries *stats.Int64Measure
NVMeCriticalWarning *stats.Int64Measure

NVMeBytesReadDiff *stats.Int64Measure
NVMeBytesWrittenDiff *stats.Int64Measure
NVMeReadIOPS *stats.Float64Measure
NVMeWriteIOPS *stats.Float64Measure
NVMeReadThroughput *stats.Float64Measure
NVMeWriteThroughput *stats.Float64Measure
}{
PhaseLockCount: stats.Int64(pre+"phase_lock_count", "Number of active locks in each phase", stats.UnitDimensionless),
PhaseWaitingCount: stats.Int64(pre+"phase_waiting_count", "Number of goroutines waiting for a phase lock", stats.UnitDimensionless),
PhaseAvgDuration: stats.Float64(pre+"phase_avg_duration", "Average duration of each phase in seconds", stats.UnitSeconds),

// NVMe Health measures
NVMeTemperature: stats.Float64(pre+"nvme_temperature_celsius", "NVMe Temperature in Celsius", stats.UnitDimensionless),
NVMeAvailableSpare: stats.Int64(pre+"nvme_available_spare", "NVMe Available Spare", stats.UnitDimensionless),
NVMePercentageUsed: stats.Int64(pre+"nvme_percentage_used", "NVMe Percentage Used", stats.UnitDimensionless),
NVMePowerCycles: stats.Int64(pre+"nvme_power_cycles", "NVMe Power Cycles", stats.UnitDimensionless),
NVMePowerOnHours: stats.Float64(pre+"nvme_power_on_hours", "NVMe Power On Hours", stats.UnitDimensionless),
NVMeUnsafeShutdowns: stats.Int64(pre+"nvme_unsafe_shutdowns", "NVMe Unsafe Shutdowns", stats.UnitDimensionless),
NVMeMediaErrors: stats.Int64(pre+"nvme_media_errors", "NVMe Media Errors", stats.UnitDimensionless),
NVMeErrorLogEntries: stats.Int64(pre+"nvme_error_log_entries", "NVMe Error Log Entries", stats.UnitDimensionless),
NVMeCriticalWarning: stats.Int64(pre+"nvme_critical_warning", "NVMe Critical Warning Flags", stats.UnitDimensionless),

NVMeBytesReadDiff: stats.Int64(pre+"nvme_bytes_read_diff", "NVMe Bytes Read Diff", stats.UnitBytes),
NVMeBytesWrittenDiff: stats.Int64(pre+"nvme_bytes_written_diff", "NVMe Bytes Written Diff", stats.UnitBytes),
NVMeReadIOPS: stats.Float64(pre+"nvme_read_iops", "NVMe Read IOPS", stats.UnitDimensionless),
NVMeWriteIOPS: stats.Float64(pre+"nvme_write_iops", "NVMe Write IOPS", stats.UnitDimensionless),
NVMeReadThroughput: stats.Float64(pre+"nvme_read_throughput", "NVMe Read Throughput (bytes/sec)", stats.UnitBytes),
NVMeWriteThroughput: stats.Float64(pre+"nvme_write_throughput", "NVMe Write Throughput (bytes/sec)", stats.UnitBytes),
}

// init registers the views for SupraSeal metrics.
Expand All @@ -40,6 +77,82 @@ func init() {
Aggregation: view.LastValue(),
TagKeys: []tag.Key{phaseKey},
},
// NVMe Health views
&view.View{
Measure: SupraSealMeasures.NVMeTemperature,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeAvailableSpare,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMePercentageUsed,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMePowerCycles,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMePowerOnHours,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeUnsafeShutdowns,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeMediaErrors,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeErrorLogEntries,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeCriticalWarning,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeBytesReadDiff,
Aggregation: view.Sum(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeBytesWrittenDiff,
Aggregation: view.Sum(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeReadIOPS,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeWriteIOPS,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeReadThroughput,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeWriteThroughput,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
)
if err != nil {
panic(err)
Expand Down
68 changes: 62 additions & 6 deletions tasks/sealsupra/task_supraseal.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import (
"encoding/hex"
"encoding/json"
"fmt"
"go.opencensus.io/stats"
"go.opencensus.io/tag"
"os"
"path/filepath"
"time"
Expand Down Expand Up @@ -113,19 +115,73 @@ func NewSupraSeal(sectorSize string, batchSize, pipelines int, dualHashers bool,
log.Infow("nvme health page", "hp", hp)
}

go func() {
// TODO: put this into prometheus metrics instead of printing
// Initialize previous health infos slice
prevHealthInfos := make([]supraffi.HealthInfo, len(nvmeDevices))

for {
time.Sleep(30 * time.Second)
go func() {
const intervalSeconds = 30
ticker := time.NewTicker(time.Duration(intervalSeconds) * time.Second)
defer ticker.Stop()

hp, err := supraffi.GetHealthInfo()
for range ticker.C {
healthInfos, err := supraffi.GetHealthInfo()
if err != nil {
log.Errorw("health page get error", "error", err)
continue
}

log.Infow("nvme health page", "hp", hp)
for i, hi := range healthInfos {
if i >= len(nvmeDevices) {
log.Warnw("More health info entries than nvme devices", "index", i)
break
}
deviceName := nvmeDevices[i]

ctx, err := tag.New(
context.Background(),
tag.Insert(nvmeDeviceKey, deviceName),
)
if err != nil {
log.Errorw("Failed to create context with tags", "error", err)
continue
}

// Record the metrics
stats.Record(ctx, SupraSealMeasures.NVMeTemperature.M(hi.Temperature))
stats.Record(ctx, SupraSealMeasures.NVMeAvailableSpare.M(int64(hi.AvailableSpare)))
stats.Record(ctx, SupraSealMeasures.NVMePercentageUsed.M(int64(hi.PercentageUsed)))
stats.Record(ctx, SupraSealMeasures.NVMePowerCycles.M(int64(hi.PowerCycles)))
stats.Record(ctx, SupraSealMeasures.NVMePowerOnHours.M(hi.PowerOnHours.Hours()))
stats.Record(ctx, SupraSealMeasures.NVMeUnsafeShutdowns.M(int64(hi.UnsafeShutdowns)))
stats.Record(ctx, SupraSealMeasures.NVMeMediaErrors.M(int64(hi.MediaErrors)))
stats.Record(ctx, SupraSealMeasures.NVMeErrorLogEntries.M(int64(hi.ErrorLogEntries)))
stats.Record(ctx, SupraSealMeasures.NVMeCriticalWarning.M(int64(hi.CriticalWarning)))

// For counters, compute difference from previous values
if prevHealthInfos[i].DataUnitsRead != 0 {
dataUnitsReadBytesDiff := int64((hi.DataUnitsRead - prevHealthInfos[i].DataUnitsRead) * 512)
dataUnitsWrittenBytesDiff := int64((hi.DataUnitsWritten - prevHealthInfos[i].DataUnitsWritten) * 512)
hostReadCommandsDiff := hi.HostReadCommands - prevHealthInfos[i].HostReadCommands
hostWriteCommandsDiff := hi.HostWriteCommands - prevHealthInfos[i].HostWriteCommands

// Compute IOPS and Throughput
readIOPS := float64(hostReadCommandsDiff) / float64(intervalSeconds)
writeIOPS := float64(hostWriteCommandsDiff) / float64(intervalSeconds)
readThroughput := float64(dataUnitsReadBytesDiff) / float64(intervalSeconds)
writeThroughput := float64(dataUnitsWrittenBytesDiff) / float64(intervalSeconds)

// Record the diffs and computed metrics
stats.Record(ctx, SupraSealMeasures.NVMeBytesReadDiff.M(dataUnitsReadBytesDiff))
stats.Record(ctx, SupraSealMeasures.NVMeBytesWrittenDiff.M(dataUnitsWrittenBytesDiff))
stats.Record(ctx, SupraSealMeasures.NVMeReadIOPS.M(readIOPS))
stats.Record(ctx, SupraSealMeasures.NVMeWriteIOPS.M(writeIOPS))
stats.Record(ctx, SupraSealMeasures.NVMeReadThroughput.M(readThroughput))
stats.Record(ctx, SupraSealMeasures.NVMeWriteThroughput.M(writeThroughput))
}

// Update previous health info
prevHealthInfos[i] = hi
}
}
}()

Expand Down

0 comments on commit 4c54d39

Please sign in to comment.