Skip to content

Commit

Permalink
wip: Supraseal healthpage output (#325)
Browse files Browse the repository at this point in the history
* supraffi: healthpage api

* wire in health page getter

* batchseal: Report nvme metrics to prometheus

* make gen

* batchseal: Fix data units
  • Loading branch information
magik6k authored Nov 15, 2024
1 parent ce37e6e commit 5303e0a
Show file tree
Hide file tree
Showing 6 changed files with 345 additions and 3 deletions.
2 changes: 1 addition & 1 deletion extern/supra_seal
75 changes: 75 additions & 0 deletions lib/supraffi/common.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
package supraffi

import "time"

// HealthInfo represents NVMe device health information in a more Go-friendly format
type HealthInfo struct {
// Critical warning flags
CriticalWarning byte

// Temperature information in Celsius
Temperature float64
TemperatureSensors []float64
WarningTempTime time.Duration
CriticalTempTime time.Duration

// Reliability metrics
AvailableSpare uint8
AvailableSpareThreshold uint8
PercentageUsed uint8

// Usage statistics
DataUnitsRead uint64 // in 512-byte units
DataUnitsWritten uint64 // in 512-byte units
HostReadCommands uint64
HostWriteCommands uint64
ControllerBusyTime time.Duration

// Power and error statistics
PowerCycles uint64
PowerOnHours time.Duration
UnsafeShutdowns uint64
MediaErrors uint64
ErrorLogEntries uint64
}

// Helper methods for interpreting critical warning flags
const (
WarningSpareSpace = 1 << 0
WarningTemperature = 1 << 1
WarningReliability = 1 << 2
WarningReadOnly = 1 << 3
WarningVolatileMemory = 1 << 4
WarningPersistentMemory = 1 << 5
)

// HasWarning checks if a specific warning flag is set
func (h *HealthInfo) HasWarning(flag byte) bool {
return (h.CriticalWarning & flag) != 0
}

// GetWarnings returns a slice of active warning descriptions
func (h *HealthInfo) GetWarnings() []string {
var warnings []string

if h.HasWarning(WarningSpareSpace) {
warnings = append(warnings, "available spare space has fallen below threshold")
}
if h.HasWarning(WarningTemperature) {
warnings = append(warnings, "temperature is above critical threshold")
}
if h.HasWarning(WarningReliability) {
warnings = append(warnings, "device reliability has been degraded")
}
if h.HasWarning(WarningReadOnly) {
warnings = append(warnings, "media has been placed in read only mode")
}
if h.HasWarning(WarningVolatileMemory) {
warnings = append(warnings, "volatile memory backup device has failed")
}
if h.HasWarning(WarningPersistentMemory) {
warnings = append(warnings, "persistent memory region has become read-only")
}

return warnings
}
4 changes: 4 additions & 0 deletions lib/supraffi/no_supraseal.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ func GenerateMultiString(paths []Path) (string, error) {
return buffer.String(), nil
}

func GetHealthInfo() ([]HealthInfo, error) {
panic("GetHealthInfo: supraseal build tag not enabled")
}

// Pc2 performs the pc2 operation.
func Pc2(blockOffset uint64, numSectors int, outputDir string, sectorSize uint64) int {
panic("Pc2: supraseal build tag not enabled")
Expand Down
81 changes: 81 additions & 0 deletions lib/supraffi/seal.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,37 @@ package supraffi
#include <stdbool.h>
#include "supra_seal.h"
#include <stdlib.h>
typedef struct nvme_health_info {
uint8_t critical_warning;
int16_t temperature;
uint8_t available_spare;
uint8_t available_spare_threshold;
uint8_t percentage_used;
uint64_t data_units_read;
uint64_t data_units_written;
uint64_t host_read_commands;
uint64_t host_write_commands;
uint64_t controller_busy_time;
uint64_t power_cycles;
uint64_t power_on_hours;
uint64_t unsafe_shutdowns;
uint64_t media_errors;
uint64_t num_error_info_log_entries;
uint32_t warning_temp_time;
uint32_t critical_temp_time;
int16_t temp_sensors[8];
} nvme_health_info_t;
size_t get_nvme_health_info(nvme_health_info_t* health_infos, size_t max_controllers);
*/
import "C"
import (
"bytes"
"encoding/binary"
"fmt"
"time"
"unsafe"
)

Expand Down Expand Up @@ -137,6 +163,61 @@ func SupraSealInit(sectorSize uint64, configFile string) {
C.supra_seal_init(C.size_t(sectorSize), cConfigFile)
}

// GetHealthInfo retrieves health information for all NVMe devices
func GetHealthInfo() ([]HealthInfo, error) {
// Allocate space for raw C struct
const maxControllers = 64
rawInfos := make([]C.nvme_health_info_t, maxControllers)

// Get health info from C
count := C.get_nvme_health_info(
(*C.nvme_health_info_t)(unsafe.Pointer(&rawInfos[0])),
C.size_t(maxControllers),
)

if count == 0 {
return nil, fmt.Errorf("no NVMe controllers found")
}

// Convert C structs to Go structs
healthInfos := make([]HealthInfo, count)
for i := 0; i < int(count); i++ {
raw := &rawInfos[i]

// Convert temperature sensors, filtering out unused ones
sensors := make([]float64, 0, 8)
for _, temp := range raw.temp_sensors {
if temp != 0 {
sensors = append(sensors, float64(temp))
}
}

// todo likely not entirely correct
healthInfos[i] = HealthInfo{
CriticalWarning: byte(raw.critical_warning),
Temperature: float64(raw.temperature), // celsius??
TemperatureSensors: sensors,
WarningTempTime: time.Duration(raw.warning_temp_time) * time.Minute,
CriticalTempTime: time.Duration(raw.critical_temp_time) * time.Minute,
AvailableSpare: uint8(raw.available_spare),
AvailableSpareThreshold: uint8(raw.available_spare_threshold),
PercentageUsed: uint8(raw.percentage_used),
DataUnitsRead: uint64(raw.data_units_read),
DataUnitsWritten: uint64(raw.data_units_written),
HostReadCommands: uint64(raw.host_read_commands),
HostWriteCommands: uint64(raw.host_write_commands),
ControllerBusyTime: time.Duration(raw.controller_busy_time) * time.Minute,
PowerCycles: uint64(raw.power_cycles),
PowerOnHours: time.Duration(raw.power_on_hours) * time.Hour,
UnsafeShutdowns: uint64(raw.unsafe_shutdowns),
MediaErrors: uint64(raw.media_errors),
ErrorLogEntries: uint64(raw.num_error_info_log_entries),
}
}

return healthInfos, nil
}

// Pc1 performs the pc1 operation.
func Pc1(blockOffset uint64, replicaIDs [][32]byte, parentsFilename string, sectorSize uint64) int {
flatReplicaIDs := make([]byte, len(replicaIDs)*32)
Expand Down
113 changes: 111 additions & 2 deletions tasks/sealsupra/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,52 @@ import (
)

var (
phaseKey, _ = tag.NewKey("phase")
pre = "sealsupra_"
phaseKey, _ = tag.NewKey("phase")
nvmeDeviceKey, _ = tag.NewKey("nvme_device")
pre = "sealsupra_"
)

// SupraSealMeasures groups all SupraSeal metrics.
var SupraSealMeasures = struct {
PhaseLockCount *stats.Int64Measure
PhaseWaitingCount *stats.Int64Measure
PhaseAvgDuration *stats.Float64Measure

// NVMe Health measures
NVMeTemperature *stats.Float64Measure
NVMeAvailableSpare *stats.Int64Measure
NVMePercentageUsed *stats.Int64Measure
NVMePowerCycles *stats.Int64Measure
NVMePowerOnHours *stats.Float64Measure
NVMeUnsafeShutdowns *stats.Int64Measure
NVMeMediaErrors *stats.Int64Measure
NVMeErrorLogEntries *stats.Int64Measure
NVMeCriticalWarning *stats.Int64Measure

NVMeBytesRead *stats.Int64Measure
NVMeBytesWritten *stats.Int64Measure
NVMeReadIO *stats.Int64Measure
NVMeWriteIO *stats.Int64Measure
}{
PhaseLockCount: stats.Int64(pre+"phase_lock_count", "Number of active locks in each phase", stats.UnitDimensionless),
PhaseWaitingCount: stats.Int64(pre+"phase_waiting_count", "Number of goroutines waiting for a phase lock", stats.UnitDimensionless),
PhaseAvgDuration: stats.Float64(pre+"phase_avg_duration", "Average duration of each phase in seconds", stats.UnitSeconds),

// NVMe Health measures
NVMeTemperature: stats.Float64(pre+"nvme_temperature_celsius", "NVMe Temperature in Celsius", stats.UnitDimensionless),
NVMeAvailableSpare: stats.Int64(pre+"nvme_available_spare", "NVMe Available Spare", stats.UnitDimensionless),
NVMePercentageUsed: stats.Int64(pre+"nvme_percentage_used", "NVMe Percentage Used", stats.UnitDimensionless),
NVMePowerCycles: stats.Int64(pre+"nvme_power_cycles", "NVMe Power Cycles", stats.UnitDimensionless),
NVMePowerOnHours: stats.Float64(pre+"nvme_power_on_hours", "NVMe Power On Hours", stats.UnitDimensionless),
NVMeUnsafeShutdowns: stats.Int64(pre+"nvme_unsafe_shutdowns", "NVMe Unsafe Shutdowns", stats.UnitDimensionless),
NVMeMediaErrors: stats.Int64(pre+"nvme_media_errors", "NVMe Media Errors", stats.UnitDimensionless),
NVMeErrorLogEntries: stats.Int64(pre+"nvme_error_log_entries", "NVMe Error Log Entries", stats.UnitDimensionless),
NVMeCriticalWarning: stats.Int64(pre+"nvme_critical_warning", "NVMe Critical Warning Flags", stats.UnitDimensionless),

NVMeBytesRead: stats.Int64(pre+"nvme_bytes_read", "NVMe Bytes Read", stats.UnitBytes),
NVMeBytesWritten: stats.Int64(pre+"nvme_bytes_written", "NVMe Bytes Written", stats.UnitBytes),
NVMeReadIO: stats.Int64(pre+"nvme_read_io", "NVMe Read IOs", stats.UnitDimensionless),
NVMeWriteIO: stats.Int64(pre+"nvme_write_io", "NVMe Write IOs", stats.UnitDimensionless),
}

// init registers the views for SupraSeal metrics.
Expand All @@ -40,6 +73,82 @@ func init() {
Aggregation: view.LastValue(),
TagKeys: []tag.Key{phaseKey},
},
// NVMe Health views
&view.View{
Measure: SupraSealMeasures.NVMeTemperature,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeAvailableSpare,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMePercentageUsed,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMePowerCycles,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMePowerOnHours,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeUnsafeShutdowns,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeMediaErrors,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeErrorLogEntries,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeCriticalWarning,
Aggregation: view.LastValue(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeBytesRead,
Aggregation: view.Sum(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeBytesWritten,
Aggregation: view.Sum(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeReadIO,
Aggregation: view.Sum(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeWriteIO,
Aggregation: view.Sum(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeReadIO,
Aggregation: view.Sum(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
&view.View{
Measure: SupraSealMeasures.NVMeWriteIO,
Aggregation: view.Sum(),
TagKeys: []tag.Key{nvmeDeviceKey},
},
)
if err != nil {
panic(err)
Expand Down
Loading

0 comments on commit 5303e0a

Please sign in to comment.